27 #ifndef KALDI_CUDAMATRIX_CU_KERNELS_H_    28 #define KALDI_CUDAMATRIX_CU_KERNELS_H_    42 inline void cuda_add_row_sum_mat(
int Gr, 
int Bl, 
double* result,
    44                                  const double alpha, 
const double beta) {
    45   cudaD_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
    47 inline void cuda_add_row_sum_mat(
int Gr, 
int Bl, 
float* result,
    49                                  const float alpha, 
const float beta) {
    50   cudaF_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
    52 inline void cuda_add_col_sum_mat(
int Gr, 
int Bl, 
double* result,
    54                                  const double alpha, 
const double beta) {
    55   cudaD_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
    57 inline void cuda_add_col_sum_mat(
int Gr, 
int Bl, 
float* result,
    59                                  const float alpha, 
const float beta) {
    60   cudaF_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
    62 inline void cuda_add_cols(dim3 Gr, dim3 Bl, 
double* dst, 
const double* src,
    65   cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
    67 inline void cuda_add_cols(dim3 Gr, dim3 Bl, 
float* dst, 
const float* src,
    70   cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
    72 inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, 
const double alpha,
    73                                      const double* M, 
const int stride_M,
    75                                      const double beta, 
double* v) {
    76   cudaD_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
    78 inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, 
const float alpha,
    79                                      const float* M, 
const int stride_M,
    81                                      const float beta, 
float* v) {
    82   cudaF_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
    84 inline void cuda_add_diag_mat_mat_MNT(
int Gr, 
int Bl, 
const double alpha,
    86                                       const double* N, 
const int stride_N,
    87                                       const double beta, 
double* v) {
    88   cudaD_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
    90 inline void cuda_add_diag_mat_mat_MNT(
int Gr, 
int Bl, 
const float alpha,
    92                                       const float* N, 
const int stride_N,
    93                                       const float beta, 
float* v) {
    94   cudaF_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
    96 inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, 
const double alpha,
    97                                       const double* M, 
const int stride_M,
    99                                       const double beta, 
double* v,
   100                                       const int stride_v) {
   101   cudaD_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v,
   104 inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, 
const float alpha,
   105                                       const float* M, 
const int stride_M,
   107                                       const float beta, 
float* v,
   108                                       const int stride_v) {
   109   cudaF_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v,
   112 inline void cuda_add_diag_packed(
int Gr, 
int Bl, 
double* mat, 
double value,
   114   cudaD_add_diag_packed(Gr, Bl, mat, value, dim);
   116 inline void cuda_add_diag_packed(
int Gr, 
int Bl, 
float* mat, 
float value,
   118   cudaF_add_diag_packed(Gr, Bl, mat, value, dim);
   120 inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, 
double alpha, 
double *mat,
   122                                   const double *mat2, 
int mat2_row_stride,
   123                                   int mat2_col_stride, 
double beta) {
   124   cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
   125                          mat2_row_stride, mat2_col_stride, beta);
   127 inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, 
float alpha, 
float *mat,
   129                                   const float *mat2, 
int mat2_row_stride,
   130                                   int mat2_col_stride, 
float beta) {
   131   cudaF_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
   132                          mat2_row_stride, mat2_col_stride, beta);
   134 inline void cuda_add(dim3 Gr, dim3 Bl, 
double *mat, 
double value, 
MatrixDim d) {
   135   cudaD_add(Gr, Bl, mat, value, d);
   137 inline void cuda_add(dim3 Gr, dim3 Bl, 
float *mat, 
float value, 
MatrixDim d) {
   138   cudaF_add(Gr, Bl, mat, value, d);
   140 inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, 
double *data, 
MatrixDim d,
   141                                   const double *Adata, 
int A_num_rows,
   142                                   int A_num_cols, 
int A_row_stride,
   145                                   int B_num_blocks, 
double alpha, 
double beta,
   147   cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
   148                          A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
   149                          alpha, beta, B_trans);
   151 inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, 
float *data, 
MatrixDim d,
   152                                   const float *Adata, 
int A_num_rows,
   153                                   int A_num_cols, 
int A_row_stride,
   156                                   int B_num_blocks, 
float alpha, 
float beta,
   158   cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
   159                          A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
   160                          alpha, beta, B_trans);
   162 inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, 
double alpha,
   166   cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
   167                        d, src_stride, A_trans);
   169 inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, 
float alpha, 
const float *src,
   173   cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
   174                        d, src_stride, A_trans);
   176 inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, 
double alpha,
   179   cudaD_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
   181 inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, 
float alpha,
   184   cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
   186 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, 
double alpha, 
double *mat,
   188                                   int mat2_row_stride, 
int mat2_col_stride,
   189                                   const double *vec, 
double beta) {
   190   cudaD_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
   191                          mat2_col_stride, vec, beta);
   193 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, 
float alpha, 
float *mat,
   195                                   int mat2_row_stride, 
int mat2_col_stride,
   196                                   const float *vec, 
float beta) {
   197   cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
   198                          mat2_col_stride, vec, beta);
   200 inline void cuda_add_mat(dim3 Gr, dim3 Bl, 
double alpha, 
const double *src,
   203   cudaD_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
   205 inline void cuda_add_mat(dim3 Gr, dim3 Bl, 
float alpha, 
const float *src,
   206                          float *dst, 
MatrixDim d, 
int src_stride, 
int A_trans) {
   207   cudaF_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
   209 inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, 
double *data,
   210                                       const double *srcA_data,
   212                                       int srcA_stride, 
int srcB_stride,
   213                                       double alpha, 
double beta) {
   214   cudaD_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
   215                              srcA_stride, srcB_stride, alpha, beta);
   217 inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, 
float *data,
   218                                       const float *srcA_data,
   220                                       int srcA_stride, 
int srcB_stride,
   221                                       float alpha, 
float beta) {
   222   cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
   223                              srcA_stride, srcB_stride, alpha, beta);
   225 inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, 
double *data, 
MatrixDim dim,
   226                                 const double *src_data, 
MatrixDim src_dim,
   228   cudaD_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
   230 inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, 
float *data, 
MatrixDim dim,
   231                                 const float *src_data, 
MatrixDim src_dim,
   233   cudaF_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
   235 inline void cuda_add_rows(dim3 Gr, dim3 Bl, 
double alpha, 
double* dst,
   236                           const double* 
const * src, 
MatrixDim dst_dim) {
   237   cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
   239 inline void cuda_add_rows(dim3 Gr, dim3 Bl, 
float alpha, 
float* dst,
   240                           const float* 
const * src, 
MatrixDim dst_dim) {
   241   cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
   243 inline void cuda_add_rows(dim3 Gr, dim3 Bl, 
double alpha, 
double* dst,
   246   cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
   248 inline void cuda_add_rows(dim3 Gr, dim3 Bl, 
float alpha, 
float* dst,
   251   cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
   253 inline void cuda_mul_rows(dim3 Gr, dim3 Bl, 
double* dst,
   256   cudaD_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
   258 inline void cuda_mul_rows(dim3 Gr, dim3 Bl, 
float* dst,
   261   cudaF_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
   263 inline void cuda_add_smat(dim3 Gr, dim3 Bl, 
double* mat, 
MatrixDim mat_dim,
   264                           double alpha, 
const int* smat_row_ptr,
   265                           const int* smat_col_idx, 
const double* smat_val) {
   266   cudaD_add_smat(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
   269 inline void cuda_add_smat(dim3 Gr, dim3 Bl, 
float* mat, 
MatrixDim mat_dim,
   270                           float alpha, 
const int* smat_row_ptr,
   271                           const int* smat_col_idx, 
const float* smat_val) {
   272   cudaF_add_smat(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
   275 inline void cuda_add_smat_trans(dim3 Gr, dim3 Bl, 
double* mat,
   277                                 const int* smat_row_ptr,
   278                                 const int* smat_col_idx,
   279                                 const double* smat_val) {
   280   cudaD_add_smat_trans(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
   283 inline void cuda_add_smat_trans(dim3 Gr, dim3 Bl, 
float* mat, 
MatrixDim mat_dim,
   284                                 float alpha, 
const int* smat_row_ptr,
   285                                 const int* smat_col_idx,
   286                                 const float* smat_val) {
   287   cudaF_add_smat_trans(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
   290 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, 
double alpha,
   291                              double* 
const * dst, 
const double* src,
   293   cudaD_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
   295 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, 
float alpha, 
float* 
const * dst,
   297   cudaF_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
   299 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, 
double alpha,
   300                              double* dst, 
const double* src,
   303   cudaD_add_to_rows(Gr, Bl, alpha, dst, src, reorder, src_dim, dst_stride);
   305 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, 
float alpha,
   306                              float* dst, 
const float* src,
   309   cudaF_add_to_rows(Gr, Bl, alpha, dst, src, reorder, src_dim, dst_stride);
   311 inline void cuda_add_vec2(dim3 Gr, dim3 Bl, 
double *mat, 
const double *vec,
   312                           const double alpha, 
int dim) {
   313   cudaD_add_vec2(Gr, Bl, mat, vec, alpha, dim);
   315 inline void cuda_add_vec2(dim3 Gr, dim3 Bl, 
float *mat, 
const float *vec,
   316                           const float alpha, 
int dim) {
   317   cudaF_add_vec2(Gr, Bl, mat, vec, alpha, dim);
   319 inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, 
double alpha,
   320                                  const double *col, 
double beta, 
double *dst,
   322   cudaD_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
   324 inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, 
float alpha,
   325                                  const float *col, 
float beta, 
float *dst,
   327   cudaF_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
   329 inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, 
double alpha,
   330                                  const double *row, 
double beta, 
double *dst,
   332   cudaD_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
   334 inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, 
float alpha,
   335                                  const float *row, 
float beta, 
float *dst,
   337   cudaF_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
   339 inline void cuda_add_vec_vec(
int Gr, 
int Bl, 
double alpha, 
double* v,
   340                              const double* x, 
const double* y, 
double beta,
   342   cudaD_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
   344 inline void cuda_add_vec_vec(
int Gr, 
int Bl, 
float alpha, 
float* v,
   345                              const float* x, 
const float* y, 
float beta,
   347   cudaF_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
   349 inline cublasStatus_t cuda_axpy(cublasHandle_t handle, 
int n, 
double alpha,
   350                                 const double *x, 
int incx, 
double *y,
   352   return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy);
   354 inline cublasStatus_t cuda_axpy(cublasHandle_t handle, 
int n, 
float alpha,
   355                                 const float *x, 
int incx, 
float *y, 
int incy) {
   356   return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
   358 inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
   360                                    const double *C_data, 
int C_num_cols,
   361                                    int C_row_stride, 
int C_col_stride,
   362                                    const double *D_data, 
int D_row_stride,
   363                                    int D_col_stride, 
double alpha,
   365   cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
   366                           C_row_stride, C_col_stride, D_data, D_row_stride,
   367                           D_col_stride, alpha, beta);
   369 inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
   371                                    const float *C_data, 
int C_num_cols,
   372                                    int C_row_stride, 
int C_col_stride,
   373                                    const float *D_data, 
int D_row_stride,
   374                                    int D_col_stride, 
float alpha, 
float beta) {
   375   cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
   376                           C_row_stride, C_col_stride, D_data, D_row_stride,
   377                           D_col_stride, alpha, beta);
   379 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, 
double *y,
   380                                       const double *x1, 
const double *x2,
   382                                       int x2_stride, 
int group_size) {
   383   cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
   386 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, 
float *y,
   387                                       const float *x1, 
const float *x2,
   389                                       int x2_stride, 
int group_size) {
   390   cudaF_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
   396   cudaD_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
   401   cudaF_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
   403 inline void cuda_copy_col_from_mat_df(
int Gr, 
int Bl, 
double* v, 
int col,
   406   cudaD_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
   408 inline void cuda_copy_col_from_mat_df(
int Gr, 
int Bl, 
double* v, 
int col,
   411   cudaF_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
   413 inline void cuda_copy_col_from_mat_fd(
int Gr, 
int Bl, 
float* v, 
int col,
   416   cudaD_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
   418 inline void cuda_copy_col_from_mat_fd(
int Gr, 
int Bl, 
float* v, 
int col,
   421   cudaF_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
   423 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, 
double* dst, 
const double* src,
   426   cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
   428 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, 
float* dst, 
const float* src,
   431   cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
   433 inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, 
double *mat_out,
   435   cudaD_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
   437 inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, 
float *mat_out,
   439   cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
   441 inline void cuda_copy(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   444   cudaD_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
   446 inline void cuda_copy(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
   449   cudaF_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
   451 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, 
double* mat_out,
   454   cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
   456 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, 
double* mat_out,
   459   cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
   461 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, 
float* mat_out,
   464   cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
   466 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, 
float* mat_out,
   469   cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
   471 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, 
double* mat_out,
   474   cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
   476 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, 
double* mat_out,
   479   cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
   481 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, 
float* mat_out,
   484   cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
   486 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, 
float* mat_out,
   489   cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
   491 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, 
double* mat,
   492                                 MatrixDim mat_dim, 
const int* smat_row_ptr,
   493                                 const int* smat_col_idx,
   494                                 const double* smat_val) {
   495   cuda_copy_from_smat_dd(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
   498 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, 
double* mat,
   499                                 MatrixDim mat_dim, 
const int* smat_row_ptr,
   500                                 const int* smat_col_idx,
   501                                 const float* smat_val) {
   502   cuda_copy_from_smat_df(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
   505 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, 
float* mat, 
MatrixDim mat_dim,
   506                                 const int* smat_row_ptr,
   507                                 const int* smat_col_idx,
   508                                 const double* smat_val) {
   509   cuda_copy_from_smat_fd(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
   512 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, 
float* mat, 
MatrixDim mat_dim,
   513                                 const int* smat_row_ptr,
   514                                 const int* smat_col_idx,
   515                                 const float* smat_val) {
   516   cuda_copy_from_smat_ff(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
   519 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, 
double* mat,
   521                                       const int* smat_row_ptr,
   522                                       const int* smat_col_idx,
   523                                       const double* smat_val) {
   524   cuda_copy_from_smat_dd_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
   527 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, 
double* mat,
   529                                       const int* smat_row_ptr,
   530                                       const int* smat_col_idx,
   531                                       const float* smat_val) {
   532   cuda_copy_from_smat_df_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
   535 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, 
float* mat,
   537                                       const int* smat_row_ptr,
   538                                       const int* smat_col_idx,
   539                                       const double* smat_val) {
   540   cuda_copy_from_smat_fd_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
   543 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, 
float* mat,
   545                                       const int* smat_row_ptr,
   546                                       const int* smat_col_idx,
   547                                       const float* smat_val) {
   548   cuda_copy_from_smat_ff_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
   551 inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, 
const double* x, 
double* y,
   553   cudaD_copy_from_sp(Gr, Bl, x, y, d_out);
   555 inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, 
const float* x, 
float* y,
   557   cudaF_copy_from_sp(Gr, Bl, x, y, d_out);
   559 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, 
double* A, 
const double* B,
   561   cudaD_copy_from_tp(Gr, Bl, A, B, dmat);
   563 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, 
double* A, 
const float* B,
   565   cudaDF_copy_from_tp(Gr, Bl, A, B, dmat);
   567 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, 
float* A, 
const double* B,
   569   cudaFD_copy_from_tp(Gr, Bl, A, B, dmat);
   571 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, 
float* A, 
const float* B,
   573   cudaF_copy_from_tp(Gr, Bl, A, B, dmat);
   575 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, 
double* A,
   577   cudaD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
   579 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, 
double* A, 
const float* B,
   581   cudaDF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
   583 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, 
float* A, 
const double* B,
   585   cudaFD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
   587 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, 
float* A, 
const float* B,
   589   cudaF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
   591 inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, 
double* A, 
MatrixDim dimA) {
   592   cudaD_copy_low_upp(Gr, Bl, A, dimA);
   594 inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, 
float* A, 
MatrixDim dimA) {
   595   cudaF_copy_low_upp(Gr, Bl, A, dimA);
   597 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, 
double* dst,
   598                            const double* 
const * src, 
MatrixDim dst_dim) {
   599   cudaD_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
   601 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, 
double* dst, 
const double* src,
   604   cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
   606 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, 
float* dst,
   607                            const float* 
const * src, 
MatrixDim dst_dim) {
   608   cudaF_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
   610 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, 
float* dst, 
const float* src,
   613   cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
   615 inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, 
double *mat_out,
   617   cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
   619 inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, 
float *mat_out,
   621   cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
   623 inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, 
double* 
const * dst,
   625   cudaD_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
   627 inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, 
float* 
const * dst,
   629   cudaF_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
   631 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, 
double* A, 
MatrixDim dimA) {
   632   cudaD_copy_upp_low(Gr, Bl, A, dimA);
   634 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, 
float* A, 
MatrixDim dimA) {
   635   cudaF_copy_upp_low(Gr, Bl, A, dimA);
   637 inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, 
double *
id,
   638                                   const double *iv, 
const double *ov,
   640                                   int iv_stride, 
int ov_stride, 
int od_stride,
   641                                   int group_size, 
double power) {
   642   cudaD_diff_group_pnorm(Gr, Bl, 
id, iv, ov, od, id_dim, iv_stride, ov_stride,
   643                          od_stride, group_size, power);
   645 inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, 
float *
id, 
const float *iv,
   646                                   const float *ov, 
const float* od,
   648                                   int ov_stride, 
int od_stride, 
int group_size,
   650   cudaF_diff_group_pnorm(Gr, Bl, 
id, iv, ov, od, id_dim, iv_stride, ov_stride,
   651                          od_stride, group_size, power);
   653 inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
   655                                   const double* out_value,
   656                                   const int out_value_stride,
   657                                   const double* out_deriv,
   658                                   const int out_deriv_stride,
   660   cudaD_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
   661                          out_deriv, out_deriv_stride, in_deriv);
   663 inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
   665                                   const float* out_value,
   666                                   const int out_value_stride,
   667                                   const float* out_deriv,
   668                                   const int out_deriv_stride, 
float* in_deriv) {
   669   cudaF_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
   670                          out_deriv, out_deriv_stride, in_deriv);
   672 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, 
const int cell_dim,
   673                                         const int have_dropout_mask,
   674                                         const int num_rows, 
const double* input,
   675                                         const int input_stride,
   676                                         const double* params,
   677                                         const int params_stride,
   678                                         const double* output_deriv,
   679                                         const int output_deriv_stride,
   680                                         const double* deriv_sum_in,
   681                                         const int deriv_sum_in_stride,
   682                                         const double* self_repair_config,
   683                                         double count, 
double* input_deriv,
   684                                         const int input_deriv_stride,
   685                                         double* params_deriv,
   686                                         const int params_deriv_stride,
   687                                         double* value_sum_out,
   688                                         const int value_sum_out_stride,
   689                                         double* deriv_sum_out,
   690                                         const int deriv_sum_out_stride,
   691                                         double* self_repair_sum_out,
   692                                         const int self_repair_sum_out_stride) {
   693   cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows,
   695                                params, params_stride, output_deriv,
   696                                output_deriv_stride, deriv_sum_in,
   697                                deriv_sum_in_stride, self_repair_config, count,
   698                                input_deriv, input_deriv_stride, params_deriv,
   699                                params_deriv_stride, value_sum_out,
   700                                value_sum_out_stride, deriv_sum_out,
   701                                deriv_sum_out_stride, self_repair_sum_out,
   702                                self_repair_sum_out_stride);
   704 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, 
const int cell_dim,
   705                                         const int have_dropout_mask,
   706                                         const int num_rows, 
const float* input,
   707                                         const int input_stride,
   709                                         const int params_stride,
   710                                         const float* output_deriv,
   711                                         const int output_deriv_stride,
   712                                         const double* deriv_sum_in,
   713                                         const int deriv_sum_in_stride,
   714                                         const float* self_repair_config,
   715                                         double count, 
float* input_deriv,
   716                                         const int input_deriv_stride,
   718                                         const int params_deriv_stride,
   719                                         double* value_sum_out,
   720                                         const int value_sum_out_stride,
   721                                         double* deriv_sum_out,
   722                                         const int deriv_sum_out_stride,
   723                                         float* self_repair_sum_out,
   724                                         const int self_repair_sum_out_stride) {
   725   cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask,
   726                                num_rows, input, input_stride,
   727                                params, params_stride, output_deriv,
   728                                output_deriv_stride, deriv_sum_in,
   729                                deriv_sum_in_stride, self_repair_config, count,
   730                                input_deriv, input_deriv_stride, params_deriv,
   731                                params_deriv_stride, value_sum_out,
   732                                value_sum_out_stride, deriv_sum_out,
   733                                deriv_sum_out_stride, self_repair_sum_out,
   734                                self_repair_sum_out_stride);
   736 inline void cuda_diff_normalize_per_row(
size_t Gr, 
size_t Bl, 
double *
id,
   737                                         int id_stride, 
const double *iv,
   739                                         int od_stride, 
double target_rms,
   740                                         bool add_log_stddev) {
   741   cudaD_diff_normalize_per_row(Gr, Bl, 
id, id_stride, iv, iv_dim, od, od_stride,
   742                                target_rms, add_log_stddev);
   744 inline void cuda_diff_normalize_per_row(
size_t Gr, 
size_t Bl, 
float *
id,
   745                                         int id_stride, 
const float *iv,
   747                                         int od_stride, 
float target_rms,
   748                                         bool add_log_stddev) {
   749   cudaF_diff_normalize_per_row(Gr, Bl, 
id, id_stride, iv, iv_dim, od, od_stride,
   750                                target_rms, add_log_stddev);
   752 inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, 
double *eout,
   753                                       const double *e, 
const double *y,
   755                                       const double *a, 
const double *b) {
   756   cudaD_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
   758 inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, 
float *eout,
   759                                       const float *e, 
const float *y,
   761                                       const float *a, 
const float *b) {
   762   cudaF_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
   764 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, 
double *eout, 
const double *e,
   767   cudaD_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
   769 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, 
float *eout, 
const float *e,
   772   cudaF_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
   774 inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, 
double* x, 
const MatrixDim dim,
   775                               const double* value, 
const int value_stride,
   776                               const double* diff, 
const int diff_stride) {
   777   cudaD_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
   779 inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, 
float* x, 
const MatrixDim dim,
   780                               const float* value, 
const int value_stride,
   781                               const float* diff, 
const int diff_stride) {
   782   cudaF_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
   784 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, 
double *eout, 
const double *e,
   787   cudaD_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
   789 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, 
float *eout, 
const float *e,
   792   cudaF_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
   794 inline void cuda_ensure_nonzero(dim3 Gr, dim3 Bl, 
const double *x, 
MatrixDim d,
   795                                 double epsilon, 
int y_stride, 
double *y) {
   796   cudaD_ensure_nonzero(Gr, Bl, x, d, epsilon, y_stride, y);
   798 inline void cuda_ensure_nonzero(dim3 Gr, dim3 Bl, 
const float *x, 
MatrixDim d,
   799                                 float epsilon, 
int y_stride, 
float *y) {
   800   cudaF_ensure_nonzero(Gr, Bl, x, d, epsilon, y_stride, y);
   802 inline void cuda_diff_xent(dim3 Gr, dim3 Bl, 
const int32_cuda *vec_tgt,
   803                            double *mat_net_out, 
double *vec_log_post,
   805   cudaD_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
   807 inline void cuda_diff_xent(dim3 Gr, dim3 Bl, 
const int32_cuda *vec_tgt,
   808                            float *mat_net_out, 
float *vec_log_post,
   810   cudaF_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
   812 inline void cuda_div_elements(dim3 Gr, dim3 Bl, 
double *mat, 
const double *A,
   814   cudaD_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
   816 inline void cuda_div_elements(dim3 Gr, dim3 Bl, 
float *mat, 
const float *A,
   818   cudaF_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
   820 inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, 
double *mat,
   822   cudaD_div_rows_vec(Gr, Bl, mat, vec_div, d);
   824 inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, 
float *mat,
   826   cudaF_div_rows_vec(Gr, Bl, mat, vec_div, d);
   828 inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, 
const double *mat1,
   829                                     const double *mat2, 
double *mask,
   832   cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
   835 inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, 
const float *mat1,
   836                                     const float *mat2, 
float *mask,
   839   cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
   842 inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, 
const double *mat,
   845   cudaD_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
   847 inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, 
const float *mat,
   850   cudaF_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
   852 inline void cuda_group_max(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   853                            MatrixDim d, 
int src_stride, 
int group_size) {
   854   cudaD_group_max(Gr, Bl, y, x, d, src_stride, group_size);
   856 inline void cuda_group_max(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
   857                            MatrixDim d, 
int src_stride, 
int group_size) {
   858   cudaF_group_max(Gr, Bl, y, x, d, src_stride, group_size);
   860 inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   863   cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
   865 inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
   868   cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
   870 inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   873   cudaD_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
   875 inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, 
float* y, 
const float* x,
   878   cudaF_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
   880 inline void cuda_heaviside(dim3 Gr, dim3 Bl, 
double* y, 
const double* x,
   882   cudaD_heaviside(Gr, Bl, y, x, d, src_stride);
   884 inline void cuda_heaviside(dim3 Gr, dim3 Bl, 
float* y, 
const float* x,
   886   cudaF_heaviside(Gr, Bl, y, x, d, src_stride);
   888 inline void cuda_exp(dim3 Gr, dim3 Bl, 
double* y, 
const double* x,
   890   cudaD_exp(Gr, Bl, y, x, d, src_stride);
   892 inline void cuda_exp(dim3 Gr, dim3 Bl, 
float* y, 
const float* x,
   894   cudaF_exp(Gr, Bl, y, x, d, src_stride);
   896 inline void cuda_pow(dim3 Gr, dim3 Bl, 
double* y, 
const double* x, 
double power,
   898   cudaD_pow(Gr, Bl, y, x, power, d, src_stride);
   900 inline void cuda_pow(dim3 Gr, dim3 Bl, 
float* y, 
const float* x, 
float power,
   902   cudaF_pow(Gr, Bl, y, x, power, d, src_stride);
   904 inline void cuda_ceiling(dim3 Gr, dim3 Bl, 
double* y, 
const double* x, 
double ceiling_val,
   906   cudaD_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
   908 inline void cuda_ceiling(dim3 Gr, dim3 Bl, 
float* y, 
const float* x, 
float ceiling_val,
   910   cudaF_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
   912 inline void cuda_floor(dim3 Gr, dim3 Bl, 
double* y, 
const double* x, 
double floor_val,
   914   cudaD_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
   916 inline void cuda_floor(dim3 Gr, dim3 Bl, 
float* y, 
const float* x, 
float floor_val,
   918   cudaF_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
   920 inline void cuda_exp_limited(dim3 Gr, dim3 Bl, 
double* y, 
const double* x,
   921                              double lower_limit, 
double upper_limit, 
MatrixDim d, 
int src_stride) {
   922   cudaD_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
   924 inline void cuda_exp_limited(dim3 Gr, dim3 Bl, 
float* y, 
const float* x,
   925                              float lower_limit, 
float upper_limit, 
MatrixDim d, 
int src_stride) {
   926   cudaF_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
   928 inline void cuda_exp_special(dim3 Gr, dim3 Bl, 
double* y, 
const double* x,
   930   cudaD_exp_special(Gr, Bl, y, x, d, src_stride);
   932 inline void cuda_exp_special(dim3 Gr, dim3 Bl, 
float* y, 
const float* x,
   934   cudaF_exp_special(Gr, Bl, y, x, d, src_stride);
   936 inline void cuda_log(dim3 Gr, dim3 Bl, 
double* y, 
const double* x, 
MatrixDim d, 
int src_stride) {
   937   cudaD_log(Gr, Bl, y, x, d, src_stride);
   939 inline void cuda_log(dim3 Gr, dim3 Bl, 
float* y, 
const float* x, 
MatrixDim d, 
int src_stride) {
   940   cudaF_log(Gr, Bl, y, x, d, src_stride);
   942 inline void cuda_pow_abs(dim3 Gr, dim3 Bl, 
double* y, 
const double* x, 
double power,
   943                          bool include_sign, 
MatrixDim dim, 
int src_stride) {
   944   cudaD_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
   946 inline void cuda_pow_abs(dim3 Gr, dim3 Bl, 
float* y, 
const float* x, 
float power,
   947                          bool include_sign, 
MatrixDim dim, 
int src_stride) {
   948   cudaF_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
   950 inline void cuda_invert_elements(dim3 Gr, dim3 Bl, 
double *data, 
MatrixDim d) {
   951   cudaD_invert_elements(Gr, Bl, data, d);
   953 inline void cuda_invert_elements(dim3 Gr, dim3 Bl, 
float *data, 
MatrixDim d) {
   954   cudaF_invert_elements(Gr, Bl, data, d);
   956 inline void cuda_log_softmax_reduce(
size_t Gr, 
size_t Bl, 
double *y,
   959   cudaD_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
   961 inline void cuda_log_softmax_reduce(
size_t Gr, 
size_t Bl, 
float *y,
   964   cudaF_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
   966 inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, 
const double* in,
   967                                    const int in_stride, 
const double* params,
   968                                    const int params_stride,
   969                                    const int out_stride, 
const int cell_dim,
   970                                    const int have_dropout_mask,
   971                                    const int num_rows, 
double* out) {
   972   cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
   973                           out_stride, cell_dim, have_dropout_mask,
   976 inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, 
const float* in,
   977                                    const int in_stride, 
const float* params,
   978                                    const int params_stride,
   979                                    const int out_stride, 
const int cell_dim,
   980                                    const int have_dropout_mask,
   981                                    const int num_rows, 
float* out) {
   982   cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
   983                           out_stride, cell_dim, have_dropout_mask,
   986 inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, 
double *data,
   990   cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
   992 inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, 
float *data,
   996   cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
   998 inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, 
MatrixDim dim,
  1001                                            const double* x, 
int s,
  1003   cudaD_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
  1005 inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, 
MatrixDim dim,
  1008                                            const float* x, 
int s, 
float* data) {
  1009   cudaF_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
  1011 inline void cuda_matrix_add_to_elements(dim3 Gr, dim3 Bl, 
double alpha,
  1014   cudaD_matrix_add_to_elements(Gr, Bl, alpha, mat, dim, elements);
  1016 inline void cuda_matrix_add_to_elements(dim3 Gr, dim3 Bl, 
float alpha,
  1019   cudaF_matrix_add_to_elements(Gr, Bl, alpha, mat, dim, elements);
  1021 inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, 
const double *data,
  1023                                int indices_size, 
double *output) {
  1024   cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
  1026 inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, 
const float *data,
  1028                                int indices_size, 
float *output) {
  1029   cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
  1031 inline void cuda_vector_copy_elements(dim3 Gr, dim3 Bl, 
double *data, 
int dim,
  1032                                       const double *src_mat, 
int mat_stride,
  1035   cudaD_vector_copy_elements(Gr, Bl, data, dim, src_mat, mat_stride,
  1036                              transpose, elements);
  1038 inline void cuda_vector_copy_elements(dim3 Gr, dim3 Bl, 
float *data, 
int dim,
  1039                                       const float *src_mat, 
int mat_stride,
  1042   cudaF_vector_copy_elements(Gr, Bl, data, dim, src_mat, mat_stride,
  1043                              transpose, elements);
  1045 inline void cuda_max(dim3 Gr, dim3 Bl, 
double *mat, 
const double *A,
  1047   cudaD_max(Gr, Bl, mat, A, dst_d, src_stride);
  1049 inline void cuda_max(dim3 Gr, dim3 Bl, 
float *mat, 
const float *A,
  1051   cudaF_max(Gr, Bl, mat, A, dst_d, src_stride);
  1053 inline void cuda_max_mat_cols(
int Gr, 
int Bl, 
double* result, 
const double* mat,
  1055   cudaD_max_mat_cols(Gr, Bl, result, mat, d);
  1057 inline void cuda_max_mat_cols(
int Gr, 
int Bl, 
float* result, 
const float* mat,
  1059   cudaF_max_mat_cols(Gr, Bl, result, mat, d);
  1061 inline void cuda_min(dim3 Gr, dim3 Bl, 
double *mat, 
const double *other,
  1063   cudaD_min(Gr, Bl, mat, other, mat_d, other_stride);
  1065 inline void cuda_min(dim3 Gr, dim3 Bl, 
float *mat, 
const float *other,
  1067   cudaF_min(Gr, Bl, mat, other, mat_d, other_stride);
  1069 inline void cuda_min_mat_cols(
int Gr, 
int Bl, 
double* result, 
const double* mat,
  1071   cudaD_min_mat_cols(Gr, Bl, result, mat, d);
  1073 inline void cuda_min_mat_cols(
int Gr, 
int Bl, 
float* result, 
const float* mat,
  1075   cudaF_min_mat_cols(Gr, Bl, result, mat, d);
  1077 inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, 
double *mat,
  1079   cudaD_mul_cols_vec(Gr, Bl, mat, scale, d);
  1081 inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, 
float *mat, 
const float *scale,
  1083   cudaF_mul_cols_vec(Gr, Bl, mat, scale, d);
  1085 inline void cuda_mul_elements(dim3 Gr, dim3 Bl, 
double *mat, 
const double *A,
  1087   cudaD_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
  1089 inline void cuda_mul_elements(dim3 Gr, dim3 Bl, 
float *mat, 
const float *A,
  1091   cudaF_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
  1093 inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, 
double *y,
  1095                                     int src_stride, 
int group_size) {
  1096   cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
  1098 inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
  1101   cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
  1103 inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, 
double *mat,
  1105   cudaD_mul_rows_vec(Gr, Bl, mat, scale, d);
  1107 inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, 
float *mat, 
const float *scale,
  1109   cudaF_mul_rows_vec(Gr, Bl, mat, scale, d);
  1111 inline void cuda_normalize_per_row(
size_t Gr, 
size_t Bl, 
double *y,
  1112                                    int y_stride, 
const double *x, 
MatrixDim x_d,
  1113                                    double target_rms, 
bool add_log_stddev) {
  1114   cudaD_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
  1117 inline void cuda_normalize_per_row(
size_t Gr, 
size_t Bl, 
float *y, 
int y_stride,
  1119                                    float target_rms, 
bool add_log_stddev) {
  1120   cudaF_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
  1123 inline void cuda_one(
int Gr, 
int Bl, 
double* x, 
int dim) {
  1124   cudaD_one(Gr, Bl, x, dim);
  1126 inline void cuda_one(
int Gr, 
int Bl, 
float* x, 
int dim) {
  1127   cudaF_one(Gr, Bl, x, dim);
  1129 inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
  1130                                  MatrixDim d, 
int src_stride, 
const double *a,
  1132   cudaD_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
  1134 inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
  1137   cudaF_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
  1139 inline void cuda_randomize(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
  1142   cudaD_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
  1144 inline void cuda_randomize(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
  1147   cudaF_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
  1149 inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, 
double *wei, 
double *grad,
  1152   cudaD_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
  1154 inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, 
float *wei, 
float *grad,
  1157   cudaF_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
  1159 inline void cuda_replace_value(
int Gr, 
int Bl, 
double *v, 
int dim, 
double orig,
  1161   cudaD_replace_value(Gr, Bl, v, dim, orig, changed);
  1163 inline void cuda_replace_value(
int Gr, 
int Bl, 
float *v, 
int dim, 
float orig,
  1165   cudaF_replace_value(Gr, Bl, v, dim, orig, changed);
  1167 inline cublasStatus_t cuda_scal(cublasHandle_t handle, 
int n, 
double alpha,
  1168                                 double *x, 
int incx) {
  1169   return cublasDscal_v2(handle, n, &alpha, x, incx);
  1171 inline cublasStatus_t cuda_scal(cublasHandle_t handle, 
int n, 
float alpha,
  1172                                 float *x, 
int incx) {
  1173   return cublasSscal_v2(handle, n, &alpha, x, incx);
  1175 inline void cuda_scale_diag_packed(
int Gr, 
int Bl, 
double* mat, 
double value,
  1177   cudaD_scale_diag_packed(Gr, Bl, mat, value, dim);
  1179 inline void cuda_scale_diag_packed(
int Gr, 
int Bl, 
float* mat, 
float value,
  1181   cudaF_scale_diag_packed(Gr, Bl, mat, value, dim);
  1183 inline void cuda_scale(dim3 Gr, dim3 Bl, 
double *mat, 
double value,
  1185   cudaD_scale(Gr, Bl, mat, value, d);
  1187 inline void cuda_scale(dim3 Gr, dim3 Bl, 
float *mat, 
float value, 
MatrixDim d) {
  1188   cudaF_scale(Gr, Bl, mat, value, d);
  1190 inline void cuda_select_rows(dim3 Gr, dim3 Bl, 
const int* out_row_ptr,
  1191                              int* out_col_idx, 
double* out_val,
  1192                              const int* row_indexes,
  1193                              const int num_selected_rows, 
const int* in_row_ptr,
  1194                              const int* in_col_idx, 
const double* in_val) {
  1195   cudaD_select_rows(Gr, Bl, out_row_ptr, out_col_idx, out_val, row_indexes,
  1196                     num_selected_rows, in_row_ptr, in_col_idx, in_val);
  1198 inline void cuda_select_rows(dim3 Gr, dim3 Bl, 
const int* out_row_ptr,
  1199                              int* out_col_idx, 
float* out_val,
  1200                              const int* row_indexes,
  1201                              const int num_selected_rows, 
const int* in_row_ptr,
  1202                              const int* in_col_idx, 
const float* in_val) {
  1203   cudaF_select_rows(Gr, Bl, out_row_ptr, out_col_idx, out_val, row_indexes,
  1204                     num_selected_rows, in_row_ptr, in_col_idx, in_val);
  1206 inline void cuda_set_bias_params(
int Gr, 
int Bl, 
double* v, 
const double* a,
  1207                                  double param_1, 
double param_2, 
double param_3,
  1208                                  int* flag, 
int dim) {
  1209   cudaD_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
  1211 inline void cuda_set_bias_params(
int Gr, 
int Bl, 
float* v, 
const float* a,
  1212                                  float param_1, 
float param_2, 
float param_3,
  1213                                  int* flag, 
int dim) {
  1214   cudaF_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
  1216 inline void cuda_set_const(dim3 Gr, dim3 Bl, 
double *mat, 
double value,
  1218   cudaD_set_const(Gr, Bl, mat, value, d);
  1220 inline void cuda_set_const(dim3 Gr, dim3 Bl, 
float *mat, 
float value,
  1222   cudaF_set_const(Gr, Bl, mat, value, d);
  1224 inline void cuda_set_diag(
int Gr, 
int Bl, 
double* mat, 
double value,
  1226   cudaD_set_diag(Gr, Bl, mat, value, d);
  1228 inline void cuda_set_diag(
int Gr, 
int Bl, 
float* mat, 
float value,
  1230   cudaF_set_diag(Gr, Bl, mat, value, d);
  1232 inline void cuda_set_diag_packed(
int Gr, 
int Bl, 
double* mat, 
double value,
  1234   cudaD_set_diag_packed(Gr, Bl, mat, value, dim);
  1236 inline void cuda_set_diag_packed(
int Gr, 
int Bl, 
float* mat, 
float value,
  1238   cudaF_set_diag_packed(Gr, Bl, mat, value, dim);
  1240 inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, 
const double *A,
  1241                                      const double *B, 
const double *C,
  1243                                      int stride_b, 
int stride_c) {
  1244   cudaD_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
  1247 inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, 
const float *A,
  1248                                      const float *B, 
const float *C, 
float *dst,
  1251   cudaF_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
  1254 inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, 
double* mat,
  1256   cudaD_set_zero_above_diag(Gr, Bl, mat, d);
  1258 inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, 
float* mat,
  1260   cudaF_set_zero_above_diag(Gr, Bl, mat, d);
  1262 inline void cuda_sequence(dim3 Gr, dim3 Bl, 
int32_cuda* data, 
int length,
  1264   cuda_int32_sequence(Gr, Bl, data, length, base);
  1266 inline void cuda_sigmoid(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
  1268   cudaD_sigmoid(Gr, Bl, y, x, d, src_stride);
  1270 inline void cuda_sigmoid(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
  1272   cudaF_sigmoid(Gr, Bl, y, x, d, src_stride);
  1274 inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
  1276   cudaD_soft_hinge(Gr, Bl, y, x, d, src_stride);
  1278 inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
  1280   cudaF_soft_hinge(Gr, Bl, y, x, d, src_stride);
  1282 inline void cuda_softmax_reduce(
size_t Gr, 
size_t Bl, 
double *y,
  1283                                 const double *x, 
MatrixDim d, 
int src_stride) {
  1284   cudaD_softmax_reduce(Gr, Bl, y, x, d, src_stride);
  1286 inline void cuda_softmax_reduce(
size_t Gr, 
size_t Bl, 
float *y, 
const float *x,
  1288   cudaF_softmax_reduce(Gr, Bl, y, x, d, src_stride);
  1290 inline void cuda_splice(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
  1293   cudaD_splice(Gr, Bl, y, x, off, d_out, d_in);
  1295 inline void cuda_splice(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
  1298   cudaF_splice(Gr, Bl, y, x, off, d_out, d_in);
  1300 inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, 
double *data,
  1304   cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
  1306 inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, 
float *data, 
MatrixDim dim,
  1307                                    const float *src_data, 
MatrixDim src_dim,
  1309   cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
  1311 inline void cuda_sum_mat_cols(
int Gr, 
int Bl, 
double* result, 
const double* mat,
  1313   cudaD_sum_mat_cols(Gr, Bl, result, mat, d);
  1315 inline void cuda_sum_mat_cols(
int Gr, 
int Bl, 
float* result, 
const float* mat,
  1317   cudaF_sum_mat_cols(Gr, Bl, result, mat, d);
  1319 inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, 
double alpha, 
double beta,
  1320                             const double* T, 
MatrixDim tdim, 
double *S,
  1322   cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
  1324 inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, 
float alpha, 
float beta,
  1325                             const float* T, 
MatrixDim tdim, 
float *S,
  1327   cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
  1329 inline void cuda_take_lower(dim3 Gr, dim3 Bl, 
const double* x, 
double* y,
  1331   cudaD_take_lower(Gr, Bl, x, y, d_in);
  1333 inline void cuda_take_lower(dim3 Gr, dim3 Bl, 
const float* x, 
float* y,
  1335   cudaF_take_lower(Gr, Bl, x, y, d_in);
  1337 inline void cuda_take_mean(dim3 Gr, dim3 Bl, 
const double* x, 
double* y,
  1339   cudaD_take_mean(Gr, Bl, x, y, d_in);
  1341 inline void cuda_take_mean(dim3 Gr, dim3 Bl, 
const float* x, 
float* y,
  1343   cudaF_take_mean(Gr, Bl, x, y, d_in);
  1345 inline void cuda_take_upper(dim3 Gr, dim3 Bl, 
const double* x, 
double* y,
  1347   cudaD_take_upper(Gr, Bl, x, y, d_in);
  1349 inline void cuda_take_upper(dim3 Gr, dim3 Bl, 
const float* x, 
float* y,
  1351   cudaF_take_upper(Gr, Bl, x, y, d_in);
  1353 inline void cuda_tanh(dim3 Gr, dim3 Bl, 
double *y, 
const double *x, 
MatrixDim d,
  1355   cudaD_tanh(Gr, Bl, y, x, d, src_stride);
  1357 inline void cuda_tanh(dim3 Gr, dim3 Bl, 
float *y, 
const float *x, 
MatrixDim d,
  1359   cudaF_tanh(Gr, Bl, y, x, d, src_stride);
  1361 inline void cuda_trace(
int Gr, 
int Bl, 
double* mat, 
double* value, 
int dim) {
  1362   cudaD_trace(Gr, Bl, mat, value, dim);
  1364 inline void cuda_trace(
int Gr, 
int Bl, 
float* mat, 
float* value, 
int dim) {
  1365   cudaF_trace(Gr, Bl, mat, value, dim);
  1367 inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, 
const double* A,
  1368                                const double* B, 
MatrixDim dA, 
int B_stride,
  1370   cudaD_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
  1372 inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, 
const float* A, 
const float* B,
  1373                                MatrixDim dA, 
int B_stride, 
float* value) {
  1374   cudaF_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
  1376 inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, 
const double* A,
  1378                                      int B_stride, 
double* value) {
  1379   cudaD_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
  1381 inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, 
const float* A,
  1382                                      const float* B, 
MatrixDim dA, 
int B_stride,
  1384   cudaF_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
  1386 inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, 
const double* mat,
  1387                                 MatrixDim mat_dim, 
const int* smat_row_ptr,
  1388                                 const int* smat_col_idx, 
const double* smat_val,
  1389                                 double* trace_vec) {
  1390   cudaD_trace_mat_smat(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
  1391                        smat_val, trace_vec);
  1393 inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, 
const float* mat,
  1394                                 MatrixDim mat_dim, 
const int* smat_row_ptr,
  1395                                 const int* smat_col_idx, 
const float* smat_val,
  1397   cudaF_trace_mat_smat(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
  1398                        smat_val, trace_vec);
  1400 inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, 
const double* mat,
  1402                                       const int* smat_row_ptr,
  1403                                       const int* smat_col_idx,
  1404                                       const double* smat_val,
  1405                                       double* trace_vec) {
  1406   cudaD_trace_mat_smat_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
  1407                              smat_val, trace_vec);
  1409 inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, 
const float* mat,
  1411                                       const int* smat_row_ptr,
  1412                                       const int* smat_col_idx,
  1413                                       const float* smat_val, 
float* trace_vec) {
  1414   cudaF_trace_mat_smat_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
  1415                              smat_val, trace_vec);
  1417 inline void cuda_vec_apply_ceiling(
int Gr, 
int Bl, 
double* v, 
double floor_val,
  1418                                    float* num, 
int dim) {
  1419   cudaD_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
  1421 inline void cuda_vec_apply_ceiling(
int Gr, 
int Bl, 
float* v, 
float floor_val,
  1422                                    float* num, 
int dim) {
  1423   cudaF_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
  1425 inline void cuda_vec_apply_exp(
int Gr, 
int Bl, 
double* v, 
int dim) {
  1426   cudaD_vec_apply_exp(Gr, Bl, v, dim);
  1428 inline void cuda_vec_apply_exp(
int Gr, 
int Bl, 
float* v, 
int dim) {
  1429   cudaF_vec_apply_exp(Gr, Bl, v, dim);
  1431 inline void cuda_vec_apply_floor(
int Gr, 
int Bl, 
double* v, 
double floor_val,
  1432                                  float* num, 
int dim) {
  1433   cudaD_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
  1435 inline void cuda_vec_apply_floor(
int Gr, 
int Bl, 
float* v, 
float floor_val,
  1436                                  float* num, 
int dim) {
  1437   cudaF_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
  1439 inline void cuda_vec_apply_log(
int Gr, 
int Bl, 
double* v, 
double* flag,
  1441   cudaD_vec_apply_log(Gr, Bl, v, flag, dim);
  1443 inline void cuda_vec_apply_log(
int Gr, 
int Bl, 
float* v, 
float* flag, 
int dim) {
  1444   cudaF_vec_apply_log(Gr, Bl, v, flag, dim);
  1446 inline void cuda_vec_copy_diag_from_packed(
int Gr, 
int Bl, 
double *dst,
  1447                                            const double *src, 
int dim) {
  1448   cudaD_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
  1450 inline void cuda_vec_copy_diag_from_packed(
int Gr, 
int Bl, 
float *dst,
  1451                                            const float *src, 
int dim) {
  1452   cudaF_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
  1454 inline void cuda_vec_max(
int Gr, 
int Bl, 
const double* v, 
double* value,
  1456   cudaD_vec_max(Gr, Bl, v, value, dim, inc);
  1458 inline void cuda_vec_max(
int Gr, 
int Bl, 
const float* v, 
float* value, 
int dim,
  1460   cudaF_vec_max(Gr, Bl, v, value, dim, inc);
  1462 inline void cuda_vec_min(
int Gr, 
int Bl, 
const double* v, 
double* value,
  1464   cudaD_vec_min(Gr, Bl, v, value, dim, inc);
  1466 inline void cuda_vec_min(
int Gr, 
int Bl, 
const float* v, 
float* value, 
int dim,
  1468   cudaF_vec_min(Gr, Bl, v, value, dim, inc);
  1470 inline void cuda_vec_mul_elements(
int Gr, 
int Bl, 
double* v, 
const double* a,
  1472   cudaD_vec_mul_elements(Gr, Bl, v, a, dim);
  1474 inline void cuda_vec_mul_elements(
int Gr, 
int Bl, 
float* v, 
const float* a,
  1476   cudaF_vec_mul_elements(Gr, Bl, v, a, dim);
  1478 inline void cuda_vec_soft_max(
int Gr, 
int Bl, 
double* v, 
int dim) {
  1479   cudaD_vec_soft_max(Gr, Bl, v, dim);
  1481 inline void cuda_vec_soft_max(
int Gr, 
int Bl, 
float* v, 
int dim) {
  1482   cudaF_vec_soft_max(Gr, Bl, v, dim);
  1484 inline void cuda_vec_sum(
int Gr, 
int Bl, 
double* v, 
double* value, 
int dim,
  1486   cudaD_vec_sum(Gr, Bl, v, value, dim, inc);
  1488 inline void cuda_vec_sum(
int Gr, 
int Bl, 
float* v, 
float* value, 
int dim,
  1490   cudaF_vec_sum(Gr, Bl, v, value, dim, inc);
  1495 inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
  1498   cuda_compress_uint8_sign(Gr, Bl, src, dim, dest, dest_stride);
  1502 template <
typename I>
  1503 inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
  1506   KALDI_ERR << 
"Not implemented for this type.";
  1509 inline void cuda_mat_compress(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
  1511                               int dest_stride, 
float inv_scale,
  1512                               bool bounds_check) {
  1513   cuda_compress_int16(Gr, Bl, src, dim, dest, dest_stride,
  1514                       inv_scale, bounds_check);
  1516 inline void cuda_mat_compress(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
  1518                               int dest_stride, 
float inv_scale,
  1519                               bool bounds_check) {
  1520   cuda_compress_uint16(Gr, Bl, src, dim, dest, dest_stride,
  1521                        inv_scale, bounds_check);
  1523 inline void cuda_mat_compress(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
  1525                               int dest_stride, 
float inv_scale,
  1526                               bool bounds_check) {
  1527   cuda_compress_uint8(Gr, Bl, src, dim, dest, dest_stride,
  1528                       inv_scale, bounds_check);
  1530 inline void cuda_mat_compress(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
  1532                               int dest_stride, 
float inv_scale,
  1533                               bool bounds_check) {
  1534   cuda_compress_int8(Gr, Bl, src, dim, dest, dest_stride,
  1535                      inv_scale, bounds_check);
  1538 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, 
BaseFloat *dest,
  1540                                 int src_stride, 
float scale) {
  1541   cuda_uncompress_int8(Gr, Bl, dest, dim, src, src_stride, scale);
  1543 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, 
BaseFloat *dest,
  1545                                 int src_stride, 
float scale) {
  1546   cuda_uncompress_uint8(Gr, Bl, dest, dim, src, src_stride, scale);
  1548 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, 
BaseFloat *dest,
  1550                                 int src_stride, 
float scale) {
  1551   cuda_uncompress_int16(Gr, Bl, dest, dim, src, src_stride, scale);
  1553 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, 
BaseFloat *dest,
  1555                                 int src_stride, 
float scale) {
  1556   cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale);
  1559 inline void cuda_mat_copy_range_clamped(
  1560    int32_t row_start, int32_t row_end, int32_t num_cols,
  1561    const double *src, int32_t lds, 
  1562    int32_t clamp_low, int32_t clamp_high,
  1563    double *dst, int32_t ldd) {
  1564   cudaD_mat_copy_range_clamped(row_start, row_end, num_cols,
  1565       src, lds, clamp_low, clamp_high, dst, ldd);
  1568 inline void cuda_mat_copy_range_clamped(
  1569    int32_t row_start, int32_t row_end, int32_t num_cols,
  1570    const float *src, int32_t lds, 
  1571    int32_t clamp_low, int32_t clamp_high,
  1572    float *dst, int32_t ldd) {
  1573   cudaF_mat_copy_range_clamped(row_start, row_end, num_cols,
  1574       src, lds, clamp_low, clamp_high, dst, ldd);
  1577 inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
  1578     int32_t *num_cols, 
const float **inputs, int32_t *ldi, 
float **outputs,
  1580   cudaF_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
  1584 inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
  1585     int32_t *num_cols, 
const double **inputs, int32_t *ldi, 
double **outputs,
  1587   cudaD_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
 
Structure containing size of the matrix plus stride. 
 
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
 
int32_t MatrixIndexT_cuda