26 #ifndef KALDI_CUDAMATRIX_CU_KERNELS_ANSI_H_    27 #define KALDI_CUDAMATRIX_CU_KERNELS_ANSI_H_    36 #if (KALDI_DOUBLEPRECISION != 0)    43 void cudaD_add_row_sum_mat(
int Gr, 
int Bl, 
double* result, 
const double* mat,
    46 void cudaF_add_row_sum_mat(
int Gr, 
int Bl, 
float* result, 
const float* mat,
    49 void cudaD_add_col_sum_mat(
int Gr, 
int Bl, 
double* result, 
const double* mat,
    52 void cudaF_add_col_sum_mat(
int Gr, 
int Bl, 
float* result, 
const float* mat,
    55 void cudaD_add_cols(dim3 Gr, dim3 Bl, 
double* dst, 
const double* src,
    58 void cudaF_add_cols(dim3 Gr, dim3 Bl, 
float* dst, 
const float* src,
    61 void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, 
const double alpha,
    62                                const double* M, 
const int strid_M,
    64                                const double beta, 
double* v);
    65 void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, 
const float alpha,
    66                                const float* M, 
const int strid_M,
    68                                const float beta, 
float* v);
    69 void cudaD_add_diag_mat_mat_MNT(
int Gr, 
int Bl, 
const double alpha,
    71                                 const double* N, 
const int stride_N,
    72                                 const double beta, 
double* v);
    73 void cudaF_add_diag_mat_mat_MNT(
int Gr, 
int Bl, 
const float alpha,
    75                                 const float* N, 
const int stride_N,
    76                                 const float beta, 
float* v);
    77 void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, 
const double alpha,
    78                                 const double* M, 
const int strid_M,
    80                                 const double beta, 
double* v,
    82 void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, 
const float alpha,
    83                                 const float* M, 
const int strid_M,
    85                                 const float beta, 
float* v, 
const int stride_v);
    86 void cudaD_add_diag_packed(
int Gr, 
int Bl, 
double* mat, 
double value, 
int dim);
    87 void cudaF_add_diag_packed(
int Gr, 
int Bl, 
float* mat, 
float value, 
int dim);
    88 void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, 
double alpha, 
double *mat,
    90                             const double *mat2, 
int mat2_row_stride,
    91                             int mat2_col_stride, 
double beta);
    92 void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, 
float alpha, 
float *mat,
    94                             const float *mat2, 
int mat2_row_stride,
    95                             int mat2_col_stride, 
float beta);
    96 void cudaD_add(dim3 Gr, dim3 Bl, 
double *mat, 
double value, 
MatrixDim d);
    97 void cudaF_add(dim3 Gr, dim3 Bl, 
float *mat, 
float value, 
MatrixDim d);
    98 void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, 
double *data, 
MatrixDim d,
    99                             const double *Adata, 
int A_num_rows, 
int A_num_cols,
   100                             int A_row_stride, 
int A_col_stride,
   102                             int B_num_blocks, 
double alpha, 
double beta,
   104 void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, 
float *data, 
MatrixDim d,
   105                             const float *Adata, 
int A_num_rows, 
int A_num_cols,
   106                             int A_row_stride, 
int A_col_stride,
   108                             int B_num_blocks, 
float alpha, 
float beta,
   110 void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, 
double alpha, 
const double *src,
   114 void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, 
float alpha, 
const float *src,
   116                           float *dst, 
MatrixDim d, 
int src_stride, 
int A_trans);
   117 void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, 
double alpha, 
const double *src,
   119 void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, 
float alpha, 
const float *src,
   121 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, 
double alpha, 
double *mat,
   123                             int mat2_row_stride, 
int mat2_col_stride,
   124                             const double *vec, 
double beta);
   125 void cudaF_add_mat_diag_vec(dim3 Gr, dim3 Bl, 
float alpha, 
float *mat,
   127                             int mat2_row_stride, 
int mat2_col_stride,
   128                             const float *vec, 
float beta);
   129 void cudaD_add_mat(dim3 Gr, dim3 Bl, 
double alpha, 
const double *src,
   130                    double *dst, 
MatrixDim d, 
int src_stride, 
int A_trans);
   131 void cudaF_add_mat(dim3 Gr, dim3 Bl, 
float alpha, 
const float *src, 
float *dst,
   133 void cudaD_add_mat_mat_elements(dim3 Gr, dim3 Bl, 
double *data,
   134                                 const double *srcA_data,
   136                                 int srcA_stride, 
int srcB_stride, 
double alpha,
   138 void cudaF_add_mat_mat_elements(dim3 Gr, dim3 Bl, 
float *data,
   139                                 const float *srcA_data, 
const float *srcB_data,
   140                                 MatrixDim dim, 
int srcA_stride, 
int srcB_stride,
   141                                 float alpha, 
float beta);
   142 void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, 
double *data, 
MatrixDim dim,
   143                           const double *src_data, 
MatrixDim src_dim,
   145 void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, 
float *data, 
MatrixDim dim,
   146                           const float *src_data, 
MatrixDim src_dim,
   148 void cudaD_add_rows(dim3 Gr, dim3 Bl, 
double alpha, 
double* dst,
   151 void cudaF_add_rows(dim3 Gr, dim3 Bl, 
float alpha, 
float* dst, 
const float* src,
   154 void cudaD_mul_rows(dim3 Gr, dim3 Bl, 
double* dst,
   157 void cudaF_mul_rows(dim3 Gr, dim3 Bl, 
float* dst, 
const float* src,
   160 void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, 
double alpha, 
double* dst,
   161                            const double* 
const * src, 
MatrixDim dst_dim);
   162 void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, 
float alpha, 
float* dst,
   163                            const float* 
const * src, 
MatrixDim dst_dim);
   164 void cudaD_add_smat(dim3 Gr, dim3 Bl, 
double* mat, 
MatrixDim mat_dim,
   165                     double alpha, 
const int* smat_row_ptr,
   166                     const int* smat_col_idx, 
const double* smat_val);
   167 void cudaF_add_smat(dim3 Gr, dim3 Bl, 
float* mat, 
MatrixDim mat_dim,
   168                     float alpha, 
const int* smat_row_ptr,
   169                     const int* smat_col_idx, 
const float* smat_val);
   170 void cudaD_add_smat_trans(dim3 Gr, dim3 Bl, 
double* mat, 
MatrixDim mat_dim,
   171                           double alpha, 
const int* smat_row_ptr,
   172                           const int* smat_col_idx, 
const double* smat_val);
   173 void cudaF_add_smat_trans(dim3 Gr, dim3 Bl, 
float* mat, 
MatrixDim mat_dim,
   174                           float alpha, 
const int* smat_row_ptr,
   175                           const int* smat_col_idx, 
const float* smat_val);
   176 void cudaD_add_to_rows_direct(dim3 Gr, dim3 Bl, 
double alpha,
   177                               double* 
const * dst, 
const double* src,
   179 void cudaF_add_to_rows_direct(dim3 Gr, dim3 Bl, 
float alpha, 
float* 
const * dst,
   181 void cudaD_add_to_rows(dim3 Gr, dim3 Bl, 
double alpha,
   182                        double* dst, 
const double* src,
   185 void cudaF_add_to_rows(dim3 Gr, dim3 Bl, 
float alpha,
   186                        float* dst, 
const float* src,
   189 void cudaD_add_vec2(dim3 Gr, dim3 Bl, 
double *mat, 
const double *vec,
   190                     const double alpha, 
int dim);
   191 void cudaF_add_vec2(dim3 Gr, dim3 Bl, 
float* mat, 
const float* vec,
   192                     const float alpha, 
int dim);
   193 void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, 
double alpha, 
const double *col,
   195 void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, 
float alpha, 
const float *col,
   197 void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, 
double alpha, 
const double *row,
   199 void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, 
float alpha, 
const float *row,
   201 void cudaD_add_vec_vec(
int Gr, 
int Bl, 
double alpha, 
double* v, 
const double* x,
   202                        const double* y, 
double beta, 
int dim);
   203 void cudaF_add_vec_vec(
int Gr, 
int Bl, 
float alpha, 
float* v, 
const float* x,
   204                        const float* y, 
float beta, 
int dim);
   206                              int num_blocks, 
const double *C_data,
   207                              int C_num_cols, 
int C_row_stride, 
int C_col_stride,
   208                              const double *D_data, 
int D_row_stride,
   209                              int D_col_stride, 
double alpha, 
double beta);
   211                              int num_blocks, 
const float *C_data,
   212                              int C_num_cols, 
int C_row_stride, 
int C_col_stride,
   213                              const float *D_data, 
int D_row_stride,
   214                              int D_col_stride, 
float alpha, 
float beta);
   215 void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, 
double *y, 
const double *x1,
   217                                 int x1_stride, 
int x2_stride, 
int group_size);
   218 void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, 
float *y, 
const float *x1,
   219                                 const float *x2, 
MatrixDim y_dim, 
int x1_stride,
   220                                 int x2_stride, 
int group_size);
   227 void cudaD_copy_col_from_mat_df(
int Gr, 
int Bl, 
double* v, 
int col,
   228                                 const double* mat, 
MatrixDim dmat, 
int dim);
   229 void cudaF_copy_col_from_mat_df(
int Gr, 
int Bl, 
double* v, 
int col,
   230                                 const float* mat, 
MatrixDim dmat, 
int dim);
   231 void cudaD_copy_col_from_mat_fd(
int Gr, 
int Bl, 
float* v, 
int col,
   232                                 const double* mat, 
MatrixDim dmat, 
int dim);
   233 void cudaF_copy_col_from_mat_fd(
int Gr, 
int Bl, 
float* v, 
int col,
   234                                 const float* mat, 
MatrixDim dmat, 
int dim);
   235 void cudaD_copy_cols(dim3 Gr, dim3 Bl, 
double* dst, 
const double* src,
   238 void cudaF_copy_cols(dim3 Gr, dim3 Bl, 
float* dst, 
const float* src,
   241 void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, 
double *mat_out,
   243 void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, 
float *mat_out, 
MatrixDim d_out,
   245 void cudaD_copy(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   247 void cudaF_copy(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
   249 void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, 
double *mat_out,
   252 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, 
double *mat_out,
   255 void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, 
double* mat_out,
   258 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, 
double* mat_out,
   261 void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, 
float *mat_out,
   264 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, 
float *mat_out,
   267 void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, 
float* mat_out,
   270 void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, 
float* mat_out,
   273 void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, 
double* mat, 
MatrixDim mat_dim,
   274                             const int* smat_row_ptr, 
const int* smat_col_idx,
   275                             const double* smat_val);
   276 void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, 
double* mat,
   277                                   MatrixDim mat_dim, 
const int* smat_row_ptr,
   278                                   const int* smat_col_idx,
   279                                   const double* smat_val);
   280 void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, 
double* mat, 
MatrixDim mat_dim,
   281                             const int* smat_row_ptr, 
const int* smat_col_idx,
   282                             const float* smat_val);
   283 void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, 
double* mat,
   284                                   MatrixDim mat_dim, 
const int* smat_row_ptr,
   285                                   const int* smat_col_idx,
   286                                   const float* smat_val);
   287 void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, 
float* mat, 
MatrixDim mat_dim,
   288                             const int* smat_row_ptr, 
const int* smat_col_idx,
   289                             const double* smat_val);
   290 void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, 
float* mat,
   291                                   MatrixDim mat_dim, 
const int* smat_row_ptr,
   292                                   const int* smat_col_idx,
   293                                   const double* smat_val);
   294 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, 
float* mat, 
MatrixDim mat_dim,
   295                             const int* smat_row_ptr, 
const int* smat_col_idx,
   296                             const float* smat_val);
   297 void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, 
float* mat,
   298                                   MatrixDim mat_dim, 
const int* smat_row_ptr,
   299                                   const int* smat_col_idx,
   300                                   const float* smat_val);
   301 void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, 
const double* x, 
double* y,
   303 void cudaF_copy_from_sp(dim3 Gr, dim3 Bl, 
const float* x, 
float* y,
   305 void cudaD_copy_from_tp(dim3 Gr, dim3 Bl, 
double* A, 
const double* B,
   307 void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, 
double* A, 
const float* B,
   309 void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, 
float* A, 
const double* B,
   311 void cudaF_copy_from_tp(dim3 Gr, dim3 Bl, 
float* A, 
const float* B,
   313 void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, 
double* A, 
const double* B,
   315 void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, 
double* A, 
const float* B,
   317 void cudaFD_copy_from_tp_trans(dim3 Gr, dim3 Bl, 
float* A, 
const double* B,
   319 void cudaF_copy_from_tp_trans(dim3 Gr, dim3 Bl, 
float* A, 
const float* B,
   321 void cublas_copy_kaldi_df(
int Gr, 
int Bl, 
int n, 
const double* x, 
int incx,
   323 void cublas_copy_kaldi_fd(
int Gr, 
int Bl, 
int n, 
const float* x, 
int incx,
   324                           double* y, 
int incy);
   325 void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, 
double* A, 
MatrixDim dimA);
   326 void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, 
float* A, 
MatrixDim dimA);
   327 void cudaD_copy_rows(dim3 Gr, dim3 Bl, 
double* dst, 
const double* src,
   330 void cudaF_copy_rows(dim3 Gr, dim3 Bl, 
float* dst, 
const float* src,
   333 void cudaD_copy_rows_direct(dim3 Gr, dim3 Bl, 
double* dst,
   334                             const double* 
const * src, 
MatrixDim dst_dim);
   335 void cudaF_copy_rows_direct(dim3 Gr, dim3 Bl, 
float* dst,
   336                             const float* 
const * src, 
MatrixDim dst_dim);
   337 void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, 
double *mat_out,
   339 void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, 
float *mat_out, 
MatrixDim d_out,
   341 void cudaD_copy_to_rows_direct(dim3 Gr, dim3 Bl, 
double* 
const * dst,
   343 void cudaF_copy_to_rows_direct(dim3 Gr, dim3 Bl, 
float* 
const * dst,
   345 void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, 
double* A, 
MatrixDim dimB);
   346 void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, 
float* A, 
MatrixDim dimA);
   347 void cudaD_diff_group_pnorm(dim3 Gr, dim3 Bl, 
double *
id, 
const double *iv,
   348                             const double *ov, 
const double* od,
   349                             MatrixDim id_dim, 
int iv_stride, 
int ov_stride,
   350                             int od_stride, 
int group_size, 
double power);
   351 void cudaF_diff_group_pnorm(dim3 Gr, dim3 Bl, 
float *
id, 
const float *iv,
   352                             const float *ov, 
const float* od, 
MatrixDim id_dim,
   353                             int iv_stride, 
int ov_stride, 
int od_stride,
   354                             int group_size, 
float power);
   355 void cudaD_diff_log_softmax(dim3 Gr, dim3 Bl, 
const MatrixDim in_deriv_dim,
   356                             const double* out_value, 
const int out_value_stride,
   357                             const double* out_deriv, 
const int out_deriv_stride,
   359 void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl, 
const MatrixDim in_deriv_dim,
   360                             const float* out_value, 
const int out_value_stride,
   361                             const float* out_deriv, 
const int out_deriv_stride,
   363 void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, 
const int cell_dim,
   364                                   const int have_dropout_mask,
   365                                   const int num_rows, 
const double* input,
   366                                   const int in_stride, 
const double* params,
   367                                   const int params_stride,
   368                                   const double* output_deriv,
   369                                   const int output_deriv_stride,
   370                                   const double* deriv_sum_in,
   371                                   const int deriv_sum_in_stride,
   372                                   const double* self_repair_config,
   373                                   double count, 
double* input_deriv,
   374                                   const int input_deriv_stride,
   375                                   double* params_deriv,
   376                                   const int params_deriv_stride,
   377                                   double* value_sum_out,
   378                                   const int value_sum_out_stride,
   379                                   double* deriv_sum_out,
   380                                   const int deriv_sum_out_stride,
   381                                   double* self_repair_sum_out,
   382                                   const int self_repair_sum_out_stride);
   383 void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, 
const int cell_dim,
   384                                   const int have_dropout_mask,
   385                                   const int num_rows, 
const float* input,
   386                                   const int in_stride, 
const float* params,
   387                                   const int params_stride,
   388                                   const float* output_deriv,
   389                                   const int output_deriv_stride,
   390                                   const double* deriv_sum_in,
   391                                   const int deriv_sum_in_stride,
   392                                   const float* self_repair_config, 
double count,
   394                                   const int input_deriv_stride,
   396                                   const int params_deriv_stride,
   397                                   double* value_sum_out,
   398                                   const int value_sum_out_stride,
   399                                   double* deriv_sum_out,
   400                                   const int deriv_sum_out_stride,
   401                                   float* self_repair_sum_out,
   402                                   const int self_repair_sum_out_stride);
   403 void cudaD_diff_normalize_per_row(
size_t Gr, 
size_t Bl, 
double *
id,
   404                                   int id_stride, 
const double *iv,
   406                                   int od_stride, 
double target_rms,
   407                                   bool add_log_stddev);
   408 void cudaF_diff_normalize_per_row(
size_t Gr, 
size_t Bl, 
float *
id,
   409                                   int id_stride, 
const float *iv,
   411                                   int od_stride, 
float target_rms,
   412                                   bool add_log_stddev);
   413 void cudaD_diff_parametric_relu(dim3 Gr, dim3 Bl, 
double *eout, 
const double *e,
   415                                 int y_stride, 
const double *a, 
const double *b);
   416 void cudaF_diff_parametric_relu(dim3 Gr, dim3 Bl, 
float *eout, 
const float *e,
   418                                 int y_stride, 
const float *a, 
const float *b);
   419 void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, 
double *eout, 
const double *e,
   422 void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, 
float *eout, 
const float *e,
   425 void cudaD_diff_softmax(dim3 Gr, dim3 Bl, 
double* x, 
const MatrixDim dim,
   426                         const double* value, 
const int value_stride,
   427                         const double* diff, 
const int diff_stride);
   428 void cudaF_diff_softmax(dim3 Gr, dim3 Bl, 
float* x, 
const MatrixDim dim,
   429                         const float* value, 
const int value_stride,
   430                         const float* diff, 
const int diff_stride);
   431 void cudaD_diff_tanh(dim3 Gr, dim3 Bl, 
double *eout, 
const double *e,
   432                      const double *y, 
MatrixDim d, 
int e_stride, 
int y_stride);
   433 void cudaF_diff_tanh(dim3 Gr, dim3 Bl, 
float *eout, 
const float *e,
   434                      const float *y, 
MatrixDim d, 
int e_stride, 
int y_stride);
   435 void cudaD_ensure_nonzero(dim3 Gr, dim3 Bl, 
const double *x, 
MatrixDim d,
   436                           double epsilon, 
int y_stride, 
double *y);
   437 void cudaF_ensure_nonzero(dim3 Gr, dim3 Bl, 
const float *x, 
MatrixDim d,
   438                           float epsilon, 
int y_stride, 
float *y);
   439 void cudaD_diff_xent(dim3 Gr, dim3 Bl, 
const int32_cuda *vec_tgt,
   440                      double *mat_net_out, 
double *vec_log_post, 
MatrixDim d);
   441 void cudaF_diff_xent(dim3 Gr, dim3 Bl, 
const int32_cuda *vec_tgt,
   442                      float *mat_net_out, 
float *vec_log_post, 
MatrixDim d);
   443 void cudaD_div_elements(dim3 Gr, dim3 Bl, 
double *mat, 
const double *A,
   445 void cudaF_div_elements(dim3 Gr, dim3 Bl, 
float *mat, 
const float *A,
   447 void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, 
double *mat, 
const double *vec_div,
   449 void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, 
float *mat, 
const float *vec_div,
   451 void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, 
const double *mat1,
   452                               const double *mat2, 
double *mask,
   455 void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, 
const float *mat1,
   456                               const float *mat2, 
float *mask,
   459 void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, 
const double *mat, 
double *vec_val,
   461 void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, 
const float *mat, 
float *vec_val,
   463 void cudaD_group_max(dim3 Gr, dim3 Bl, 
double *y, 
const double *x, 
MatrixDim d,
   464                      int src_stride, 
int group_size);
   465 void cudaF_group_max(dim3 Gr, dim3 Bl, 
float *y, 
const float *x, 
MatrixDim d,
   466                      int src_stride, 
int group_size);
   467 void cudaD_group_pnorm(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   470 void cudaF_group_pnorm(dim3 Gr, dim3 Bl, 
float *y, 
const float *x, 
MatrixDim d,
   471                        int src_stride, 
int group_size, 
float power);
   472 void cudaD_group_spec_pnorm(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   475 void cudaF_group_spec_pnorm(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
   478 void cudaD_heaviside(dim3 Gr, dim3 Bl, 
double *y, 
const double *x, 
MatrixDim d,
   480 void cudaF_heaviside(dim3 Gr, dim3 Bl, 
float *y, 
const float *x, 
MatrixDim d,
   482 void cudaD_exp(dim3 Gr, dim3 Bl, 
double *y, 
const double *x, 
MatrixDim d,
   484 void cudaF_exp(dim3 Gr, dim3 Bl, 
float *y, 
const float *x, 
MatrixDim d,
   486 void cudaD_pow(dim3 Gr, dim3 Bl, 
double *y, 
const double *x, 
double power, 
MatrixDim d,
   488 void cudaF_pow(dim3 Gr, dim3 Bl, 
float *y, 
const float *x, 
float power, 
MatrixDim d,
   490 void cudaD_ceiling(dim3 Gr, dim3 Bl, 
double* y, 
const double* x, 
double ceiling_val,
   492 void cudaF_ceiling(dim3 Gr, dim3 Bl, 
float* y, 
const float* x, 
float ceiling_val,
   494 void cudaD_floor(dim3 Gr, dim3 Bl, 
double* y, 
const double* x, 
double floor_val,
   496 void cudaF_floor(dim3 Gr, dim3 Bl, 
float* y, 
const float* x, 
float floor_val,
   498 void cudaD_exp_limited(dim3 Gr, dim3 Bl, 
double* y, 
const double* x,
   499                        double lower_limit, 
double upper_limit, 
MatrixDim d, 
int src_stride);
   500 void cudaF_exp_limited(dim3 Gr, dim3 Bl, 
float* y, 
const float* x,
   501                        float lower_limit, 
float upper_limit, 
MatrixDim d, 
int src_stride);
   502 void cudaD_exp_special(dim3 Gr, dim3 Bl, 
double* y, 
const double* x,
   504 void cudaF_exp_special(dim3 Gr, dim3 Bl, 
float* y, 
const float* x,
   506 void cudaD_log(dim3 Gr, dim3 Bl, 
double* y, 
const double* x, 
MatrixDim d, 
int src_stride);
   507 void cudaF_log(dim3 Gr, dim3 Bl, 
float* y, 
const float* x, 
MatrixDim d, 
int src_stride);
   508 void cudaD_pow_abs(dim3 Gr, dim3 Bl, 
double* y, 
const double* x, 
double power,
   509                    bool include_sign, 
MatrixDim dim, 
int src_stride);
   510 void cudaF_pow_abs(dim3 Gr, dim3 Bl, 
float* y, 
const float* x, 
float power,
   511                    bool include_sign, 
MatrixDim dim, 
int src_stride);  
   516 void cuda_int32_sequence(dim3 Gr, dim3 Bl, 
int32_cuda* data, 
int length,
   518 void cudaD_invert_elements(dim3 Gr, dim3 Bl, 
double *data, 
MatrixDim d);
   519 void cudaF_invert_elements(dim3 Gr, dim3 Bl, 
float *data, 
MatrixDim d);
   520 void cudaD_log_softmax_reduce(
size_t Gr, 
size_t Bl, 
double *y, 
const double *x,
   522 void cudaF_log_softmax_reduce(
size_t Gr, 
size_t Bl, 
float *y, 
const float *x,
   524 void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, 
const double* in,
   525                              const int in_stride, 
const double* params,
   526                              const int params_stride, 
const int out_stride,
   527                              const int cell_dim, 
const int have_dropout_mask,
   530 void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, 
const float* in,
   531                              const int in_stride, 
const float* params,
   532                              const int params_stride, 
const int out_stride,
   533                              const int cell_dim, 
const int have_dropout_mask,
   536 void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, 
double *data, 
MatrixDim dim,
   539 void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, 
float *data, 
MatrixDim dim,
   542 void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, 
MatrixDim dim,
   544                                      const double* x, 
int s, 
double* data);
   545 void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, 
MatrixDim dim,
   547                                      const float* x, 
int s, 
float* data);
   548 void cudaD_matrix_add_to_elements(dim3 Gr, dim3 Bl, 
double alpha,
   551 void cudaF_matrix_add_to_elements(dim3 Gr, dim3 Bl, 
float alpha,
   554 void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, 
const double *data, 
MatrixDim dim,
   555                          const Int32Pair *indices, 
int indices_size,
   557 void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, 
const float *data, 
MatrixDim dim,
   558                          const Int32Pair *indices, 
int indices_size,
   560 void cudaD_vector_copy_elements(dim3 Gr, dim3 Bl, 
double *data, 
int dim,
   561                                 const double *src_mat, 
int mat_stride,
   564 void cudaF_vector_copy_elements(dim3 Gr, dim3 Bl, 
float *data, 
int dim,
   565                                 const float *src_mat, 
int mat_stride,
   568 void cudaD_max(dim3 Gr, dim3 Bl, 
double *mat, 
const double *A, 
MatrixDim dst_d,
   570 void cudaF_max(dim3 Gr, dim3 Bl, 
float *mat, 
const float *A, 
MatrixDim dst_d,
   572 void cudaD_max_mat_cols(
int Gr, 
int Bl, 
double* result, 
const double* mat,
   574 void cudaF_max_mat_cols(
int Gr, 
int Bl, 
float* result, 
const float* mat,
   576 void cudaD_min(dim3 Gr, dim3 Bl, 
double *mat, 
const double *other,
   578 void cudaF_min(dim3 Gr, dim3 Bl, 
float *mat, 
const float *other,
   580 void cudaD_min_mat_cols(
int Gr, 
int Bl, 
double* result, 
const double* mat,
   582 void cudaF_min_mat_cols(
int Gr, 
int Bl, 
float* result, 
const float* mat,
   584 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, 
double *mat, 
const double *scale,
   586 void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, 
float *mat, 
const float *scale,
   588 void cudaD_mul_elements(dim3 Gr, dim3 Bl, 
double *mat, 
const double *A,
   590 void cudaF_mul_elements(dim3 Gr, dim3 Bl, 
float *mat, 
const float *A,
   592 void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   594 void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
   596 void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, 
double *mat, 
const double *scale,
   598 void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, 
float *mat, 
const float *scale,
   600 void cudaD_normalize_per_row(
size_t Gr, 
size_t Bl, 
double *y, 
int y_stride,
   601                              const double *x, 
MatrixDim x_d, 
double tartget_rms,
   602                              bool add_log_stddev);
   603 void cudaF_normalize_per_row(
size_t Gr, 
size_t Bl, 
float *y, 
int y_stride,
   604                              const float *x, 
MatrixDim x_d, 
float tartget_rms,
   605                              bool add_log_stddev);
   606 void cudaD_one(
int Gr, 
int Bl, 
double* x, 
int dim);
   607 void cudaF_one(
int Gr, 
int Bl, 
float* x, 
int dim);
   608 void cudaD_parametric_relu(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   611 void cudaF_parametric_relu(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
   614 void cudaD_randomize(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   617 void cudaF_randomize(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
   620 void cudaD_regularize_l1(dim3 Gr, dim3 Bl, 
double *wei, 
double *grad, 
double l1,
   622 void cudaF_regularize_l1(dim3 Gr, dim3 Bl, 
float *wei, 
float *grad, 
float l1,
   624 void cudaD_replace_value(
int Gr, 
int Bl, 
double *v, 
int dim, 
double orig,
   626 void cudaF_replace_value(
int Gr, 
int Bl, 
float *v, 
int dim, 
float orig,
   628 void cudaD_scale_diag_packed(
int Gr, 
int Bl, 
double* mat, 
double value,
   630 void cudaF_scale_diag_packed(
int Gr, 
int Bl, 
float* mat, 
float value, 
int dim);
   631 void cudaD_scale(dim3 Gr, dim3 Bl, 
double *mat, 
double value, 
MatrixDim d);
   632 void cudaF_scale(dim3 Gr, dim3 Bl, 
float *mat, 
float value, 
MatrixDim d);
   633 void cudaD_select_rows(dim3 Gr, dim3 Bl, 
const int* out_row_ptr,
   634                        int* out_col_idx, 
double* out_val,
   635                        const int* row_indexes, 
const int num_selected_rows,
   636                        const int* in_row_ptr, 
const int* in_col_idx,
   637                        const double* in_val);
   638 void cudaF_select_rows(dim3 Gr, dim3 Bl, 
const int* out_row_ptr,
   639                        int* out_col_idx, 
float* out_val, 
const int* row_indexes,
   640                        const int num_selected_rows, 
const int* in_row_ptr,
   641                        const int* in_col_idx, 
const float* in_val);
   642 void cudaD_set_bias_params(
int Gr, 
int Bl, 
double* v, 
const double* a,
   643                            double param_1, 
double param_2, 
double param_3,
   645 void cudaF_set_bias_params(
int Gr, 
int Bl, 
float* v, 
const float* a,
   646                            float param_1, 
float param_2, 
float param_3,
   648 void cudaD_set_const(dim3 Gr, dim3 Bl, 
double *mat, 
double value, 
MatrixDim d);
   649 void cudaF_set_const(dim3 Gr, dim3 Bl, 
float *mat, 
float value, 
MatrixDim d);
   650 void cudaD_set_diag(
int Gr, 
int Bl, 
double* mat, 
double value, 
MatrixDim d);
   651 void cudaF_set_diag(
int Gr, 
int Bl, 
float* mat, 
float value, 
MatrixDim d);
   652 void cudaD_set_diag_packed(
int Gr, 
int Bl, 
double* mat, 
double value, 
int dim);
   653 void cudaF_set_diag_packed(
int Gr, 
int Bl, 
float* mat, 
float value, 
int dim);
   654 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, 
const double *A,
   655                                const double *B, 
const double *C, 
double *dst,
   658 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, 
const float *A, 
const float *B,
   660                                int stride_a, 
int stride_b, 
int stride_c);
   661 void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, 
double* mat, 
MatrixDim d);
   662 void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, 
float* mat, 
MatrixDim d);
   663 void cudaD_sigmoid(dim3 Gr, dim3 Bl, 
double *y, 
const double *x, 
MatrixDim d,
   665 void cudaF_sigmoid(dim3 Gr, dim3 Bl, 
float *y, 
const float *x, 
MatrixDim d,
   667 void cudaD_soft_hinge(dim3 Gr, dim3 Bl, 
double *y, 
const double *x, 
MatrixDim d,
   669 void cudaF_soft_hinge(dim3 Gr, dim3 Bl, 
float *y, 
const float *x, 
MatrixDim d,
   671 void cudaD_softmax_reduce(
size_t Gr, 
size_t Bl, 
double *y, 
const double *x,
   673 void cudaF_softmax_reduce(
size_t Gr, 
size_t Bl, 
float *y, 
const float *x,
   675 void cudaD_splice(dim3 Gr, dim3 Bl, 
double *y, 
const double *x,
   677 void cudaF_splice(dim3 Gr, dim3 Bl, 
float *y, 
const float *x,
   679 void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, 
double *data, 
MatrixDim dim,
   680                              const double *src_data, 
MatrixDim src_dim,
   682 void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, 
float *data, 
MatrixDim dim,
   683                              const float *src_data, 
MatrixDim src_dim,
   685 void cudaD_sum_mat_cols(
int Gr, 
int Bl, 
double* result, 
const double* mat,
   687 void cudaF_sum_mat_cols(
int Gr, 
int Bl, 
float* result, 
const float* mat,
   689 void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, 
double alpha, 
double beta,
   690                       const double* T, 
MatrixDim tdim, 
double *S,
   692 void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, 
float alpha, 
float beta, 
const float* T,
   694 void cudaD_take_lower(dim3 Gr, dim3 Bl, 
const double* x, 
double* y,
   696 void cudaF_take_lower(dim3 Gr, dim3 Bl, 
const float* x, 
float* y,
   698 void cudaD_take_mean(dim3 Gr, dim3 Bl, 
const double* x, 
double* y,
   700 void cudaF_take_mean(dim3 Gr, dim3 Bl, 
const float* x, 
float* y,
   702 void cudaD_take_upper(dim3 Gr, dim3 Bl, 
const double* x, 
double* y,
   704 void cudaF_take_upper(dim3 Gr, dim3 Bl, 
const float* x, 
float* y,
   706 void cudaD_tanh(dim3 Gr, dim3 Bl, 
double *y, 
const double *x, 
MatrixDim d,
   708 void cudaF_tanh(dim3 Gr, dim3 Bl, 
float *y, 
const float *x, 
MatrixDim d,
   710 void cudaD_trace(
int Gr, 
int Bl, 
double* mat, 
double* value, 
int dim);
   711 void cudaF_trace(
int Gr, 
int Bl, 
float* mat, 
float* value, 
int dim);
   712 void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, 
const double* A, 
const double* B,
   713                          MatrixDim dA, 
int B_stride, 
double* value);
   714 void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, 
const float* A, 
const float* B,
   715                          MatrixDim dA, 
int B_stride, 
float* value);
   716 void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, 
const double* A,
   717                                const double* B, 
MatrixDim dA, 
int B_stride,
   719 void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, 
const float* A, 
const float* B,
   720                                MatrixDim dA, 
int B_stride, 
float* value);
   721 void cudaD_trace_mat_smat(dim3 Gr, dim3 Bl, 
const double* mat,
   722                           MatrixDim mat_dim, 
const int* smat_row_ptr,
   723                           const int* smat_col_idx, 
const double* smat_val,
   725 void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, 
const float* mat, 
MatrixDim mat_dim,
   726                           const int* smat_row_ptr, 
const int* smat_col_idx,
   727                           const float* smat_val, 
float* trace_vec);
   728 void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, 
const double* mat,
   729                                 MatrixDim mat_dim, 
const int* smat_row_ptr,
   730                                 const int* smat_col_idx, 
const double* smat_val,
   732 void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, 
const float* mat,
   733                                 MatrixDim mat_dim, 
const int* smat_row_ptr,
   734                                 const int* smat_col_idx, 
const float* smat_val,
   736 void cudaD_vec_apply_ceiling(
int Gr, 
int Bl, 
double* v, 
double ceiling_val,
   737                              float* num, 
int dim);
   738 void cudaF_vec_apply_ceiling(
int Gr, 
int Bl, 
float* v, 
float ceiling_val,
   739                              float* num, 
int dim);
   740 void cudaD_vec_apply_exp(
int Gr, 
int Bl, 
double* v, 
int dim);
   741 void cudaF_vec_apply_exp(
int Gr, 
int Bl, 
float* v, 
int dim);
   742 void cudaD_vec_apply_floor(
int Gr, 
int Bl, 
double* v, 
double floor_val,
   743                            float* num, 
int dim);
   744 void cudaF_vec_apply_floor(
int Gr, 
int Bl, 
float* v, 
float floor_val,
   745                            float* num, 
int dim);
   746 void cudaD_vec_apply_log(
int Gr, 
int Bl, 
double* v, 
double* flag, 
int dim);
   747 void cudaF_vec_apply_log(
int Gr, 
int Bl, 
float* v, 
float* flag, 
int dim);
   748 void cudaD_vec_copy_diag_from_packed(
int Gr, 
int Bl, 
double *dst,
   749                                      const double *src, 
int dim);
   750 void cudaF_vec_copy_diag_from_packed(
int Gr, 
int Bl, 
float *dst,
   751                                      const float *src, 
int dim);
   752 void cudaD_vec_max(
int Gr, 
int Bl, 
const double* v, 
double* value, 
int dim,
   754 void cudaF_vec_max(
int Gr, 
int Bl, 
const float* v, 
float* value, 
int dim,
   756 void cudaD_vec_min(
int Gr, 
int Bl, 
const double* v, 
double* value, 
int dim,
   758 void cudaF_vec_min(
int Gr, 
int Bl, 
const float* v, 
float* value, 
int dim,
   760 void cudaD_vec_mul_elements(
int Gr, 
int Bl, 
double* v, 
const double* a,
   762 void cudaF_vec_mul_elements(
int Gr, 
int Bl, 
float* v, 
const float* a, 
int dim);
   763 void cudaD_vec_soft_max(
int Gr, 
int Bl, 
double* v, 
int dim);
   764 void cudaF_vec_soft_max(
int Gr, 
int Bl, 
float* v, 
int dim);
   765 void cudaD_vec_sum(
int Gr, 
int Bl, 
double* v, 
double* value, 
int dim, 
int inc);
   766 void cudaF_vec_sum(
int Gr, 
int Bl, 
float* v, 
float* value, 
int dim, 
int inc);
   769 void cuda_compress_int16(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
   771                           int dest_stride, 
float inv_scale,
   773 void cuda_compress_uint16(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
   775                           int dest_stride, 
float inv_scale,
   777 void cuda_compress_uint8(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
   779                           int dest_stride, 
float inv_scale,
   781 void cuda_compress_int8(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
   783                          int dest_stride, 
float inv_scale,
   786 void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, 
const BaseFloat *src,
   787                               MatrixDim dim, uint8_t *dest, 
int dest_stride);
   789 void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
   791                            int src_stride, 
float scale);
   792 void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest,
   794                             int src_stride, 
float scale);
   795 void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest,
   797                           int src_stride, 
float scale);
   798 void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
   800                           int src_stride, 
float scale);
   805 void cudaF_mat_copy_range_clamped(
   806    int32_t row_start, int32_t row_end, int32_t num_cols,
   807    const float *src, int32_t lds, 
   808    int32_t clamp_low, int32_t clamp_high,
   809    float *dst, int32_t ldd);
   810 void cudaD_mat_copy_range_clamped(
   811    int32_t row_start, int32_t row_end, int32_t num_cols,
   812    const double *src, int32_t lds, 
   813    int32_t clamp_low, int32_t clamp_high,
   814    double *dst, int32_t ldd);
   819 void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
   820     int32_t *num_cols, 
const float **inputs, int32_t *ldi, 
float **outputs,
   822 void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
   823     int32_t *num_cols, 
const double **inputs, int32_t *ldi, 
double **outputs,
   829 void cuda_legacy_noop();
 Structure containing size of the matrix plus stride. 
 
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
 
int32_t MatrixIndexT_cuda