26 #ifndef KALDI_CUDAMATRIX_CU_KERNELS_ANSI_H_ 27 #define KALDI_CUDAMATRIX_CU_KERNELS_ANSI_H_ 36 #if (KALDI_DOUBLEPRECISION != 0) 43 void cudaD_add_row_sum_mat(
int Gr,
int Bl,
double* result,
const double* mat,
46 void cudaF_add_row_sum_mat(
int Gr,
int Bl,
float* result,
const float* mat,
49 void cudaD_add_col_sum_mat(
int Gr,
int Bl,
double* result,
const double* mat,
52 void cudaF_add_col_sum_mat(
int Gr,
int Bl,
float* result,
const float* mat,
55 void cudaD_add_cols(dim3 Gr, dim3 Bl,
double* dst,
const double* src,
58 void cudaF_add_cols(dim3 Gr, dim3 Bl,
float* dst,
const float* src,
61 void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl,
const double alpha,
62 const double* M,
const int strid_M,
64 const double beta,
double* v);
65 void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl,
const float alpha,
66 const float* M,
const int strid_M,
68 const float beta,
float* v);
69 void cudaD_add_diag_mat_mat_MNT(
int Gr,
int Bl,
const double alpha,
71 const double* N,
const int stride_N,
72 const double beta,
double* v);
73 void cudaF_add_diag_mat_mat_MNT(
int Gr,
int Bl,
const float alpha,
75 const float* N,
const int stride_N,
76 const float beta,
float* v);
77 void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl,
const double alpha,
78 const double* M,
const int strid_M,
80 const double beta,
double* v,
82 void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl,
const float alpha,
83 const float* M,
const int strid_M,
85 const float beta,
float* v,
const int stride_v);
86 void cudaD_add_diag_packed(
int Gr,
int Bl,
double* mat,
double value,
int dim);
87 void cudaF_add_diag_packed(
int Gr,
int Bl,
float* mat,
float value,
int dim);
88 void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl,
double alpha,
double *mat,
90 const double *mat2,
int mat2_row_stride,
91 int mat2_col_stride,
double beta);
92 void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl,
float alpha,
float *mat,
94 const float *mat2,
int mat2_row_stride,
95 int mat2_col_stride,
float beta);
96 void cudaD_add(dim3 Gr, dim3 Bl,
double *mat,
double value,
MatrixDim d);
97 void cudaF_add(dim3 Gr, dim3 Bl,
float *mat,
float value,
MatrixDim d);
98 void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl,
double *data,
MatrixDim d,
99 const double *Adata,
int A_num_rows,
int A_num_cols,
100 int A_row_stride,
int A_col_stride,
102 int B_num_blocks,
double alpha,
double beta,
104 void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl,
float *data,
MatrixDim d,
105 const float *Adata,
int A_num_rows,
int A_num_cols,
106 int A_row_stride,
int A_col_stride,
108 int B_num_blocks,
float alpha,
float beta,
110 void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl,
double alpha,
const double *src,
114 void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl,
float alpha,
const float *src,
116 float *dst,
MatrixDim d,
int src_stride,
int A_trans);
117 void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl,
double alpha,
const double *src,
119 void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl,
float alpha,
const float *src,
121 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl,
double alpha,
double *mat,
123 int mat2_row_stride,
int mat2_col_stride,
124 const double *vec,
double beta);
125 void cudaF_add_mat_diag_vec(dim3 Gr, dim3 Bl,
float alpha,
float *mat,
127 int mat2_row_stride,
int mat2_col_stride,
128 const float *vec,
float beta);
129 void cudaD_add_mat(dim3 Gr, dim3 Bl,
double alpha,
const double *src,
130 double *dst,
MatrixDim d,
int src_stride,
int A_trans);
131 void cudaF_add_mat(dim3 Gr, dim3 Bl,
float alpha,
const float *src,
float *dst,
133 void cudaD_add_mat_mat_elements(dim3 Gr, dim3 Bl,
double *data,
134 const double *srcA_data,
136 int srcA_stride,
int srcB_stride,
double alpha,
138 void cudaF_add_mat_mat_elements(dim3 Gr, dim3 Bl,
float *data,
139 const float *srcA_data,
const float *srcB_data,
140 MatrixDim dim,
int srcA_stride,
int srcB_stride,
141 float alpha,
float beta);
142 void cudaD_add_row_ranges(dim3 Gr, dim3 Bl,
double *data,
MatrixDim dim,
143 const double *src_data,
MatrixDim src_dim,
145 void cudaF_add_row_ranges(dim3 Gr, dim3 Bl,
float *data,
MatrixDim dim,
146 const float *src_data,
MatrixDim src_dim,
148 void cudaD_add_rows(dim3 Gr, dim3 Bl,
double alpha,
double* dst,
151 void cudaF_add_rows(dim3 Gr, dim3 Bl,
float alpha,
float* dst,
const float* src,
154 void cudaD_mul_rows(dim3 Gr, dim3 Bl,
double* dst,
157 void cudaF_mul_rows(dim3 Gr, dim3 Bl,
float* dst,
const float* src,
160 void cudaD_add_rows_direct(dim3 Gr, dim3 Bl,
double alpha,
double* dst,
161 const double*
const * src,
MatrixDim dst_dim);
162 void cudaF_add_rows_direct(dim3 Gr, dim3 Bl,
float alpha,
float* dst,
163 const float*
const * src,
MatrixDim dst_dim);
164 void cudaD_add_smat(dim3 Gr, dim3 Bl,
double* mat,
MatrixDim mat_dim,
165 double alpha,
const int* smat_row_ptr,
166 const int* smat_col_idx,
const double* smat_val);
167 void cudaF_add_smat(dim3 Gr, dim3 Bl,
float* mat,
MatrixDim mat_dim,
168 float alpha,
const int* smat_row_ptr,
169 const int* smat_col_idx,
const float* smat_val);
170 void cudaD_add_smat_trans(dim3 Gr, dim3 Bl,
double* mat,
MatrixDim mat_dim,
171 double alpha,
const int* smat_row_ptr,
172 const int* smat_col_idx,
const double* smat_val);
173 void cudaF_add_smat_trans(dim3 Gr, dim3 Bl,
float* mat,
MatrixDim mat_dim,
174 float alpha,
const int* smat_row_ptr,
175 const int* smat_col_idx,
const float* smat_val);
176 void cudaD_add_to_rows_direct(dim3 Gr, dim3 Bl,
double alpha,
177 double*
const * dst,
const double* src,
179 void cudaF_add_to_rows_direct(dim3 Gr, dim3 Bl,
float alpha,
float*
const * dst,
181 void cudaD_add_to_rows(dim3 Gr, dim3 Bl,
double alpha,
182 double* dst,
const double* src,
185 void cudaF_add_to_rows(dim3 Gr, dim3 Bl,
float alpha,
186 float* dst,
const float* src,
189 void cudaD_add_vec2(dim3 Gr, dim3 Bl,
double *mat,
const double *vec,
190 const double alpha,
int dim);
191 void cudaF_add_vec2(dim3 Gr, dim3 Bl,
float* mat,
const float* vec,
192 const float alpha,
int dim);
193 void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl,
double alpha,
const double *col,
195 void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl,
float alpha,
const float *col,
197 void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl,
double alpha,
const double *row,
199 void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl,
float alpha,
const float *row,
201 void cudaD_add_vec_vec(
int Gr,
int Bl,
double alpha,
double* v,
const double* x,
202 const double* y,
double beta,
int dim);
203 void cudaF_add_vec_vec(
int Gr,
int Bl,
float alpha,
float* v,
const float* x,
204 const float* y,
float beta,
int dim);
206 int num_blocks,
const double *C_data,
207 int C_num_cols,
int C_row_stride,
int C_col_stride,
208 const double *D_data,
int D_row_stride,
209 int D_col_stride,
double alpha,
double beta);
211 int num_blocks,
const float *C_data,
212 int C_num_cols,
int C_row_stride,
int C_col_stride,
213 const float *D_data,
int D_row_stride,
214 int D_col_stride,
float alpha,
float beta);
215 void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl,
double *y,
const double *x1,
217 int x1_stride,
int x2_stride,
int group_size);
218 void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl,
float *y,
const float *x1,
219 const float *x2,
MatrixDim y_dim,
int x1_stride,
220 int x2_stride,
int group_size);
227 void cudaD_copy_col_from_mat_df(
int Gr,
int Bl,
double* v,
int col,
228 const double* mat,
MatrixDim dmat,
int dim);
229 void cudaF_copy_col_from_mat_df(
int Gr,
int Bl,
double* v,
int col,
230 const float* mat,
MatrixDim dmat,
int dim);
231 void cudaD_copy_col_from_mat_fd(
int Gr,
int Bl,
float* v,
int col,
232 const double* mat,
MatrixDim dmat,
int dim);
233 void cudaF_copy_col_from_mat_fd(
int Gr,
int Bl,
float* v,
int col,
234 const float* mat,
MatrixDim dmat,
int dim);
235 void cudaD_copy_cols(dim3 Gr, dim3 Bl,
double* dst,
const double* src,
238 void cudaF_copy_cols(dim3 Gr, dim3 Bl,
float* dst,
const float* src,
241 void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl,
double *mat_out,
243 void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl,
float *mat_out,
MatrixDim d_out,
245 void cudaD_copy(dim3 Gr, dim3 Bl,
double *y,
const double *x,
247 void cudaF_copy(dim3 Gr, dim3 Bl,
float *y,
const float *x,
249 void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl,
double *mat_out,
252 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl,
double *mat_out,
255 void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl,
double* mat_out,
258 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl,
double* mat_out,
261 void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl,
float *mat_out,
264 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl,
float *mat_out,
267 void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl,
float* mat_out,
270 void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl,
float* mat_out,
273 void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl,
double* mat,
MatrixDim mat_dim,
274 const int* smat_row_ptr,
const int* smat_col_idx,
275 const double* smat_val);
276 void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl,
double* mat,
277 MatrixDim mat_dim,
const int* smat_row_ptr,
278 const int* smat_col_idx,
279 const double* smat_val);
280 void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl,
double* mat,
MatrixDim mat_dim,
281 const int* smat_row_ptr,
const int* smat_col_idx,
282 const float* smat_val);
283 void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl,
double* mat,
284 MatrixDim mat_dim,
const int* smat_row_ptr,
285 const int* smat_col_idx,
286 const float* smat_val);
287 void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl,
float* mat,
MatrixDim mat_dim,
288 const int* smat_row_ptr,
const int* smat_col_idx,
289 const double* smat_val);
290 void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl,
float* mat,
291 MatrixDim mat_dim,
const int* smat_row_ptr,
292 const int* smat_col_idx,
293 const double* smat_val);
294 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl,
float* mat,
MatrixDim mat_dim,
295 const int* smat_row_ptr,
const int* smat_col_idx,
296 const float* smat_val);
297 void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl,
float* mat,
298 MatrixDim mat_dim,
const int* smat_row_ptr,
299 const int* smat_col_idx,
300 const float* smat_val);
301 void cudaD_copy_from_sp(dim3 Gr, dim3 Bl,
const double* x,
double* y,
303 void cudaF_copy_from_sp(dim3 Gr, dim3 Bl,
const float* x,
float* y,
305 void cudaD_copy_from_tp(dim3 Gr, dim3 Bl,
double* A,
const double* B,
307 void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl,
double* A,
const float* B,
309 void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl,
float* A,
const double* B,
311 void cudaF_copy_from_tp(dim3 Gr, dim3 Bl,
float* A,
const float* B,
313 void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl,
double* A,
const double* B,
315 void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl,
double* A,
const float* B,
317 void cudaFD_copy_from_tp_trans(dim3 Gr, dim3 Bl,
float* A,
const double* B,
319 void cudaF_copy_from_tp_trans(dim3 Gr, dim3 Bl,
float* A,
const float* B,
321 void cublas_copy_kaldi_df(
int Gr,
int Bl,
int n,
const double* x,
int incx,
323 void cublas_copy_kaldi_fd(
int Gr,
int Bl,
int n,
const float* x,
int incx,
324 double* y,
int incy);
325 void cudaD_copy_low_upp(dim3 Gr, dim3 Bl,
double* A,
MatrixDim dimA);
326 void cudaF_copy_low_upp(dim3 Gr, dim3 Bl,
float* A,
MatrixDim dimA);
327 void cudaD_copy_rows(dim3 Gr, dim3 Bl,
double* dst,
const double* src,
330 void cudaF_copy_rows(dim3 Gr, dim3 Bl,
float* dst,
const float* src,
333 void cudaD_copy_rows_direct(dim3 Gr, dim3 Bl,
double* dst,
334 const double*
const * src,
MatrixDim dst_dim);
335 void cudaF_copy_rows_direct(dim3 Gr, dim3 Bl,
float* dst,
336 const float*
const * src,
MatrixDim dst_dim);
337 void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl,
double *mat_out,
339 void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl,
float *mat_out,
MatrixDim d_out,
341 void cudaD_copy_to_rows_direct(dim3 Gr, dim3 Bl,
double*
const * dst,
343 void cudaF_copy_to_rows_direct(dim3 Gr, dim3 Bl,
float*
const * dst,
345 void cudaD_copy_upp_low(dim3 Gr, dim3 Bl,
double* A,
MatrixDim dimB);
346 void cudaF_copy_upp_low(dim3 Gr, dim3 Bl,
float* A,
MatrixDim dimA);
347 void cudaD_diff_group_pnorm(dim3 Gr, dim3 Bl,
double *
id,
const double *iv,
348 const double *ov,
const double* od,
349 MatrixDim id_dim,
int iv_stride,
int ov_stride,
350 int od_stride,
int group_size,
double power);
351 void cudaF_diff_group_pnorm(dim3 Gr, dim3 Bl,
float *
id,
const float *iv,
352 const float *ov,
const float* od,
MatrixDim id_dim,
353 int iv_stride,
int ov_stride,
int od_stride,
354 int group_size,
float power);
355 void cudaD_diff_log_softmax(dim3 Gr, dim3 Bl,
const MatrixDim in_deriv_dim,
356 const double* out_value,
const int out_value_stride,
357 const double* out_deriv,
const int out_deriv_stride,
359 void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl,
const MatrixDim in_deriv_dim,
360 const float* out_value,
const int out_value_stride,
361 const float* out_deriv,
const int out_deriv_stride,
363 void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl,
const int cell_dim,
364 const int have_dropout_mask,
365 const int num_rows,
const double* input,
366 const int in_stride,
const double* params,
367 const int params_stride,
368 const double* output_deriv,
369 const int output_deriv_stride,
370 const double* deriv_sum_in,
371 const int deriv_sum_in_stride,
372 const double* self_repair_config,
373 double count,
double* input_deriv,
374 const int input_deriv_stride,
375 double* params_deriv,
376 const int params_deriv_stride,
377 double* value_sum_out,
378 const int value_sum_out_stride,
379 double* deriv_sum_out,
380 const int deriv_sum_out_stride,
381 double* self_repair_sum_out,
382 const int self_repair_sum_out_stride);
383 void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl,
const int cell_dim,
384 const int have_dropout_mask,
385 const int num_rows,
const float* input,
386 const int in_stride,
const float* params,
387 const int params_stride,
388 const float* output_deriv,
389 const int output_deriv_stride,
390 const double* deriv_sum_in,
391 const int deriv_sum_in_stride,
392 const float* self_repair_config,
double count,
394 const int input_deriv_stride,
396 const int params_deriv_stride,
397 double* value_sum_out,
398 const int value_sum_out_stride,
399 double* deriv_sum_out,
400 const int deriv_sum_out_stride,
401 float* self_repair_sum_out,
402 const int self_repair_sum_out_stride);
403 void cudaD_diff_normalize_per_row(
size_t Gr,
size_t Bl,
double *
id,
404 int id_stride,
const double *iv,
406 int od_stride,
double target_rms,
407 bool add_log_stddev);
408 void cudaF_diff_normalize_per_row(
size_t Gr,
size_t Bl,
float *
id,
409 int id_stride,
const float *iv,
411 int od_stride,
float target_rms,
412 bool add_log_stddev);
413 void cudaD_diff_parametric_relu(dim3 Gr, dim3 Bl,
double *eout,
const double *e,
415 int y_stride,
const double *a,
const double *b);
416 void cudaF_diff_parametric_relu(dim3 Gr, dim3 Bl,
float *eout,
const float *e,
418 int y_stride,
const float *a,
const float *b);
419 void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl,
double *eout,
const double *e,
422 void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl,
float *eout,
const float *e,
425 void cudaD_diff_softmax(dim3 Gr, dim3 Bl,
double* x,
const MatrixDim dim,
426 const double* value,
const int value_stride,
427 const double* diff,
const int diff_stride);
428 void cudaF_diff_softmax(dim3 Gr, dim3 Bl,
float* x,
const MatrixDim dim,
429 const float* value,
const int value_stride,
430 const float* diff,
const int diff_stride);
431 void cudaD_diff_tanh(dim3 Gr, dim3 Bl,
double *eout,
const double *e,
432 const double *y,
MatrixDim d,
int e_stride,
int y_stride);
433 void cudaF_diff_tanh(dim3 Gr, dim3 Bl,
float *eout,
const float *e,
434 const float *y,
MatrixDim d,
int e_stride,
int y_stride);
435 void cudaD_ensure_nonzero(dim3 Gr, dim3 Bl,
const double *x,
MatrixDim d,
436 double epsilon,
int y_stride,
double *y);
437 void cudaF_ensure_nonzero(dim3 Gr, dim3 Bl,
const float *x,
MatrixDim d,
438 float epsilon,
int y_stride,
float *y);
439 void cudaD_diff_xent(dim3 Gr, dim3 Bl,
const int32_cuda *vec_tgt,
440 double *mat_net_out,
double *vec_log_post,
MatrixDim d);
441 void cudaF_diff_xent(dim3 Gr, dim3 Bl,
const int32_cuda *vec_tgt,
442 float *mat_net_out,
float *vec_log_post,
MatrixDim d);
443 void cudaD_div_elements(dim3 Gr, dim3 Bl,
double *mat,
const double *A,
445 void cudaF_div_elements(dim3 Gr, dim3 Bl,
float *mat,
const float *A,
447 void cudaD_div_rows_vec(dim3 Gr, dim3 Bl,
double *mat,
const double *vec_div,
449 void cudaF_div_rows_vec(dim3 Gr, dim3 Bl,
float *mat,
const float *vec_div,
451 void cudaD_equal_element_mask(dim3 Gr, dim3 Bl,
const double *mat1,
452 const double *mat2,
double *mask,
455 void cudaF_equal_element_mask(dim3 Gr, dim3 Bl,
const float *mat1,
456 const float *mat2,
float *mask,
459 void cudaD_find_row_max_id(dim3 Gr, dim3 Bl,
const double *mat,
double *vec_val,
461 void cudaF_find_row_max_id(dim3 Gr, dim3 Bl,
const float *mat,
float *vec_val,
463 void cudaD_group_max(dim3 Gr, dim3 Bl,
double *y,
const double *x,
MatrixDim d,
464 int src_stride,
int group_size);
465 void cudaF_group_max(dim3 Gr, dim3 Bl,
float *y,
const float *x,
MatrixDim d,
466 int src_stride,
int group_size);
467 void cudaD_group_pnorm(dim3 Gr, dim3 Bl,
double *y,
const double *x,
470 void cudaF_group_pnorm(dim3 Gr, dim3 Bl,
float *y,
const float *x,
MatrixDim d,
471 int src_stride,
int group_size,
float power);
472 void cudaD_group_spec_pnorm(dim3 Gr, dim3 Bl,
double *y,
const double *x,
475 void cudaF_group_spec_pnorm(dim3 Gr, dim3 Bl,
float *y,
const float *x,
478 void cudaD_heaviside(dim3 Gr, dim3 Bl,
double *y,
const double *x,
MatrixDim d,
480 void cudaF_heaviside(dim3 Gr, dim3 Bl,
float *y,
const float *x,
MatrixDim d,
482 void cudaD_exp(dim3 Gr, dim3 Bl,
double *y,
const double *x,
MatrixDim d,
484 void cudaF_exp(dim3 Gr, dim3 Bl,
float *y,
const float *x,
MatrixDim d,
486 void cudaD_pow(dim3 Gr, dim3 Bl,
double *y,
const double *x,
double power,
MatrixDim d,
488 void cudaF_pow(dim3 Gr, dim3 Bl,
float *y,
const float *x,
float power,
MatrixDim d,
490 void cudaD_ceiling(dim3 Gr, dim3 Bl,
double* y,
const double* x,
double ceiling_val,
492 void cudaF_ceiling(dim3 Gr, dim3 Bl,
float* y,
const float* x,
float ceiling_val,
494 void cudaD_floor(dim3 Gr, dim3 Bl,
double* y,
const double* x,
double floor_val,
496 void cudaF_floor(dim3 Gr, dim3 Bl,
float* y,
const float* x,
float floor_val,
498 void cudaD_exp_limited(dim3 Gr, dim3 Bl,
double* y,
const double* x,
499 double lower_limit,
double upper_limit,
MatrixDim d,
int src_stride);
500 void cudaF_exp_limited(dim3 Gr, dim3 Bl,
float* y,
const float* x,
501 float lower_limit,
float upper_limit,
MatrixDim d,
int src_stride);
502 void cudaD_exp_special(dim3 Gr, dim3 Bl,
double* y,
const double* x,
504 void cudaF_exp_special(dim3 Gr, dim3 Bl,
float* y,
const float* x,
506 void cudaD_log(dim3 Gr, dim3 Bl,
double* y,
const double* x,
MatrixDim d,
int src_stride);
507 void cudaF_log(dim3 Gr, dim3 Bl,
float* y,
const float* x,
MatrixDim d,
int src_stride);
508 void cudaD_pow_abs(dim3 Gr, dim3 Bl,
double* y,
const double* x,
double power,
509 bool include_sign,
MatrixDim dim,
int src_stride);
510 void cudaF_pow_abs(dim3 Gr, dim3 Bl,
float* y,
const float* x,
float power,
511 bool include_sign,
MatrixDim dim,
int src_stride);
516 void cuda_int32_sequence(dim3 Gr, dim3 Bl,
int32_cuda* data,
int length,
518 void cudaD_invert_elements(dim3 Gr, dim3 Bl,
double *data,
MatrixDim d);
519 void cudaF_invert_elements(dim3 Gr, dim3 Bl,
float *data,
MatrixDim d);
520 void cudaD_log_softmax_reduce(
size_t Gr,
size_t Bl,
double *y,
const double *x,
522 void cudaF_log_softmax_reduce(
size_t Gr,
size_t Bl,
float *y,
const float *x,
524 void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl,
const double* in,
525 const int in_stride,
const double* params,
526 const int params_stride,
const int out_stride,
527 const int cell_dim,
const int have_dropout_mask,
530 void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl,
const float* in,
531 const int in_stride,
const float* params,
532 const int params_stride,
const int out_stride,
533 const int cell_dim,
const int have_dropout_mask,
536 void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl,
double *data,
MatrixDim dim,
539 void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl,
float *data,
MatrixDim dim,
542 void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl,
MatrixDim dim,
544 const double* x,
int s,
double* data);
545 void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl,
MatrixDim dim,
547 const float* x,
int s,
float* data);
548 void cudaD_matrix_add_to_elements(dim3 Gr, dim3 Bl,
double alpha,
551 void cudaF_matrix_add_to_elements(dim3 Gr, dim3 Bl,
float alpha,
554 void cudaD_matrix_lookup(dim3 Gr, dim3 Bl,
const double *data,
MatrixDim dim,
555 const Int32Pair *indices,
int indices_size,
557 void cudaF_matrix_lookup(dim3 Gr, dim3 Bl,
const float *data,
MatrixDim dim,
558 const Int32Pair *indices,
int indices_size,
560 void cudaD_vector_copy_elements(dim3 Gr, dim3 Bl,
double *data,
int dim,
561 const double *src_mat,
int mat_stride,
564 void cudaF_vector_copy_elements(dim3 Gr, dim3 Bl,
float *data,
int dim,
565 const float *src_mat,
int mat_stride,
568 void cudaD_max(dim3 Gr, dim3 Bl,
double *mat,
const double *A,
MatrixDim dst_d,
570 void cudaF_max(dim3 Gr, dim3 Bl,
float *mat,
const float *A,
MatrixDim dst_d,
572 void cudaD_max_mat_cols(
int Gr,
int Bl,
double* result,
const double* mat,
574 void cudaF_max_mat_cols(
int Gr,
int Bl,
float* result,
const float* mat,
576 void cudaD_min(dim3 Gr, dim3 Bl,
double *mat,
const double *other,
578 void cudaF_min(dim3 Gr, dim3 Bl,
float *mat,
const float *other,
580 void cudaD_min_mat_cols(
int Gr,
int Bl,
double* result,
const double* mat,
582 void cudaF_min_mat_cols(
int Gr,
int Bl,
float* result,
const float* mat,
584 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl,
double *mat,
const double *scale,
586 void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl,
float *mat,
const float *scale,
588 void cudaD_mul_elements(dim3 Gr, dim3 Bl,
double *mat,
const double *A,
590 void cudaF_mul_elements(dim3 Gr, dim3 Bl,
float *mat,
const float *A,
592 void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl,
double *y,
const double *x,
594 void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl,
float *y,
const float *x,
596 void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl,
double *mat,
const double *scale,
598 void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl,
float *mat,
const float *scale,
600 void cudaD_normalize_per_row(
size_t Gr,
size_t Bl,
double *y,
int y_stride,
601 const double *x,
MatrixDim x_d,
double tartget_rms,
602 bool add_log_stddev);
603 void cudaF_normalize_per_row(
size_t Gr,
size_t Bl,
float *y,
int y_stride,
604 const float *x,
MatrixDim x_d,
float tartget_rms,
605 bool add_log_stddev);
606 void cudaD_one(
int Gr,
int Bl,
double* x,
int dim);
607 void cudaF_one(
int Gr,
int Bl,
float* x,
int dim);
608 void cudaD_parametric_relu(dim3 Gr, dim3 Bl,
double *y,
const double *x,
611 void cudaF_parametric_relu(dim3 Gr, dim3 Bl,
float *y,
const float *x,
614 void cudaD_randomize(dim3 Gr, dim3 Bl,
double *y,
const double *x,
617 void cudaF_randomize(dim3 Gr, dim3 Bl,
float *y,
const float *x,
620 void cudaD_regularize_l1(dim3 Gr, dim3 Bl,
double *wei,
double *grad,
double l1,
622 void cudaF_regularize_l1(dim3 Gr, dim3 Bl,
float *wei,
float *grad,
float l1,
624 void cudaD_replace_value(
int Gr,
int Bl,
double *v,
int dim,
double orig,
626 void cudaF_replace_value(
int Gr,
int Bl,
float *v,
int dim,
float orig,
628 void cudaD_scale_diag_packed(
int Gr,
int Bl,
double* mat,
double value,
630 void cudaF_scale_diag_packed(
int Gr,
int Bl,
float* mat,
float value,
int dim);
631 void cudaD_scale(dim3 Gr, dim3 Bl,
double *mat,
double value,
MatrixDim d);
632 void cudaF_scale(dim3 Gr, dim3 Bl,
float *mat,
float value,
MatrixDim d);
633 void cudaD_select_rows(dim3 Gr, dim3 Bl,
const int* out_row_ptr,
634 int* out_col_idx,
double* out_val,
635 const int* row_indexes,
const int num_selected_rows,
636 const int* in_row_ptr,
const int* in_col_idx,
637 const double* in_val);
638 void cudaF_select_rows(dim3 Gr, dim3 Bl,
const int* out_row_ptr,
639 int* out_col_idx,
float* out_val,
const int* row_indexes,
640 const int num_selected_rows,
const int* in_row_ptr,
641 const int* in_col_idx,
const float* in_val);
642 void cudaD_set_bias_params(
int Gr,
int Bl,
double* v,
const double* a,
643 double param_1,
double param_2,
double param_3,
645 void cudaF_set_bias_params(
int Gr,
int Bl,
float* v,
const float* a,
646 float param_1,
float param_2,
float param_3,
648 void cudaD_set_const(dim3 Gr, dim3 Bl,
double *mat,
double value,
MatrixDim d);
649 void cudaF_set_const(dim3 Gr, dim3 Bl,
float *mat,
float value,
MatrixDim d);
650 void cudaD_set_diag(
int Gr,
int Bl,
double* mat,
double value,
MatrixDim d);
651 void cudaF_set_diag(
int Gr,
int Bl,
float* mat,
float value,
MatrixDim d);
652 void cudaD_set_diag_packed(
int Gr,
int Bl,
double* mat,
double value,
int dim);
653 void cudaF_set_diag_packed(
int Gr,
int Bl,
float* mat,
float value,
int dim);
654 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl,
const double *A,
655 const double *B,
const double *C,
double *dst,
658 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl,
const float *A,
const float *B,
660 int stride_a,
int stride_b,
int stride_c);
661 void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl,
double* mat,
MatrixDim d);
662 void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl,
float* mat,
MatrixDim d);
663 void cudaD_sigmoid(dim3 Gr, dim3 Bl,
double *y,
const double *x,
MatrixDim d,
665 void cudaF_sigmoid(dim3 Gr, dim3 Bl,
float *y,
const float *x,
MatrixDim d,
667 void cudaD_soft_hinge(dim3 Gr, dim3 Bl,
double *y,
const double *x,
MatrixDim d,
669 void cudaF_soft_hinge(dim3 Gr, dim3 Bl,
float *y,
const float *x,
MatrixDim d,
671 void cudaD_softmax_reduce(
size_t Gr,
size_t Bl,
double *y,
const double *x,
673 void cudaF_softmax_reduce(
size_t Gr,
size_t Bl,
float *y,
const float *x,
675 void cudaD_splice(dim3 Gr, dim3 Bl,
double *y,
const double *x,
677 void cudaF_splice(dim3 Gr, dim3 Bl,
float *y,
const float *x,
679 void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl,
double *data,
MatrixDim dim,
680 const double *src_data,
MatrixDim src_dim,
682 void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl,
float *data,
MatrixDim dim,
683 const float *src_data,
MatrixDim src_dim,
685 void cudaD_sum_mat_cols(
int Gr,
int Bl,
double* result,
const double* mat,
687 void cudaF_sum_mat_cols(
int Gr,
int Bl,
float* result,
const float* mat,
689 void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl,
double alpha,
double beta,
690 const double* T,
MatrixDim tdim,
double *S,
692 void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl,
float alpha,
float beta,
const float* T,
694 void cudaD_take_lower(dim3 Gr, dim3 Bl,
const double* x,
double* y,
696 void cudaF_take_lower(dim3 Gr, dim3 Bl,
const float* x,
float* y,
698 void cudaD_take_mean(dim3 Gr, dim3 Bl,
const double* x,
double* y,
700 void cudaF_take_mean(dim3 Gr, dim3 Bl,
const float* x,
float* y,
702 void cudaD_take_upper(dim3 Gr, dim3 Bl,
const double* x,
double* y,
704 void cudaF_take_upper(dim3 Gr, dim3 Bl,
const float* x,
float* y,
706 void cudaD_tanh(dim3 Gr, dim3 Bl,
double *y,
const double *x,
MatrixDim d,
708 void cudaF_tanh(dim3 Gr, dim3 Bl,
float *y,
const float *x,
MatrixDim d,
710 void cudaD_trace(
int Gr,
int Bl,
double* mat,
double* value,
int dim);
711 void cudaF_trace(
int Gr,
int Bl,
float* mat,
float* value,
int dim);
712 void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl,
const double* A,
const double* B,
713 MatrixDim dA,
int B_stride,
double* value);
714 void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl,
const float* A,
const float* B,
715 MatrixDim dA,
int B_stride,
float* value);
716 void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl,
const double* A,
717 const double* B,
MatrixDim dA,
int B_stride,
719 void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl,
const float* A,
const float* B,
720 MatrixDim dA,
int B_stride,
float* value);
721 void cudaD_trace_mat_smat(dim3 Gr, dim3 Bl,
const double* mat,
722 MatrixDim mat_dim,
const int* smat_row_ptr,
723 const int* smat_col_idx,
const double* smat_val,
725 void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl,
const float* mat,
MatrixDim mat_dim,
726 const int* smat_row_ptr,
const int* smat_col_idx,
727 const float* smat_val,
float* trace_vec);
728 void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl,
const double* mat,
729 MatrixDim mat_dim,
const int* smat_row_ptr,
730 const int* smat_col_idx,
const double* smat_val,
732 void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl,
const float* mat,
733 MatrixDim mat_dim,
const int* smat_row_ptr,
734 const int* smat_col_idx,
const float* smat_val,
736 void cudaD_vec_apply_ceiling(
int Gr,
int Bl,
double* v,
double ceiling_val,
737 float* num,
int dim);
738 void cudaF_vec_apply_ceiling(
int Gr,
int Bl,
float* v,
float ceiling_val,
739 float* num,
int dim);
740 void cudaD_vec_apply_exp(
int Gr,
int Bl,
double* v,
int dim);
741 void cudaF_vec_apply_exp(
int Gr,
int Bl,
float* v,
int dim);
742 void cudaD_vec_apply_floor(
int Gr,
int Bl,
double* v,
double floor_val,
743 float* num,
int dim);
744 void cudaF_vec_apply_floor(
int Gr,
int Bl,
float* v,
float floor_val,
745 float* num,
int dim);
746 void cudaD_vec_apply_log(
int Gr,
int Bl,
double* v,
double* flag,
int dim);
747 void cudaF_vec_apply_log(
int Gr,
int Bl,
float* v,
float* flag,
int dim);
748 void cudaD_vec_copy_diag_from_packed(
int Gr,
int Bl,
double *dst,
749 const double *src,
int dim);
750 void cudaF_vec_copy_diag_from_packed(
int Gr,
int Bl,
float *dst,
751 const float *src,
int dim);
752 void cudaD_vec_max(
int Gr,
int Bl,
const double* v,
double* value,
int dim,
754 void cudaF_vec_max(
int Gr,
int Bl,
const float* v,
float* value,
int dim,
756 void cudaD_vec_min(
int Gr,
int Bl,
const double* v,
double* value,
int dim,
758 void cudaF_vec_min(
int Gr,
int Bl,
const float* v,
float* value,
int dim,
760 void cudaD_vec_mul_elements(
int Gr,
int Bl,
double* v,
const double* a,
762 void cudaF_vec_mul_elements(
int Gr,
int Bl,
float* v,
const float* a,
int dim);
763 void cudaD_vec_soft_max(
int Gr,
int Bl,
double* v,
int dim);
764 void cudaF_vec_soft_max(
int Gr,
int Bl,
float* v,
int dim);
765 void cudaD_vec_sum(
int Gr,
int Bl,
double* v,
double* value,
int dim,
int inc);
766 void cudaF_vec_sum(
int Gr,
int Bl,
float* v,
float* value,
int dim,
int inc);
769 void cuda_compress_int16(dim3 Gr, dim3 Bl,
const BaseFloat *src,
771 int dest_stride,
float inv_scale,
773 void cuda_compress_uint16(dim3 Gr, dim3 Bl,
const BaseFloat *src,
775 int dest_stride,
float inv_scale,
777 void cuda_compress_uint8(dim3 Gr, dim3 Bl,
const BaseFloat *src,
779 int dest_stride,
float inv_scale,
781 void cuda_compress_int8(dim3 Gr, dim3 Bl,
const BaseFloat *src,
783 int dest_stride,
float inv_scale,
786 void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl,
const BaseFloat *src,
787 MatrixDim dim, uint8_t *dest,
int dest_stride);
789 void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
791 int src_stride,
float scale);
792 void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest,
794 int src_stride,
float scale);
795 void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest,
797 int src_stride,
float scale);
798 void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
800 int src_stride,
float scale);
805 void cudaF_mat_copy_range_clamped(
806 int32_t row_start, int32_t row_end, int32_t num_cols,
807 const float *src, int32_t lds,
808 int32_t clamp_low, int32_t clamp_high,
809 float *dst, int32_t ldd);
810 void cudaD_mat_copy_range_clamped(
811 int32_t row_start, int32_t row_end, int32_t num_cols,
812 const double *src, int32_t lds,
813 int32_t clamp_low, int32_t clamp_high,
814 double *dst, int32_t ldd);
819 void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
820 int32_t *num_cols,
const float **inputs, int32_t *ldi,
float **outputs,
822 void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
823 int32_t *num_cols,
const double **inputs, int32_t *ldi,
double **outputs,
829 void cuda_legacy_noop();
Structure containing size of the matrix plus stride.
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
int32_t MatrixIndexT_cuda