27 #ifndef KALDI_CUDAMATRIX_CU_KERNELS_H_ 28 #define KALDI_CUDAMATRIX_CU_KERNELS_H_ 42 inline void cuda_add_row_sum_mat(
int Gr,
int Bl,
double* result,
44 const double alpha,
const double beta) {
45 cudaD_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
47 inline void cuda_add_row_sum_mat(
int Gr,
int Bl,
float* result,
49 const float alpha,
const float beta) {
50 cudaF_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
52 inline void cuda_add_col_sum_mat(
int Gr,
int Bl,
double* result,
54 const double alpha,
const double beta) {
55 cudaD_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
57 inline void cuda_add_col_sum_mat(
int Gr,
int Bl,
float* result,
59 const float alpha,
const float beta) {
60 cudaF_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
62 inline void cuda_add_cols(dim3 Gr, dim3 Bl,
double* dst,
const double* src,
65 cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
67 inline void cuda_add_cols(dim3 Gr, dim3 Bl,
float* dst,
const float* src,
70 cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
72 inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl,
const double alpha,
73 const double* M,
const int stride_M,
75 const double beta,
double* v) {
76 cudaD_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
78 inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl,
const float alpha,
79 const float* M,
const int stride_M,
81 const float beta,
float* v) {
82 cudaF_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
84 inline void cuda_add_diag_mat_mat_MNT(
int Gr,
int Bl,
const double alpha,
86 const double* N,
const int stride_N,
87 const double beta,
double* v) {
88 cudaD_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
90 inline void cuda_add_diag_mat_mat_MNT(
int Gr,
int Bl,
const float alpha,
92 const float* N,
const int stride_N,
93 const float beta,
float* v) {
94 cudaF_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
96 inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl,
const double alpha,
97 const double* M,
const int stride_M,
99 const double beta,
double* v,
100 const int stride_v) {
101 cudaD_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v,
104 inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl,
const float alpha,
105 const float* M,
const int stride_M,
107 const float beta,
float* v,
108 const int stride_v) {
109 cudaF_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v,
112 inline void cuda_add_diag_packed(
int Gr,
int Bl,
double* mat,
double value,
114 cudaD_add_diag_packed(Gr, Bl, mat, value, dim);
116 inline void cuda_add_diag_packed(
int Gr,
int Bl,
float* mat,
float value,
118 cudaF_add_diag_packed(Gr, Bl, mat, value, dim);
120 inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl,
double alpha,
double *mat,
122 const double *mat2,
int mat2_row_stride,
123 int mat2_col_stride,
double beta) {
124 cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
125 mat2_row_stride, mat2_col_stride, beta);
127 inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl,
float alpha,
float *mat,
129 const float *mat2,
int mat2_row_stride,
130 int mat2_col_stride,
float beta) {
131 cudaF_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
132 mat2_row_stride, mat2_col_stride, beta);
134 inline void cuda_add(dim3 Gr, dim3 Bl,
double *mat,
double value,
MatrixDim d) {
135 cudaD_add(Gr, Bl, mat, value, d);
137 inline void cuda_add(dim3 Gr, dim3 Bl,
float *mat,
float value,
MatrixDim d) {
138 cudaF_add(Gr, Bl, mat, value, d);
140 inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl,
double *data,
MatrixDim d,
141 const double *Adata,
int A_num_rows,
142 int A_num_cols,
int A_row_stride,
145 int B_num_blocks,
double alpha,
double beta,
147 cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
148 A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
149 alpha, beta, B_trans);
151 inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl,
float *data,
MatrixDim d,
152 const float *Adata,
int A_num_rows,
153 int A_num_cols,
int A_row_stride,
156 int B_num_blocks,
float alpha,
float beta,
158 cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
159 A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
160 alpha, beta, B_trans);
162 inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl,
double alpha,
166 cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
167 d, src_stride, A_trans);
169 inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl,
float alpha,
const float *src,
173 cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
174 d, src_stride, A_trans);
176 inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl,
double alpha,
179 cudaD_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
181 inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl,
float alpha,
184 cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
186 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl,
double alpha,
double *mat,
188 int mat2_row_stride,
int mat2_col_stride,
189 const double *vec,
double beta) {
190 cudaD_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
191 mat2_col_stride, vec, beta);
193 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl,
float alpha,
float *mat,
195 int mat2_row_stride,
int mat2_col_stride,
196 const float *vec,
float beta) {
197 cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
198 mat2_col_stride, vec, beta);
200 inline void cuda_add_mat(dim3 Gr, dim3 Bl,
double alpha,
const double *src,
203 cudaD_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
205 inline void cuda_add_mat(dim3 Gr, dim3 Bl,
float alpha,
const float *src,
206 float *dst,
MatrixDim d,
int src_stride,
int A_trans) {
207 cudaF_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
209 inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl,
double *data,
210 const double *srcA_data,
212 int srcA_stride,
int srcB_stride,
213 double alpha,
double beta) {
214 cudaD_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
215 srcA_stride, srcB_stride, alpha, beta);
217 inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl,
float *data,
218 const float *srcA_data,
220 int srcA_stride,
int srcB_stride,
221 float alpha,
float beta) {
222 cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
223 srcA_stride, srcB_stride, alpha, beta);
225 inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl,
double *data,
MatrixDim dim,
226 const double *src_data,
MatrixDim src_dim,
228 cudaD_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
230 inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl,
float *data,
MatrixDim dim,
231 const float *src_data,
MatrixDim src_dim,
233 cudaF_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
235 inline void cuda_add_rows(dim3 Gr, dim3 Bl,
double alpha,
double* dst,
236 const double*
const * src,
MatrixDim dst_dim) {
237 cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
239 inline void cuda_add_rows(dim3 Gr, dim3 Bl,
float alpha,
float* dst,
240 const float*
const * src,
MatrixDim dst_dim) {
241 cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
243 inline void cuda_add_rows(dim3 Gr, dim3 Bl,
double alpha,
double* dst,
246 cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
248 inline void cuda_add_rows(dim3 Gr, dim3 Bl,
float alpha,
float* dst,
251 cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
253 inline void cuda_mul_rows(dim3 Gr, dim3 Bl,
double* dst,
256 cudaD_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
258 inline void cuda_mul_rows(dim3 Gr, dim3 Bl,
float* dst,
261 cudaF_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
263 inline void cuda_add_smat(dim3 Gr, dim3 Bl,
double* mat,
MatrixDim mat_dim,
264 double alpha,
const int* smat_row_ptr,
265 const int* smat_col_idx,
const double* smat_val) {
266 cudaD_add_smat(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
269 inline void cuda_add_smat(dim3 Gr, dim3 Bl,
float* mat,
MatrixDim mat_dim,
270 float alpha,
const int* smat_row_ptr,
271 const int* smat_col_idx,
const float* smat_val) {
272 cudaF_add_smat(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
275 inline void cuda_add_smat_trans(dim3 Gr, dim3 Bl,
double* mat,
277 const int* smat_row_ptr,
278 const int* smat_col_idx,
279 const double* smat_val) {
280 cudaD_add_smat_trans(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
283 inline void cuda_add_smat_trans(dim3 Gr, dim3 Bl,
float* mat,
MatrixDim mat_dim,
284 float alpha,
const int* smat_row_ptr,
285 const int* smat_col_idx,
286 const float* smat_val) {
287 cudaF_add_smat_trans(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
290 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl,
double alpha,
291 double*
const * dst,
const double* src,
293 cudaD_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
295 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl,
float alpha,
float*
const * dst,
297 cudaF_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
299 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl,
double alpha,
300 double* dst,
const double* src,
303 cudaD_add_to_rows(Gr, Bl, alpha, dst, src, reorder, src_dim, dst_stride);
305 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl,
float alpha,
306 float* dst,
const float* src,
309 cudaF_add_to_rows(Gr, Bl, alpha, dst, src, reorder, src_dim, dst_stride);
311 inline void cuda_add_vec2(dim3 Gr, dim3 Bl,
double *mat,
const double *vec,
312 const double alpha,
int dim) {
313 cudaD_add_vec2(Gr, Bl, mat, vec, alpha, dim);
315 inline void cuda_add_vec2(dim3 Gr, dim3 Bl,
float *mat,
const float *vec,
316 const float alpha,
int dim) {
317 cudaF_add_vec2(Gr, Bl, mat, vec, alpha, dim);
319 inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl,
double alpha,
320 const double *col,
double beta,
double *dst,
322 cudaD_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
324 inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl,
float alpha,
325 const float *col,
float beta,
float *dst,
327 cudaF_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
329 inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl,
double alpha,
330 const double *row,
double beta,
double *dst,
332 cudaD_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
334 inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl,
float alpha,
335 const float *row,
float beta,
float *dst,
337 cudaF_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
339 inline void cuda_add_vec_vec(
int Gr,
int Bl,
double alpha,
double* v,
340 const double* x,
const double* y,
double beta,
342 cudaD_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
344 inline void cuda_add_vec_vec(
int Gr,
int Bl,
float alpha,
float* v,
345 const float* x,
const float* y,
float beta,
347 cudaF_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
349 inline cublasStatus_t cuda_axpy(cublasHandle_t handle,
int n,
double alpha,
350 const double *x,
int incx,
double *y,
352 return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy);
354 inline cublasStatus_t cuda_axpy(cublasHandle_t handle,
int n,
float alpha,
355 const float *x,
int incx,
float *y,
int incy) {
356 return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
358 inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
360 const double *C_data,
int C_num_cols,
361 int C_row_stride,
int C_col_stride,
362 const double *D_data,
int D_row_stride,
363 int D_col_stride,
double alpha,
365 cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
366 C_row_stride, C_col_stride, D_data, D_row_stride,
367 D_col_stride, alpha, beta);
369 inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
371 const float *C_data,
int C_num_cols,
372 int C_row_stride,
int C_col_stride,
373 const float *D_data,
int D_row_stride,
374 int D_col_stride,
float alpha,
float beta) {
375 cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
376 C_row_stride, C_col_stride, D_data, D_row_stride,
377 D_col_stride, alpha, beta);
379 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl,
double *y,
380 const double *x1,
const double *x2,
382 int x2_stride,
int group_size) {
383 cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
386 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl,
float *y,
387 const float *x1,
const float *x2,
389 int x2_stride,
int group_size) {
390 cudaF_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
396 cudaD_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
401 cudaF_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
403 inline void cuda_copy_col_from_mat_df(
int Gr,
int Bl,
double* v,
int col,
406 cudaD_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
408 inline void cuda_copy_col_from_mat_df(
int Gr,
int Bl,
double* v,
int col,
411 cudaF_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
413 inline void cuda_copy_col_from_mat_fd(
int Gr,
int Bl,
float* v,
int col,
416 cudaD_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
418 inline void cuda_copy_col_from_mat_fd(
int Gr,
int Bl,
float* v,
int col,
421 cudaF_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
423 inline void cuda_copy_cols(dim3 Gr, dim3 Bl,
double* dst,
const double* src,
426 cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
428 inline void cuda_copy_cols(dim3 Gr, dim3 Bl,
float* dst,
const float* src,
431 cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
433 inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl,
double *mat_out,
435 cudaD_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
437 inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl,
float *mat_out,
439 cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
441 inline void cuda_copy(dim3 Gr, dim3 Bl,
double *y,
const double *x,
444 cudaD_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
446 inline void cuda_copy(dim3 Gr, dim3 Bl,
float *y,
const float *x,
449 cudaF_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
451 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl,
double* mat_out,
454 cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
456 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl,
double* mat_out,
459 cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
461 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl,
float* mat_out,
464 cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
466 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl,
float* mat_out,
469 cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
471 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl,
double* mat_out,
474 cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
476 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl,
double* mat_out,
479 cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
481 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl,
float* mat_out,
484 cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
486 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl,
float* mat_out,
489 cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
491 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl,
double* mat,
492 MatrixDim mat_dim,
const int* smat_row_ptr,
493 const int* smat_col_idx,
494 const double* smat_val) {
495 cuda_copy_from_smat_dd(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
498 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl,
double* mat,
499 MatrixDim mat_dim,
const int* smat_row_ptr,
500 const int* smat_col_idx,
501 const float* smat_val) {
502 cuda_copy_from_smat_df(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
505 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl,
float* mat,
MatrixDim mat_dim,
506 const int* smat_row_ptr,
507 const int* smat_col_idx,
508 const double* smat_val) {
509 cuda_copy_from_smat_fd(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
512 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl,
float* mat,
MatrixDim mat_dim,
513 const int* smat_row_ptr,
514 const int* smat_col_idx,
515 const float* smat_val) {
516 cuda_copy_from_smat_ff(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
519 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl,
double* mat,
521 const int* smat_row_ptr,
522 const int* smat_col_idx,
523 const double* smat_val) {
524 cuda_copy_from_smat_dd_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
527 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl,
double* mat,
529 const int* smat_row_ptr,
530 const int* smat_col_idx,
531 const float* smat_val) {
532 cuda_copy_from_smat_df_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
535 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl,
float* mat,
537 const int* smat_row_ptr,
538 const int* smat_col_idx,
539 const double* smat_val) {
540 cuda_copy_from_smat_fd_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
543 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl,
float* mat,
545 const int* smat_row_ptr,
546 const int* smat_col_idx,
547 const float* smat_val) {
548 cuda_copy_from_smat_ff_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
551 inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl,
const double* x,
double* y,
553 cudaD_copy_from_sp(Gr, Bl, x, y, d_out);
555 inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl,
const float* x,
float* y,
557 cudaF_copy_from_sp(Gr, Bl, x, y, d_out);
559 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl,
double* A,
const double* B,
561 cudaD_copy_from_tp(Gr, Bl, A, B, dmat);
563 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl,
double* A,
const float* B,
565 cudaDF_copy_from_tp(Gr, Bl, A, B, dmat);
567 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl,
float* A,
const double* B,
569 cudaFD_copy_from_tp(Gr, Bl, A, B, dmat);
571 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl,
float* A,
const float* B,
573 cudaF_copy_from_tp(Gr, Bl, A, B, dmat);
575 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl,
double* A,
577 cudaD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
579 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl,
double* A,
const float* B,
581 cudaDF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
583 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl,
float* A,
const double* B,
585 cudaFD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
587 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl,
float* A,
const float* B,
589 cudaF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
591 inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl,
double* A,
MatrixDim dimA) {
592 cudaD_copy_low_upp(Gr, Bl, A, dimA);
594 inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl,
float* A,
MatrixDim dimA) {
595 cudaF_copy_low_upp(Gr, Bl, A, dimA);
597 inline void cuda_copy_rows(dim3 Gr, dim3 Bl,
double* dst,
598 const double*
const * src,
MatrixDim dst_dim) {
599 cudaD_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
601 inline void cuda_copy_rows(dim3 Gr, dim3 Bl,
double* dst,
const double* src,
604 cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
606 inline void cuda_copy_rows(dim3 Gr, dim3 Bl,
float* dst,
607 const float*
const * src,
MatrixDim dst_dim) {
608 cudaF_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
610 inline void cuda_copy_rows(dim3 Gr, dim3 Bl,
float* dst,
const float* src,
613 cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
615 inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl,
double *mat_out,
617 cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
619 inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl,
float *mat_out,
621 cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
623 inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl,
double*
const * dst,
625 cudaD_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
627 inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl,
float*
const * dst,
629 cudaF_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
631 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl,
double* A,
MatrixDim dimA) {
632 cudaD_copy_upp_low(Gr, Bl, A, dimA);
634 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl,
float* A,
MatrixDim dimA) {
635 cudaF_copy_upp_low(Gr, Bl, A, dimA);
637 inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl,
double *
id,
638 const double *iv,
const double *ov,
640 int iv_stride,
int ov_stride,
int od_stride,
641 int group_size,
double power) {
642 cudaD_diff_group_pnorm(Gr, Bl,
id, iv, ov, od, id_dim, iv_stride, ov_stride,
643 od_stride, group_size, power);
645 inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl,
float *
id,
const float *iv,
646 const float *ov,
const float* od,
648 int ov_stride,
int od_stride,
int group_size,
650 cudaF_diff_group_pnorm(Gr, Bl,
id, iv, ov, od, id_dim, iv_stride, ov_stride,
651 od_stride, group_size, power);
653 inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
655 const double* out_value,
656 const int out_value_stride,
657 const double* out_deriv,
658 const int out_deriv_stride,
660 cudaD_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
661 out_deriv, out_deriv_stride, in_deriv);
663 inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
665 const float* out_value,
666 const int out_value_stride,
667 const float* out_deriv,
668 const int out_deriv_stride,
float* in_deriv) {
669 cudaF_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
670 out_deriv, out_deriv_stride, in_deriv);
672 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl,
const int cell_dim,
673 const int have_dropout_mask,
674 const int num_rows,
const double* input,
675 const int input_stride,
676 const double* params,
677 const int params_stride,
678 const double* output_deriv,
679 const int output_deriv_stride,
680 const double* deriv_sum_in,
681 const int deriv_sum_in_stride,
682 const double* self_repair_config,
683 double count,
double* input_deriv,
684 const int input_deriv_stride,
685 double* params_deriv,
686 const int params_deriv_stride,
687 double* value_sum_out,
688 const int value_sum_out_stride,
689 double* deriv_sum_out,
690 const int deriv_sum_out_stride,
691 double* self_repair_sum_out,
692 const int self_repair_sum_out_stride) {
693 cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows,
695 params, params_stride, output_deriv,
696 output_deriv_stride, deriv_sum_in,
697 deriv_sum_in_stride, self_repair_config, count,
698 input_deriv, input_deriv_stride, params_deriv,
699 params_deriv_stride, value_sum_out,
700 value_sum_out_stride, deriv_sum_out,
701 deriv_sum_out_stride, self_repair_sum_out,
702 self_repair_sum_out_stride);
704 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl,
const int cell_dim,
705 const int have_dropout_mask,
706 const int num_rows,
const float* input,
707 const int input_stride,
709 const int params_stride,
710 const float* output_deriv,
711 const int output_deriv_stride,
712 const double* deriv_sum_in,
713 const int deriv_sum_in_stride,
714 const float* self_repair_config,
715 double count,
float* input_deriv,
716 const int input_deriv_stride,
718 const int params_deriv_stride,
719 double* value_sum_out,
720 const int value_sum_out_stride,
721 double* deriv_sum_out,
722 const int deriv_sum_out_stride,
723 float* self_repair_sum_out,
724 const int self_repair_sum_out_stride) {
725 cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask,
726 num_rows, input, input_stride,
727 params, params_stride, output_deriv,
728 output_deriv_stride, deriv_sum_in,
729 deriv_sum_in_stride, self_repair_config, count,
730 input_deriv, input_deriv_stride, params_deriv,
731 params_deriv_stride, value_sum_out,
732 value_sum_out_stride, deriv_sum_out,
733 deriv_sum_out_stride, self_repair_sum_out,
734 self_repair_sum_out_stride);
736 inline void cuda_diff_normalize_per_row(
size_t Gr,
size_t Bl,
double *
id,
737 int id_stride,
const double *iv,
739 int od_stride,
double target_rms,
740 bool add_log_stddev) {
741 cudaD_diff_normalize_per_row(Gr, Bl,
id, id_stride, iv, iv_dim, od, od_stride,
742 target_rms, add_log_stddev);
744 inline void cuda_diff_normalize_per_row(
size_t Gr,
size_t Bl,
float *
id,
745 int id_stride,
const float *iv,
747 int od_stride,
float target_rms,
748 bool add_log_stddev) {
749 cudaF_diff_normalize_per_row(Gr, Bl,
id, id_stride, iv, iv_dim, od, od_stride,
750 target_rms, add_log_stddev);
752 inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl,
double *eout,
753 const double *e,
const double *y,
755 const double *a,
const double *b) {
756 cudaD_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
758 inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl,
float *eout,
759 const float *e,
const float *y,
761 const float *a,
const float *b) {
762 cudaF_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
764 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl,
double *eout,
const double *e,
767 cudaD_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
769 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl,
float *eout,
const float *e,
772 cudaF_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
774 inline void cuda_diff_softmax(dim3 Gr, dim3 Bl,
double* x,
const MatrixDim dim,
775 const double* value,
const int value_stride,
776 const double* diff,
const int diff_stride) {
777 cudaD_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
779 inline void cuda_diff_softmax(dim3 Gr, dim3 Bl,
float* x,
const MatrixDim dim,
780 const float* value,
const int value_stride,
781 const float* diff,
const int diff_stride) {
782 cudaF_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
784 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl,
double *eout,
const double *e,
787 cudaD_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
789 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl,
float *eout,
const float *e,
792 cudaF_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
794 inline void cuda_ensure_nonzero(dim3 Gr, dim3 Bl,
const double *x,
MatrixDim d,
795 double epsilon,
int y_stride,
double *y) {
796 cudaD_ensure_nonzero(Gr, Bl, x, d, epsilon, y_stride, y);
798 inline void cuda_ensure_nonzero(dim3 Gr, dim3 Bl,
const float *x,
MatrixDim d,
799 float epsilon,
int y_stride,
float *y) {
800 cudaF_ensure_nonzero(Gr, Bl, x, d, epsilon, y_stride, y);
802 inline void cuda_diff_xent(dim3 Gr, dim3 Bl,
const int32_cuda *vec_tgt,
803 double *mat_net_out,
double *vec_log_post,
805 cudaD_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
807 inline void cuda_diff_xent(dim3 Gr, dim3 Bl,
const int32_cuda *vec_tgt,
808 float *mat_net_out,
float *vec_log_post,
810 cudaF_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
812 inline void cuda_div_elements(dim3 Gr, dim3 Bl,
double *mat,
const double *A,
814 cudaD_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
816 inline void cuda_div_elements(dim3 Gr, dim3 Bl,
float *mat,
const float *A,
818 cudaF_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
820 inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl,
double *mat,
822 cudaD_div_rows_vec(Gr, Bl, mat, vec_div, d);
824 inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl,
float *mat,
826 cudaF_div_rows_vec(Gr, Bl, mat, vec_div, d);
828 inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl,
const double *mat1,
829 const double *mat2,
double *mask,
832 cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
835 inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl,
const float *mat1,
836 const float *mat2,
float *mask,
839 cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
842 inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl,
const double *mat,
845 cudaD_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
847 inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl,
const float *mat,
850 cudaF_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
852 inline void cuda_group_max(dim3 Gr, dim3 Bl,
double *y,
const double *x,
853 MatrixDim d,
int src_stride,
int group_size) {
854 cudaD_group_max(Gr, Bl, y, x, d, src_stride, group_size);
856 inline void cuda_group_max(dim3 Gr, dim3 Bl,
float *y,
const float *x,
857 MatrixDim d,
int src_stride,
int group_size) {
858 cudaF_group_max(Gr, Bl, y, x, d, src_stride, group_size);
860 inline void cuda_group_pnorm(dim3 Gr, dim3 Bl,
double *y,
const double *x,
863 cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
865 inline void cuda_group_pnorm(dim3 Gr, dim3 Bl,
float *y,
const float *x,
868 cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
870 inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl,
double *y,
const double *x,
873 cudaD_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
875 inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl,
float* y,
const float* x,
878 cudaF_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
880 inline void cuda_heaviside(dim3 Gr, dim3 Bl,
double* y,
const double* x,
882 cudaD_heaviside(Gr, Bl, y, x, d, src_stride);
884 inline void cuda_heaviside(dim3 Gr, dim3 Bl,
float* y,
const float* x,
886 cudaF_heaviside(Gr, Bl, y, x, d, src_stride);
888 inline void cuda_exp(dim3 Gr, dim3 Bl,
double* y,
const double* x,
890 cudaD_exp(Gr, Bl, y, x, d, src_stride);
892 inline void cuda_exp(dim3 Gr, dim3 Bl,
float* y,
const float* x,
894 cudaF_exp(Gr, Bl, y, x, d, src_stride);
896 inline void cuda_pow(dim3 Gr, dim3 Bl,
double* y,
const double* x,
double power,
898 cudaD_pow(Gr, Bl, y, x, power, d, src_stride);
900 inline void cuda_pow(dim3 Gr, dim3 Bl,
float* y,
const float* x,
float power,
902 cudaF_pow(Gr, Bl, y, x, power, d, src_stride);
904 inline void cuda_ceiling(dim3 Gr, dim3 Bl,
double* y,
const double* x,
double ceiling_val,
906 cudaD_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
908 inline void cuda_ceiling(dim3 Gr, dim3 Bl,
float* y,
const float* x,
float ceiling_val,
910 cudaF_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
912 inline void cuda_floor(dim3 Gr, dim3 Bl,
double* y,
const double* x,
double floor_val,
914 cudaD_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
916 inline void cuda_floor(dim3 Gr, dim3 Bl,
float* y,
const float* x,
float floor_val,
918 cudaF_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
920 inline void cuda_exp_limited(dim3 Gr, dim3 Bl,
double* y,
const double* x,
921 double lower_limit,
double upper_limit,
MatrixDim d,
int src_stride) {
922 cudaD_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
924 inline void cuda_exp_limited(dim3 Gr, dim3 Bl,
float* y,
const float* x,
925 float lower_limit,
float upper_limit,
MatrixDim d,
int src_stride) {
926 cudaF_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
928 inline void cuda_exp_special(dim3 Gr, dim3 Bl,
double* y,
const double* x,
930 cudaD_exp_special(Gr, Bl, y, x, d, src_stride);
932 inline void cuda_exp_special(dim3 Gr, dim3 Bl,
float* y,
const float* x,
934 cudaF_exp_special(Gr, Bl, y, x, d, src_stride);
936 inline void cuda_log(dim3 Gr, dim3 Bl,
double* y,
const double* x,
MatrixDim d,
int src_stride) {
937 cudaD_log(Gr, Bl, y, x, d, src_stride);
939 inline void cuda_log(dim3 Gr, dim3 Bl,
float* y,
const float* x,
MatrixDim d,
int src_stride) {
940 cudaF_log(Gr, Bl, y, x, d, src_stride);
942 inline void cuda_pow_abs(dim3 Gr, dim3 Bl,
double* y,
const double* x,
double power,
943 bool include_sign,
MatrixDim dim,
int src_stride) {
944 cudaD_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
946 inline void cuda_pow_abs(dim3 Gr, dim3 Bl,
float* y,
const float* x,
float power,
947 bool include_sign,
MatrixDim dim,
int src_stride) {
948 cudaF_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
950 inline void cuda_invert_elements(dim3 Gr, dim3 Bl,
double *data,
MatrixDim d) {
951 cudaD_invert_elements(Gr, Bl, data, d);
953 inline void cuda_invert_elements(dim3 Gr, dim3 Bl,
float *data,
MatrixDim d) {
954 cudaF_invert_elements(Gr, Bl, data, d);
956 inline void cuda_log_softmax_reduce(
size_t Gr,
size_t Bl,
double *y,
959 cudaD_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
961 inline void cuda_log_softmax_reduce(
size_t Gr,
size_t Bl,
float *y,
964 cudaF_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
966 inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl,
const double* in,
967 const int in_stride,
const double* params,
968 const int params_stride,
969 const int out_stride,
const int cell_dim,
970 const int have_dropout_mask,
971 const int num_rows,
double* out) {
972 cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
973 out_stride, cell_dim, have_dropout_mask,
976 inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl,
const float* in,
977 const int in_stride,
const float* params,
978 const int params_stride,
979 const int out_stride,
const int cell_dim,
980 const int have_dropout_mask,
981 const int num_rows,
float* out) {
982 cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
983 out_stride, cell_dim, have_dropout_mask,
986 inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl,
double *data,
990 cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
992 inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl,
float *data,
996 cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
998 inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl,
MatrixDim dim,
1001 const double* x,
int s,
1003 cudaD_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
1005 inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl,
MatrixDim dim,
1008 const float* x,
int s,
float* data) {
1009 cudaF_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
1011 inline void cuda_matrix_add_to_elements(dim3 Gr, dim3 Bl,
double alpha,
1014 cudaD_matrix_add_to_elements(Gr, Bl, alpha, mat, dim, elements);
1016 inline void cuda_matrix_add_to_elements(dim3 Gr, dim3 Bl,
float alpha,
1019 cudaF_matrix_add_to_elements(Gr, Bl, alpha, mat, dim, elements);
1021 inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl,
const double *data,
1023 int indices_size,
double *output) {
1024 cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
1026 inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl,
const float *data,
1028 int indices_size,
float *output) {
1029 cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
1031 inline void cuda_vector_copy_elements(dim3 Gr, dim3 Bl,
double *data,
int dim,
1032 const double *src_mat,
int mat_stride,
1035 cudaD_vector_copy_elements(Gr, Bl, data, dim, src_mat, mat_stride,
1036 transpose, elements);
1038 inline void cuda_vector_copy_elements(dim3 Gr, dim3 Bl,
float *data,
int dim,
1039 const float *src_mat,
int mat_stride,
1042 cudaF_vector_copy_elements(Gr, Bl, data, dim, src_mat, mat_stride,
1043 transpose, elements);
1045 inline void cuda_max(dim3 Gr, dim3 Bl,
double *mat,
const double *A,
1047 cudaD_max(Gr, Bl, mat, A, dst_d, src_stride);
1049 inline void cuda_max(dim3 Gr, dim3 Bl,
float *mat,
const float *A,
1051 cudaF_max(Gr, Bl, mat, A, dst_d, src_stride);
1053 inline void cuda_max_mat_cols(
int Gr,
int Bl,
double* result,
const double* mat,
1055 cudaD_max_mat_cols(Gr, Bl, result, mat, d);
1057 inline void cuda_max_mat_cols(
int Gr,
int Bl,
float* result,
const float* mat,
1059 cudaF_max_mat_cols(Gr, Bl, result, mat, d);
1061 inline void cuda_min(dim3 Gr, dim3 Bl,
double *mat,
const double *other,
1063 cudaD_min(Gr, Bl, mat, other, mat_d, other_stride);
1065 inline void cuda_min(dim3 Gr, dim3 Bl,
float *mat,
const float *other,
1067 cudaF_min(Gr, Bl, mat, other, mat_d, other_stride);
1069 inline void cuda_min_mat_cols(
int Gr,
int Bl,
double* result,
const double* mat,
1071 cudaD_min_mat_cols(Gr, Bl, result, mat, d);
1073 inline void cuda_min_mat_cols(
int Gr,
int Bl,
float* result,
const float* mat,
1075 cudaF_min_mat_cols(Gr, Bl, result, mat, d);
1077 inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl,
double *mat,
1079 cudaD_mul_cols_vec(Gr, Bl, mat, scale, d);
1081 inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl,
float *mat,
const float *scale,
1083 cudaF_mul_cols_vec(Gr, Bl, mat, scale, d);
1085 inline void cuda_mul_elements(dim3 Gr, dim3 Bl,
double *mat,
const double *A,
1087 cudaD_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
1089 inline void cuda_mul_elements(dim3 Gr, dim3 Bl,
float *mat,
const float *A,
1091 cudaF_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
1093 inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl,
double *y,
1095 int src_stride,
int group_size) {
1096 cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
1098 inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl,
float *y,
const float *x,
1101 cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
1103 inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl,
double *mat,
1105 cudaD_mul_rows_vec(Gr, Bl, mat, scale, d);
1107 inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl,
float *mat,
const float *scale,
1109 cudaF_mul_rows_vec(Gr, Bl, mat, scale, d);
1111 inline void cuda_normalize_per_row(
size_t Gr,
size_t Bl,
double *y,
1112 int y_stride,
const double *x,
MatrixDim x_d,
1113 double target_rms,
bool add_log_stddev) {
1114 cudaD_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
1117 inline void cuda_normalize_per_row(
size_t Gr,
size_t Bl,
float *y,
int y_stride,
1119 float target_rms,
bool add_log_stddev) {
1120 cudaF_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
1123 inline void cuda_one(
int Gr,
int Bl,
double* x,
int dim) {
1124 cudaD_one(Gr, Bl, x, dim);
1126 inline void cuda_one(
int Gr,
int Bl,
float* x,
int dim) {
1127 cudaF_one(Gr, Bl, x, dim);
1129 inline void cuda_parametric_relu(dim3 Gr, dim3 Bl,
double *y,
const double *x,
1130 MatrixDim d,
int src_stride,
const double *a,
1132 cudaD_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
1134 inline void cuda_parametric_relu(dim3 Gr, dim3 Bl,
float *y,
const float *x,
1137 cudaF_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
1139 inline void cuda_randomize(dim3 Gr, dim3 Bl,
double *y,
const double *x,
1142 cudaD_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
1144 inline void cuda_randomize(dim3 Gr, dim3 Bl,
float *y,
const float *x,
1147 cudaF_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
1149 inline void cuda_regularize_l1(dim3 Gr, dim3 Bl,
double *wei,
double *grad,
1152 cudaD_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
1154 inline void cuda_regularize_l1(dim3 Gr, dim3 Bl,
float *wei,
float *grad,
1157 cudaF_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
1159 inline void cuda_replace_value(
int Gr,
int Bl,
double *v,
int dim,
double orig,
1161 cudaD_replace_value(Gr, Bl, v, dim, orig, changed);
1163 inline void cuda_replace_value(
int Gr,
int Bl,
float *v,
int dim,
float orig,
1165 cudaF_replace_value(Gr, Bl, v, dim, orig, changed);
1167 inline cublasStatus_t cuda_scal(cublasHandle_t handle,
int n,
double alpha,
1168 double *x,
int incx) {
1169 return cublasDscal_v2(handle, n, &alpha, x, incx);
1171 inline cublasStatus_t cuda_scal(cublasHandle_t handle,
int n,
float alpha,
1172 float *x,
int incx) {
1173 return cublasSscal_v2(handle, n, &alpha, x, incx);
1175 inline void cuda_scale_diag_packed(
int Gr,
int Bl,
double* mat,
double value,
1177 cudaD_scale_diag_packed(Gr, Bl, mat, value, dim);
1179 inline void cuda_scale_diag_packed(
int Gr,
int Bl,
float* mat,
float value,
1181 cudaF_scale_diag_packed(Gr, Bl, mat, value, dim);
1183 inline void cuda_scale(dim3 Gr, dim3 Bl,
double *mat,
double value,
1185 cudaD_scale(Gr, Bl, mat, value, d);
1187 inline void cuda_scale(dim3 Gr, dim3 Bl,
float *mat,
float value,
MatrixDim d) {
1188 cudaF_scale(Gr, Bl, mat, value, d);
1190 inline void cuda_select_rows(dim3 Gr, dim3 Bl,
const int* out_row_ptr,
1191 int* out_col_idx,
double* out_val,
1192 const int* row_indexes,
1193 const int num_selected_rows,
const int* in_row_ptr,
1194 const int* in_col_idx,
const double* in_val) {
1195 cudaD_select_rows(Gr, Bl, out_row_ptr, out_col_idx, out_val, row_indexes,
1196 num_selected_rows, in_row_ptr, in_col_idx, in_val);
1198 inline void cuda_select_rows(dim3 Gr, dim3 Bl,
const int* out_row_ptr,
1199 int* out_col_idx,
float* out_val,
1200 const int* row_indexes,
1201 const int num_selected_rows,
const int* in_row_ptr,
1202 const int* in_col_idx,
const float* in_val) {
1203 cudaF_select_rows(Gr, Bl, out_row_ptr, out_col_idx, out_val, row_indexes,
1204 num_selected_rows, in_row_ptr, in_col_idx, in_val);
1206 inline void cuda_set_bias_params(
int Gr,
int Bl,
double* v,
const double* a,
1207 double param_1,
double param_2,
double param_3,
1208 int* flag,
int dim) {
1209 cudaD_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
1211 inline void cuda_set_bias_params(
int Gr,
int Bl,
float* v,
const float* a,
1212 float param_1,
float param_2,
float param_3,
1213 int* flag,
int dim) {
1214 cudaF_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
1216 inline void cuda_set_const(dim3 Gr, dim3 Bl,
double *mat,
double value,
1218 cudaD_set_const(Gr, Bl, mat, value, d);
1220 inline void cuda_set_const(dim3 Gr, dim3 Bl,
float *mat,
float value,
1222 cudaF_set_const(Gr, Bl, mat, value, d);
1224 inline void cuda_set_diag(
int Gr,
int Bl,
double* mat,
double value,
1226 cudaD_set_diag(Gr, Bl, mat, value, d);
1228 inline void cuda_set_diag(
int Gr,
int Bl,
float* mat,
float value,
1230 cudaF_set_diag(Gr, Bl, mat, value, d);
1232 inline void cuda_set_diag_packed(
int Gr,
int Bl,
double* mat,
double value,
1234 cudaD_set_diag_packed(Gr, Bl, mat, value, dim);
1236 inline void cuda_set_diag_packed(
int Gr,
int Bl,
float* mat,
float value,
1238 cudaF_set_diag_packed(Gr, Bl, mat, value, dim);
1240 inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl,
const double *A,
1241 const double *B,
const double *C,
1243 int stride_b,
int stride_c) {
1244 cudaD_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
1247 inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl,
const float *A,
1248 const float *B,
const float *C,
float *dst,
1251 cudaF_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
1254 inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl,
double* mat,
1256 cudaD_set_zero_above_diag(Gr, Bl, mat, d);
1258 inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl,
float* mat,
1260 cudaF_set_zero_above_diag(Gr, Bl, mat, d);
1262 inline void cuda_sequence(dim3 Gr, dim3 Bl,
int32_cuda* data,
int length,
1264 cuda_int32_sequence(Gr, Bl, data, length, base);
1266 inline void cuda_sigmoid(dim3 Gr, dim3 Bl,
double *y,
const double *x,
1268 cudaD_sigmoid(Gr, Bl, y, x, d, src_stride);
1270 inline void cuda_sigmoid(dim3 Gr, dim3 Bl,
float *y,
const float *x,
1272 cudaF_sigmoid(Gr, Bl, y, x, d, src_stride);
1274 inline void cuda_soft_hinge(dim3 Gr, dim3 Bl,
double *y,
const double *x,
1276 cudaD_soft_hinge(Gr, Bl, y, x, d, src_stride);
1278 inline void cuda_soft_hinge(dim3 Gr, dim3 Bl,
float *y,
const float *x,
1280 cudaF_soft_hinge(Gr, Bl, y, x, d, src_stride);
1282 inline void cuda_softmax_reduce(
size_t Gr,
size_t Bl,
double *y,
1283 const double *x,
MatrixDim d,
int src_stride) {
1284 cudaD_softmax_reduce(Gr, Bl, y, x, d, src_stride);
1286 inline void cuda_softmax_reduce(
size_t Gr,
size_t Bl,
float *y,
const float *x,
1288 cudaF_softmax_reduce(Gr, Bl, y, x, d, src_stride);
1290 inline void cuda_splice(dim3 Gr, dim3 Bl,
double *y,
const double *x,
1293 cudaD_splice(Gr, Bl, y, x, off, d_out, d_in);
1295 inline void cuda_splice(dim3 Gr, dim3 Bl,
float *y,
const float *x,
1298 cudaF_splice(Gr, Bl, y, x, off, d_out, d_in);
1300 inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl,
double *data,
1304 cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
1306 inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl,
float *data,
MatrixDim dim,
1307 const float *src_data,
MatrixDim src_dim,
1309 cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
1311 inline void cuda_sum_mat_cols(
int Gr,
int Bl,
double* result,
const double* mat,
1313 cudaD_sum_mat_cols(Gr, Bl, result, mat, d);
1315 inline void cuda_sum_mat_cols(
int Gr,
int Bl,
float* result,
const float* mat,
1317 cudaF_sum_mat_cols(Gr, Bl, result, mat, d);
1319 inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl,
double alpha,
double beta,
1320 const double* T,
MatrixDim tdim,
double *S,
1322 cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
1324 inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl,
float alpha,
float beta,
1325 const float* T,
MatrixDim tdim,
float *S,
1327 cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
1329 inline void cuda_take_lower(dim3 Gr, dim3 Bl,
const double* x,
double* y,
1331 cudaD_take_lower(Gr, Bl, x, y, d_in);
1333 inline void cuda_take_lower(dim3 Gr, dim3 Bl,
const float* x,
float* y,
1335 cudaF_take_lower(Gr, Bl, x, y, d_in);
1337 inline void cuda_take_mean(dim3 Gr, dim3 Bl,
const double* x,
double* y,
1339 cudaD_take_mean(Gr, Bl, x, y, d_in);
1341 inline void cuda_take_mean(dim3 Gr, dim3 Bl,
const float* x,
float* y,
1343 cudaF_take_mean(Gr, Bl, x, y, d_in);
1345 inline void cuda_take_upper(dim3 Gr, dim3 Bl,
const double* x,
double* y,
1347 cudaD_take_upper(Gr, Bl, x, y, d_in);
1349 inline void cuda_take_upper(dim3 Gr, dim3 Bl,
const float* x,
float* y,
1351 cudaF_take_upper(Gr, Bl, x, y, d_in);
1353 inline void cuda_tanh(dim3 Gr, dim3 Bl,
double *y,
const double *x,
MatrixDim d,
1355 cudaD_tanh(Gr, Bl, y, x, d, src_stride);
1357 inline void cuda_tanh(dim3 Gr, dim3 Bl,
float *y,
const float *x,
MatrixDim d,
1359 cudaF_tanh(Gr, Bl, y, x, d, src_stride);
1361 inline void cuda_trace(
int Gr,
int Bl,
double* mat,
double* value,
int dim) {
1362 cudaD_trace(Gr, Bl, mat, value, dim);
1364 inline void cuda_trace(
int Gr,
int Bl,
float* mat,
float* value,
int dim) {
1365 cudaF_trace(Gr, Bl, mat, value, dim);
1367 inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl,
const double* A,
1368 const double* B,
MatrixDim dA,
int B_stride,
1370 cudaD_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
1372 inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl,
const float* A,
const float* B,
1373 MatrixDim dA,
int B_stride,
float* value) {
1374 cudaF_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
1376 inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl,
const double* A,
1378 int B_stride,
double* value) {
1379 cudaD_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
1381 inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl,
const float* A,
1382 const float* B,
MatrixDim dA,
int B_stride,
1384 cudaF_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
1386 inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl,
const double* mat,
1387 MatrixDim mat_dim,
const int* smat_row_ptr,
1388 const int* smat_col_idx,
const double* smat_val,
1389 double* trace_vec) {
1390 cudaD_trace_mat_smat(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
1391 smat_val, trace_vec);
1393 inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl,
const float* mat,
1394 MatrixDim mat_dim,
const int* smat_row_ptr,
1395 const int* smat_col_idx,
const float* smat_val,
1397 cudaF_trace_mat_smat(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
1398 smat_val, trace_vec);
1400 inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl,
const double* mat,
1402 const int* smat_row_ptr,
1403 const int* smat_col_idx,
1404 const double* smat_val,
1405 double* trace_vec) {
1406 cudaD_trace_mat_smat_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
1407 smat_val, trace_vec);
1409 inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl,
const float* mat,
1411 const int* smat_row_ptr,
1412 const int* smat_col_idx,
1413 const float* smat_val,
float* trace_vec) {
1414 cudaF_trace_mat_smat_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
1415 smat_val, trace_vec);
1417 inline void cuda_vec_apply_ceiling(
int Gr,
int Bl,
double* v,
double floor_val,
1418 float* num,
int dim) {
1419 cudaD_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
1421 inline void cuda_vec_apply_ceiling(
int Gr,
int Bl,
float* v,
float floor_val,
1422 float* num,
int dim) {
1423 cudaF_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
1425 inline void cuda_vec_apply_exp(
int Gr,
int Bl,
double* v,
int dim) {
1426 cudaD_vec_apply_exp(Gr, Bl, v, dim);
1428 inline void cuda_vec_apply_exp(
int Gr,
int Bl,
float* v,
int dim) {
1429 cudaF_vec_apply_exp(Gr, Bl, v, dim);
1431 inline void cuda_vec_apply_floor(
int Gr,
int Bl,
double* v,
double floor_val,
1432 float* num,
int dim) {
1433 cudaD_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
1435 inline void cuda_vec_apply_floor(
int Gr,
int Bl,
float* v,
float floor_val,
1436 float* num,
int dim) {
1437 cudaF_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
1439 inline void cuda_vec_apply_log(
int Gr,
int Bl,
double* v,
double* flag,
1441 cudaD_vec_apply_log(Gr, Bl, v, flag, dim);
1443 inline void cuda_vec_apply_log(
int Gr,
int Bl,
float* v,
float* flag,
int dim) {
1444 cudaF_vec_apply_log(Gr, Bl, v, flag, dim);
1446 inline void cuda_vec_copy_diag_from_packed(
int Gr,
int Bl,
double *dst,
1447 const double *src,
int dim) {
1448 cudaD_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
1450 inline void cuda_vec_copy_diag_from_packed(
int Gr,
int Bl,
float *dst,
1451 const float *src,
int dim) {
1452 cudaF_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
1454 inline void cuda_vec_max(
int Gr,
int Bl,
const double* v,
double* value,
1456 cudaD_vec_max(Gr, Bl, v, value, dim, inc);
1458 inline void cuda_vec_max(
int Gr,
int Bl,
const float* v,
float* value,
int dim,
1460 cudaF_vec_max(Gr, Bl, v, value, dim, inc);
1462 inline void cuda_vec_min(
int Gr,
int Bl,
const double* v,
double* value,
1464 cudaD_vec_min(Gr, Bl, v, value, dim, inc);
1466 inline void cuda_vec_min(
int Gr,
int Bl,
const float* v,
float* value,
int dim,
1468 cudaF_vec_min(Gr, Bl, v, value, dim, inc);
1470 inline void cuda_vec_mul_elements(
int Gr,
int Bl,
double* v,
const double* a,
1472 cudaD_vec_mul_elements(Gr, Bl, v, a, dim);
1474 inline void cuda_vec_mul_elements(
int Gr,
int Bl,
float* v,
const float* a,
1476 cudaF_vec_mul_elements(Gr, Bl, v, a, dim);
1478 inline void cuda_vec_soft_max(
int Gr,
int Bl,
double* v,
int dim) {
1479 cudaD_vec_soft_max(Gr, Bl, v, dim);
1481 inline void cuda_vec_soft_max(
int Gr,
int Bl,
float* v,
int dim) {
1482 cudaF_vec_soft_max(Gr, Bl, v, dim);
1484 inline void cuda_vec_sum(
int Gr,
int Bl,
double* v,
double* value,
int dim,
1486 cudaD_vec_sum(Gr, Bl, v, value, dim, inc);
1488 inline void cuda_vec_sum(
int Gr,
int Bl,
float* v,
float* value,
int dim,
1490 cudaF_vec_sum(Gr, Bl, v, value, dim, inc);
1495 inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl,
const BaseFloat *src,
1498 cuda_compress_uint8_sign(Gr, Bl, src, dim, dest, dest_stride);
1502 template <
typename I>
1503 inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl,
const BaseFloat *src,
1506 KALDI_ERR <<
"Not implemented for this type.";
1509 inline void cuda_mat_compress(dim3 Gr, dim3 Bl,
const BaseFloat *src,
1511 int dest_stride,
float inv_scale,
1512 bool bounds_check) {
1513 cuda_compress_int16(Gr, Bl, src, dim, dest, dest_stride,
1514 inv_scale, bounds_check);
1516 inline void cuda_mat_compress(dim3 Gr, dim3 Bl,
const BaseFloat *src,
1518 int dest_stride,
float inv_scale,
1519 bool bounds_check) {
1520 cuda_compress_uint16(Gr, Bl, src, dim, dest, dest_stride,
1521 inv_scale, bounds_check);
1523 inline void cuda_mat_compress(dim3 Gr, dim3 Bl,
const BaseFloat *src,
1525 int dest_stride,
float inv_scale,
1526 bool bounds_check) {
1527 cuda_compress_uint8(Gr, Bl, src, dim, dest, dest_stride,
1528 inv_scale, bounds_check);
1530 inline void cuda_mat_compress(dim3 Gr, dim3 Bl,
const BaseFloat *src,
1532 int dest_stride,
float inv_scale,
1533 bool bounds_check) {
1534 cuda_compress_int8(Gr, Bl, src, dim, dest, dest_stride,
1535 inv_scale, bounds_check);
1538 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl,
BaseFloat *dest,
1540 int src_stride,
float scale) {
1541 cuda_uncompress_int8(Gr, Bl, dest, dim, src, src_stride, scale);
1543 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl,
BaseFloat *dest,
1545 int src_stride,
float scale) {
1546 cuda_uncompress_uint8(Gr, Bl, dest, dim, src, src_stride, scale);
1548 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl,
BaseFloat *dest,
1550 int src_stride,
float scale) {
1551 cuda_uncompress_int16(Gr, Bl, dest, dim, src, src_stride, scale);
1553 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl,
BaseFloat *dest,
1555 int src_stride,
float scale) {
1556 cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale);
1559 inline void cuda_mat_copy_range_clamped(
1560 int32_t row_start, int32_t row_end, int32_t num_cols,
1561 const double *src, int32_t lds,
1562 int32_t clamp_low, int32_t clamp_high,
1563 double *dst, int32_t ldd) {
1564 cudaD_mat_copy_range_clamped(row_start, row_end, num_cols,
1565 src, lds, clamp_low, clamp_high, dst, ldd);
1568 inline void cuda_mat_copy_range_clamped(
1569 int32_t row_start, int32_t row_end, int32_t num_cols,
1570 const float *src, int32_t lds,
1571 int32_t clamp_low, int32_t clamp_high,
1572 float *dst, int32_t ldd) {
1573 cudaF_mat_copy_range_clamped(row_start, row_end, num_cols,
1574 src, lds, clamp_low, clamp_high, dst, ldd);
1577 inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
1578 int32_t *num_cols,
const float **inputs, int32_t *ldi,
float **outputs,
1580 cudaF_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
1584 inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
1585 int32_t *num_cols,
const double **inputs, int32_t *ldi,
double **outputs,
1587 cudaD_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Structure containing size of the matrix plus stride.
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
int32_t MatrixIndexT_cuda