cu-kernels.h
Go to the documentation of this file.
1 // cudamatrix/cu-kernels.h
2 
3 // Copyright 2009-2012 Karel Vesely
4 // 2013 Ehsan Variani
5 // 2014 Johns Hopkins University (author: Daniel Povey)
6 // 2013 Hainan Xu
7 // 2013 Xiaohui Zhang
8 // 2013-2015 Guoguo Chen
9 // 2016-2018 Shiyin Kang
10 // 2019 Yiwen Shao
11 
12 // See ../../COPYING for clarification regarding multiple authors
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
21 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
22 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
23 // MERCHANTABLITY OR NON-INFRINGEMENT.
24 // See the Apache 2 License for the specific language governing permissions and
25 // limitations under the License.
26 
27 #ifndef KALDI_CUDAMATRIX_CU_KERNELS_H_
28 #define KALDI_CUDAMATRIX_CU_KERNELS_H_
29 
30 #if HAVE_CUDA == 1
31 
32 #include "base/kaldi-error.h"
34 
35 /*
36  * In this file are C++ templated wrappers
37  * of the ANSI-C CUDA kernels
38  */
39 
40 namespace kaldi {
41 
42 inline void cuda_add_row_sum_mat(int Gr, int Bl, double* result,
43  const double* mat, const MatrixDim d,
44  const double alpha, const double beta) {
45  cudaD_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
46 }
47 inline void cuda_add_row_sum_mat(int Gr, int Bl, float* result,
48  const float* mat, const MatrixDim d,
49  const float alpha, const float beta) {
50  cudaF_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
51 }
52 inline void cuda_add_col_sum_mat(int Gr, int Bl, double* result,
53  const double* mat, const MatrixDim d,
54  const double alpha, const double beta) {
55  cudaD_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
56 }
57 inline void cuda_add_col_sum_mat(int Gr, int Bl, float* result,
58  const float* mat, const MatrixDim d,
59  const float alpha, const float beta) {
60  cudaF_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
61 }
62 inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
63  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
64  int src_stride) {
65  cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
66 }
67 inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
68  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
69  int src_stride) {
70  cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
71 }
72 inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
73  const double* M, const int stride_M,
74  const double* N, const MatrixDim dim_N,
75  const double beta, double* v) {
76  cudaD_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
77 }
78 inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
79  const float* M, const int stride_M,
80  const float* N, const MatrixDim dim_N,
81  const float beta, float* v) {
82  cudaF_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
83 }
84 inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
85  const double* M, const MatrixDim dim_M,
86  const double* N, const int stride_N,
87  const double beta, double* v) {
88  cudaD_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
89 }
90 inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
91  const float* M, const MatrixDim dim_M,
92  const float* N, const int stride_N,
93  const float beta, float* v) {
94  cudaF_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
95 }
96 inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
97  const double* M, const int stride_M,
98  const double* N, const MatrixDim dim_N,
99  const double beta, double* v,
100  const int stride_v) {
101  cudaD_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v,
102  stride_v);
103 }
104 inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
105  const float* M, const int stride_M,
106  const float* N, const MatrixDim dim_N,
107  const float beta, float* v,
108  const int stride_v) {
109  cudaF_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v,
110  stride_v);
111 }
112 inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value,
113  int dim) {
114  cudaD_add_diag_packed(Gr, Bl, mat, value, dim);
115 }
116 inline void cuda_add_diag_packed(int Gr, int Bl, float* mat, float value,
117  int dim) {
118  cudaF_add_diag_packed(Gr, Bl, mat, value, dim);
119 }
120 inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
121  MatrixDim mat_dim, const double *vec,
122  const double *mat2, int mat2_row_stride,
123  int mat2_col_stride, double beta) {
124  cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
125  mat2_row_stride, mat2_col_stride, beta);
126 }
127 inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
128  MatrixDim mat_dim, const float *vec,
129  const float *mat2, int mat2_row_stride,
130  int mat2_col_stride, float beta) {
131  cudaF_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
132  mat2_row_stride, mat2_col_stride, beta);
133 }
134 inline void cuda_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) {
135  cudaD_add(Gr, Bl, mat, value, d);
136 }
137 inline void cuda_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
138  cudaF_add(Gr, Bl, mat, value, d);
139 }
140 inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
141  const double *Adata, int A_num_rows,
142  int A_num_cols, int A_row_stride,
143  int A_col_stride,
144  const CuBlockMatrixData *B_cu_data,
145  int B_num_blocks, double alpha, double beta,
146  int B_trans) {
147  cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
148  A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
149  alpha, beta, B_trans);
150 }
151 inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
152  const float *Adata, int A_num_rows,
153  int A_num_cols, int A_row_stride,
154  int A_col_stride,
155  const CuBlockMatrixData *B_cu_data,
156  int B_num_blocks, float alpha, float beta,
157  int B_trans) {
158  cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
159  A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
160  alpha, beta, B_trans);
161 }
162 inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha,
163  const double *src, int32_cuda num_row_blocks,
164  int32_cuda num_col_blocks, double *dst,
165  MatrixDim d, int src_stride, int A_trans) {
166  cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
167  d, src_stride, A_trans);
168 }
169 inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
170  int32_cuda num_row_blocks,
171  int32_cuda num_col_blocks, float *dst,
172  MatrixDim d, int src_stride, int A_trans) {
173  cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
174  d, src_stride, A_trans);
175 }
176 inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha,
177  const double *src, MatrixDim src_dim,
178  double *dst, MatrixDim dst_dim) {
179  cudaD_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
180 }
181 inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
182  const float *src, MatrixDim src_dim,
183  float *dst, MatrixDim dst_dim) {
184  cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
185 }
186 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
187  MatrixDim mat_dim, const double *mat2,
188  int mat2_row_stride, int mat2_col_stride,
189  const double *vec, double beta) {
190  cudaD_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
191  mat2_col_stride, vec, beta);
192 }
193 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat,
194  MatrixDim mat_dim, const float *mat2,
195  int mat2_row_stride, int mat2_col_stride,
196  const float *vec, float beta) {
197  cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
198  mat2_col_stride, vec, beta);
199 }
200 inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
201  double *dst, MatrixDim d, int src_stride,
202  int A_trans) {
203  cudaD_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
204 }
205 inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src,
206  float *dst, MatrixDim d, int src_stride, int A_trans) {
207  cudaF_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
208 }
209 inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
210  const double *srcA_data,
211  const double *srcB_data, MatrixDim dim,
212  int srcA_stride, int srcB_stride,
213  double alpha, double beta) {
214  cudaD_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
215  srcA_stride, srcB_stride, alpha, beta);
216 }
217 inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data,
218  const float *srcA_data,
219  const float *srcB_data, MatrixDim dim,
220  int srcA_stride, int srcB_stride,
221  float alpha, float beta) {
222  cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
223  srcA_stride, srcB_stride, alpha, beta);
224 }
225 inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
226  const double *src_data, MatrixDim src_dim,
227  const Int32Pair *indexes) {
228  cudaD_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
229 }
230 inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
231  const float *src_data, MatrixDim src_dim,
232  const Int32Pair *indexes) {
233  cudaF_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
234 }
235 inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
236  const double* const * src, MatrixDim dst_dim) {
237  cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
238 }
239 inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
240  const float* const * src, MatrixDim dst_dim) {
241  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
242 }
243 inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
244  const double* src, const MatrixIndexT_cuda* reorder,
245  MatrixDim dst_dim, int src_stride) {
246  cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
247 }
248 inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
249  const float* src, const MatrixIndexT_cuda* reorder,
250  MatrixDim dst_dim, int src_stride) {
251  cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
252 }
253 inline void cuda_mul_rows(dim3 Gr, dim3 Bl, double* dst,
254  const double* src, const MatrixIndexT_cuda* reorder,
255  MatrixDim dst_dim, int src_stride) {
256  cudaD_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
257 }
258 inline void cuda_mul_rows(dim3 Gr, dim3 Bl, float* dst,
259  const float* src, const MatrixIndexT_cuda* reorder,
260  MatrixDim dst_dim, int src_stride) {
261  cudaF_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
262 }
263 inline void cuda_add_smat(dim3 Gr, dim3 Bl, double* mat, MatrixDim mat_dim,
264  double alpha, const int* smat_row_ptr,
265  const int* smat_col_idx, const double* smat_val) {
266  cudaD_add_smat(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
267  smat_val);
268 }
269 inline void cuda_add_smat(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
270  float alpha, const int* smat_row_ptr,
271  const int* smat_col_idx, const float* smat_val) {
272  cudaF_add_smat(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
273  smat_val);
274 }
275 inline void cuda_add_smat_trans(dim3 Gr, dim3 Bl, double* mat,
276  MatrixDim mat_dim, double alpha,
277  const int* smat_row_ptr,
278  const int* smat_col_idx,
279  const double* smat_val) {
280  cudaD_add_smat_trans(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
281  smat_val);
282 }
283 inline void cuda_add_smat_trans(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
284  float alpha, const int* smat_row_ptr,
285  const int* smat_col_idx,
286  const float* smat_val) {
287  cudaF_add_smat_trans(Gr, Bl, mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
288  smat_val);
289 }
290 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, double alpha,
291  double* const * dst, const double* src,
292  MatrixDim src_dim) {
293  cudaD_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
294 }
295 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
296  const float* src, MatrixDim src_dim) {
297  cudaF_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
298 }
299 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, double alpha,
300  double* dst, const double* src,
301  const MatrixIndexT_cuda* reorder,
302  MatrixDim src_dim, int dst_stride) {
303  cudaD_add_to_rows(Gr, Bl, alpha, dst, src, reorder, src_dim, dst_stride);
304 }
305 inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, float alpha,
306  float* dst, const float* src,
307  const MatrixIndexT_cuda* reorder,
308  MatrixDim src_dim, int dst_stride) {
309  cudaF_add_to_rows(Gr, Bl, alpha, dst, src, reorder, src_dim, dst_stride);
310 }
311 inline void cuda_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
312  const double alpha, int dim) {
313  cudaD_add_vec2(Gr, Bl, mat, vec, alpha, dim);
314 }
315 inline void cuda_add_vec2(dim3 Gr, dim3 Bl, float *mat, const float *vec,
316  const float alpha, int dim) {
317  cudaF_add_vec2(Gr, Bl, mat, vec, alpha, dim);
318 }
319 inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha,
320  const double *col, double beta, double *dst,
321  MatrixDim d) {
322  cudaD_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
323 }
324 inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha,
325  const float *col, float beta, float *dst,
326  MatrixDim d) {
327  cudaF_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
328 }
329 inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha,
330  const double *row, double beta, double *dst,
331  MatrixDim d) {
332  cudaD_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
333 }
334 inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha,
335  const float *row, float beta, float *dst,
336  MatrixDim d) {
337  cudaF_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
338 }
339 inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v,
340  const double* x, const double* y, double beta,
341  int dim) {
342  cudaD_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
343 }
344 inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v,
345  const float* x, const float* y, float beta,
346  int dim) {
347  cudaF_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
348 }
349 inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, double alpha,
350  const double *x, int incx, double *y,
351  int incy) {
352  return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy);
353 }
354 inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, float alpha,
355  const float *x, int incx, float *y, int incy) {
356  return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
357 }
358 inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
359  CuBlockMatrixData *B_cu_data, int num_blocks,
360  const double *C_data, int C_num_cols,
361  int C_row_stride, int C_col_stride,
362  const double *D_data, int D_row_stride,
363  int D_col_stride, double alpha,
364  double beta) {
365  cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
366  C_row_stride, C_col_stride, D_data, D_row_stride,
367  D_col_stride, alpha, beta);
368 }
369 inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
370  CuBlockMatrixData *B_cu_data, int num_blocks,
371  const float *C_data, int C_num_cols,
372  int C_row_stride, int C_col_stride,
373  const float *D_data, int D_row_stride,
374  int D_col_stride, float alpha, float beta) {
375  cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
376  C_row_stride, C_col_stride, D_data, D_row_stride,
377  D_col_stride, alpha, beta);
378 }
379 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y,
380  const double *x1, const double *x2,
381  MatrixDim y_dim, int x1_stride,
382  int x2_stride, int group_size) {
383  cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
384  group_size);
385 }
386 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y,
387  const float *x1, const float *x2,
388  MatrixDim y_dim, int x1_stride,
389  int x2_stride, int group_size) {
390  cudaF_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
391  group_size);
392 }
393 inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x,
394  int32 size, const double* z, MatrixDim d,
395  double* z2, MatrixDim d2, double* t) {
396  cudaD_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
397 }
398 inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x,
399  int32 size, const float* z, MatrixDim d,
400  float* z2, MatrixDim d2, float* t) {
401  cudaF_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
402 }
403 inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
404  const double* mat, MatrixDim dmat,
405  int dim) {
406  cudaD_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
407 }
408 inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
409  const float* mat, MatrixDim dmat,
410  int dim) {
411  cudaF_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
412 }
413 inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
414  const double* mat, MatrixDim dmat,
415  int dim) {
416  cudaD_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
417 }
418 inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
419  const float* mat, MatrixDim dmat,
420  int dim) {
421  cudaF_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
422 }
423 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
424  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
425  int src_stride) {
426  cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
427 }
428 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
429  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
430  int src_stride) {
431  cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
432 }
433 inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
434  MatrixDim d_out, const double *v_in) {
435  cudaD_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
436 }
437 inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
438  MatrixDim d_out, const float *v_in) {
439  cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
440 }
441 inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
442  const int32_cuda *copy_from, MatrixDim d_out,
443  MatrixDim d_in) {
444  cudaD_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
445 }
446 inline void cuda_copy(dim3 Gr, dim3 Bl, float *y, const float *x,
447  const int32_cuda *copy_from, MatrixDim d_out,
448  MatrixDim d_in) {
449  cudaF_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
450 }
451 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
452  const double* mat_in, MatrixDim d_out,
453  MatrixDim d_in) {
454  cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
455 }
456 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
457  const float* mat_in, MatrixDim d_out,
458  MatrixDim d_in) {
459  cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
460 }
461 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
462  const double* mat_in, MatrixDim d_out,
463  MatrixDim d_in) {
464  cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
465 }
466 inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
467  const float* mat_in, MatrixDim d_out,
468  MatrixDim d_in) {
469  cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
470 }
471 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
472  const double* mat_in, MatrixDim d_out,
473  MatrixDim d_in) {
474  cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
475 }
476 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
477  const float* mat_in, MatrixDim d_out,
478  MatrixDim d_in) {
479  cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
480 }
481 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
482  const double* mat_in, MatrixDim d_out,
483  MatrixDim d_in) {
484  cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
485 }
486 inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
487  const float* mat_in, MatrixDim d_out,
488  MatrixDim d_in) {
489  cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
490 }
491 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat,
492  MatrixDim mat_dim, const int* smat_row_ptr,
493  const int* smat_col_idx,
494  const double* smat_val) {
495  cuda_copy_from_smat_dd(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
496  smat_val);
497 }
498 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat,
499  MatrixDim mat_dim, const int* smat_row_ptr,
500  const int* smat_col_idx,
501  const float* smat_val) {
502  cuda_copy_from_smat_df(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
503  smat_val);
504 }
505 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
506  const int* smat_row_ptr,
507  const int* smat_col_idx,
508  const double* smat_val) {
509  cuda_copy_from_smat_fd(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
510  smat_val);
511 }
512 inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
513  const int* smat_row_ptr,
514  const int* smat_col_idx,
515  const float* smat_val) {
516  cuda_copy_from_smat_ff(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
517  smat_val);
518 }
519 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat,
520  MatrixDim mat_dim,
521  const int* smat_row_ptr,
522  const int* smat_col_idx,
523  const double* smat_val) {
524  cuda_copy_from_smat_dd_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
525  smat_val);
526 }
527 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat,
528  MatrixDim mat_dim,
529  const int* smat_row_ptr,
530  const int* smat_col_idx,
531  const float* smat_val) {
532  cuda_copy_from_smat_df_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
533  smat_val);
534 }
535 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat,
536  MatrixDim mat_dim,
537  const int* smat_row_ptr,
538  const int* smat_col_idx,
539  const double* smat_val) {
540  cuda_copy_from_smat_fd_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
541  smat_val);
542 }
543 inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat,
544  MatrixDim mat_dim,
545  const int* smat_row_ptr,
546  const int* smat_col_idx,
547  const float* smat_val) {
548  cuda_copy_from_smat_ff_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
549  smat_val);
550 }
551 inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
552  MatrixDim d_out) {
553  cudaD_copy_from_sp(Gr, Bl, x, y, d_out);
554 }
555 inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y,
556  MatrixDim d_out) {
557  cudaF_copy_from_sp(Gr, Bl, x, y, d_out);
558 }
559 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B,
560  MatrixDim dmat) {
561  cudaD_copy_from_tp(Gr, Bl, A, B, dmat);
562 }
563 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
564  MatrixDim dmat) {
565  cudaDF_copy_from_tp(Gr, Bl, A, B, dmat);
566 }
567 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
568  MatrixDim dmat) {
569  cudaFD_copy_from_tp(Gr, Bl, A, B, dmat);
570 }
571 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
572  MatrixDim dmat) {
573  cudaF_copy_from_tp(Gr, Bl, A, B, dmat);
574 }
575 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A,
576  const double* B, MatrixDim dmat) {
577  cudaD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
578 }
579 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
580  MatrixDim dmat) {
581  cudaDF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
582 }
583 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
584  MatrixDim dmat) {
585  cudaFD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
586 }
587 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
588  MatrixDim dmat) {
589  cudaF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
590 }
591 inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
592  cudaD_copy_low_upp(Gr, Bl, A, dimA);
593 }
594 inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
595  cudaF_copy_low_upp(Gr, Bl, A, dimA);
596 }
597 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst,
598  const double* const * src, MatrixDim dst_dim) {
599  cudaD_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
600 }
601 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
602  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
603  int src_stride) {
604  cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
605 }
606 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst,
607  const float* const * src, MatrixDim dst_dim) {
608  cudaF_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
609 }
610 inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
611  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
612  int src_stride) {
613  cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
614 }
615 inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
616  MatrixDim d_out, const double *v_in) {
617  cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
618 }
619 inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
620  MatrixDim d_out, const float *v_in) {
621  cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
622 }
623 inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, double* const * dst,
624  const double* src, MatrixDim src_dim) {
625  cudaD_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
626 }
627 inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, float* const * dst,
628  const float* src, MatrixDim src_dim) {
629  cudaF_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
630 }
631 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
632  cudaD_copy_upp_low(Gr, Bl, A, dimA);
633 }
634 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
635  cudaF_copy_upp_low(Gr, Bl, A, dimA);
636 }
637 inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id,
638  const double *iv, const double *ov,
639  const double* od, MatrixDim id_dim,
640  int iv_stride, int ov_stride, int od_stride,
641  int group_size, double power) {
642  cudaD_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
643  od_stride, group_size, power);
644 }
645 inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
646  const float *ov, const float* od,
647  MatrixDim id_dim, int iv_stride,
648  int ov_stride, int od_stride, int group_size,
649  float power) {
650  cudaF_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
651  od_stride, group_size, power);
652 }
653 inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
654  const MatrixDim in_deriv_dim,
655  const double* out_value,
656  const int out_value_stride,
657  const double* out_deriv,
658  const int out_deriv_stride,
659  double* in_deriv) {
660  cudaD_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
661  out_deriv, out_deriv_stride, in_deriv);
662 }
663 inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
664  const MatrixDim in_deriv_dim,
665  const float* out_value,
666  const int out_value_stride,
667  const float* out_deriv,
668  const int out_deriv_stride, float* in_deriv) {
669  cudaF_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
670  out_deriv, out_deriv_stride, in_deriv);
671 }
672 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
673  const int have_dropout_mask,
674  const int num_rows, const double* input,
675  const int input_stride,
676  const double* params,
677  const int params_stride,
678  const double* output_deriv,
679  const int output_deriv_stride,
680  const double* deriv_sum_in,
681  const int deriv_sum_in_stride,
682  const double* self_repair_config,
683  double count, double* input_deriv,
684  const int input_deriv_stride,
685  double* params_deriv,
686  const int params_deriv_stride,
687  double* value_sum_out,
688  const int value_sum_out_stride,
689  double* deriv_sum_out,
690  const int deriv_sum_out_stride,
691  double* self_repair_sum_out,
692  const int self_repair_sum_out_stride) {
693  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows,
694  input, input_stride,
695  params, params_stride, output_deriv,
696  output_deriv_stride, deriv_sum_in,
697  deriv_sum_in_stride, self_repair_config, count,
698  input_deriv, input_deriv_stride, params_deriv,
699  params_deriv_stride, value_sum_out,
700  value_sum_out_stride, deriv_sum_out,
701  deriv_sum_out_stride, self_repair_sum_out,
702  self_repair_sum_out_stride);
703 }
704 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
705  const int have_dropout_mask,
706  const int num_rows, const float* input,
707  const int input_stride,
708  const float* params,
709  const int params_stride,
710  const float* output_deriv,
711  const int output_deriv_stride,
712  const double* deriv_sum_in,
713  const int deriv_sum_in_stride,
714  const float* self_repair_config,
715  double count, float* input_deriv,
716  const int input_deriv_stride,
717  float* params_deriv,
718  const int params_deriv_stride,
719  double* value_sum_out,
720  const int value_sum_out_stride,
721  double* deriv_sum_out,
722  const int deriv_sum_out_stride,
723  float* self_repair_sum_out,
724  const int self_repair_sum_out_stride) {
725  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask,
726  num_rows, input, input_stride,
727  params, params_stride, output_deriv,
728  output_deriv_stride, deriv_sum_in,
729  deriv_sum_in_stride, self_repair_config, count,
730  input_deriv, input_deriv_stride, params_deriv,
731  params_deriv_stride, value_sum_out,
732  value_sum_out_stride, deriv_sum_out,
733  deriv_sum_out_stride, self_repair_sum_out,
734  self_repair_sum_out_stride);
735 }
736 inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
737  int id_stride, const double *iv,
738  MatrixDim iv_dim, const double* od,
739  int od_stride, double target_rms,
740  bool add_log_stddev) {
741  cudaD_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
742  target_rms, add_log_stddev);
743 }
744 inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
745  int id_stride, const float *iv,
746  MatrixDim iv_dim, const float* od,
747  int od_stride, float target_rms,
748  bool add_log_stddev) {
749  cudaF_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
750  target_rms, add_log_stddev);
751 }
752 inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout,
753  const double *e, const double *y,
754  MatrixDim d, int e_stride, int y_stride,
755  const double *a, const double *b) {
756  cudaD_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
757 }
758 inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout,
759  const float *e, const float *y,
760  MatrixDim d, int e_stride, int y_stride,
761  const float *a, const float *b) {
762  cudaF_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
763 }
764 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
765  const double *y, MatrixDim d, int e_stride,
766  int y_stride) {
767  cudaD_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
768 }
769 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
770  const float *y, MatrixDim d, int e_stride,
771  int y_stride) {
772  cudaF_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
773 }
774 inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
775  const double* value, const int value_stride,
776  const double* diff, const int diff_stride) {
777  cudaD_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
778 }
779 inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
780  const float* value, const int value_stride,
781  const float* diff, const int diff_stride) {
782  cudaF_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
783 }
784 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
785  const double *y, MatrixDim d, int e_stride,
786  int y_stride) {
787  cudaD_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
788 }
789 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
790  const float *y, MatrixDim d, int e_stride,
791  int y_stride) {
792  cudaF_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
793 }
794 inline void cuda_ensure_nonzero(dim3 Gr, dim3 Bl, const double *x, MatrixDim d,
795  double epsilon, int y_stride, double *y) {
796  cudaD_ensure_nonzero(Gr, Bl, x, d, epsilon, y_stride, y);
797 }
798 inline void cuda_ensure_nonzero(dim3 Gr, dim3 Bl, const float *x, MatrixDim d,
799  float epsilon, int y_stride, float *y) {
800  cudaF_ensure_nonzero(Gr, Bl, x, d, epsilon, y_stride, y);
801 }
802 inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
803  double *mat_net_out, double *vec_log_post,
804  MatrixDim d) {
805  cudaD_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
806 }
807 inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
808  float *mat_net_out, float *vec_log_post,
809  MatrixDim d) {
810  cudaF_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
811 }
812 inline void cuda_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
813  MatrixDim dst_d, int src_stride) {
814  cudaD_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
815 }
816 inline void cuda_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
817  MatrixDim dst_d, int src_stride) {
818  cudaF_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
819 }
820 inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, double *mat,
821  const double *vec_div, MatrixDim d) {
822  cudaD_div_rows_vec(Gr, Bl, mat, vec_div, d);
823 }
824 inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, float *mat,
825  const float *vec_div, MatrixDim d) {
826  cudaF_div_rows_vec(Gr, Bl, mat, vec_div, d);
827 }
828 inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
829  const double *mat2, double *mask,
830  MatrixDim mat1_dim, int mat2_stride,
831  int mask_stride) {
832  cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
833  mask_stride);
834 }
835 inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
836  const float *mat2, float *mask,
837  MatrixDim mat1_dim, int mat2_stride,
838  int mask_stride) {
839  cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
840  mask_stride);
841 }
842 inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat,
843  double *vec_val, int32_cuda *vec_id,
844  MatrixDim d) {
845  cudaD_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
846 }
847 inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat,
848  float *vec_val, int32_cuda *vec_id,
849  MatrixDim d) {
850  cudaF_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
851 }
852 inline void cuda_group_max(dim3 Gr, dim3 Bl, double *y, const double *x,
853  MatrixDim d, int src_stride, int group_size) {
854  cudaD_group_max(Gr, Bl, y, x, d, src_stride, group_size);
855 }
856 inline void cuda_group_max(dim3 Gr, dim3 Bl, float *y, const float *x,
857  MatrixDim d, int src_stride, int group_size) {
858  cudaF_group_max(Gr, Bl, y, x, d, src_stride, group_size);
859 }
860 inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
861  MatrixDim d, int src_stride, int group_size,
862  double power) {
863  cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
864 }
865 inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
866  MatrixDim d, int src_stride, int group_size,
867  float power) {
868  cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
869 }
870 inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
871  MatrixDim d, int src_stride, int group_size,
872  double power) {
873  cudaD_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
874 }
875 inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, float* y, const float* x,
876  MatrixDim d, int src_stride, int group_size,
877  float power) {
878  cudaF_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
879 }
880 inline void cuda_heaviside(dim3 Gr, dim3 Bl, double* y, const double* x,
881  MatrixDim d, int src_stride) {
882  cudaD_heaviside(Gr, Bl, y, x, d, src_stride);
883 }
884 inline void cuda_heaviside(dim3 Gr, dim3 Bl, float* y, const float* x,
885  MatrixDim d, int src_stride) {
886  cudaF_heaviside(Gr, Bl, y, x, d, src_stride);
887 }
888 inline void cuda_exp(dim3 Gr, dim3 Bl, double* y, const double* x,
889  MatrixDim d, int src_stride) {
890  cudaD_exp(Gr, Bl, y, x, d, src_stride);
891 }
892 inline void cuda_exp(dim3 Gr, dim3 Bl, float* y, const float* x,
893  MatrixDim d, int src_stride) {
894  cudaF_exp(Gr, Bl, y, x, d, src_stride);
895 }
896 inline void cuda_pow(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
897  MatrixDim d, int src_stride) {
898  cudaD_pow(Gr, Bl, y, x, power, d, src_stride);
899 }
900 inline void cuda_pow(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
901  MatrixDim d, int src_stride) {
902  cudaF_pow(Gr, Bl, y, x, power, d, src_stride);
903 }
904 inline void cuda_ceiling(dim3 Gr, dim3 Bl, double* y, const double* x, double ceiling_val,
905  MatrixDim dim, int src_stride) {
906  cudaD_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
907 }
908 inline void cuda_ceiling(dim3 Gr, dim3 Bl, float* y, const float* x, float ceiling_val,
909  MatrixDim dim, int src_stride) {
910  cudaF_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
911 }
912 inline void cuda_floor(dim3 Gr, dim3 Bl, double* y, const double* x, double floor_val,
913  MatrixDim dim, int src_stride) {
914  cudaD_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
915 }
916 inline void cuda_floor(dim3 Gr, dim3 Bl, float* y, const float* x, float floor_val,
917  MatrixDim dim, int src_stride) {
918  cudaF_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
919 }
920 inline void cuda_exp_limited(dim3 Gr, dim3 Bl, double* y, const double* x,
921  double lower_limit, double upper_limit, MatrixDim d, int src_stride) {
922  cudaD_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
923 }
924 inline void cuda_exp_limited(dim3 Gr, dim3 Bl, float* y, const float* x,
925  float lower_limit, float upper_limit, MatrixDim d, int src_stride) {
926  cudaF_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
927 }
928 inline void cuda_exp_special(dim3 Gr, dim3 Bl, double* y, const double* x,
929  MatrixDim d, int src_stride) {
930  cudaD_exp_special(Gr, Bl, y, x, d, src_stride);
931 }
932 inline void cuda_exp_special(dim3 Gr, dim3 Bl, float* y, const float* x,
933  MatrixDim d, int src_stride) {
934  cudaF_exp_special(Gr, Bl, y, x, d, src_stride);
935 }
936 inline void cuda_log(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
937  cudaD_log(Gr, Bl, y, x, d, src_stride);
938 }
939 inline void cuda_log(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
940  cudaF_log(Gr, Bl, y, x, d, src_stride);
941 }
942 inline void cuda_pow_abs(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
943  bool include_sign, MatrixDim dim, int src_stride) {
944  cudaD_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
945 }
946 inline void cuda_pow_abs(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
947  bool include_sign, MatrixDim dim, int src_stride) {
948  cudaF_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
949 }
950 inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) {
951  cudaD_invert_elements(Gr, Bl, data, d);
952 }
953 inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) {
954  cudaF_invert_elements(Gr, Bl, data, d);
955 }
956 inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y,
957  const double *x, MatrixDim y_dim,
958  int x_stride) {
959  cudaD_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
960 }
961 inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, float *y,
962  const float *x, MatrixDim y_dim,
963  int x_stride) {
964  cudaF_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
965 }
966 inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
967  const int in_stride, const double* params,
968  const int params_stride,
969  const int out_stride, const int cell_dim,
970  const int have_dropout_mask,
971  const int num_rows, double* out) {
972  cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
973  out_stride, cell_dim, have_dropout_mask,
974  num_rows, out);
975 }
976 inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
977  const int in_stride, const float* params,
978  const int params_stride,
979  const int out_stride, const int cell_dim,
980  const int have_dropout_mask,
981  const int num_rows, float* out) {
982  cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
983  out_stride, cell_dim, have_dropout_mask,
984  num_rows, out);
985 }
986 inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data,
987  MatrixDim dim, double alpha,
989  int num_elements) {
990  cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
991 }
992 inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, float *data,
993  MatrixDim dim, float alpha,
995  int num_elements) {
996  cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
997 }
998 inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
999  double alpha,
1000  const Int32Pair* indices,
1001  const double* x, int s,
1002  double* data) {
1003  cudaD_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
1004 }
1005 inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
1006  float alpha,
1007  const Int32Pair* indices,
1008  const float* x, int s, float* data) {
1009  cudaF_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
1010 }
1011 inline void cuda_matrix_add_to_elements(dim3 Gr, dim3 Bl, double alpha,
1012  double* mat, MatrixDim dim,
1013  const MatrixIndexT_cuda* elements) {
1014  cudaD_matrix_add_to_elements(Gr, Bl, alpha, mat, dim, elements);
1015 }
1016 inline void cuda_matrix_add_to_elements(dim3 Gr, dim3 Bl, float alpha,
1017  float* mat, MatrixDim dim,
1018  const MatrixIndexT_cuda* elements) {
1019  cudaF_matrix_add_to_elements(Gr, Bl, alpha, mat, dim, elements);
1020 }
1021 inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
1022  MatrixDim dim, const Int32Pair *indices,
1023  int indices_size, double *output) {
1024  cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
1025 }
1026 inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
1027  MatrixDim dim, const Int32Pair *indices,
1028  int indices_size, float *output) {
1029  cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
1030 }
1031 inline void cuda_vector_copy_elements(dim3 Gr, dim3 Bl, double *data, int dim,
1032  const double *src_mat, int mat_stride,
1033  bool transpose,
1034  const MatrixIndexT_cuda* elements) {
1035  cudaD_vector_copy_elements(Gr, Bl, data, dim, src_mat, mat_stride,
1036  transpose, elements);
1037 }
1038 inline void cuda_vector_copy_elements(dim3 Gr, dim3 Bl, float *data, int dim,
1039  const float *src_mat, int mat_stride,
1040  bool transpose,
1041  const MatrixIndexT_cuda* elements) {
1042  cudaF_vector_copy_elements(Gr, Bl, data, dim, src_mat, mat_stride,
1043  transpose, elements);
1044 }
1045 inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A,
1046  MatrixDim dst_d, int src_stride) {
1047  cudaD_max(Gr, Bl, mat, A, dst_d, src_stride);
1048 }
1049 inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A,
1050  MatrixDim dst_d, int src_stride) {
1051  cudaF_max(Gr, Bl, mat, A, dst_d, src_stride);
1052 }
1053 inline void cuda_max_mat_cols(int Gr, int Bl, double* result, const double* mat,
1054  const MatrixDim d) {
1055  cudaD_max_mat_cols(Gr, Bl, result, mat, d);
1056 }
1057 inline void cuda_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
1058  const MatrixDim d) {
1059  cudaF_max_mat_cols(Gr, Bl, result, mat, d);
1060 }
1061 inline void cuda_min(dim3 Gr, dim3 Bl, double *mat, const double *other,
1062  MatrixDim mat_d, int other_stride) {
1063  cudaD_min(Gr, Bl, mat, other, mat_d, other_stride);
1064 }
1065 inline void cuda_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
1066  MatrixDim mat_d, int other_stride) {
1067  cudaF_min(Gr, Bl, mat, other, mat_d, other_stride);
1068 }
1069 inline void cuda_min_mat_cols(int Gr, int Bl, double* result, const double* mat,
1070  const MatrixDim d) {
1071  cudaD_min_mat_cols(Gr, Bl, result, mat, d);
1072 }
1073 inline void cuda_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
1074  const MatrixDim d) {
1075  cudaF_min_mat_cols(Gr, Bl, result, mat, d);
1076 }
1077 inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat,
1078  const double *scale, MatrixDim d) {
1079  cudaD_mul_cols_vec(Gr, Bl, mat, scale, d);
1080 }
1081 inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
1082  MatrixDim d) {
1083  cudaF_mul_cols_vec(Gr, Bl, mat, scale, d);
1084 }
1085 inline void cuda_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
1086  MatrixDim dst_d, int src_stride) {
1087  cudaD_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
1088 }
1089 inline void cuda_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
1090  MatrixDim dst_d, int src_stride) {
1091  cudaF_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
1092 }
1093 inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y,
1094  const double *x, MatrixDim d,
1095  int src_stride, int group_size) {
1096  cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
1097 }
1098 inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
1099  MatrixDim d, int src_stride,
1100  int group_size) {
1101  cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
1102 }
1103 inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat,
1104  const double *scale, MatrixDim d) {
1105  cudaD_mul_rows_vec(Gr, Bl, mat, scale, d);
1106 }
1107 inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
1108  MatrixDim d) {
1109  cudaF_mul_rows_vec(Gr, Bl, mat, scale, d);
1110 }
1111 inline void cuda_normalize_per_row(size_t Gr, size_t Bl, double *y,
1112  int y_stride, const double *x, MatrixDim x_d,
1113  double target_rms, bool add_log_stddev) {
1114  cudaD_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
1115  add_log_stddev);
1116 }
1117 inline void cuda_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
1118  const float *x, MatrixDim x_d,
1119  float target_rms, bool add_log_stddev) {
1120  cudaF_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
1121  add_log_stddev);
1122 }
1123 inline void cuda_one(int Gr, int Bl, double* x, int dim) {
1124  cudaD_one(Gr, Bl, x, dim);
1125 }
1126 inline void cuda_one(int Gr, int Bl, float* x, int dim) {
1127  cudaF_one(Gr, Bl, x, dim);
1128 }
1129 inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
1130  MatrixDim d, int src_stride, const double *a,
1131  const double *b) {
1132  cudaD_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
1133 }
1134 inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
1135  MatrixDim d, int src_stride, const float *a,
1136  const float *b) {
1137  cudaF_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
1138 }
1139 inline void cuda_randomize(dim3 Gr, dim3 Bl, double *y, const double *x,
1140  const int32_cuda *copy_from, MatrixDim d_out,
1141  MatrixDim d_in) {
1142  cudaD_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
1143 }
1144 inline void cuda_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
1145  const int32_cuda *copy_from, MatrixDim d_out,
1146  MatrixDim d_in) {
1147  cudaF_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
1148 }
1149 inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad,
1150  double l1, double lr, MatrixDim d,
1151  int stride_grad) {
1152  cudaD_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
1153 }
1154 inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad,
1155  float l1, float lr, MatrixDim d,
1156  int stride_grad) {
1157  cudaF_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
1158 }
1159 inline void cuda_replace_value(int Gr, int Bl, double *v, int dim, double orig,
1160  double changed) {
1161  cudaD_replace_value(Gr, Bl, v, dim, orig, changed);
1162 }
1163 inline void cuda_replace_value(int Gr, int Bl, float *v, int dim, float orig,
1164  float changed) {
1165  cudaF_replace_value(Gr, Bl, v, dim, orig, changed);
1166 }
1167 inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, double alpha,
1168  double *x, int incx) {
1169  return cublasDscal_v2(handle, n, &alpha, x, incx);
1170 }
1171 inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, float alpha,
1172  float *x, int incx) {
1173  return cublasSscal_v2(handle, n, &alpha, x, incx);
1174 }
1175 inline void cuda_scale_diag_packed(int Gr, int Bl, double* mat, double value,
1176  int dim) {
1177  cudaD_scale_diag_packed(Gr, Bl, mat, value, dim);
1178 }
1179 inline void cuda_scale_diag_packed(int Gr, int Bl, float* mat, float value,
1180  int dim) {
1181  cudaF_scale_diag_packed(Gr, Bl, mat, value, dim);
1182 }
1183 inline void cuda_scale(dim3 Gr, dim3 Bl, double *mat, double value,
1184  MatrixDim d) {
1185  cudaD_scale(Gr, Bl, mat, value, d);
1186 }
1187 inline void cuda_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
1188  cudaF_scale(Gr, Bl, mat, value, d);
1189 }
1190 inline void cuda_select_rows(dim3 Gr, dim3 Bl, const int* out_row_ptr,
1191  int* out_col_idx, double* out_val,
1192  const int* row_indexes,
1193  const int num_selected_rows, const int* in_row_ptr,
1194  const int* in_col_idx, const double* in_val) {
1195  cudaD_select_rows(Gr, Bl, out_row_ptr, out_col_idx, out_val, row_indexes,
1196  num_selected_rows, in_row_ptr, in_col_idx, in_val);
1197 }
1198 inline void cuda_select_rows(dim3 Gr, dim3 Bl, const int* out_row_ptr,
1199  int* out_col_idx, float* out_val,
1200  const int* row_indexes,
1201  const int num_selected_rows, const int* in_row_ptr,
1202  const int* in_col_idx, const float* in_val) {
1203  cudaF_select_rows(Gr, Bl, out_row_ptr, out_col_idx, out_val, row_indexes,
1204  num_selected_rows, in_row_ptr, in_col_idx, in_val);
1205 }
1206 inline void cuda_set_bias_params(int Gr, int Bl, double* v, const double* a,
1207  double param_1, double param_2, double param_3,
1208  int* flag, int dim) {
1209  cudaD_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
1210 }
1211 inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a,
1212  float param_1, float param_2, float param_3,
1213  int* flag, int dim) {
1214  cudaF_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
1215 }
1216 inline void cuda_set_const(dim3 Gr, dim3 Bl, double *mat, double value,
1217  MatrixDim d) {
1218  cudaD_set_const(Gr, Bl, mat, value, d);
1219 }
1220 inline void cuda_set_const(dim3 Gr, dim3 Bl, float *mat, float value,
1221  MatrixDim d) {
1222  cudaF_set_const(Gr, Bl, mat, value, d);
1223 }
1224 inline void cuda_set_diag(int Gr, int Bl, double* mat, double value,
1225  MatrixDim d) {
1226  cudaD_set_diag(Gr, Bl, mat, value, d);
1227 }
1228 inline void cuda_set_diag(int Gr, int Bl, float* mat, float value,
1229  MatrixDim d) {
1230  cudaF_set_diag(Gr, Bl, mat, value, d);
1231 }
1232 inline void cuda_set_diag_packed(int Gr, int Bl, double* mat, double value,
1233  int dim) {
1234  cudaD_set_diag_packed(Gr, Bl, mat, value, dim);
1235 }
1236 inline void cuda_set_diag_packed(int Gr, int Bl, float* mat, float value,
1237  int dim) {
1238  cudaF_set_diag_packed(Gr, Bl, mat, value, dim);
1239 }
1240 inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
1241  const double *B, const double *C,
1242  double *dst, MatrixDim d, int stride_a,
1243  int stride_b, int stride_c) {
1244  cudaD_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
1245  stride_c);
1246 }
1247 inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A,
1248  const float *B, const float *C, float *dst,
1249  MatrixDim d, int stride_a, int stride_b,
1250  int stride_c) {
1251  cudaF_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
1252  stride_c);
1253 }
1254 inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat,
1255  MatrixDim d) {
1256  cudaD_set_zero_above_diag(Gr, Bl, mat, d);
1257 }
1258 inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat,
1259  MatrixDim d) {
1260  cudaF_set_zero_above_diag(Gr, Bl, mat, d);
1261 }
1262 inline void cuda_sequence(dim3 Gr, dim3 Bl, int32_cuda* data, int length,
1263  int32_cuda base) {
1264  cuda_int32_sequence(Gr, Bl, data, length, base);
1265 }
1266 inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x,
1267  MatrixDim d, int src_stride) {
1268  cudaD_sigmoid(Gr, Bl, y, x, d, src_stride);
1269 }
1270 inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x,
1271  MatrixDim d, int src_stride) {
1272  cudaF_sigmoid(Gr, Bl, y, x, d, src_stride);
1273 }
1274 inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x,
1275  MatrixDim d, int src_stride) {
1276  cudaD_soft_hinge(Gr, Bl, y, x, d, src_stride);
1277 }
1278 inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x,
1279  MatrixDim d, int src_stride) {
1280  cudaF_soft_hinge(Gr, Bl, y, x, d, src_stride);
1281 }
1282 inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y,
1283  const double *x, MatrixDim d, int src_stride) {
1284  cudaD_softmax_reduce(Gr, Bl, y, x, d, src_stride);
1285 }
1286 inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
1287  MatrixDim d, int src_stride) {
1288  cudaF_softmax_reduce(Gr, Bl, y, x, d, src_stride);
1289 }
1290 inline void cuda_splice(dim3 Gr, dim3 Bl, double *y, const double *x,
1291  const int32_cuda *off, MatrixDim d_out,
1292  MatrixDim d_in) {
1293  cudaD_splice(Gr, Bl, y, x, off, d_out, d_in);
1294 }
1295 inline void cuda_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
1296  const int32_cuda *off, MatrixDim d_out,
1297  MatrixDim d_in) {
1298  cudaF_splice(Gr, Bl, y, x, off, d_out, d_in);
1299 }
1300 inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data,
1301  MatrixDim dim, const double *src_data,
1302  MatrixDim src_dim,
1303  const Int32Pair *indices) {
1304  cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
1305 }
1306 inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
1307  const float *src_data, MatrixDim src_dim,
1308  const Int32Pair *indices) {
1309  cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
1310 }
1311 inline void cuda_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
1312  const MatrixDim d) {
1313  cudaD_sum_mat_cols(Gr, Bl, result, mat, d);
1314 }
1315 inline void cuda_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
1316  const MatrixDim d) {
1317  cudaF_sum_mat_cols(Gr, Bl, result, mat, d);
1318 }
1319 inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
1320  const double* T, MatrixDim tdim, double *S,
1321  MatrixDim sdim) {
1322  cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
1323 }
1324 inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta,
1325  const float* T, MatrixDim tdim, float *S,
1326  MatrixDim sdim) {
1327  cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
1328 }
1329 inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y,
1330  MatrixDim d_in) {
1331  cudaD_take_lower(Gr, Bl, x, y, d_in);
1332 }
1333 inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
1334  MatrixDim d_in) {
1335  cudaF_take_lower(Gr, Bl, x, y, d_in);
1336 }
1337 inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y,
1338  MatrixDim d_in) {
1339  cudaD_take_mean(Gr, Bl, x, y, d_in);
1340 }
1341 inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
1342  MatrixDim d_in) {
1343  cudaF_take_mean(Gr, Bl, x, y, d_in);
1344 }
1345 inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
1346  MatrixDim d_in) {
1347  cudaD_take_upper(Gr, Bl, x, y, d_in);
1348 }
1349 inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
1350  MatrixDim d_in) {
1351  cudaF_take_upper(Gr, Bl, x, y, d_in);
1352 }
1353 inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
1354  int src_stride) {
1355  cudaD_tanh(Gr, Bl, y, x, d, src_stride);
1356 }
1357 inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
1358  int src_stride) {
1359  cudaF_tanh(Gr, Bl, y, x, d, src_stride);
1360 }
1361 inline void cuda_trace(int Gr, int Bl, double* mat, double* value, int dim) {
1362  cudaD_trace(Gr, Bl, mat, value, dim);
1363 }
1364 inline void cuda_trace(int Gr, int Bl, float* mat, float* value, int dim) {
1365  cudaF_trace(Gr, Bl, mat, value, dim);
1366 }
1367 inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A,
1368  const double* B, MatrixDim dA, int B_stride,
1369  double* value) {
1370  cudaD_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
1371 }
1372 inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
1373  MatrixDim dA, int B_stride, float* value) {
1374  cudaF_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
1375 }
1376 inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
1377  const double* B, MatrixDim dA,
1378  int B_stride, double* value) {
1379  cudaD_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
1380 }
1381 inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A,
1382  const float* B, MatrixDim dA, int B_stride,
1383  float* value) {
1384  cudaF_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
1385 }
1386 inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat,
1387  MatrixDim mat_dim, const int* smat_row_ptr,
1388  const int* smat_col_idx, const double* smat_val,
1389  double* trace_vec) {
1390  cudaD_trace_mat_smat(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
1391  smat_val, trace_vec);
1392 }
1393 inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat,
1394  MatrixDim mat_dim, const int* smat_row_ptr,
1395  const int* smat_col_idx, const float* smat_val,
1396  float* trace_vec) {
1397  cudaF_trace_mat_smat(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
1398  smat_val, trace_vec);
1399 }
1400 inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat,
1401  MatrixDim mat_dim,
1402  const int* smat_row_ptr,
1403  const int* smat_col_idx,
1404  const double* smat_val,
1405  double* trace_vec) {
1406  cudaD_trace_mat_smat_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
1407  smat_val, trace_vec);
1408 }
1409 inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat,
1410  MatrixDim mat_dim,
1411  const int* smat_row_ptr,
1412  const int* smat_col_idx,
1413  const float* smat_val, float* trace_vec) {
1414  cudaF_trace_mat_smat_trans(Gr, Bl, mat, mat_dim, smat_row_ptr, smat_col_idx,
1415  smat_val, trace_vec);
1416 }
1417 inline void cuda_vec_apply_ceiling(int Gr, int Bl, double* v, double floor_val,
1418  float* num, int dim) {
1419  cudaD_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
1420 }
1421 inline void cuda_vec_apply_ceiling(int Gr, int Bl, float* v, float floor_val,
1422  float* num, int dim) {
1423  cudaF_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
1424 }
1425 inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) {
1426  cudaD_vec_apply_exp(Gr, Bl, v, dim);
1427 }
1428 inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) {
1429  cudaF_vec_apply_exp(Gr, Bl, v, dim);
1430 }
1431 inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
1432  float* num, int dim) {
1433  cudaD_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
1434 }
1435 inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
1436  float* num, int dim) {
1437  cudaF_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
1438 }
1439 inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag,
1440  int dim) {
1441  cudaD_vec_apply_log(Gr, Bl, v, flag, dim);
1442 }
1443 inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) {
1444  cudaF_vec_apply_log(Gr, Bl, v, flag, dim);
1445 }
1446 inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
1447  const double *src, int dim) {
1448  cudaD_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
1449 }
1450 inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
1451  const float *src, int dim) {
1452  cudaF_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
1453 }
1454 inline void cuda_vec_max(int Gr, int Bl, const double* v, double* value,
1455  int dim, int inc) {
1456  cudaD_vec_max(Gr, Bl, v, value, dim, inc);
1457 }
1458 inline void cuda_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
1459  int inc) {
1460  cudaF_vec_max(Gr, Bl, v, value, dim, inc);
1461 }
1462 inline void cuda_vec_min(int Gr, int Bl, const double* v, double* value,
1463  int dim, int inc) {
1464  cudaD_vec_min(Gr, Bl, v, value, dim, inc);
1465 }
1466 inline void cuda_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
1467  int inc) {
1468  cudaF_vec_min(Gr, Bl, v, value, dim, inc);
1469 }
1470 inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
1471  int dim) {
1472  cudaD_vec_mul_elements(Gr, Bl, v, a, dim);
1473 }
1474 inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a,
1475  int dim) {
1476  cudaF_vec_mul_elements(Gr, Bl, v, a, dim);
1477 }
1478 inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) {
1479  cudaD_vec_soft_max(Gr, Bl, v, dim);
1480 }
1481 inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) {
1482  cudaF_vec_soft_max(Gr, Bl, v, dim);
1483 }
1484 inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim,
1485  int inc) {
1486  cudaD_vec_sum(Gr, Bl, v, value, dim, inc);
1487 }
1488 inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim,
1489  int inc) {
1490  cudaF_vec_sum(Gr, Bl, v, value, dim, inc);
1491 }
1492 
1493 // Compresses the matrix in 'src' to 'dest', retaining only zero-one
1494 // information (1 if the value is >0, 0 otherwise)
1495 inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
1496  MatrixDim dim, uint8 *dest,
1497  int dest_stride) {
1498  cuda_compress_uint8_sign(Gr, Bl, src, dim, dest, dest_stride);
1499 }
1500 // this template handles the other types that are not instantiated yet,
1501 // to avoid compilation errors.
1502 template <typename I>
1503 inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
1504  MatrixDim dim, I *dest,
1505  int dest_stride) {
1506  KALDI_ERR << "Not implemented for this type.";
1507 }
1508 
1509 inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
1510  MatrixDim dim, int16_t *dest,
1511  int dest_stride, float inv_scale,
1512  bool bounds_check) {
1513  cuda_compress_int16(Gr, Bl, src, dim, dest, dest_stride,
1514  inv_scale, bounds_check);
1515 }
1516 inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
1517  MatrixDim dim, uint16_t *dest,
1518  int dest_stride, float inv_scale,
1519  bool bounds_check) {
1520  cuda_compress_uint16(Gr, Bl, src, dim, dest, dest_stride,
1521  inv_scale, bounds_check);
1522 }
1523 inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
1524  MatrixDim dim, uint8_t *dest,
1525  int dest_stride, float inv_scale,
1526  bool bounds_check) {
1527  cuda_compress_uint8(Gr, Bl, src, dim, dest, dest_stride,
1528  inv_scale, bounds_check);
1529 }
1530 inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
1531  MatrixDim dim, int8_t *dest,
1532  int dest_stride, float inv_scale,
1533  bool bounds_check) {
1534  cuda_compress_int8(Gr, Bl, src, dim, dest, dest_stride,
1535  inv_scale, bounds_check);
1536 }
1537 
1538 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
1539  MatrixDim dim, const int8_t *src,
1540  int src_stride, float scale) {
1541  cuda_uncompress_int8(Gr, Bl, dest, dim, src, src_stride, scale);
1542 }
1543 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
1544  MatrixDim dim, const uint8_t *src,
1545  int src_stride, float scale) {
1546  cuda_uncompress_uint8(Gr, Bl, dest, dim, src, src_stride, scale);
1547 }
1548 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
1549  MatrixDim dim, const int16_t *src,
1550  int src_stride, float scale) {
1551  cuda_uncompress_int16(Gr, Bl, dest, dim, src, src_stride, scale);
1552 }
1553 inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
1554  MatrixDim dim, const uint16_t *src,
1555  int src_stride, float scale) {
1556  cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale);
1557 }
1558 
1559 inline void cuda_mat_copy_range_clamped(
1560  int32_t row_start, int32_t row_end, int32_t num_cols,
1561  const double *src, int32_t lds,
1562  int32_t clamp_low, int32_t clamp_high,
1563  double *dst, int32_t ldd) {
1564  cudaD_mat_copy_range_clamped(row_start, row_end, num_cols,
1565  src, lds, clamp_low, clamp_high, dst, ldd);
1566 }
1567 
1568 inline void cuda_mat_copy_range_clamped(
1569  int32_t row_start, int32_t row_end, int32_t num_cols,
1570  const float *src, int32_t lds,
1571  int32_t clamp_low, int32_t clamp_high,
1572  float *dst, int32_t ldd) {
1573  cudaF_mat_copy_range_clamped(row_start, row_end, num_cols,
1574  src, lds, clamp_low, clamp_high, dst, ldd);
1575 }
1576 
1577 inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
1578  int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
1579  int32_t *ldo) {
1580  cudaF_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
1581  outputs, ldo);
1582 }
1583 
1584 inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
1585  int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
1586  int32_t *ldo) {
1587  cudaD_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
1588  outputs, ldo);
1589 }
1590 
1591 
1592 } // namespace kaldi
1593 
1594 #endif // HAVE_CUDA
1595 
1596 #endif
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:46
kaldi::int32 int32
int32_t int32_cuda
Definition: cu-matrixdim.h:31
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
Definition: cu-matrixdim.h:68
const size_t count
float BaseFloat
Definition: kaldi-types.h:29
struct rnnlm::@11::@12 n
#define KALDI_ERR
Definition: kaldi-error.h:147
int32_t MatrixIndexT_cuda
Definition: cu-matrixdim.h:32