cu-kernels-ansi.h
Go to the documentation of this file.
1 // cudamatrix/cu-kernels-ansi.h
2 
3 // Copyright 2009-2012 Karel Vesely
4 // 2013 Johns Hopkins University (author: Daniel Povey)
5 // 2013 Hainan Xu
6 // 2013 Xiaohui Zhang
7 // 2013-2015 Guoguo Chen
8 // 2016-2018 Shiyin Kang
9 // 2019 Yiwen Shao
10 
11 // See ../../COPYING for clarification regarding multiple authors
12 //
13 // Licensed under the Apache License, Version 2.0 (the "License");
14 // you may not use this file except in compliance with the License.
15 // You may obtain a copy of the License at
16 //
17 // http://www.apache.org/licenses/LICENSE-2.0
18 //
19 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
20 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
21 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
22 // MERCHANTABLITY OR NON-INFRINGEMENT.
23 // See the Apache 2 License for the specific language governing permissions and
24 // limitations under the License.
25 
26 #ifndef KALDI_CUDAMATRIX_CU_KERNELS_ANSI_H_
27 #define KALDI_CUDAMATRIX_CU_KERNELS_ANSI_H_
28 
30 
31 #if HAVE_CUDA == 1
32 extern "C" {
33 
34 // "C" version of the BaseFloat typedef-- this saves us having to write
35 // multiple versions of these kernels.
36 #if (KALDI_DOUBLEPRECISION != 0)
37 typedef double BaseFloat;
38 #else
39 typedef float BaseFloat;
40 #endif
41 
42 
43 void cudaD_add_row_sum_mat(int Gr, int Bl, double* result, const double* mat,
44  const MatrixDim d, const double alpha,
45  const double beta);
46 void cudaF_add_row_sum_mat(int Gr, int Bl, float* result, const float* mat,
47  const MatrixDim d, const float alpha,
48  const float beta);
49 void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
50  const MatrixDim d, const double alpha,
51  const double beta);
52 void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
53  const MatrixDim d, const float alpha,
54  const float beta);
55 void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
56  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
57  int src_stride);
58 void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
59  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
60  int src_stride);
61 void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
62  const double* M, const int strid_M,
63  const double* N, const MatrixDim dim_N,
64  const double beta, double* v);
65 void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
66  const float* M, const int strid_M,
67  const float* N, const MatrixDim dim_N,
68  const float beta, float* v);
69 void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
70  const double* M, const MatrixDim dim_M,
71  const double* N, const int stride_N,
72  const double beta, double* v);
73 void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
74  const float* M, const MatrixDim dim_M,
75  const float* N, const int stride_N,
76  const float beta, float* v);
77 void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
78  const double* M, const int strid_M,
79  const double* N, const MatrixDim dim_N,
80  const double beta, double* v,
81  const int stride_v);
82 void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
83  const float* M, const int strid_M,
84  const float* N, const MatrixDim dim_N,
85  const float beta, float* v, const int stride_v);
86 void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
87 void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
88 void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
89  MatrixDim mat_dim, const double *vec,
90  const double *mat2, int mat2_row_stride,
91  int mat2_col_stride, double beta);
92 void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
93  MatrixDim mat_dim, const float *vec,
94  const float *mat2, int mat2_row_stride,
95  int mat2_col_stride, float beta);
96 void cudaD_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
97 void cudaF_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
98 void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
99  const double *Adata, int A_num_rows, int A_num_cols,
100  int A_row_stride, int A_col_stride,
101  const CuBlockMatrixData *B_cu_data,
102  int B_num_blocks, double alpha, double beta,
103  int B_trans);
104 void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
105  const float *Adata, int A_num_rows, int A_num_cols,
106  int A_row_stride, int A_col_stride,
107  const CuBlockMatrixData *B_cu_data,
108  int B_num_blocks, float alpha, float beta,
109  int B_trans);
110 void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src,
111  int32_cuda num_row_blocks, int32_cuda num_col_blocks,
112  double *dst, MatrixDim d, int src_stride,
113  int A_trans);
114 void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
115  int32_cuda num_row_blocks, int32_cuda num_col_blocks,
116  float *dst, MatrixDim d, int src_stride, int A_trans);
117 void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src,
118  MatrixDim src_dim, double *dst, MatrixDim dst_dim);
119 void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src,
120  MatrixDim src_dim, float *dst, MatrixDim dst_dim);
121 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
122  MatrixDim mat_dim, const double *mat2,
123  int mat2_row_stride, int mat2_col_stride,
124  const double *vec, double beta);
125 void cudaF_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat,
126  MatrixDim mat_dim, const float *mat2,
127  int mat2_row_stride, int mat2_col_stride,
128  const float *vec, float beta);
129 void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
130  double *dst, MatrixDim d, int src_stride, int A_trans);
131 void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst,
132  MatrixDim d, int src_stride, int A_trans);
133 void cudaD_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
134  const double *srcA_data,
135  const double *srcB_data, MatrixDim dim,
136  int srcA_stride, int srcB_stride, double alpha,
137  double beta);
138 void cudaF_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data,
139  const float *srcA_data, const float *srcB_data,
140  MatrixDim dim, int srcA_stride, int srcB_stride,
141  float alpha, float beta);
142 void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
143  const double *src_data, MatrixDim src_dim,
144  const Int32Pair *indexes);
145 void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
146  const float *src_data, MatrixDim src_dim,
147  const Int32Pair *indexes);
148 void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
149  const double* src, const MatrixIndexT_cuda* reorder,
150  MatrixDim dst_dim, int src_stride);
151 void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
152  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
153  int src_stride);
154 void cudaD_mul_rows(dim3 Gr, dim3 Bl, double* dst,
155  const double* src, const MatrixIndexT_cuda* reorder,
156  MatrixDim dst_dim, int src_stride);
157 void cudaF_mul_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
158  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
159  int src_stride);
160 void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
161  const double* const * src, MatrixDim dst_dim);
162 void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
163  const float* const * src, MatrixDim dst_dim);
164 void cudaD_add_smat(dim3 Gr, dim3 Bl, double* mat, MatrixDim mat_dim,
165  double alpha, const int* smat_row_ptr,
166  const int* smat_col_idx, const double* smat_val);
167 void cudaF_add_smat(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
168  float alpha, const int* smat_row_ptr,
169  const int* smat_col_idx, const float* smat_val);
170 void cudaD_add_smat_trans(dim3 Gr, dim3 Bl, double* mat, MatrixDim mat_dim,
171  double alpha, const int* smat_row_ptr,
172  const int* smat_col_idx, const double* smat_val);
173 void cudaF_add_smat_trans(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
174  float alpha, const int* smat_row_ptr,
175  const int* smat_col_idx, const float* smat_val);
176 void cudaD_add_to_rows_direct(dim3 Gr, dim3 Bl, double alpha,
177  double* const * dst, const double* src,
178  MatrixDim src_dim);
179 void cudaF_add_to_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
180  const float* src, MatrixDim src_dim);
181 void cudaD_add_to_rows(dim3 Gr, dim3 Bl, double alpha,
182  double* dst, const double* src,
183  const MatrixIndexT_cuda* reorder,
184  MatrixDim src_dim, int dst_stride);
185 void cudaF_add_to_rows(dim3 Gr, dim3 Bl, float alpha,
186  float* dst, const float* src,
187  const MatrixIndexT_cuda* reorder,
188  MatrixDim src_dim, int dst_stride);
189 void cudaD_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
190  const double alpha, int dim);
191 void cudaF_add_vec2(dim3 Gr, dim3 Bl, float* mat, const float* vec,
192  const float alpha, int dim);
193 void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col,
194  double beta, double *dst, MatrixDim d);
195 void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col,
196  float beta, float *dst, MatrixDim d);
197 void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row,
198  double beta, double *dst, MatrixDim d);
199 void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row,
200  float beta, float *dst, MatrixDim d);
201 void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x,
202  const double* y, double beta, int dim);
203 void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x,
204  const float* y, float beta, int dim);
205 void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
206  int num_blocks, const double *C_data,
207  int C_num_cols, int C_row_stride, int C_col_stride,
208  const double *D_data, int D_row_stride,
209  int D_col_stride, double alpha, double beta);
210 void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
211  int num_blocks, const float *C_data,
212  int C_num_cols, int C_row_stride, int C_col_stride,
213  const float *D_data, int D_row_stride,
214  int D_col_stride, float alpha, float beta);
215 void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1,
216  const double *x2, MatrixDim y_dim,
217  int x1_stride, int x2_stride, int group_size);
218 void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1,
219  const float *x2, MatrixDim y_dim, int x1_stride,
220  int x2_stride, int group_size);
221 void cudaD_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int s,
222  const double* z, MatrixDim d, double* z2,
223  MatrixDim d2, double* t);
224 void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int s,
225  const float* z, MatrixDim d, float* z2, MatrixDim d2,
226  float* t);
227 void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
228  const double* mat, MatrixDim dmat, int dim);
229 void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
230  const float* mat, MatrixDim dmat, int dim);
231 void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
232  const double* mat, MatrixDim dmat, int dim);
233 void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
234  const float* mat, MatrixDim dmat, int dim);
235 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
236  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
237  int src_stride);
238 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
239  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
240  int src_stride);
241 void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
242  MatrixDim d_out, const double *v_in);
243 void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
244  const float *v_in);
245 void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
246  const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
247 void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x,
248  const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
249 void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
250  const double* mat_in, MatrixDim d_out,
251  MatrixDim d_in);
252 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
253  const double* mat_in, MatrixDim d_out,
254  MatrixDim d_in);
255 void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, double* mat_out,
256  const float* mat_in, MatrixDim d_out,
257  MatrixDim d_in);
258 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
259  const float* mat_in, MatrixDim d_out,
260  MatrixDim d_in);
261 void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, float *mat_out,
262  const double* mat_in, MatrixDim d_out,
263  MatrixDim d_in);
264 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
265  const double* mat_in, MatrixDim d_out,
266  MatrixDim d_in);
267 void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, float* mat_out,
268  const float* mat_in, MatrixDim d_out,
269  MatrixDim d_in);
270 void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
271  const float* mat_in, MatrixDim d_out,
272  MatrixDim d_in);
273 void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, double* mat, MatrixDim mat_dim,
274  const int* smat_row_ptr, const int* smat_col_idx,
275  const double* smat_val);
276 void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat,
277  MatrixDim mat_dim, const int* smat_row_ptr,
278  const int* smat_col_idx,
279  const double* smat_val);
280 void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, double* mat, MatrixDim mat_dim,
281  const int* smat_row_ptr, const int* smat_col_idx,
282  const float* smat_val);
283 void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat,
284  MatrixDim mat_dim, const int* smat_row_ptr,
285  const int* smat_col_idx,
286  const float* smat_val);
287 void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
288  const int* smat_row_ptr, const int* smat_col_idx,
289  const double* smat_val);
290 void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat,
291  MatrixDim mat_dim, const int* smat_row_ptr,
292  const int* smat_col_idx,
293  const double* smat_val);
294 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
295  const int* smat_row_ptr, const int* smat_col_idx,
296  const float* smat_val);
297 void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, float* mat,
298  MatrixDim mat_dim, const int* smat_row_ptr,
299  const int* smat_col_idx,
300  const float* smat_val);
301 void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
302  MatrixDim d_out);
303 void cudaF_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y,
304  MatrixDim d_out);
305 void cudaD_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B,
306  MatrixDim dmat);
307 void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
308  MatrixDim dmat);
309 void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
310  MatrixDim dmat);
311 void cudaF_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
312  MatrixDim dmat);
313 void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B,
314  MatrixDim dmat);
315 void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
316  MatrixDim dmat);
317 void cudaFD_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
318  MatrixDim dmat);
319 void cudaF_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
320  MatrixDim dmat);
321 void cublas_copy_kaldi_df(int Gr, int Bl, int n, const double* x, int incx,
322  float* y, int incy);
323 void cublas_copy_kaldi_fd(int Gr, int Bl, int n, const float* x, int incx,
324  double* y, int incy);
325 void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
326 void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
327 void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
328  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
329  int src_stride);
330 void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
331  const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
332  int src_stride);
333 void cudaD_copy_rows_direct(dim3 Gr, dim3 Bl, double* dst,
334  const double* const * src, MatrixDim dst_dim);
335 void cudaF_copy_rows_direct(dim3 Gr, dim3 Bl, float* dst,
336  const float* const * src, MatrixDim dst_dim);
337 void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
338  MatrixDim d_out, const double *v_in);
339 void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
340  const float *v_in);
341 void cudaD_copy_to_rows_direct(dim3 Gr, dim3 Bl, double* const * dst,
342  const double* src, MatrixDim src_dim);
343 void cudaF_copy_to_rows_direct(dim3 Gr, dim3 Bl, float* const * dst,
344  const float* src, MatrixDim src_dim);
345 void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
346 void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
347 void cudaD_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id, const double *iv,
348  const double *ov, const double* od,
349  MatrixDim id_dim, int iv_stride, int ov_stride,
350  int od_stride, int group_size, double power);
351 void cudaF_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
352  const float *ov, const float* od, MatrixDim id_dim,
353  int iv_stride, int ov_stride, int od_stride,
354  int group_size, float power);
355 void cudaD_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
356  const double* out_value, const int out_value_stride,
357  const double* out_deriv, const int out_deriv_stride,
358  double* in_deriv);
359 void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
360  const float* out_value, const int out_value_stride,
361  const float* out_deriv, const int out_deriv_stride,
362  float* in_deriv);
363 void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
364  const int have_dropout_mask,
365  const int num_rows, const double* input,
366  const int in_stride, const double* params,
367  const int params_stride,
368  const double* output_deriv,
369  const int output_deriv_stride,
370  const double* deriv_sum_in,
371  const int deriv_sum_in_stride,
372  const double* self_repair_config,
373  double count, double* input_deriv,
374  const int input_deriv_stride,
375  double* params_deriv,
376  const int params_deriv_stride,
377  double* value_sum_out,
378  const int value_sum_out_stride,
379  double* deriv_sum_out,
380  const int deriv_sum_out_stride,
381  double* self_repair_sum_out,
382  const int self_repair_sum_out_stride);
383 void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
384  const int have_dropout_mask,
385  const int num_rows, const float* input,
386  const int in_stride, const float* params,
387  const int params_stride,
388  const float* output_deriv,
389  const int output_deriv_stride,
390  const double* deriv_sum_in,
391  const int deriv_sum_in_stride,
392  const float* self_repair_config, double count,
393  float* input_deriv,
394  const int input_deriv_stride,
395  float* params_deriv,
396  const int params_deriv_stride,
397  double* value_sum_out,
398  const int value_sum_out_stride,
399  double* deriv_sum_out,
400  const int deriv_sum_out_stride,
401  float* self_repair_sum_out,
402  const int self_repair_sum_out_stride);
403 void cudaD_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
404  int id_stride, const double *iv,
405  MatrixDim iv_dim, const double* od,
406  int od_stride, double target_rms,
407  bool add_log_stddev);
408 void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
409  int id_stride, const float *iv,
410  MatrixDim iv_dim, const float* od,
411  int od_stride, float target_rms,
412  bool add_log_stddev);
413 void cudaD_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout, const double *e,
414  const double *y, MatrixDim d, int e_stride,
415  int y_stride, const double *a, const double *b);
416 void cudaF_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout, const float *e,
417  const float *y, MatrixDim d, int e_stride,
418  int y_stride, const float *a, const float *b);
419 void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
420  const double *y, MatrixDim d, int e_stride,
421  int y_stride);
422 void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
423  const float *y, MatrixDim d, int e_stride,
424  int y_stride);
425 void cudaD_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
426  const double* value, const int value_stride,
427  const double* diff, const int diff_stride);
428 void cudaF_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
429  const float* value, const int value_stride,
430  const float* diff, const int diff_stride);
431 void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
432  const double *y, MatrixDim d, int e_stride, int y_stride);
433 void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
434  const float *y, MatrixDim d, int e_stride, int y_stride);
435 void cudaD_ensure_nonzero(dim3 Gr, dim3 Bl, const double *x, MatrixDim d,
436  double epsilon, int y_stride, double *y);
437 void cudaF_ensure_nonzero(dim3 Gr, dim3 Bl, const float *x, MatrixDim d,
438  float epsilon, int y_stride, float *y);
439 void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
440  double *mat_net_out, double *vec_log_post, MatrixDim d);
441 void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
442  float *mat_net_out, float *vec_log_post, MatrixDim d);
443 void cudaD_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
444  MatrixDim dst_d, int src_stride);
445 void cudaF_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
446  MatrixDim dst_d, int src_stride);
447 void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div,
448  MatrixDim d);
449 void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div,
450  MatrixDim d);
451 void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
452  const double *mat2, double *mask,
453  MatrixDim mat1_dim, int mat2_stride,
454  int mask_stride);
455 void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
456  const float *mat2, float *mask,
457  MatrixDim mat1_dim, int mat2_stride,
458  int mask_stride);
459 void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val,
460  int32_cuda *vec_id, MatrixDim d);
461 void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val,
462  int32_cuda *vec_id, MatrixDim d);
463 void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
464  int src_stride, int group_size);
465 void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
466  int src_stride, int group_size);
467 void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
468  MatrixDim d, int src_stride, int group_size,
469  double power);
470 void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
471  int src_stride, int group_size, float power);
472 void cudaD_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
473  MatrixDim d, int src_stride, int group_size,
474  double power);
475 void cudaF_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
476  MatrixDim d, int src_stride, int group_size,
477  float power);
478 void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
479  int src_stride);
480 void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
481  int src_stride);
482 void cudaD_exp(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
483  int src_stride);
484 void cudaF_exp(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
485  int src_stride);
486 void cudaD_pow(dim3 Gr, dim3 Bl, double *y, const double *x, double power, MatrixDim d,
487  int src_stride);
488 void cudaF_pow(dim3 Gr, dim3 Bl, float *y, const float *x, float power, MatrixDim d,
489  int src_stride);
490 void cudaD_ceiling(dim3 Gr, dim3 Bl, double* y, const double* x, double ceiling_val,
491  MatrixDim dim, int src_stride);
492 void cudaF_ceiling(dim3 Gr, dim3 Bl, float* y, const float* x, float ceiling_val,
493  MatrixDim dim, int src_stride);
494 void cudaD_floor(dim3 Gr, dim3 Bl, double* y, const double* x, double floor_val,
495  MatrixDim dim, int src_stride);
496 void cudaF_floor(dim3 Gr, dim3 Bl, float* y, const float* x, float floor_val,
497  MatrixDim dim, int src_stride);
498 void cudaD_exp_limited(dim3 Gr, dim3 Bl, double* y, const double* x,
499  double lower_limit, double upper_limit, MatrixDim d, int src_stride);
500 void cudaF_exp_limited(dim3 Gr, dim3 Bl, float* y, const float* x,
501  float lower_limit, float upper_limit, MatrixDim d, int src_stride);
502 void cudaD_exp_special(dim3 Gr, dim3 Bl, double* y, const double* x,
503  MatrixDim d, int src_stride);
504 void cudaF_exp_special(dim3 Gr, dim3 Bl, float* y, const float* x,
505  MatrixDim d, int src_stride);
506 void cudaD_log(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride);
507 void cudaF_log(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride);
508 void cudaD_pow_abs(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
509  bool include_sign, MatrixDim dim, int src_stride);
510 void cudaF_pow_abs(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
511  bool include_sign, MatrixDim dim, int src_stride);
512 void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
513  MatrixDim d);
514 void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
515  MatrixDim d);
516 void cuda_int32_sequence(dim3 Gr, dim3 Bl, int32_cuda* data, int length,
517  int32_cuda base);
518 void cudaD_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d);
519 void cudaF_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d);
520 void cudaD_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
521  MatrixDim y_dim, int x_stride);
522 void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
523  MatrixDim y_dim, int x_stride);
524 void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
525  const int in_stride, const double* params,
526  const int params_stride, const int out_stride,
527  const int cell_dim, const int have_dropout_mask,
528  const int num_rows,
529  double* out);
530 void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
531  const int in_stride, const float* params,
532  const int params_stride, const int out_stride,
533  const int cell_dim, const int have_dropout_mask,
534  const int num_rows,
535  float* out);
536 void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
537  double alpha, MatrixElement<double>* x,
538  int num_elements);
539 void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
540  float alpha, MatrixElement<float>* x,
541  int num_elements);
542 void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
543  double alpha, const Int32Pair* indices,
544  const double* x, int s, double* data);
545 void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
546  float alpha, const Int32Pair* indices,
547  const float* x, int s, float* data);
548 void cudaD_matrix_add_to_elements(dim3 Gr, dim3 Bl, double alpha,
549  double* mat, MatrixDim dim,
550  const MatrixIndexT_cuda* elements);
551 void cudaF_matrix_add_to_elements(dim3 Gr, dim3 Bl, float alpha,
552  float* mat, MatrixDim dim,
553  const MatrixIndexT_cuda* elements);
554 void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
555  const Int32Pair *indices, int indices_size,
556  double *output);
557 void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
558  const Int32Pair *indices, int indices_size,
559  float *output);
560 void cudaD_vector_copy_elements(dim3 Gr, dim3 Bl, double *data, int dim,
561  const double *src_mat, int mat_stride,
562  bool transpose,
563  const MatrixIndexT_cuda* elements);
564 void cudaF_vector_copy_elements(dim3 Gr, dim3 Bl, float *data, int dim,
565  const float *src_mat, int mat_stride,
566  bool transpose,
567  const MatrixIndexT_cuda* elements);
568 void cudaD_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d,
569  int src_stride);
570 void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d,
571  int src_stride);
572 void cudaD_max_mat_cols(int Gr, int Bl, double* result, const double* mat,
573  const MatrixDim d);
574 void cudaF_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
575  const MatrixDim d);
576 void cudaD_min(dim3 Gr, dim3 Bl, double *mat, const double *other,
577  MatrixDim mat_d, int other_stride);
578 void cudaF_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
579  MatrixDim mat_d, int other_stride);
580 void cudaD_min_mat_cols(int Gr, int Bl, double* result, const double* mat,
581  const MatrixDim d);
582 void cudaF_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
583  const MatrixDim d);
584 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
585  MatrixDim d);
586 void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
587  MatrixDim d);
588 void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
589  MatrixDim dst_d, int src_stride);
590 void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
591  MatrixDim dst_d, int src_stride);
592 void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x,
593  MatrixDim d, int src_stride, int group_size);
594 void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
595  MatrixDim d, int src_stride, int group_size);
596 void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
597  MatrixDim d);
598 void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
599  MatrixDim d);
600 void cudaD_normalize_per_row(size_t Gr, size_t Bl, double *y, int y_stride,
601  const double *x, MatrixDim x_d, double tartget_rms,
602  bool add_log_stddev);
603 void cudaF_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
604  const float *x, MatrixDim x_d, float tartget_rms,
605  bool add_log_stddev);
606 void cudaD_one(int Gr, int Bl, double* x, int dim);
607 void cudaF_one(int Gr, int Bl, float* x, int dim);
608 void cudaD_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
609  MatrixDim d, int src_stride, const double *a,
610  const double *b);
611 void cudaF_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
612  MatrixDim d, int src_stride, const float *a,
613  const float *b);
614 void cudaD_randomize(dim3 Gr, dim3 Bl, double *y, const double *x,
615  const int32_cuda *copy_from, MatrixDim d_out,
616  MatrixDim d_in);
617 void cudaF_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
618  const int32_cuda *copy_from, MatrixDim d_out,
619  MatrixDim d_in);
620 void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1,
621  double lr, MatrixDim d, int stride_grad);
622 void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1,
623  float lr, MatrixDim d, int stride_grad);
624 void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig,
625  double changed);
626 void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
627  float changed);
628 void cudaD_scale_diag_packed(int Gr, int Bl, double* mat, double value,
629  int dim);
630 void cudaF_scale_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
631 void cudaD_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
632 void cudaF_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
633 void cudaD_select_rows(dim3 Gr, dim3 Bl, const int* out_row_ptr,
634  int* out_col_idx, double* out_val,
635  const int* row_indexes, const int num_selected_rows,
636  const int* in_row_ptr, const int* in_col_idx,
637  const double* in_val);
638 void cudaF_select_rows(dim3 Gr, dim3 Bl, const int* out_row_ptr,
639  int* out_col_idx, float* out_val, const int* row_indexes,
640  const int num_selected_rows, const int* in_row_ptr,
641  const int* in_col_idx, const float* in_val);
642 void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a,
643  double param_1, double param_2, double param_3,
644  int* flag, int dim);
645 void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a,
646  float param_1, float param_2, float param_3,
647  int* flag, int dim);
648 void cudaD_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
649 void cudaF_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
650 void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
651 void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
652 void cudaD_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
653 void cudaF_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
654 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
655  const double *B, const double *C, double *dst,
656  MatrixDim d, int stride_a, int stride_b,
657  int stride_c);
658 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
659  const float *C, float *dst, MatrixDim d,
660  int stride_a, int stride_b, int stride_c);
661 void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
662 void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
663 void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
664  int src_stride);
665 void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
666  int src_stride);
667 void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
668  int src_stride);
669 void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
670  int src_stride);
671 void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
672  MatrixDim d, int src_stride);
673 void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
674  MatrixDim d, int src_stride);
675 void cudaD_splice(dim3 Gr, dim3 Bl, double *y, const double *x,
676  const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
677 void cudaF_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
678  const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
679 void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
680  const double *src_data, MatrixDim src_dim,
681  const Int32Pair *indices);
682 void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
683  const float *src_data, MatrixDim src_dim,
684  const Int32Pair *indices);
685 void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
686  const MatrixDim d);
687 void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
688  const MatrixDim d);
689 void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
690  const double* T, MatrixDim tdim, double *S,
691  MatrixDim sdim);
692 void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T,
693  MatrixDim tdim, float *S, MatrixDim sdim);
694 void cudaD_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y,
695  MatrixDim d_in);
696 void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
697  MatrixDim d_in);
698 void cudaD_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y,
699  MatrixDim d_in);
700 void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
701  MatrixDim d_in);
702 void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
703  MatrixDim d_in);
704 void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
705  MatrixDim d_in);
706 void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
707  int src_stride);
708 void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
709  int src_stride);
710 void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
711 void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
712 void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
713  MatrixDim dA, int B_stride, double* value);
714 void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
715  MatrixDim dA, int B_stride, float* value);
716 void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
717  const double* B, MatrixDim dA, int B_stride,
718  double* value);
719 void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B,
720  MatrixDim dA, int B_stride, float* value);
721 void cudaD_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat,
722  MatrixDim mat_dim, const int* smat_row_ptr,
723  const int* smat_col_idx, const double* smat_val,
724  double* trace_vec);
725 void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat, MatrixDim mat_dim,
726  const int* smat_row_ptr, const int* smat_col_idx,
727  const float* smat_val, float* trace_vec);
728 void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat,
729  MatrixDim mat_dim, const int* smat_row_ptr,
730  const int* smat_col_idx, const double* smat_val,
731  double* trace_vec);
732 void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat,
733  MatrixDim mat_dim, const int* smat_row_ptr,
734  const int* smat_col_idx, const float* smat_val,
735  float* trace_vec);
736 void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val,
737  float* num, int dim);
738 void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val,
739  float* num, int dim);
740 void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
741 void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
742 void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
743  float* num, int dim);
744 void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
745  float* num, int dim);
746 void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
747 void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
748 void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
749  const double *src, int dim);
750 void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
751  const float *src, int dim);
752 void cudaD_vec_max(int Gr, int Bl, const double* v, double* value, int dim,
753  int inc);
754 void cudaF_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
755  int inc);
756 void cudaD_vec_min(int Gr, int Bl, const double* v, double* value, int dim,
757  int inc);
758 void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
759  int inc);
760 void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
761  int dim);
762 void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
763 void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
764 void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
765 void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
766 void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
767 
768 
769 void cuda_compress_int16(dim3 Gr, dim3 Bl, const BaseFloat *src,
770  MatrixDim dim, int16_t *dest,
771  int dest_stride, float inv_scale,
772  bool bounds_check);
773 void cuda_compress_uint16(dim3 Gr, dim3 Bl, const BaseFloat *src,
774  MatrixDim dim, uint16_t *dest,
775  int dest_stride, float inv_scale,
776  bool bounds_check);
777 void cuda_compress_uint8(dim3 Gr, dim3 Bl, const BaseFloat *src,
778  MatrixDim dim, uint8_t *dest,
779  int dest_stride, float inv_scale,
780  bool bounds_check);
781 void cuda_compress_int8(dim3 Gr, dim3 Bl, const BaseFloat *src,
782  MatrixDim dim, int8_t *dest,
783  int dest_stride, float inv_scale,
784  bool bounds_check);
785 
786 void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
787  MatrixDim dim, uint8_t *dest, int dest_stride);
788 
789 void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
790  MatrixDim dim, const int16_t *src,
791  int src_stride, float scale);
792 void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest,
793  MatrixDim dim, const uint16_t *src,
794  int src_stride, float scale);
795 void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest,
796  MatrixDim dim, const int8_t *src,
797  int src_stride, float scale);
798 void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
799  MatrixDim dim, const uint8_t *src,
800  int src_stride, float scale);
801 
802 // copies the sub matrix in src[range_start, range_end] to the matrix in dst
803 // if src row is outside of the clamped range it will clamp to the specified
804 // rows. src and dst cannot overlap.
805 void cudaF_mat_copy_range_clamped(
806  int32_t row_start, int32_t row_end, int32_t num_cols,
807  const float *src, int32_t lds,
808  int32_t clamp_low, int32_t clamp_high,
809  float *dst, int32_t ldd);
810 void cudaD_mat_copy_range_clamped(
811  int32_t row_start, int32_t row_end, int32_t num_cols,
812  const double *src, int32_t lds,
813  int32_t clamp_low, int32_t clamp_high,
814  double *dst, int32_t ldd);
815 
816 // for i=[0,num_mats) perform the matrix copy outputs[i] = inputs[i] where
817 // the matrices are of size num_rows[i] x num_cols[i] and have a leading
818 // dimension of ldo[i] for the output and ldi[i] for the input.
819 void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
820  int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
821  int32_t *ldo);
822 void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
823  int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
824  int32_t *ldo);
825 
826 // Launches a kernel that does nothing, explicitly using the legacy default stream;
827 // this will synchronize all CUDA streams (except for non-blocking streams) on the
828 // device.
829 void cuda_legacy_noop();
830 
831 
832 } // extern "C"
833 
834 #endif // HAVE_CUDA
835 
836 #endif
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:46
int32_t int32_cuda
Definition: cu-matrixdim.h:31
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
Definition: cu-matrixdim.h:68
const size_t count
float BaseFloat
Definition: kaldi-types.h:29
struct rnnlm::@11::@12 n
int32_t MatrixIndexT_cuda
Definition: cu-matrixdim.h:32