30 #include <cuda_runtime_api.h> 31 #include <cublas_v2.h> 49 template<
typename Real>
55 if (rows * cols == 0)
KALDI_ASSERT(rows == 0 && cols == 0);
56 if (this->num_rows_ == rows && this->num_cols_ == cols) {
57 if (resize_type ==
kSetZero) this->SetZero();
60 if (this->num_rows_ != 0)
62 if (rows == 0)
return;
64 if (CuDevice::Instantiate().Enabled()) {
69 this->
data_ =
static_cast<Real*
>(CuDevice::Instantiate().MallocPitch(
70 row_bytes, rows, &pitch));
71 this->num_rows_ = rows;
72 this->num_cols_ = cols;
73 this->stride_ = pitch /
sizeof(Real);
75 size_t bytes = rows * cols *
sizeof(Real);
76 this->
data_ =
static_cast<Real*
>(CuDevice::Instantiate().Malloc(bytes));
77 this->num_rows_ = rows;
78 this->num_cols_ = cols;
81 if (resize_type ==
kSetZero) this->SetZero();
82 CuDevice::Instantiate().AccuProfile(
"CuMatrix::Resize", tim);
93 template<
typename Real>
96 if (CuDevice::Instantiate().Enabled()) {
97 if (this->
data_ != NULL) {
99 CuDevice::Instantiate().Free(this->
data_);
100 CuDevice::Instantiate().AccuProfile(__func__, tim);
113 template<
typename Real>
122 template<
typename Real>
125 if (CuDevice::Instantiate().Enabled()) {
126 if (this->num_rows_ == 0) {
130 this->CopyFromMat(*mat);
146 this->CopyToMat(mat);
160 template <
typename Real>
166 int32 row_offset = 0, col_offset = 0;
171 col_offset, num_cols);
173 row_offset += num_rows;
174 col_offset += num_cols;
176 KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
179 int32 row_offset = 0, col_offset = 0;
184 col_offset, num_cols);
186 row_offset += num_rows;
187 col_offset += num_cols;
189 KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
194 template <
typename Real>
199 this->CopyFromBlock(B);
202 this->CopyFromBlock(B,
kTrans);
207 template<
class OtherReal>
211 static_cast<const void*>(M.
Data()) ==
212 static_cast<const void*>(this->Data())) {
213 if (M.
Data() == NULL)
221 if (CuDevice::Instantiate().Enabled()) {
234 cudaMemcpy2DAsync(
data_, dst_pitch, M.
data_, src_pitch,
235 width, M.
num_rows_, cudaMemcpyDeviceToDevice,
236 cudaStreamPerThread));
239 dim3 dimGrid, dimBlock;
240 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
241 &dimGrid, &dimBlock);
242 cuda_copy_from_mat(dimGrid, dimBlock,
data_, M.
data_, Dim(), M.
Dim());
246 const int32 warpSize = 32;
247 dim3 dimBlock(warpSize,
CU1DBLOCK / warpSize);
248 dim3 dimGrid(n_blocks(M.
NumCols(), warpSize),
249 n_blocks(M.
NumRows(), warpSize));
250 cuda_copy_from_mat_trans(dimGrid, dimBlock,
data_, M.
data_, Dim(),
253 CU_SAFE_CALL(cudaGetLastError());
255 CuDevice::Instantiate().AccuProfile(
"CuMatrixBase::CopyFromMat(from other CuMatrixBase)", tim);
259 Mat().CopyFromMat(M.
Mat(), trans);
278 template<
typename Real>
279 template<
typename OtherReal>
286 if (CuDevice::Instantiate().Enabled()) {
289 dim3 dimGrid(n_blocks(num_rows_,
CU2DBLOCK),
292 cuda_copy_from_tp(dimGrid, dimBlock,
data_, M.
Data(), Dim());
294 cuda_copy_from_tp_trans(dimGrid, dimBlock,
data_, M.
Data(), Dim());
296 CuDevice::Instantiate().AccuProfile(__func__, tim);
300 Mat().CopyFromTp(M.
Mat(), trans);
313 template<
typename Real>
317 if (CuDevice::Instantiate().Enabled()) {
325 CU_SAFE_CALL(cudaMemcpy2DAsync(
data_, dst_pitch, src.
Data(), src_pitch,
326 width, src.
NumRows(), cudaMemcpyHostToDevice,
327 cudaStreamPerThread));
328 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
330 CuDevice::Instantiate().AccuProfile(
"CuMatrixBase::CopyFromMat(from CPU)", tim);
333 this->CopyFromMat(trans_mat,
kTrans);
338 Mat().CopyFromMat(src, trans);
342 template<
typename Real>
343 template<
typename OtherReal>
347 this->CopyFromMat(temp, trans);
359 template<
typename Real>
365 if (CuDevice::Instantiate().Enabled()) {
368 dim3 dimGrid(n_blocks(NumRows(),
CU2DBLOCK),
370 cuda_copy_from_sp(dimGrid, dimBlock, M.
Data(),
data_, Dim());
371 CuDevice::Instantiate().AccuProfile(
"CuMatrix::CopyFromSp", tim);
375 Mat().CopyFromSp(M.
Mat());
379 template<
typename Real>
385 this->CopyFromMat(other, trans);
388 template<
typename Real>
394 this->CopyFromMat(other, trans);
398 template<
typename Real>
399 template<
typename OtherReal>
405 this->CopyFromMat(other, trans);
418 template <
typename Real>
420 int32_t start_range, int32_t end_range,
421 int32_t clamp_low, int32_t clamp_high) {
427 if (CuDevice::Instantiate().Enabled()) {
428 cuda_mat_copy_range_clamped(start_range, end_range, NumCols(),
429 src.
Data(), src.
Stride(), clamp_low, clamp_high,
434 for (
int32 t = start_range; t < end_range; t++) {
436 if (t_clamped < clamp_low) t_clamped = clamp_low;
437 if (t_clamped >= clamp_high) t_clamped = clamp_high;
445 template<
typename Real>
446 template<
typename OtherReal>
450 if (CuDevice::Instantiate().Enabled()) {
456 if (num_rows_ == 0)
return;
462 CU_SAFE_CALL(cudaMemcpy2DAsync(dst->
Data(), dst_pitch, this->
data_,
463 src_pitch, width, this->num_rows_,
464 cudaMemcpyDeviceToHost, cudaStreamPerThread));
465 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
466 CuDevice::Instantiate().AccuProfile(
"CuMatrix::CopyToMatD2H", tim);
493 template<
typename Real>
496 temp.
Read(is, binary);
501 template<
typename Real>
504 this->CopyToMat(&temp);
505 temp.
Write(os, binary);
508 template<
typename Real>
511 if (CuDevice::Instantiate().Enabled()) {
513 CU_SAFE_CALL(cudaMemset2DAsync(
data_, stride_ *
sizeof(Real), 0,
514 num_cols_ *
sizeof(Real), num_rows_ ,
515 cudaStreamPerThread));
516 CuDevice::Instantiate().AccuProfile(
"CuMatrix::SetZero", tim);
530 template<
typename Real>
533 if (CuDevice::Instantiate().Enabled()) {
534 if (num_rows_ == 0)
return;
537 dim3 dimGrid, dimBlock;
538 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
539 &dimGrid, &dimBlock);
541 cuda_set_const(dimGrid, dimBlock,
data_, value, Dim());
542 CU_SAFE_CALL(cudaGetLastError());
544 CuDevice::Instantiate().AccuProfile(__func__, tim);
553 template<
typename Real>
556 if (CuDevice::Instantiate().Enabled()) {
557 if (num_rows_ == 0)
return;
560 dim3 dimGrid, dimBlock;
561 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
562 &dimGrid, &dimBlock);
564 cuda_set_zero_above_diag(dimGrid, dimBlock,
data_, Dim());
565 CU_SAFE_CALL(cudaGetLastError());
567 CuDevice::Instantiate().AccuProfile(__func__, tim);
573 for (
int32 r = 0; r + 1 < num_rows; r++) {
575 vec_part(vec, r + 1, num_cols - (r + 1));
581 template<
typename Real>
584 if (CuDevice::Instantiate().Enabled()) {
585 if (num_rows_ == 0)
return;
588 dim3 dimGrid, dimBlock;
589 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
590 &dimGrid, &dimBlock);
592 cuda_add(dimGrid, dimBlock,
data_, value, Dim());
593 CU_SAFE_CALL(cudaGetLastError());
595 CuDevice::Instantiate().AccuProfile(__func__, tim);
603 template<
typename Real>
606 if (CuDevice::Instantiate().Enabled()) {
607 if (num_rows_ == 0)
return;
613 this_stride = stride_ + 1;
615 dim3 dimGrid(1, n_blocks(num_diag,
CU1DBLOCK));
617 cuda_add(dimGrid, dimBlock,
data_, value, d);
618 CU_SAFE_CALL(cudaGetLastError());
620 CuDevice::Instantiate().AccuProfile(__func__, tim);
624 Mat().AddToDiag(value);
628 template<
typename Real>
637 return (
TraceMatMat(*
this, *
this,
kTrans) + this->NumRows() - 2.0 * this->Trace() <=
638 tol * this->NumRows());
643 template<
typename Real>
646 if (CuDevice::Instantiate().Enabled()) {
647 if (num_rows_ == 0)
return;
650 dim3 dimGrid, dimBlock;
651 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
652 &dimGrid, &dimBlock);
654 cuda_scale(dimGrid, dimBlock,
data_, value, Dim());
655 CU_SAFE_CALL(cudaGetLastError());
657 CuDevice::Instantiate().AccuProfile(__func__, tim);
666 template<
typename Real>
669 if (CuDevice::Instantiate().Enabled()) {
675 dim3 dimGrid, dimBlock;
676 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
677 &dimGrid, &dimBlock);
680 CU_SAFE_CALL(cudaGetLastError());
682 CuDevice::Instantiate().AccuProfile(__func__, tim);
686 Mat().MulElements(A.
Mat());
690 template<
typename Real>
693 if (CuDevice::Instantiate().Enabled()) {
699 dim3 dimGrid, dimBlock;
700 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
701 &dimGrid, &dimBlock);
704 CU_SAFE_CALL(cudaGetLastError());
706 CuDevice::Instantiate().AccuProfile(__func__, tim);
710 Mat().DivElements(A.
Mat());
714 template<
typename Real>
717 if (CuDevice::Instantiate().Enabled()) {
723 dim3 dimGrid, dimBlock;
724 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
725 &dimGrid, &dimBlock);
728 CU_SAFE_CALL(cudaGetLastError());
730 CuDevice::Instantiate().AccuProfile(__func__, tim);
739 template<
typename Real>
742 if (CuDevice::Instantiate().Enabled()) {
748 dim3 dimGrid, dimBlock;
749 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
750 &dimGrid, &dimBlock);
753 CU_SAFE_CALL(cudaGetLastError());
755 CuDevice::Instantiate().AccuProfile(__func__, tim);
764 template<
typename Real>
767 if (CuDevice::Instantiate().Enabled()) {
773 dim3 dimGrid, dimBlock;
774 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
775 &dimGrid, &dimBlock);
777 cuda_mul_cols_vec(dimGrid, dimBlock,
data_, scale.
data_, Dim());
778 CU_SAFE_CALL(cudaGetLastError());
781 CuDevice::Instantiate().AccuProfile(__func__, tim);
785 Mat().MulColsVec(scale.
Vec());
791 template<
typename Real>
794 if (CuDevice::Instantiate().Enabled()) {
799 dim3 dimGrid, dimBlock;
800 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
801 &dimGrid, &dimBlock);
803 cuda_mul_rows_vec(dimGrid, dimBlock,
data_, scale.
data_, Dim());
804 CU_SAFE_CALL(cudaGetLastError());
807 CuDevice::Instantiate().AccuProfile(__func__, tim);
811 Mat().MulRowsVec(scale.
Vec());
815 template<
typename Real>
819 if (CuDevice::Instantiate().Enabled()) {
821 int group_size = this->NumCols() / src.
NumCols();
823 dim3 dimGrid, dimBlock;
824 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
825 &dimGrid, &dimBlock);
827 cuda_mul_rows_group_mat(dimGrid, dimBlock, this->
data_, src.
data_,
828 this->Dim(), src.
Stride(), group_size);
829 CU_SAFE_CALL(cudaGetLastError());
831 CuDevice::Instantiate().AccuProfile(__func__, tim);
835 Mat().MulRowsGroupMat(src.
Mat());
840 template<
typename Real>
847 int group_size = this->NumCols() / out_value.
NumCols();
850 if (CuDevice::Instantiate().Enabled()) {
852 const int kWarpSize = 32;
853 dim3 dimBlock(kWarpSize,
CU1DBLOCK / kWarpSize);
854 dim3 dimGrid(n_blocks(NumCols(), dimBlock.x),
855 n_blocks(NumRows(), dimBlock.y));
856 if (dimGrid.x * dimGrid.y > 1024) {
857 dimGrid.y = std::max(1024 / dimGrid.x,
unsigned(1));
859 cuda_diff_group_pnorm(dimGrid, dimBlock, this->
data_, in_value.
Data(),
860 out_value.
Data(), out_deriv.
Data(), Dim(),
862 out_deriv.
Stride(), group_size, power);
863 CU_SAFE_CALL(cudaGetLastError());
864 CuDevice::Instantiate().AccuProfile(__func__, tim);
868 Mat().GroupPnormDeriv(in_value.
Mat(), out_value.
Mat(), power);
869 MulRowsGroupMat(out_deriv);
873 template<
typename Real>
877 int group_size = this->NumCols() / src2.
NumCols();
880 if (CuDevice::Instantiate().Enabled()) {
883 dim3 dimGrid(n_blocks(NumCols(),
CU2DBLOCK),
885 cuda_calc_group_max_deriv(dimGrid, dimBlock, this->
data_, src1.
Data(),
888 CU_SAFE_CALL(cudaGetLastError());
890 CuDevice::Instantiate().AccuProfile(__func__, tim);
894 Mat().GroupMaxDeriv(src1.
Mat(), src2.
Mat());
898 template<
typename Real>
901 if (CuDevice::Instantiate().Enabled()) {
906 dim3 dimGrid, dimBlock;
907 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
908 &dimGrid, &dimBlock);
911 if (dimGrid.x * dimGrid.y > 1024) {
912 dimGrid.x = 1024 / dimGrid.y;
913 if (dimGrid.x == 0) {
917 cuda_div_rows_vec(dimGrid, dimBlock,
data_, div.
data_, Dim());
918 CU_SAFE_CALL(cudaGetLastError());
920 CuDevice::Instantiate().AccuProfile(__func__, tim);
926 Mat().MulRowsVec(temp);
931 template<
typename Real>
934 if (CuDevice::Instantiate().Enabled()) {
937 dim3 dimGrid, dimBlock;
938 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
939 &dimGrid, &dimBlock);
941 cuda_invert_elements(dimGrid, dimBlock,
data_, Dim());
942 CU_SAFE_CALL(cudaGetLastError());
944 CuDevice::Instantiate().AccuProfile(__func__, tim);
948 Mat().InvertElements();
953 template<
typename Real>
958 if (CuDevice::Instantiate().Enabled()) {
964 if (num_rows_ == 0)
return;
969 dim3 dimGrid(n_blocks(NumCols(),
CU2DBLOCK),
971 cuda_add_mat(dimGrid, dimBlock, alpha, A.
data_,
973 (transA ==
kTrans ? 1 : 0));
974 CU_SAFE_CALL(cudaGetLastError());
976 CuDevice::Instantiate().AccuProfile(__func__, tim);
980 Mat().AddMat(alpha, A.
Mat(), transA);
984 template<
typename Real>
988 if (CuDevice::Instantiate().Enabled()) {
1002 const int warpSize = 32;
1003 dim3 dimBlock(warpSize,
CU1DBLOCK / warpSize);
1004 dim3 dimGrid(n_blocks(A.
NumRows(), dimBlock.y));
1007 cuda_add_smat(dimGrid, dimBlock, Data(), Dim(), alpha, A.
CsrRowPtr(),
1010 cuda_add_smat_trans(dimGrid, dimBlock, Data(), Dim(), alpha,
1014 CU_SAFE_CALL(cudaGetLastError());
1015 CuDevice::Instantiate().AccuProfile(__func__, tim);
1019 Mat().AddSmat(alpha, A.
Smat(), trans);
1023 template<
typename Real>
1028 if (CuDevice::Instantiate().Enabled()) {
1047 cusparseMatDescr_t descr;
1048 CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descr));
1054 cusparse_csrmm2(GetCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
1055 CUSPARSE_OPERATION_TRANSPOSE, AT.
NumRows(),
1061 cusparse_csrmm2(GetCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
1062 CUSPARSE_OPERATION_TRANSPOSE, A.
NumRows(),
1067 CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descr));
1069 this->CopyFromMat(CT,
kTrans);
1071 CuDevice::Instantiate().AccuProfile(__func__, tim);
1075 Mat().AddSmatMat(alpha, A.
Smat(), transA, B.
Mat(), beta);
1079 template<
typename Real>
1084 if (CuDevice::Instantiate().Enabled()) {
1097 cusparseMatDescr_t descr;
1098 CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descr));
1101 GetCusparseHandle(),
1103 CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
1106 &beta, Data(), Stride()));
1107 CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descr));
1109 CuDevice::Instantiate().AccuProfile(__func__, tim);
1113 Mat().AddMatSmat(alpha, A.
Mat(), B.
Smat(), transB, beta);
1118 template<
typename Real>
1121 if (num_rows_ == 0 || num_cols_ == 0)
return;
1127 int32 num_row_blocks, num_col_blocks;
1130 num_row_blocks = A.
Mat().NumRows() / num_rows_;
1131 num_col_blocks = A.
Mat().NumCols() / num_cols_;
1134 num_row_blocks = A.
Mat().NumRows() / num_cols_;
1135 num_col_blocks = A.
Mat().NumCols() / num_rows_;
1138 if (CuDevice::Instantiate().Enabled()) {
1140 dim3 dimGrid, dimBlock;
1141 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1142 &dimGrid, &dimBlock);
1143 cuda_add_mat_blocks(dimGrid, dimBlock, alpha, A.
data_, num_row_blocks,
1145 (transA ==
kTrans ? 1 : 0));
1146 CU_SAFE_CALL(cudaGetLastError());
1148 CuDevice::Instantiate().AccuProfile(__func__, tim);
1160 for (
int32 i = 0;
i < num_row_blocks;
i++) {
1161 for (
int32 j = 0;
j < num_col_blocks;
j++) {
1171 KALDI_ERR <<
"Transposed operation not supported currently.";
1172 if (!(num_rows_ % A.
NumRows() == 0 && num_cols_ % A.
NumCols() == 0))
1173 KALDI_ERR <<
"Invalid sizes of arguments";
1175 if (CuDevice::Instantiate().Enabled()) {
1177 dim3 dimGrid, dimBlock;
1178 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1179 &dimGrid, &dimBlock);
1180 cuda_add_mat_repeated(dimGrid, dimBlock, alpha,
1182 CU_SAFE_CALL(cudaGetLastError());
1183 CuDevice::Instantiate().AccuProfile(__func__, tim);
1188 &this_mat = this->Mat();
1189 for (
int32 row_offset = 0; row_offset < NumRows();
1190 row_offset += src_mat.
NumRows()) {
1191 for (
int32 col_offset = 0; col_offset < NumCols();
1192 col_offset += src_mat.
NumCols()) {
1194 row_offset, src_mat.
NumRows(),
1195 col_offset, src_mat.
NumCols());
1196 this_part.
AddMat(alpha, src_mat);
1205 template<
typename Real>
1209 if (CuDevice::Instantiate().Enabled()) {
1215 if (num_rows_ == 0)
return;
1216 dim3 dimGrid, dimBlock;
1217 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1218 &dimGrid, &dimBlock);
1219 cuda_set_mat_mat_div_mat(dimGrid, dimBlock, A.
data_, B.
data_, C.
data_,
1221 CU_SAFE_CALL(cudaGetLastError());
1223 CuDevice::Instantiate().AccuProfile(__func__, tim);
1227 Mat().SetMatMatDivMat(A.
Mat(), B.
Mat(), C.
Mat());
1231 template<
typename Real>
1235 if (col.
Dim() != NumRows()) {
1236 KALDI_ERR <<
"Non matching dimensions: Rows:" << NumRows() <<
" VectorDim:" << col.
Dim();
1240 if (CuDevice::Instantiate().Enabled()) {
1242 dim3 dimGrid, dimBlock;
1243 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1244 &dimGrid, &dimBlock);
1245 cuda_add_vec_to_cols(dimGrid, dimBlock, alpha, col.
data_, beta,
1247 CU_SAFE_CALL(cudaGetLastError());
1249 CuDevice::Instantiate().AccuProfile(__func__, tim);
1253 if (beta != 1.0) Mat().Scale(beta);
1254 Mat().AddVecToCols(alpha, col.
Vec());
1260 template<
typename Real>
1264 if (row.
Dim() != NumCols()) {
1265 KALDI_ERR <<
"Non matching dimensions: Cols:" << NumCols() <<
" VectorDim:" << row.
Dim();
1268 if (CuDevice::Instantiate().Enabled()) {
1270 dim3 dimGrid, dimBlock;
1271 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1272 &dimGrid, &dimBlock);
1273 cuda_add_vec_to_rows(dimGrid, dimBlock, alpha, row.
data_, beta,
data_, Dim());
1274 CU_SAFE_CALL(cudaGetLastError());
1276 CuDevice::Instantiate().AccuProfile(__func__, tim);
1280 if (beta != 1.0) Mat().Scale(beta);
1281 Mat().AddVecToRows(alpha, row.
Vec());
1290 template<
typename Real>
1311 if (CuDevice::Instantiate().Enabled()) {
1313 CUBLAS_SAFE_CALL(cublas_gemm(GetCublasHandle(),
1314 (transB==
kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
1315 (transA==
kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
1319 CuDevice::Instantiate().AccuProfile(__func__, tim);
1323 Mat().AddMatMat(alpha, A.
Mat(), transA, B.
Mat(), transB, beta);
1328 template<
typename Real>
1338 if (CuDevice::Instantiate().Enabled()) {
1340 CUBLAS_SAFE_CALL(cublas_ger(GetCublasHandle(), m, n, alpha,
1343 CuDevice::Instantiate().AccuProfile(__func__, tim);
1347 Mat().AddVecVec(alpha, x.
Vec(), y.
Vec());
1352 template<
typename Real>
1359 if (num_rows_ == 0)
return;
1363 if (CuDevice::Instantiate().Enabled()) {
1365 cublasOperation_t trans = (transA ==
kTrans ? CUBLAS_OP_N : CUBLAS_OP_T);
1367 CUBLAS_SAFE_CALL(cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER,
1368 trans, num_rows_, A_other_dim,
1370 beta, this->
data_, this->stride_));
1372 CuDevice::Instantiate().AccuProfile(__func__, tim);
1376 Mat().SymAddMat2(alpha, A.
Mat(), transA, beta);
1381 template<
typename Real>
1387 if (CuDevice::Instantiate().Enabled()) {
1397 dim3 dimGrid(n_blocks(num_cols_,
CU2DBLOCK),
1402 cuda_add_diag_vec_mat(dimGrid, dimBlock, alpha,
data_, Dim(),
1403 v.
Data(), M.
Data(), M_row_stride, M_col_stride, beta);
1404 CU_SAFE_CALL(cudaGetLastError());
1405 CuDevice::Instantiate().AccuProfile(__func__, tim);
1409 Mat().AddDiagVecMat(alpha, v.
Vec(), M.
Mat(), transM, beta);
1414 template<
typename Real>
1421 if (CuDevice::Instantiate().Enabled()) {
1430 dim3 dimGrid, dimBlock;
1431 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1432 &dimGrid, &dimBlock);
1435 cuda_add_mat_diag_vec(dimGrid, dimBlock, alpha,
data_, Dim(),
1436 M.
Data(), M_row_stride, M_col_stride, v.
Data(), beta);
1437 CU_SAFE_CALL(cudaGetLastError());
1438 CuDevice::Instantiate().AccuProfile(__func__, tim);
1442 Mat().AddMatDiagVec(alpha, M.
Mat(), transM, v.
Vec(), beta);
1446 template<
typename Real>
1450 if (CuDevice::Instantiate().Enabled()) {
1453 dim3 dimGrid, dimBlock;
1454 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1455 &dimGrid, &dimBlock);
1456 cuda_add_mat_mat_elements(dimGrid, dimBlock, this->
data_, A.
Data(),
1458 CuDevice::Instantiate().AccuProfile(__func__, tim);
1462 Mat().AddMatMatElements(alpha, A.
Mat(), B.
Mat(), beta);
1466 template<
typename Real>
1476 if (CuDevice::Instantiate().Enabled()) {
1482 cuda_parametric_relu(dimGrid, dimBlock, this->
data_, src.
data_, this->Dim(),
1484 CU_SAFE_CALL(cudaGetLastError());
1486 CuDevice::Instantiate().AccuProfile(__func__, tim);
1493 Real src_elem = src.
Mat()(r,c);
1494 this->Mat()(r,c) = src_elem * (src_elem >= 0.0 ? alpha.
Vec()(c) : beta.
Vec()(c));
1500 template<
typename Real>
1507 if (CuDevice::Instantiate().Enabled()) {
1513 cuda_diff_parametric_relu(dimGrid, dimBlock,
data_, diff.
data_, value.
data_,
1516 CU_SAFE_CALL(cudaGetLastError());
1518 CuDevice::Instantiate().AccuProfile(__func__, tim);
1525 Real value_elem = value.
Mat()(r,c);
1526 this->Mat()(r,c) = diff.
Mat()(r,c) *
1527 (value_elem >= 0.0 ? alpha.
Vec()(c) : beta.
Vec()(c));
1533 template<
typename Real>
1537 if (CuDevice::Instantiate().Enabled()) {
1539 dim3 dimGrid, dimBlock;
1540 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1541 &dimGrid, &dimBlock);
1542 cuda_sigmoid(dimGrid, dimBlock, this->
data_, src.
data_, this->Dim(),
1544 CU_SAFE_CALL(cudaGetLastError());
1546 CuDevice::Instantiate().AccuProfile(__func__, tim);
1550 Mat().Sigmoid(src.
Mat());
1554 template<
typename Real>
1558 if (CuDevice::Instantiate().Enabled()) {
1560 dim3 dimGrid, dimBlock;
1561 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1562 &dimGrid, &dimBlock);
1563 cuda_soft_hinge(dimGrid, dimBlock, this->
data_, src.
data_, this->Dim(),
1565 CU_SAFE_CALL(cudaGetLastError());
1567 CuDevice::Instantiate().AccuProfile(__func__, tim);
1571 Mat().SoftHinge(src.
Mat());
1575 template<
typename Real>
1577 int group_size = src.
NumCols() / this->NumCols();
1579 this->NumRows() == src.
NumRows());
1581 if (CuDevice::Instantiate().Enabled()) {
1583 if (power == Real(0) || power == Real(1) || power == Real(2)
1584 || power == std::numeric_limits<Real>::infinity()) {
1590 while (threads_per_group * 3 / 2 >= group_size) {
1591 threads_per_group >>= 1;
1593 if (group_size == 1) {
1594 threads_per_group = 1;
1596 dim3 dimBlock(threads_per_group,
CU1DBLOCK / threads_per_group);
1597 dim3 dimGrid(NumRows());
1598 cuda_group_spec_pnorm(dimGrid, dimBlock, this->
data_, src.
data_,
1599 this->Dim(), src.
Stride(), group_size, power);
1602 dim3 dimGrid(n_blocks(NumCols(),
CU2DBLOCK),
1604 cuda_group_pnorm(dimGrid, dimBlock, this->
data_, src.
data_, this->Dim(),
1605 src.
Stride(), group_size, power);
1607 CU_SAFE_CALL(cudaGetLastError());
1608 CuDevice::Instantiate().AccuProfile(__func__, tim);
1612 Mat().GroupPnorm(src.
Mat(), power);
1616 template<
typename Real>
1618 int group_size = src.
NumCols() / this->NumCols();
1620 this->NumRows() == src.
NumRows());
1622 if (CuDevice::Instantiate().Enabled()) {
1631 while (threads_per_group * 3 / 2 >= group_size) {
1632 threads_per_group >>= 1;
1634 if (group_size == 1) {
1635 threads_per_group = 1;
1637 dim3 dimBlock(threads_per_group,
CU1DBLOCK / threads_per_group);
1638 dim3 dimGrid(NumRows());
1639 cuda_group_max(dimGrid, dimBlock, this->
data_, src.
data_, this->Dim(),
1640 src.
Stride(), group_size);
1641 CU_SAFE_CALL(cudaGetLastError());
1642 CuDevice::Instantiate().AccuProfile(__func__, tim);
1646 Mat().GroupMax(src.
Mat());
1660 template<
typename Real>
1663 Real *tot_objf, Real* tot_weight) {
1665 typedef typename std::vector<MatrixElement<Real> >::const_iterator Iter;
1666 MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
1667 for (Iter iter = sv_labels.begin(); iter != sv_labels.end(); ++iter) {
1669 iter->column < num_cols && iter->column >= 0);
1674 if (CuDevice::Instantiate().Enabled()) {
1675 if (sv_labels.empty()) {
1681 void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() *
sizeof(
MatrixElement<Real>));
1682 CU_SAFE_CALL(cudaMemcpyAsync(addr, sv_labels.data(), sv_labels.size() *
1684 cudaMemcpyHostToDevice,
1685 cudaStreamPerThread));
1691 sv_labels.size(), output.
Data(), output.
Dim(),
1692 this->Data(), this->Dim(), tmp.
Data());
1694 *tot_objf = tmp_cpu(0);
1695 *tot_weight = tmp_cpu(1);
1696 CuDevice::Instantiate().Free(addr);
1697 CuDevice::Instantiate().AccuProfile(__func__, tim);
1703 for(
int32 i = 0;
i<sv_labels.size();
i++) {
1704 int32 m = sv_labels[
i].row, label = sv_labels[
i].column;
1705 Real weight = sv_labels[
i].weight;
1707 Real this_prob = output(m, label);
1710 *tot_weight += weight;
1711 (*this)(m, label) += weight / this_prob;
1716 template<
typename Real>
1720 if (CuDevice::Instantiate().Enabled()) {
1724 cuda_softmax_reduce(dimGrid, dimBlock,
data_, src.
data_, Dim(), src.
Stride());
1725 CU_SAFE_CALL(cudaGetLastError());
1727 CuDevice::Instantiate().AccuProfile(__func__, tim);
1734 mat.
Row(r).ApplySoftMax();
1739 template<
typename Real>
1743 if (CuDevice::Instantiate().Enabled()) {
1747 cuda_log_softmax_reduce(dimGrid, dimBlock,
1749 CU_SAFE_CALL(cudaGetLastError());
1751 CuDevice::Instantiate().AccuProfile(__func__, tim);
1758 mat.
Row(r).ApplyLogSoftMax();
1763 template<
typename Real>
1768 if (CuDevice::Instantiate().Enabled()) {
1770 dim3 dimGrid, dimBlock;
1771 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1772 &dimGrid, &dimBlock);
1774 CU_SAFE_CALL(cudaGetLastError());
1776 CuDevice::Instantiate().AccuProfile(__func__, tim);
1780 Mat().DiffSigmoid(value.
Mat(), diff.
Mat());
1785 template<
typename Real>
1789 if (CuDevice::Instantiate().Enabled()) {
1791 dim3 dimGrid, dimBlock;
1792 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1793 &dimGrid, &dimBlock);
1795 cuda_tanh(dimGrid, dimBlock, this->
data_, src.
data_, this->Dim(), src.
Stride());
1796 CU_SAFE_CALL(cudaGetLastError());
1798 CuDevice::Instantiate().AccuProfile(__func__, tim);
1802 Mat().Tanh(src.
Mat());
1808 template<
typename Real>
1812 if (CuDevice::Instantiate().Enabled()) {
1814 dim3 dimGrid, dimBlock;
1815 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1816 &dimGrid, &dimBlock);
1818 CU_SAFE_CALL(cudaGetLastError());
1820 CuDevice::Instantiate().AccuProfile(__func__, tim);
1824 Mat().DiffTanh(value.
Mat(), diff.
Mat());
1828 template<
typename Real>
1831 if (CuDevice::Instantiate().Enabled()) {
1833 id->Resize(num_rows_);
1838 dim3 dimGrid(num_rows_);
1839 cuda_find_row_max_id(dimGrid, dimBlock,
data_, NULL, id->
Data(),
d);
1840 CU_SAFE_CALL(cudaGetLastError());
1843 CuDevice::Instantiate().AccuProfile(__func__, tim);
1848 id->Resize(num_rows_);
1851 MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
1855 const Real *row_data = Mat().RowData(r);
1857 if (max < row_data[c]) {
1862 id->Data()[r] = max_id;
1867 template<
typename Real>
1875 if (CuDevice::Instantiate().Enabled()) {
1880 dim3 dimGrid(num_rows_);
1881 cuda_diff_softmax(dimGrid, dimBlock,
data_, this->Dim(), value.
Data(),
1883 CU_SAFE_CALL(cudaGetLastError());
1885 CuDevice::Instantiate().AccuProfile(__func__, tim);
1902 template<
typename Real>
1907 this != &out_value);
1910 if (CuDevice::Instantiate().Enabled()) {
1915 dim3 dimGrid(num_rows_);
1916 cuda_diff_log_softmax(dimGrid, dimBlock, this->Dim(), out_value.
Data(),
1919 CU_SAFE_CALL(cudaGetLastError());
1921 CuDevice::Instantiate().AccuProfile(__func__, tim);
1925 if (
this == &out_deriv) {
1956 template<
typename Real>
1964 if (CuDevice::Instantiate().Enabled()) {
1968 cuda_diff_xent(dimGrid, dimBlock, tgt.
Data(),
data_,
1969 log_post_tgt->
data_, Dim());
1971 CuDevice::Instantiate().AccuProfile(__func__, tim);
1976 for(
int32 r = 0; r < num_rows; r++) {
1978 Real &value = Mat()(r, col_tgt);
1986 template<
typename Real>
1989 const int32 block_size = 64;
1991 bool have_gpu = CuDevice::Instantiate().Enabled();
1993 bool have_gpu =
false;
1995 if (this->NumRows() == 0) {
1998 if (inv_cholesky == NULL && this->NumRows() >= block_size * 2 && have_gpu) {
2006 if (this->NumRows() <= block_size || inv_cholesky == NULL || !have_gpu) {
2009 int32 dim = this->NumRows();
2016 this->CopyFromTp(C);
2017 if (inv_cholesky != NULL) {
2027 int32 tot_dim = this->NumRows();
2033 dim1 = block_size * std::max<int32>(1, tot_dim / (2 * block_size));
2035 int32 dim2 = tot_dim - dim1;
2037 this_12(*
this, 0, dim1, dim1, dim2),
2038 this_21(*
this, dim1, dim2, 0, dim1),
2039 this_22(*
this, dim1, dim2, dim1, dim2);
2041 inv_12(*inv_cholesky, 0, dim1, dim1, dim2),
2042 inv_21(*inv_cholesky, dim1, dim2, 0, dim1),
2043 inv_22(*inv_cholesky, dim1, dim2, dim1, dim2);
2087 this_11.Cholesky(&inv_11);
2097 this_12.AddMatMat(1.0, inv_11,
kTrans, inv_21,
kTrans, 0.0);
2099 this_21.CopyFromMat(inv_21);
2110 template<
typename Real>
2113 if (num_rows_ == 0)
return;
2115 if (CuDevice::Instantiate().Enabled()) {
2118 this->Cholesky(&inv_cholesky);
2120 this->SymAddMat2(1.0, inv_cholesky,
kTrans, 0.0);
2121 this->CopyLowerToUpper();
2122 CuDevice::Instantiate().AccuProfile(__func__, tim);
2131 this->Mat().CopyFromSp(temp_sp);
2136 template<
typename Real>
2140 diff.
AddMat(-1.0, other);
2141 return (diff.
FrobeniusNorm() <= tol * (*this).FrobeniusNorm());
2144 template<
typename Real>
2154 if (CuDevice::Instantiate().Enabled()) {
2167 const int kWarpSize = 32;
2168 dim3 dimBlock(kWarpSize,
CU1DBLOCK / kWarpSize);
2169 dim3 dimGrid(n_blocks(A.
NumCols(), kWarpSize),
2170 n_blocks(A.
NumRows(), kWarpSize));
2171 if (dimGrid.x * dimGrid.y > 256) {
2172 dimGrid.y = 256 / dimGrid.x;
2173 if (dimGrid.y == 0) {
2179 cuda_trace_mat_mat(dimGrid, dimBlock, A.
Data(), B.
Data(), A.
Dim(),
2182 cuda_trace_mat_mat_trans(dimGrid, dimBlock, A.
Data(), B.
Data(), A.
Dim(),
2185 CU_SAFE_CALL(cudaGetLastError());
2187 result = result_cpu.
Sum();
2188 CuDevice::Instantiate().AccuProfile(__func__, tim);
2206 template<
typename Real>
2212 KALDI_ASSERT(A.size() == B.size() && B.size() == C.size());
2213 int32 size = A.size();
2215 if (size == 0)
return;
2218 for (
int32 i = 0;
i + 1 < size;
i++) {
2243 if (CuDevice::Instantiate().Enabled()) {
2245 Real **device_abc_array =
2246 static_cast<Real**
>(CuDevice::Instantiate().Malloc(3 * size *
sizeof(Real*)));
2247 const Real **device_a_array =
const_cast<const Real**
>(device_abc_array);
2248 const Real **device_b_array =
const_cast<const Real**
>(device_abc_array) + size;
2249 Real **device_c_array = device_abc_array + 2 * size;
2250 const Real **host_abc_array =
new const Real*[3*size];
2251 const Real **host_a_array = host_abc_array;
2252 const Real **host_b_array = host_abc_array + size;
2253 const Real **host_c_array = host_abc_array + 2 * size;
2256 host_a_array[
i] = A[
i]->
data_;
2257 host_b_array[
i] = B[
i]->
data_;
2258 host_c_array[
i] = C[
i]->data_;
2261 CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array,
2262 3*size*
sizeof(Real*), cudaMemcpyHostToDevice,
2263 cudaStreamPerThread));
2265 CUBLAS_SAFE_CALL(cublas_gemmBatched(GetCublasHandle(),
2266 (transB==
kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
2267 (transA==
kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
2268 m, n, k, alpha, device_b_array,
2269 B[0]->Stride(), device_a_array,
2270 A[0]->Stride(), beta, device_c_array,
2271 C[0]->Stride(), size));
2273 CuDevice::Instantiate().Free(device_abc_array);
2274 delete[] host_abc_array;
2276 CuDevice::Instantiate().AccuProfile(__func__, tim);
2281 C[
i]->Mat().AddMatMat(alpha, A[
i]->Mat(), transA, B[
i]->Mat(), transB, beta);
2300 template<
typename Real>
2303 if (CuDevice::Instantiate().Enabled()) {
2305 if (v.
Dim() == num_rows_*num_cols_) {
2306 if (stride_ == num_cols_) {
2307 const Real* v_data = v.
Data();
2309 cudaMemcpyAsync(
data_, v_data,
sizeof(Real)*num_rows_*num_cols_,
2310 cudaMemcpyDeviceToDevice, cudaStreamPerThread));
2313 cudaMemcpy2DAsync(
data_, stride_ *
sizeof(Real), v.
Data(),
2314 num_cols_*
sizeof(Real), num_cols_*
sizeof(Real),
2315 num_rows_, cudaMemcpyDeviceToDevice,
2316 cudaStreamPerThread));
2318 }
else if (v.
Dim() == num_cols_) {
2319 dim3 dimGrid, dimBlock;
2320 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2321 &dimGrid, &dimBlock);
2322 cuda_copy_rows_from_vec(dimGrid, dimBlock,
data_, this->Dim(), v.
Data());
2323 CU_SAFE_CALL(cudaGetLastError());
2327 CuDevice::Instantiate().AccuProfile(__func__, tim);
2331 Mat().CopyRowsFromVec(v.
Vec());
2335 template<
typename Real>
2338 if (CuDevice::Instantiate().Enabled()) {
2340 if (v.
Dim() == num_rows_*num_cols_) {
2341 if (stride_ == num_cols_) {
2342 const Real* v_data = v.
Data();
2343 CU_SAFE_CALL(cudaMemcpyAsync(
data_, v_data,
2344 sizeof(Real)*num_rows_*num_cols_,
2345 cudaMemcpyHostToDevice,
2346 cudaStreamPerThread));
2348 const Real *v_data = v.
Data();
2350 Real *row_data = RowData(r);
2351 CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data,
sizeof(Real)*num_cols_,
2352 cudaMemcpyHostToDevice,
2353 cudaStreamPerThread));
2354 v_data += num_cols_;
2357 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
2358 }
else if (v.
Dim() == num_cols_) {
2359 dim3 dimGrid, dimBlock;
2360 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2361 &dimGrid, &dimBlock);
2362 cuda_copy_rows_from_vec(dimGrid, dimBlock, this->
data_, this->Dim(), v.
Data());
2363 CU_SAFE_CALL(cudaGetLastError());
2367 CuDevice::Instantiate().AccuProfile(__func__, tim);
2371 Mat().CopyRowsFromVec(v);
2375 template<
typename Real>
2378 if (CuDevice::Instantiate().Enabled()) {
2380 if (rv.
Dim() == num_rows_ * num_cols_) {
2384 MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ };
2385 const int32 warpSize = 32;
2386 dim3 dimBlock(warpSize,
CU1DBLOCK / warpSize);
2387 dim3 dimGrid(n_blocks(rv_dim.
cols, warpSize),
2388 n_blocks(rv_dim.
rows, warpSize));
2389 cuda_copy_from_mat_trans(dimGrid, dimBlock,
data_, rv.
Data(), Dim(),
2391 CU_SAFE_CALL(cudaGetLastError());
2392 }
else if (rv.
Dim() == num_rows_) {
2395 const int32 warpSize = 32;
2396 dim3 dimBlock(warpSize,
CU1DBLOCK / warpSize);
2397 dim3 dimGrid(n_blocks(num_cols_, dimBlock.x),
2398 n_blocks(num_rows_, dimBlock.y));
2399 cuda_copy_cols_from_vec(dimGrid, dimBlock, Data(), Dim(), rv.
Data());
2400 CU_SAFE_CALL(cudaGetLastError());
2404 CuDevice::Instantiate().AccuProfile(__func__, tim);
2408 Mat().CopyColsFromVec(rv.
Vec());
2413 template<
typename Real>
2418 static_cast<UnsignedMatrixIndexT>(num_cols_));
2420 if (CuDevice::Instantiate().Enabled()) {
2422 cublas_copy(GetCublasHandle(),
2424 this->
data_ + col, this->stride_);
2425 CU_SAFE_CALL(cudaGetLastError());
2426 CuDevice::Instantiate().AccuProfile(__func__, tim);
2430 Mat().CopyColFromVec(v.
Vec(), col);
2434 template<
typename Real>
2438 if (CuDevice::Instantiate().Enabled()) {
2440 dim3 dimGrid, dimBlock;
2441 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2442 &dimGrid, &dimBlock);
2443 cuda_heaviside(dimGrid, dimBlock, this->
data_, src.
data_, this->Dim(),
2445 CU_SAFE_CALL(cudaGetLastError());
2447 CuDevice::Instantiate().AccuProfile(__func__, tim);
2451 Mat().Heaviside(src.
Mat());
2455 template<
typename Real>
2459 if (CuDevice::Instantiate().Enabled()) {
2461 dim3 dimGrid, dimBlock;
2462 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2463 &dimGrid, &dimBlock);
2464 cuda_exp(dimGrid, dimBlock, this->
data_, src.
data_, this->Dim(),
2466 CU_SAFE_CALL(cudaGetLastError());
2468 CuDevice::Instantiate().AccuProfile(__func__, tim);
2472 Mat().Exp(src.
Mat());
2476 template<
typename Real>
2480 if (CuDevice::Instantiate().Enabled()) {
2481 if (num_rows_ == 0)
return;
2483 dim3 dimGrid, dimBlock;
2484 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2485 &dimGrid, &dimBlock);
2487 cuda_log(dimGrid, dimBlock, this->
data_, src.
data_, this->Dim(),
2489 CU_SAFE_CALL(cudaGetLastError());
2491 CuDevice::Instantiate().AccuProfile(__func__, tim);
2495 Mat().Log(src.
Mat());
2499 template<
typename Real>
2503 if (CuDevice::Instantiate().Enabled()) {
2505 dim3 dimGrid, dimBlock;
2506 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2507 &dimGrid, &dimBlock);
2508 cuda_pow(dimGrid, dimBlock, this->
data_, src.
data_, power, this->Dim(),
2510 CU_SAFE_CALL(cudaGetLastError());
2512 CuDevice::Instantiate().AccuProfile(__func__, tim);
2516 Mat().Pow(src.
Mat(), power);
2520 template<
typename Real>
2524 if (CuDevice::Instantiate().Enabled()) {
2526 dim3 dimGrid, dimBlock;
2527 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2528 &dimGrid, &dimBlock);
2529 cuda_pow_abs(dimGrid, dimBlock, this->
data_, src.
data_, power, include_sign,
2530 this->Dim(), src.
Stride());
2531 CU_SAFE_CALL(cudaGetLastError());
2532 CuDevice::Instantiate().AccuProfile(__func__, tim);
2536 Mat().PowAbs(src.
Mat(), power, include_sign);
2540 template<
typename Real>
2545 if (CuDevice::Instantiate().Enabled()) {
2547 dim3 dimGrid, dimBlock;
2548 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2549 &dimGrid, &dimBlock);
2550 cuda_exp_limited(dimGrid, dimBlock, this->
data_, src.
data_, lower_limit, upper_limit,
2551 this->Dim(), src.
Stride());
2552 CU_SAFE_CALL(cudaGetLastError());
2553 CuDevice::Instantiate().AccuProfile(__func__, tim);
2557 Mat().ExpLimited(src.
Mat(), lower_limit, upper_limit);
2562 template<
typename Real>
2566 if (CuDevice::Instantiate().Enabled()) {
2568 dim3 dimGrid, dimBlock;
2569 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2570 &dimGrid, &dimBlock);
2571 cuda_exp_special(dimGrid, dimBlock, this->
data_, src.
data_, Dim(), src.
Stride());
2572 CU_SAFE_CALL(cudaGetLastError());
2573 CuDevice::Instantiate().AccuProfile(__func__, tim);
2577 Mat().ExpSpecial(src.
Mat());
2581 template<
typename Real>
2585 if (CuDevice::Instantiate().Enabled()) {
2587 dim3 dimGrid, dimBlock;
2588 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2589 &dimGrid, &dimBlock);
2590 cuda_floor(dimGrid, dimBlock,
data_, src.
data_, floor_val, this->Dim(), src.
Stride());
2591 CU_SAFE_CALL(cudaGetLastError());
2592 CuDevice::Instantiate().AccuProfile(__func__, tim);
2596 Mat().Floor(src.
Mat(), floor_val);
2600 template<
typename Real>
2604 if (CuDevice::Instantiate().Enabled()) {
2606 dim3 dimGrid, dimBlock;
2607 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2608 &dimGrid, &dimBlock);
2609 cuda_ceiling(dimGrid, dimBlock, this->
data_, src.
data_, ceiling_val, this->Dim(), src.
Stride());
2610 CU_SAFE_CALL(cudaGetLastError());
2611 CuDevice::Instantiate().AccuProfile(__func__, tim);
2615 Mat().Ceiling(src.
Mat(), ceiling_val);
2620 template<
typename Real>
2624 if (CuDevice::Instantiate().Enabled()) {
2627 CU_SAFE_CALL(cudaMemcpyAsync(
data_, mat.
Data(),
sizeof(Real)*dim_,
2628 cudaMemcpyDeviceToHost, cudaStreamPerThread));
2631 Real* vec_data =
data_;
2633 CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.
RowData(r),
2634 sizeof(Real) * mat.
NumCols(), cudaMemcpyDeviceToHost,
2635 cudaStreamPerThread));
2639 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
2640 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::CopyRowsFromMat", tim);
2644 CopyRowsFromMat(mat.
Mat());
2655 template<
typename Real>
2659 if (CuDevice::Instantiate().Enabled()) {
2663 dim3 dimGrid, dimBlock;
2664 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2665 &dimGrid, &dimBlock);
2666 cuda_copy_cols(dimGrid, dimBlock,
data_, src.
Data(), indices.
Data(), Dim(), src.
Stride());
2667 CU_SAFE_CALL(cudaGetLastError());
2668 CuDevice::Instantiate().AccuProfile(__func__, tim);
2672 Mat().CopyCols(src.
Mat(), indices.
Data());
2677 template<
typename Real>
2681 if (CuDevice::Instantiate().Enabled()) {
2686 dim3 dimGrid, dimBlock;
2687 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2688 &dimGrid, &dimBlock);
2689 cuda_copy_rows(dimGrid, dimBlock,
data_, src.
Data(), indices.
Data(),
2691 CU_SAFE_CALL(cudaGetLastError());
2692 CuDevice::Instantiate().AccuProfile(__func__, tim);
2696 Mat().CopyRows(src.
Mat(), indices.
Data());
2700 template<
typename Real>
2704 if (CuDevice::Instantiate().Enabled()) {
2708 dim3 dimGrid, dimBlock;
2709 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2710 &dimGrid, &dimBlock);
2711 cuda_add_cols(dimGrid, dimBlock,
data_, src.
Data(), indices.
Data(),
2713 CU_SAFE_CALL(cudaGetLastError());
2714 CuDevice::Instantiate().AccuProfile(__func__, tim);
2718 Mat().AddCols(src.
Mat(), indices.
Data());
2722 template<
typename Real>
2724 if (NumRows() == 0)
return;
2726 if (CuDevice::Instantiate().Enabled()) {
2730 dim3 dimGrid(n_blocks(num_cols_,
CU2DBLOCK),
2732 cuda_copy_rows(dimGrid, dimBlock,
data_, src.
Data(), Dim());
2733 CU_SAFE_CALL(cudaGetLastError());
2734 CuDevice::Instantiate().AccuProfile(__func__, tim);
2738 Mat().CopyRows(src.
Data());
2743 template<
typename Real>
2745 if (NumRows() == 0)
return;
2747 if (CuDevice::Instantiate().Enabled()) {
2752 dim3 dimGrid(n_blocks(num_cols_,
CU2DBLOCK),
2754 cuda_copy_to_rows(dimGrid, dimBlock, dst.
Data(),
data_, Dim());
2755 CU_SAFE_CALL(cudaGetLastError());
2756 CuDevice::Instantiate().AccuProfile(__func__, tim);
2760 Mat().CopyToRows(dst.
Data());
2765 template<
typename Real>
2769 if (NumRows() == 0)
return;
2771 if (CuDevice::Instantiate().Enabled()) {
2775 dim3 dimGrid, dimBlock;
2776 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2777 &dimGrid, &dimBlock);
2778 cuda_add_rows(dimGrid, dimBlock, alpha,
2780 CU_SAFE_CALL(cudaGetLastError());
2781 CuDevice::Instantiate().AccuProfile(__func__, tim);
2785 Mat().AddRows(alpha, src.
Mat(), indexes.
Data());
2789 template<
typename Real>
2792 if (NumRows() == 0)
return;
2795 if (CuDevice::Instantiate().Enabled()) {
2798 dim3 dimGrid, dimBlock;
2799 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2800 &dimGrid, &dimBlock);
2801 cuda_mul_rows(dimGrid, dimBlock,
2803 CU_SAFE_CALL(cudaGetLastError());
2804 CuDevice::Instantiate().AccuProfile(__func__, tim);
2810 int32 num_rows = NumRows();
2812 for (
int32 r = 0; r < num_rows; r++) {
2813 int32 src_r = index_ptr[r];
2817 src_row(src_mat, src_r);
2818 this_row.MulElements(src_row);
2825 template<
typename Real>
2827 if (NumRows() == 0)
return;
2829 if (CuDevice::Instantiate().Enabled()) {
2832 dim3 dimGrid, dimBlock;
2833 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2834 &dimGrid, &dimBlock);
2835 cuda_add_rows(dimGrid, dimBlock, alpha,
data_, src.
Data(), Dim());
2836 CU_SAFE_CALL(cudaGetLastError());
2837 CuDevice::Instantiate().AccuProfile(__func__, tim);
2841 Mat().AddRows(alpha, src.
Data());
2846 template<
typename Real>
2849 if (NumRows() == 0)
return;
2851 if (CuDevice::Instantiate().Enabled()) {
2854 dim3 dimGrid, dimBlock;
2855 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2856 &dimGrid, &dimBlock);
2857 cuda_add_to_rows(dimGrid, dimBlock, alpha, dst.
Data(),
data_, Dim());
2858 CU_SAFE_CALL(cudaGetLastError());
2859 CuDevice::Instantiate().AccuProfile(__func__, tim);
2863 Mat().AddToRows(alpha, dst.
Data());
2868 template<
typename Real>
2872 if (NumRows() == 0)
return;
2874 if (CuDevice::Instantiate().Enabled()) {
2878 dim3 dimGrid, dimBlock;
2879 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2880 &dimGrid, &dimBlock);
2881 cuda_add_to_rows(dimGrid, dimBlock, alpha, dst->
Data(),
data_, indexes.
Data(), Dim(), dst->
Stride());
2882 CU_SAFE_CALL(cudaGetLastError());
2883 CuDevice::Instantiate().AccuProfile(__func__, tim);
2887 Mat().AddToRows(alpha, indexes.
Data(), &(dst->
Mat()));
2892 template<
typename Real>
2897 if (NumRows() == 0)
return;
2899 if (CuDevice::Instantiate().Enabled()) {
2901 dim3 dimGrid, dimBlock;
2902 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2903 &dimGrid, &dimBlock);
2904 cuda_sum_column_ranges(dimGrid, dimBlock,
data_, Dim(), src.
Data(),
2906 CU_SAFE_CALL(cudaGetLastError());
2907 CuDevice::Instantiate().AccuProfile(__func__, tim);
2911 int32 num_rows = this->num_rows_, num_cols = this->num_cols_,
2912 this_stride = this->stride_, src_stride = src.
stride_;
2913 Real *data = this->
data_;
2914 const Real *src_data = src.
data_;
2916 for (
int32 row = 0; row < num_rows; row++) {
2917 for (
int32 col = 0; col < num_cols; col++) {
2919 end_col = indices_data[col].
second;
2921 for (
int32 src_col = start_col; src_col < end_col; src_col++)
2922 sum += src_data[row * src_stride + src_col];
2923 data[row * this_stride + col] = sum;
2930 template<
typename Real>
2935 if (NumRows() == 0)
return;
2937 if (CuDevice::Instantiate().Enabled()) {
2939 dim3 dimGrid, dimBlock;
2940 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2941 &dimGrid, &dimBlock);
2942 cuda_add_row_ranges(dimGrid, dimBlock,
2944 CU_SAFE_CALL(cudaGetLastError());
2945 CuDevice::Instantiate().AccuProfile(__func__, tim);
2949 int32 num_rows = this->num_rows_, num_cols = this->num_cols_,
2950 this_stride = this->stride_, src_stride = src.
stride_;
2951 Real *data = this->
data_;
2952 const Real *src_data = src.
data_;
2954 for (
int32 row = 0; row < num_rows; row++) {
2956 end_row = indexes_data[row].
second;
2957 for (
int32 col = 0; col < num_cols; col++) {
2959 for (
int32 src_row = start_row; src_row < end_row; src_row++)
2960 sum += src_data[src_row * src_stride + col];
2961 data[row * this_stride + col] += sum;
2968 template<
typename Real>
2971 if (num_rows_ == 0)
return;
2973 if (CuDevice::Instantiate().Enabled()) {
2976 int32 dim = num_rows_;
2979 cuda_copy_low_upp(dimGrid, dimBlock,
data_, Dim());
2980 CU_SAFE_CALL(cudaGetLastError());
2981 CuDevice::Instantiate().AccuProfile(__func__, tim);
2985 Mat().CopyLowerToUpper();
2989 template<
typename Real>
2992 if (num_rows_ == 0)
return;
2994 if (CuDevice::Instantiate().Enabled()) {
2996 int32 dim = this->num_rows_;
3000 cuda_copy_upp_low(dimGrid, dimBlock,
data_, Dim());
3001 CU_SAFE_CALL(cudaGetLastError());
3002 CuDevice::Instantiate().AccuProfile(__func__, tim);
3006 Mat().CopyUpperToLower();
3011 template<
typename Real>
3014 if (CuDevice::Instantiate().Enabled()) {
3020 Real ans = col_sum.
Sum();
3022 CuDevice::Instantiate().AccuProfile(__func__, tim);
3032 template<
typename Real>
3035 if (CuDevice::Instantiate().Enabled()) {
3041 Real ans = col_max.
Max();
3043 CuDevice::Instantiate().AccuProfile(__func__, tim);
3053 template<
typename Real>
3056 if (CuDevice::Instantiate().Enabled()) {
3062 Real ans = col_min.
Min();
3064 CuDevice::Instantiate().AccuProfile(__func__, tim);
3074 template<
typename Real>
3077 if (CuDevice::Instantiate().Enabled()) {
3079 if (check_square)
KALDI_ASSERT(this->num_rows_ == this->num_cols_);
3080 MatrixIndexT dim = std::min(this->num_rows_, this->num_cols_);
3084 cuda_vec_sum(dimGrid, dimBlock,
data_, tmp.
Data(), dim, Stride() + 1);
3085 CU_SAFE_CALL(cudaGetLastError());
3086 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::Sum", tim);
3091 return Mat().Trace(check_square);
3095 template <
typename Real>
3098 switch (src.
Type()) {
3101 this->CopyFromMat(src_full_mat, trans);
3107 this->CopyFromMat(mat, trans);
3113 if (CuDevice::Instantiate().Enabled()) {
3125 KALDI_ERR <<
"Invalid GeneralMatrix type.";
3131 template<
typename Real>
3133 if (num_rows_ == 0)
return;
3135 if (CuDevice::Instantiate().Enabled()) {
3145 template<
typename Real>
3147 if (num_rows_ == 0)
return;
3149 if (CuDevice::Instantiate().Enabled()) {
3155 Mat().SetRandUniform();
3159 template<
typename Real>
3166 template<
typename Real>
3167 template<
typename OtherReal>
3173 this->CopyFromMat(M);
3176 this->CopyFromMat(M,
kTrans);
3189 template<
typename Real>
3191 if (this->num_rows_ == 0)
3204 template<
typename Real>
3212 A_row_stride = A.
Stride(), A_col_stride = 1,
3223 KALDI_ASSERT(NumRows() == A_num_rows && NumCols() == B_num_cols);
3227 if (num_rows_ == 0)
return;
3229 if (CuDevice::Instantiate().Enabled()) {
3235 dim3 dimGrid(n_blocks(num_rows_,
CU2DBLOCK),
3240 cuda_add_mat_blockmat(dimGrid, dimBlock,
data_, this_dim, A.
Data(),
3241 A_num_rows, A_num_cols, A_row_stride, A_col_stride,
3242 B.CuData(), B_num_blocks, alpha, beta,
3243 (transB ==
kTrans ? 1 : 0));
3245 CU_SAFE_CALL(cudaGetLastError());
3247 CuDevice::Instantiate().AccuProfile(__func__, tim);
3253 int32 row_offset = 0, col_offset = 0;
3254 for (
int32 b = 0; b < B_num_blocks; b++) {
3257 this_num_cols = this_block.
NumCols();
3260 col_offset, this_num_cols);
3263 row_offset, this_num_rows) :
3266 this_part.
AddMatMat(alpha, A_part, transA, this_block, transB, beta);
3267 row_offset += this_num_rows;
3268 col_offset += this_num_cols;
3272 KALDI_ASSERT(row_offset == B_num_rows && col_offset == B_num_cols);
3276 template<
typename Real>
3280 MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
3281 for (
int32 i = 0;
i < input.size(); ++
i) {
3283 input[
i].column < num_cols && input[
i].column >= 0);
3286 if (CuDevice::Instantiate().Enabled()) {
3288 CU_SAFE_CALL(cudaMemcpyAsync(addr, input.data(),
3290 cudaMemcpyHostToDevice, cudaStreamPerThread));
3294 int dimGrid(n_blocks(input.size(),
CU1DBLOCK));
3296 cuda_matrix_add_elements(dimGrid, dimBlock, this->
data_, this->Dim(),
3298 CU_SAFE_CALL(cudaGetLastError());
3299 CuDevice::Instantiate().Free(addr);
3300 CuDevice::Instantiate().AccuProfile(__func__, tim);
3304 for (
int32 i = 0;
i < input.size();
i++) {
3305 (*this)(input[
i].row, input[
i].column) += alpha * input[
i].weight;
3310 template<
typename Real>
3312 const Real *input) {
3313 if (indexes.
Dim() == 0)
return;
3317 if (CuDevice::Instantiate().Enabled()) {
3320 CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input,
3321 indexes.
Dim() *
sizeof(Real),
3322 cudaMemcpyHostToDevice, cudaStreamPerThread));
3326 cuda_matrix_add_indexed_values(dimGrid, dimBlock, this->Dim(), alpha,
3327 indexes.
Data(), tmp_vec.Data(), indexes.
Dim(), this->
data_);
3328 CU_SAFE_CALL(cudaGetLastError());
3329 CuDevice::Instantiate().AccuProfile(__func__, tim);
3333 MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
3337 index[
i].second < num_cols && index[
i].second >= 0);
3343 template<
typename Real>
3347 if (CuDevice::Instantiate().Enabled()) {
3351 dim3 dimGrid(n_blocks(NumRows(),
CU1DBLOCK));
3353 cuda_matrix_add_to_elements(dimGrid, dimBlock, alpha,
data_, Dim(), elements.
Data());
3354 CU_SAFE_CALL(cudaGetLastError());
3355 CuDevice::Instantiate().AccuProfile(__func__, tim);
3360 const int32* row_to_col = elements.
Data();
3363 if (row_to_col[r] >= 0)
3364 this_mat(r, row_to_col[r]) += alpha;
3369 template<
typename Real>
3371 Real *output)
const {
3373 MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
3374 for (
int32 i = 0;
i < indices.size(); ++
i) {
3375 KALDI_ASSERT(indices[
i].first < num_rows && indices[
i].first >= 0 &&
3376 indices[
i].second < num_cols && indices[
i].second >= 0);
3378 if (indices.size() == 0)
return;
3382 if (CuDevice::Instantiate().Enabled()) {
3384 Lookup(cuda_indices, output);
3388 for (
int32 i = 0;
i < indices.size();
i++) {
3389 output[
i] = (*this)(indices[
i].first, indices[
i].second);
3394 template<
typename Real>
3396 Real *output)
const {
3397 int32 num_elements = indices.
Dim();
3398 if (num_elements == 0)
return;
3402 if (CuDevice::Instantiate().Enabled()) {
3406 dim3 dimGrid(n_blocks(num_elements,
CU1DBLOCK), 1);
3408 cuda_matrix_lookup(dimGrid, dimBlock, this->
data_, this->Dim(),
3409 indices.
Data(), num_elements, cuda_output.
Data());
3410 CU_SAFE_CALL(cudaGetLastError());
3413 CuDevice::Instantiate().AccuProfile(__func__, tim);
3417 MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
3419 for (
int32 i = 0;
i < num_elements;
i++) {
3421 index[
i].second < num_cols && index[
i].second >= 0);
3428 template<
typename Real>
3437 if (CuDevice::Instantiate().Enabled()) {
3439 dim3 dimGrid, dimBlock;
3440 GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
3441 &dimGrid, &dimBlock);
3442 cuda_equal_element_mask(dimGrid, dimBlock, this->
data_, mat.
Data(),
3445 CU_SAFE_CALL(cudaGetLastError());
3447 CuDevice::Instantiate().AccuProfile(__func__, tim);
3451 for (
int32 r = 0; r < NumRows(); r++) {
3452 for (
int32 c = 0; c < NumCols(); c++) {
3453 (*mask)(r,c) = ((*
this)(r,c) == mat(r,c) ? 1.0 : 0.0);
3463 template<
typename Real>
3464 std::ostream &operator << (std::ostream &out, const CuMatrixBase<Real> &mat) {
3472 std::ostream &operator << (std::ostream &out, const CuMatrixBase<float> &mat);
3474 std::ostream &operator << (std::ostream &out, const CuMatrixBase<double> &mat);
const MatrixBase< Real > & Mat() const
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
MatrixIndexT Stride() const
MatrixIndexT NumRows() const
This class provides a way for switching between double and float types.
Packed symetric matrix class.
void Write(std::ostream &out, bool binary) const
write to stream.
This class is a wrapper that enables you to store a matrix in one of three forms: either as a Matrix<...
const Real * CsrVal() const
Returns pointer to the data array of length nnz_ that holds all nonzero values in zero-based CSR form...
void CopyToMat(CuMatrixBase< OtherReal > *dest, MatrixTransposeType trans=kNoTrans) const
void CopyFromTp(const CuTpMatrix< Real > &other)
const CuSubVector< Real > Row(MatrixIndexT i) const
MatrixIndexT stride_
< Number of rows
void RandUniform(CuMatrixBase< Real > *tgt)
Fill with uniform [0..1] floats,.
void CopyToMat(MatrixBase< OtherReal > *dst, MatrixTransposeType trans=kNoTrans) const
void GetMatrix(Matrix< BaseFloat > *mat) const
Outputs the contents as a matrix.
MatrixIndexT NumRows() const
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Base class which provides matrix operations not involving resizing or allocation. ...
const Real * Data() const
Gives pointer to raw data (const).
Structure containing size of the matrix plus stride.
Real * data_
GPU data pointer (or regular data pointer if CUDA is not compiled in or we have no GPU)...
Real * data_
data memory area
void AddMat(const Real alpha, const MatrixBase< Real > &M, MatrixTransposeType transA=kNoTrans)
*this += alpha * M [or M^T]
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
The class CuBlockMatrix holds a vector of objects of type CuMatrix, say, M_1, M_2, .
void AddMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
*this += alpha * A
A class for storing matrices.
const T * Data() const
Get raw pointer.
const Matrix< BaseFloat > & GetFullMatrix() const
Returns the contents as a Matrix<BaseFloat>.
This class represents a matrix that's stored on the GPU if we have one, and in memory if not...
Real * data_
GPU data pointer (or regular matrix data pointer,.
void Swap(Matrix< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
void CopyFromMat(const MatrixBase< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given matrix. (no resize is done).
MatrixIndexT NumRows() const
uint32 UnsignedMatrixIndexT
template void AddMatMatBatched(const double alpha, std::vector< CuSubMatrix< double > * > &C, const std::vector< CuSubMatrix< double > * > &A, MatrixTransposeType transA, const std::vector< CuSubMatrix< double > * > &B, MatrixTransposeType transB, const double beta)
MatrixIndexT NumRows() const
void AddDiagMatMat(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const CuMatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
void CopyToMat(MatrixBase< OtherReal > *other, MatrixTransposeType t=kNoTrans) const
Copy to matrix. It must already have the correct size.
const SpMatrix< Real > & Mat() const
void RandGaussian(CuMatrixBase< Real > *tgt)
Fill with Normal random numbers,.
void Read(std::istream &in, bool binary, bool add=false)
read from stream.
void Cholesky(const SpMatrix< Real > &orig)
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols.
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
const int * CsrColIdx() const
Returns pointer to the integer array of length nnz_ that contains the column indices of the correspon...
GeneralMatrixType Type() const
Returns the type of the matrix: kSparseMatrix, kCompressedMatrix or kFullMatrix.
void AddColSumMat(Real alpha, const CuMatrixBase< Real > &mat, Real beta=1.0)
Sum the columns of the matrix, add to vector.
void Swap(Matrix< Real > *mat)
MatrixIndexT num_rows_
< Number of columns
const SparseMatrix< Real > & Smat() const
void MulElements(const CuMatrixBase< Real > &A)
Multiply two matrices elementwise: C = C .* A.
void CopyFromBlock(const CuBlockMatrix< Real > &B, MatrixTransposeType trans=kNoTrans)
void AddTp2(const Real alpha, const TpMatrix< Real > &T, MatrixTransposeType transM, const Real beta=0.0)
The following function does: this <– beta*this + alpha * T * T^T.
void SymAddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transA, Real beta)
*this = beta * *this + alpha * M M^T, for symmetric matrices.
void Resize(MatrixIndexT dim, MatrixResizeType t=kSetZero)
Allocate the memory.
#define KALDI_MEMALIGN_FREE(x)
MatrixIndexT NumElements() const
Packed symetric matrix class.
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
MatrixIndexT num_cols_
these attributes store the real matrix size as it is stored in memory including memalignment ...
void Cholesky(CuMatrixBase< Real > *inv_cholesky=NULL)
This function does sets *this to the Cholesky factor of *this (i.e.
This class is used for a piece of a CuMatrix.
Real * Data()
Returns a pointer to the start of the vector's data.
void CopyToHost(T *dst) const
Version of the above function that copies contents to a host array (i.e.
MatrixIndexT Dim() const
Returns the dimension of the vector.
Real Sum() const
Returns sum of the elements.
void CopyFromMat(const CuMatrixBase< Real > &orig, SpCopyType copy_type=kTakeLower)
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
const Real * Data() const
Return data pointer (const).
const int * CsrRowPtr() const
Returns pointer to the integer array of length NumRows()+1 that holds indices of the first nonzero el...
Matrix for CUDA computing.
MatrixIndexT NumCols() const
MatrixIndexT NumBlocks() const
void DiffLogSoftmaxPerRow(const CuMatrixBase< Real > &out_value, const CuMatrixBase< Real > &out_deriv)
Differentiate backward through the log softmax function.
A class representing a vector.
void InvertElements()
Invert all elements.
const VectorBase< Real > & Vec() const
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
void CopyFromTp(const CuTpMatrix< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Real * Data()
Returns a pointer to the start of the vector's data.
Real FrobeniusNorm() const
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
const SparseMatrix< BaseFloat > & GetSparseMatrix() const
Returns the contents as a SparseMatrix.
MatrixIndexT NumRows() const
Dimensions.
Provides a vector abstraction class.
MatrixIndexT Dim() const
Return the vector dimension.
MatrixIndexT NumCols() const
void SetZero()
Set vector to all zeros.
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i'th row by scale[i]
Sub-matrix representation.
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
const TpMatrix< Real > & Mat() const
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
void Resize(MatrixIndexT rows, MatrixIndexT cols, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Allocate the memory.
Real Min() const
Returns the minimum value of any element, or +infinity for the empty vector.
MatrixIndexT NumCols() const
const CuSubMatrix< Real > Block(MatrixIndexT b) const
template double TraceMatMat(const CuMatrixBase< double > &A, const CuMatrixBase< double > &B, MatrixTransposeType trans)
MatrixIndexT Dim() const
Dimensions.
Vector for CUDA computing.
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).