25 #include <cuda_runtime_api.h> 26 #include <cublas_v2.h> 46 template<
typename Real>
53 if (CuDevice::Instantiate().Enabled()) {
55 CUBLAS_SAFE_CALL(cublas_dot(GetCublasHandle(), a.
Dim(), a.
Data(), 1, b.
Data(),
57 CuDevice::Instantiate().AccuProfile(__func__, tim);
72 template<
typename Real,
typename OtherReal>
82 template<
typename Real>
102 template<
typename Real>
107 if (CuDevice::Instantiate().Enabled()) {
109 cublas_copy(GetCublasHandle(),
111 CU_SAFE_CALL(cudaGetLastError());
112 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::CopyColFromMat", tim);
116 Vec().CopyColFromMat(mat.
Mat(),col);
126 if (CuDevice::Instantiate().Enabled()) {
131 cuda_copy_col_from_mat_df(dimGrid, dimBlock,
data_, col, mat.
Data(), mat.
Dim(), dim_);
132 CU_SAFE_CALL(cudaGetLastError());
133 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::CopyColFromMat", tim);
137 Vec().CopyColFromMat(mat.
Mat(), col);
148 if (CuDevice::Instantiate().Enabled()) {
153 cuda_copy_col_from_mat_fd(dimGrid, dimBlock,
data_, col, mat.
Data(), mat.
Dim(), dim_);
154 CU_SAFE_CALL(cudaGetLastError());
155 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::CopyColFromMat", tim);
159 Vec().CopyColFromMat(mat.
Mat(), col);
163 template<
typename Real>
167 if (CuDevice::Instantiate().Enabled()) {
168 if (dim_ == 0)
return;
172 cudaMemcpyAsync(
data_, mat.
Data(),
sizeof(Real)*dim_,
173 cudaMemcpyDeviceToDevice, cudaStreamPerThread));
175 Real* vec_data =
data_;
177 CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.
RowData(r),
179 cudaMemcpyDeviceToDevice,
180 cudaStreamPerThread));
184 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::CopyRowsFromMat", tim);
188 Vec().CopyRowsFromMat(mat.
Mat());
192 template<
typename Real>
195 if (CuDevice::Instantiate().Enabled()) {
199 if (dim_ == 0)
return 0.0;
201 cublas_asum(GetCublasHandle(), dim_,
data_, 1, &ans);
203 cublas_nrm2(GetCublasHandle(), dim_,
data_, 1, &ans);
205 CuDevice::Instantiate().AccuProfile(__func__, tim);
213 return Vec().Norm(p);
217 template<
typename Real>
221 if (CuDevice::Instantiate().Enabled()) {
222 if (dim_ == 0)
return;
225 CU_SAFE_CALL(cudaMemcpyAsync(
data_, mat.
Data(),
sizeof(Real)*dim_,
226 cudaMemcpyHostToDevice, cudaStreamPerThread));
228 Real* vec_data =
data_;
230 CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.
RowData(r),
232 cudaMemcpyHostToDevice, cudaStreamPerThread));
236 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
237 CuDevice::Instantiate().AccuProfile(__func__, tim);
241 Vec().CopyRowsFromMat(mat);
245 template<
typename Real>
249 if (CuDevice::Instantiate().Enabled()) {
250 if (num_rows_ == 0)
return;
252 if (Stride() == NumCols()) {
253 CU_SAFE_CALL(cudaMemcpyAsync(
data_, v.
Data(),
254 sizeof(Real)*v.
Dim(),
255 cudaMemcpyDeviceToHost,
256 cudaStreamPerThread));
258 const Real* vec_data = v.
Data();
260 CU_SAFE_CALL(cudaMemcpyAsync(RowData(r), vec_data,
261 sizeof(Real) * NumCols(),
262 cudaMemcpyDeviceToHost,
263 cudaStreamPerThread));
264 vec_data += NumCols();
267 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
268 CuDevice::Instantiate().AccuProfile(__func__, tim);
272 CopyRowsFromVec(v.
Vec());
280 template<
typename Real>
282 if (dim_ == 0)
return;
287 template<
typename Real>
289 if (dim_ == 0)
return;
296 template<
typename Real>
301 if (CuDevice::Instantiate().Enabled()) {
309 result = ans_cpu.
Sum();
313 int dimGrid = n_blocks(dim_, dimBlock);
318 cuda_vec_sum(dimGrid, dimBlock,
data_, ans.
Data(), dim_, 1);
319 CU_SAFE_CALL(cudaGetLastError());
321 result = ans_cpu.
Sum();
324 CuDevice::Instantiate().AccuProfile(__func__, tim);
333 template<
typename Real>
336 if (CuDevice::Instantiate().Enabled()) {
337 if (dim_ == 0)
return;
342 cuda_softmax_reduce(dimGrid, dimBlock,
data_,
data_, dim, this->dim_);
343 CU_SAFE_CALL(cudaGetLastError());
344 CuDevice::Instantiate().AccuProfile(__func__, tim);
348 Vec().ApplySoftMax();
352 template<
typename Real>
355 if (CuDevice::Instantiate().Enabled()) {
358 if (floored_count ==
nullptr) {
359 if (dim_ == 0)
return;
364 cuda_floor(dimGrid, dimBlock, this->
data_, src.
Data(), floor_val, dim, 1);
365 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::FloorNoCount", tim);
367 if (dim_ == 0) { *floored_count = 0;
return; }
372 cuda_vec_apply_floor(dimGrid, dimBlock,
data_, floor_val, count_vec.
Data(), dim_);
373 CU_SAFE_CALL(cudaGetLastError());
374 *floored_count = count_vec.
Sum();
375 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::Floor", tim);
380 Vec().Floor(src.
Vec(), floor_val, floored_count);
384 template<
typename Real>
388 if (CuDevice::Instantiate().Enabled()) {
391 if (ceiled_count ==
nullptr) {
392 if (dim_ == 0)
return;
397 cuda_ceiling(dimGrid, dimBlock, this->
data_, src.
Data(), ceiling_val, dim, 1);
399 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::CeilingNoCount", tim);
401 if (dim_ == 0) { *ceiled_count = 0;
return; }
406 cuda_vec_apply_ceiling(dimGrid, dimBlock,
data_, ceiling_val, count_vec.
Data(), dim_);
407 CU_SAFE_CALL(cudaGetLastError());
408 *ceiled_count = count_vec.
Sum();
409 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::Ceiling", tim);
414 Vec().Ceiling(src.
Vec(), ceiling_val, ceiled_count);
418 template<
typename Real>
421 if (CuDevice::Instantiate().Enabled()) {
422 if (dim_ == 0)
return;
427 dim3 dimGrid(n_blocks(Dim(),
CU1DBLOCK), 1);
430 cuda_pow(dimGrid, dimBlock, this->
data_, src.
Data(), power, fake_matrix_dim, 1);
431 CU_SAFE_CALL(cudaGetLastError());
432 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::ApplyPow", tim);
436 Vec().Pow(src.
Vec(), power);
441 template<
typename Real>
444 if (CuDevice::Instantiate().Enabled()) {
445 if (dim_ == 0)
return;
450 cuda_vec_apply_exp(dimGrid, dimBlock,
data_, dim_);
451 CU_SAFE_CALL(cudaGetLastError());
452 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::ApplyExp", tim);
461 template<
typename Real>
464 if (CuDevice::Instantiate().Enabled()) {
465 if (dim_ == 0)
return;
471 cuda_vec_apply_log(dimGrid, dimBlock,
data_, flag.
Data(), dim_);
472 CU_SAFE_CALL(cudaGetLastError());
474 KALDI_ERR <<
"Trying to take log of a negative number.";
475 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::ApplyLog", tim);
483 template<
typename Real>
486 if (CuDevice::Instantiate().Enabled()) {
487 if (dim_ == 0)
return;
493 cuda_log_softmax_reduce(dimGrid, dimBlock,
data_,
data_, dim, this->dim_);
494 CU_SAFE_CALL(cudaGetLastError());
495 CuDevice::Instantiate().AccuProfile(__func__, tim);
499 Vec().ApplyLogSoftMax();
505 template<
typename Real>
515 if (CuDevice::Instantiate().Enabled()) {
516 if (dim_ == 0)
return;
521 CUBLAS_SAFE_CALL(cublas_gemv(GetCublasHandle(),
522 (trans==
kTrans? CUBLAS_OP_N:CUBLAS_OP_T),
526 CuDevice::Instantiate().AccuProfile(__func__, tim);
530 Vec().AddMatVec(alpha,M.
Mat(),trans,v.
Vec(),beta);
534 template<
typename Real>
542 if (CuDevice::Instantiate().Enabled()) {
543 if (dim_ == 0)
return;
548 CUBLAS_SAFE_CALL(cublas_spmv(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, Dim(),
551 CuDevice::Instantiate().AccuProfile(__func__, tim);
555 Vec().AddSpVec(alpha,M.
Mat(),v.
Vec(),beta);
559 template<
typename Real>
565 if (CuDevice::Instantiate().Enabled()) {
566 if (dim_ == 0)
return;
571 cuda_add_vec_vec(dimGrid, dimBlock, alpha,
data_, v.
Data(), r.
Data(), beta, dim_);
572 CU_SAFE_CALL(cudaGetLastError());
573 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::AddVecVec", tim);
577 Vec().AddVecVec(alpha, v.
Vec(), r.
Vec(), beta);
582 template<
typename Real>
584 if (dim_ != other.
dim_)
KALDI_ERR <<
"ApproxEqual: size mismatch " 585 << dim_ <<
" vs. " << other.
dim_;
590 return tmp_norm <= static_cast<Real>(tol) * this_norm;
594 template<
typename Real>
598 if (CuDevice::Instantiate().Enabled()) {
599 if (dim_ == 0)
return;
602 this->AddDiagMatMat(alpha, M, trans, M, other_trans, beta);
606 Vec().AddDiagMat2(alpha, M.
Mat(), trans, beta);
610 template<
typename Real>
616 if (CuDevice::Instantiate().Enabled()) {
619 if (transM != transN) {
628 cuda_add_diag_mat_mat_MNT(dimGrid, dimBlock, alpha, M.
Data(), M.
Dim(),
640 const int32 warpSize = 32;
641 const int32 kOptNumBlocks = 512;
642 const int32 tile_dim =
645 dim3 dimBlock(tile_dim,
CU1DBLOCK / tile_dim);
646 dim3 dimGrid(n_blocks(N.
NumCols(), dimBlock.x),
647 n_blocks(N.
NumRows(), dimBlock.y));
648 dimGrid.y = std::min(dimGrid.y, (kOptNumBlocks - 1) / dimGrid.x + 1);
649 dimGrid.y = tile_dim == 16 ? 1 : dimGrid.y;
652 cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, Real(1), M.
Data(),
654 buf.Data(), buf.Stride());
655 this->AddRowSumMat(alpha, buf, beta);
657 cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, alpha, M.
Data(),
671 sizeof(Real) ==
sizeof(
float) && N.
NumCols() >= 2048 ? 32 : 16;
672 dim3 dimBlock(tile_dim,
CU1DBLOCK / tile_dim);
673 dim3 dimGrid(n_blocks(N.
NumCols(), tile_dim));
674 cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.
Data(), M.
Stride(),
680 sizeof(Real) ==
sizeof(
float) && N.
NumCols() >= 2048 ? 32 : 16;
681 dim3 dimBlock(tile_dim,
CU1DBLOCK / tile_dim);
682 dim3 dimGrid(n_blocks(M.
NumCols(), tile_dim));
683 cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.
Data(), N.
Stride(),
687 CU_SAFE_CALL(cudaGetLastError());
689 CuDevice::Instantiate().AccuProfile(__func__, tim);
693 Vec().AddDiagMatMat(alpha, M.
Mat(), transM, N.
Mat(), transN, beta);
697 template<
typename Real>
704 if (CuDevice::Instantiate().Enabled()) {
705 if (dim_ == 0)
return;
708 if (&v !=
this) CopyFromVec(v);
710 if (alpha != 1.0) Scale(alpha);
714 if (beta != 1.0) Scale(beta);
715 AddVec(alpha, tmp, 1.0);
717 CuDevice::Instantiate().AccuProfile(__func__, tim);
721 Vec().AddTpVec(alpha, M.
Mat(), trans, v.
Vec(), beta);
726 template<
typename Real>
730 if (CuDevice::Instantiate().Enabled()) {
731 if (dim_ == 0)
return;
733 cublas_tpmv(GetCublasHandle(), (trans==
kTrans? CUBLAS_OP_N:CUBLAS_OP_T),
735 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::MulTp", tim);
739 Vec().MulTp(M.
Mat(), trans);
743 template<
typename Real>
747 if (CuDevice::Instantiate().Enabled()) {
749 return std::numeric_limits<Real>::infinity();
757 result = ans_cpu.
Min();
761 int dimGrid = n_blocks(dim_, dimBlock);
766 cuda_vec_min(dimGrid, dimBlock,
data_, ans.
Data(), dim_, 1);
767 CU_SAFE_CALL(cudaGetLastError());
769 result = ans_cpu.
Min();
772 CuDevice::Instantiate().AccuProfile(__func__, tim);
776 result = (this->Vec()).Min();
781 template<
typename Real>
785 if (CuDevice::Instantiate().Enabled()) {
787 return -std::numeric_limits<Real>::infinity();
795 result = ans_cpu.
Max();
799 int dimGrid = n_blocks(dim_, dimBlock);
804 cuda_vec_max(dimGrid, dimBlock,
data_, ans.
Data(), dim_, 1);
805 CU_SAFE_CALL(cudaGetLastError());
807 result = ans_cpu.
Max();
810 CuDevice::Instantiate().AccuProfile(__func__, tim);
814 result = (this->Vec()).Max();
819 template<
typename Real>
822 if (CuDevice::Instantiate().Enabled()) {
823 if (dim_ == 0)
return;
827 cuda_replace_value(dimGrid, dimBlock,
data_, dim_, orig, changed);
828 CU_SAFE_CALL(cudaGetLastError());
829 CuDevice::Instantiate().AccuProfile(__func__, tim);
833 Vec().ReplaceValue(orig, changed);
837 template<
typename Real>
841 if (CuDevice::Instantiate().Enabled()) {
842 if (dim_ == 0)
return;
846 cuda_vec_mul_elements(dimGrid, dimBlock,
data_, v.
Data(), dim_);
847 CU_SAFE_CALL(cudaGetLastError());
848 CuDevice::Instantiate().AccuProfile(
"CuVectorBase::MulElements", tim);
852 Vec().MulElements(v.
Vec());
856 template<
typename Real>
861 v_mat(v.
Data(), 1, dim_, dim_);
872 if (CuDevice::Instantiate().Enabled()) {
873 if (dim_ == 0)
return;
875 CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, src.
Data(), 1,
data_, 1));
876 CuDevice::Instantiate().AccuProfile(__func__, tim);
880 Vec().CopyFromVec(src.
Vec());
889 if (CuDevice::Instantiate().Enabled()) {
890 if (dim_ == 0)
return;
892 CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, src.
Data(), 1,
data_, 1));
893 CuDevice::Instantiate().AccuProfile(__func__, tim);
897 Vec().CopyFromVec(src.
Vec());
902 template<
typename Real>
903 template<
typename OtherReal>
906 if (CuDevice::Instantiate().Enabled()) {
910 this->CopyFromVec(temp);
913 if (dim_ == 0)
return;
915 CU_SAFE_CALL(cudaMemcpyAsync(
data_, src.
Data(), src.
Dim()*
sizeof(Real),
916 cudaMemcpyHostToDevice, cudaStreamPerThread));
917 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
918 CuDevice::Instantiate().AccuProfile(
"CuVector::CopyFromVecH2D", tim);
923 Vec().CopyFromVec(src);
936 template<
typename Real>
937 template<
typename OtherReal>
941 if (CuDevice::Instantiate().Enabled()) {
946 if (dim_ == 0)
return;
948 CU_SAFE_CALL(cudaMemcpyAsync(dst->
Data(), this->
data_,
949 sizeof(Real) * dim_, cudaMemcpyDeviceToHost,
950 cudaStreamPerThread));
951 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
952 CuDevice::Instantiate().AccuProfile(__func__, tim);
962 template<
typename Real>
965 temp.
Read(is, binary);
972 template<
typename Real>
975 this->CopyToVec(&temp);
976 temp.
Write(os, binary);
980 template<
typename Real>
982 this->Resize(v.
Dim());
983 this->CopyFromVec(v);
986 template<
typename Real>
988 this->Resize(v.
dim_);
989 this->CopyFromVec(v);
992 template<
typename Real>
996 if (this->dim_ == dim) {
1000 if (this->dim_ != 0)
1002 if (dim == 0)
return;
1004 if (CuDevice::Instantiate().Enabled()) {
1006 this->
data_ =
static_cast<Real*
>(CuDevice::Instantiate().Malloc(dim *
sizeof(Real)));
1008 if (t ==
kSetZero) this->SetZero();
1009 CuDevice::Instantiate().AccuProfile(
"CuVector::Resize", tim);
1018 template<
typename Real>
1025 template<
typename Real>
1028 if (CuDevice::Instantiate().Enabled()) {
1029 if (this->dim_ == 0) {
1030 if (vec->
dim_ != 0) {
1033 this->CopyFromVec(*vec);
1038 if (vec->
dim_ != 0) {
1049 this->CopyToVec(vec);
1061 template<
typename Real>
1064 if (CuDevice::Instantiate().Enabled()) {
1065 if (this->
data_ != NULL)
1066 CuDevice::Instantiate().Free(this->
data_);
1077 template<
typename Real>
1081 if (CuDevice::Instantiate().Enabled()) {
1082 if (dim_ == 0)
return;
1086 cudaMemcpyDeviceToDevice, cudaStreamPerThread));
1087 CuDevice::Instantiate().AccuProfile(__func__, tim);
1091 memcpy(static_cast<void*>(
data_), static_cast<void*>(src.
data_),
1092 dim_ *
sizeof(Real));
1097 template<
typename Real>
1099 if (dim_==0 ||
data_==NULL)
return;
1101 if (CuDevice::Instantiate().Enabled()) {
1105 CU_SAFE_CALL(cudaMemsetAsync(
data_, 0, dim_*
sizeof(Real),
1106 cudaStreamPerThread));
1107 CuDevice::Instantiate().AccuProfile(
"CuVector::SetZero", tim);
1118 template<
typename Real>
1119 std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec) {
1121 vec.CopyToVec(&temp);
1127 std::ostream &operator << (std::ostream &out, const CuVectorBase<float> &vec);
1129 std::ostream &operator << (std::ostream &out, const CuVectorBase<double> &vec);
1134 template<
typename Real>
1137 if (CuDevice::Instantiate().Enabled()) {
1141 dim3 dimGrid(n_blocks(Dim(),
CU1DBLOCK));
1144 cuda_set_const(dimGrid, dimBlock,
data_, value, d);
1145 CU_SAFE_CALL(cudaGetLastError());
1146 CuDevice::Instantiate().AccuProfile(__func__, tim);
1156 template<
typename Real>
1159 if (CuDevice::Instantiate().Enabled()) {
1163 dim3 dimGrid(n_blocks(Dim(),
CU1DBLOCK));
1166 cuda_add(dimGrid, dimBlock,
data_, value, d);
1167 CU_SAFE_CALL(cudaGetLastError());
1168 CuDevice::Instantiate().AccuProfile(__func__, tim);
1176 template<
typename Real>
1179 if (CuDevice::Instantiate().Enabled()) {
1181 if (dim_ == 0)
return;
1184 int dimGrid(n_blocks(Dim(),
CU1DBLOCK));
1185 cuda_vec_copy_diag_from_packed(dimGrid, dimBlock,
data_, M.
Data(), dim_);
1186 CU_SAFE_CALL(cudaGetLastError());
1188 CuDevice::Instantiate().AccuProfile(__func__, tim);
1192 Vec().CopyDiagFromPacked(M.
Mat());
1197 template<
typename Real>
1200 if (CuDevice::Instantiate().Enabled()) {
1203 CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, M.
Data(), M.
Stride() + 1,
1206 CuDevice::Instantiate().AccuProfile(__func__, tim);
1210 Vec().CopyDiagFromMat(M.
Mat());
1215 template<
typename Real>
1218 if (CuDevice::Instantiate().Enabled()) {
1219 if (Dim() == 0 )
return;
1223 dim3 dimGrid(n_blocks(Dim(),
CU1DBLOCK));
1225 cuda_scale(dimGrid, dimBlock,
data_, value, d);
1226 CU_SAFE_CALL(cudaGetLastError());
1228 CuDevice::Instantiate().AccuProfile(__func__, tim);
1236 template<
typename Real>
1242 if (CuDevice::Instantiate().Enabled()) {
1244 int32 dim = this->dim_;
1245 Real *data = this->
data_;
1246 const Real *vec_data = vec.
data_;
1247 if (beta != 1.0) CU_SAFE_CALL(cuda_scal(GetCublasHandle(), dim, beta, data, 1));
1248 if (alpha != 0.0) CU_SAFE_CALL(cuda_axpy(GetCublasHandle(), dim, alpha, vec_data, 1, data, 1));
1249 CuDevice::Instantiate().AccuProfile(__func__, tim);
1253 if (beta != 1.0) Vec().Scale(beta);
1254 Vec().AddVec(alpha, vec.
Vec());
1259 template<
typename Real>
1260 template<
typename OtherReal>
1266 this->AddVec(alpha, temp, beta);
1276 template<
typename Real>
1283 if (CuDevice::Instantiate().Enabled()) {
1286 mat.
Dim(), alpha, beta);
1287 CU_SAFE_CALL(cudaGetLastError());
1289 CuDevice::Instantiate().AccuProfile(__func__, tim);
1293 Vec().AddRowSumMat(alpha, mat.
Mat(), beta);
1297 template<
typename Real>
1301 if (CuDevice::Instantiate().Enabled()) {
1306 mat.
Dim(), alpha, beta);
1307 CU_SAFE_CALL(cudaGetLastError());
1309 CuDevice::Instantiate().AccuProfile(__func__, tim);
1313 Vec().AddColSumMat(alpha, mat.
Mat(), beta);
1317 template<
typename Real>
1320 if (CuDevice::Instantiate().Enabled()) {
1324 dim3 dimGrid(n_blocks(dim_,
CU1DBLOCK));
1327 cuda_invert_elements(dimGrid, dimBlock,
data_, d);
1328 CU_SAFE_CALL(cudaGetLastError());
1330 CuDevice::Instantiate().AccuProfile(__func__, tim);
1334 Vec().InvertElements();
1339 template<
typename Real>
1345 if (CuDevice::Instantiate().Enabled()) {
1349 dim3 dimGrid(n_blocks(Dim(),
CU1DBLOCK));
1351 cuda_vector_copy_elements(dimGrid, dimBlock, this->
data_, Dim(),
1354 CU_SAFE_CALL(cudaGetLastError());
1355 CuDevice::Instantiate().AccuProfile(__func__, tim);
1361 const int32* index_map = elements.
Data();
1364 for (
int32 i = 0;
i < Dim();
i++) {
1369 this_vec(
i) = src_mat(
i, j);
1372 this_vec(
i) = src_mat(j,
i);
const MatrixBase< Real > & Mat() const
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
MatrixIndexT Stride() const
This class provides a way for switching between double and float types.
const PackedMatrix< Real > & Mat() const
void MulTp(const CuTpMatrix< Real > &M, const MatrixTransposeType trans)
Multiplies this vector by lower-triangular marix: *this <– *this *M.
void RandUniform(CuMatrixBase< Real > *tgt)
Fill with uniform [0..1] floats,.
MatrixIndexT NumRows() const
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Base class which provides matrix operations not involving resizing or allocation. ...
void CopyColFromMat(const CuMatrixBase< Real > &mat, MatrixIndexT col)
const Real * Data() const
Gives pointer to raw data (const).
Structure containing size of the matrix plus stride.
Real * data_
GPU data pointer (or regular data pointer if CUDA is not compiled in or we have no GPU)...
void Write(std::ostream &Out, bool binary) const
Writes to C++ stream (option to write in binary).
Real * RowData(MatrixIndexT i)
Returns pointer to data for one row (non-const)
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
Real VecMatVec(const VectorBase< Real > &v1, const MatrixBase< Real > &M, const VectorBase< Real > &v2)
Returns .
const T * Data() const
Get raw pointer.
This class represents a matrix that's stored on the GPU if we have one, and in memory if not...
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
Real Min() const
Returns the minimum value of any element, or +infinity for the empty vector.
template double VecVec(const CuVectorBase< double > &A, const CuVectorBase< float > &B)
const SpMatrix< Real > & Mat() const
void RandGaussian(CuMatrixBase< Real > *tgt)
Fill with Normal random numbers,.
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols.
void AddMatVec(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, const CuVectorBase< Real > &v, const Real beta)
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
MatrixIndexT dim_
dimension of vector
void Swap(Vector< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
#define KALDI_MEMALIGN_FREE(x)
This class is used for a piece of a CuMatrix.
Real * Data()
Returns a pointer to the start of the vector's data.
void DivElements(const CuMatrixBase< Real > &A)
Divide two matrices elementwise: C = A ./ A.
MatrixIndexT Dim() const
Returns the dimension of the vector.
Real Sum() const
Returns sum of the elements.
void AddVec(Real alpha, const CuVectorBase< Real > &vec, Real beta=1.0)
const Real * Data() const
Return data pointer (const).
Real * data_
data memory area
Matrix for CUDA computing.
MatrixIndexT NumCols() const
MatrixIndexT dim_
dimension of the vector
A class representing a vector.
const VectorBase< Real > & Vec() const
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Matrix for CUDA computing.
Real * Data()
Returns a pointer to the start of the vector's data.
MatrixIndexT NumRows() const
Dimensions.
MatrixIndexT NumCols() const
Provides a vector abstraction class.
MatrixIndexT Dim() const
Return the vector dimension.
const Matrix< Real > & Mat() const
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
void CopyToVec(VectorBase< OtherReal > *dst) const
void Read(std::istream &in, bool binary, bool add=false)
Read function using C++ streams.
const TpMatrix< Real > & Mat() const
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
MatrixIndexT Dim() const
Dimensions.
Vector for CUDA computing.
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).