24 #include <cuda_runtime_api.h> 25 #include <cublas_v2.h> 39 template<
typename Real>
45 if (this->num_rows_ == rows) {
46 if (resize_type ==
kSetZero) this->SetZero();
50 if (this->num_rows_ != 0)
52 if (rows == 0)
return;
54 CuDevice &device = CuDevice::Instantiate();
55 if (device.Enabled()) {
57 this->num_rows_ = rows;
58 size_t nr =
static_cast<size_t>(num_rows_),
59 num_bytes = ((nr * (nr+1)) / 2) *
sizeof(Real);
60 this->
data_ =
static_cast<Real*
>(device.Malloc(num_bytes));
62 if (resize_type ==
kSetZero) this->SetZero();
63 device.AccuProfile(
"CuPackedMatrix::Resize", tim);
74 template<
typename Real>
84 template<
typename Real>
87 if (CuDevice::Instantiate().Enabled()) {
88 if (this->
data_ != NULL) {
89 CuDevice::Instantiate().Free(this->
data_);
100 template<
typename Real>
103 if (CuDevice::Instantiate().Enabled()) {
104 if (this->num_rows_ == 0) {
108 CopyFromPacked(*mat);
124 this->CopyToPacked(mat);
136 template<
typename Real>
140 if (CuDevice::Instantiate().Enabled()) {
141 if (num_rows_ == 0)
return;
143 size_t nr =
static_cast<size_t>(num_rows_),
144 num_bytes = ((nr * (nr+1)) / 2) *
sizeof(Real);
147 cudaMemcpyAsync(
data_, src.
data_, num_bytes, cudaMemcpyDeviceToDevice,
148 cudaStreamPerThread));
149 CuDevice::Instantiate().AccuProfile(
"CuPackedMatrix::CopyFromPacked1",
154 Mat().CopyFromPacked(src.
Mat());
158 template<
typename Real>
162 if (CuDevice::Instantiate().Enabled()) {
163 if (num_rows_ == 0)
return;
166 cudaMemcpyHostToDevice, cudaStreamPerThread));
167 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
168 CuDevice::Instantiate().AccuProfile(
"CuPackedMatrix::CopyFromPacked2", tim);
172 Mat().CopyFromPacked(src);
177 template<
typename Real>
182 if (CuDevice::Instantiate().Enabled()) {
183 if (num_rows_ == 0)
return;
185 size_t nr =
static_cast<size_t>(num_rows_),
186 num_bytes = ((nr * (nr+1)) / 2) *
sizeof(Real);
188 CU_SAFE_CALL(cudaMemcpyAsync(dst->
data_,
data_, num_bytes,
189 cudaMemcpyDeviceToHost, cudaStreamPerThread));
190 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
191 CuDevice::Instantiate().AccuProfile(
"CuPackedMatrix::CopyToPackedD2H", tim);
230 template<
typename Real>
233 temp.
Read(is, binary);
238 template<
typename Real>
241 this->CopyToPacked(&temp);
242 temp.
Write(os, binary);
245 template<
typename Real>
248 if (CuDevice::Instantiate().Enabled()) {
250 size_t nr =
static_cast<size_t>(num_rows_),
251 num_bytes = ((nr * (nr+1)) / 2) *
sizeof(Real);
253 CU_SAFE_CALL(cudaMemsetAsync(reinterpret_cast<void*>(this->
data_), 0,
254 num_bytes, cudaStreamPerThread));
255 CuDevice::Instantiate().AccuProfile(
"CuPackedMatrix::SetZero", tim);
263 template<
typename Real>
267 if (CuDevice::Instantiate().Enabled()) {
268 if (num_rows_ == 0)
return 0.0;
275 result = Mat().Trace();
280 template<
typename Real>
283 if (CuDevice::Instantiate().Enabled()) {
284 if (num_rows_ == 0)
return;
287 int dimGrid(n_blocks(NumRows(),
CU1DBLOCK));
288 cuda_set_diag_packed(dimGrid,dimBlock,
data_,alpha,num_rows_);
289 CU_SAFE_CALL(cudaGetLastError());
290 CuDevice::Instantiate().AccuProfile(
"CuPackedMatrix::SetDiag", tim);
294 Mat().SetDiag(alpha);
298 template<
typename Real>
301 if (CuDevice::Instantiate().Enabled()) {
303 size_t nr =
static_cast<size_t>(num_rows_),
304 num_elements = ((nr * (nr+1)) / 2);
305 CUBLAS_SAFE_CALL(cublas_scal(GetCublasHandle(), num_elements, alpha,
data_, 1));
307 CuDevice::Instantiate().AccuProfile(
"CuPackedMatrix::Scale", tim);
315 template<
typename Real>
318 if (CuDevice::Instantiate().Enabled()) {
321 int dimGrid(n_blocks(NumRows(),
CU1DBLOCK));
322 cuda_scale_diag_packed(dimGrid,dimBlock,
data_,alpha,num_rows_);
323 CU_SAFE_CALL(cudaGetLastError());
324 CuDevice::Instantiate().AccuProfile(
"CuPackedMatrix::ScaleDiag", tim);
328 Mat().ScaleDiag(alpha);
332 template<
typename Real>
336 if (CuDevice::Instantiate().Enabled()) {
337 if (num_rows_ == 0)
return;
339 size_t nr = num_rows_,
340 sz = (nr * (nr + 1)) / 2;
341 cublas_axpy(GetCublasHandle(), sz, alpha, M.
Data(), 1,
data_, 1);
342 CuDevice::Instantiate().AccuProfile(
"CuPackedMatrix::AddPacked", tim);
346 Mat().AddPacked(alpha, M.
Mat());
350 template<
typename Real>
353 if (CuDevice::Instantiate().Enabled()) {
354 if (num_rows_ == 0)
return;
357 int dimGrid(n_blocks(NumRows(),
CU1DBLOCK));
358 cuda_add_diag_packed(dimGrid,dimBlock,
data_,r,num_rows_);
359 CU_SAFE_CALL(cudaGetLastError());
360 CuDevice::Instantiate().AccuProfile(
"CuPackedMatrix::AddToDiag", tim);
369 template<
typename Real>
372 if (CuDevice::Instantiate().Enabled()) {
385 template<
typename Real>
386 std::ostream &operator << (std::ostream &out, const CuPackedMatrix<Real> &mat) {
388 mat.CopyToPacked(&temp);
395 std::ostream &operator << (std::ostream &out, const CuPackedMatrix<float> &mat);
397 std::ostream &operator << (std::ostream &out, const CuPackedMatrix<double> &mat);
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Packed symetric matrix class.
void CopyDiagFromPacked(const CuPackedMatrix< Real > &M)
Extracts the diagonal of a packed matrix M; works for Sp or Tp.
const PackedMatrix< Real > & Mat() const
void Swap(CuPackedMatrix< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
void Read(std::istream &in, bool binary, bool add=false)
void Write(std::ostream &out, bool binary) const
void AddPacked(const Real alpha, const CuPackedMatrix< Real > &M)
void SetDiag(Real alpha)
< Set to random values of a normal distribution
MatrixIndexT NumRows() const
void CopyToPacked(PackedMatrix< Real > *dst) const
void AddToDiag(Real r)
< Set the diagonal value to alpha
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
MatrixIndexT NumRows() const
void SetRandn()
< Set to unit matrix.
void RandGaussian(CuMatrixBase< Real > *tgt)
Fill with Normal random numbers,.
Packed matrix: base class for triangular and symmetric matrices.
void Read(std::istream &in, bool binary)
void Swap(PackedMatrix< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
#define KALDI_MEMALIGN_FREE(x)
void SetUnit()
< Set to zero
void ScaleDiag(Real alpha)
void CopyFromPacked(const PackedMatrix< OtherReal > &orig)
size_t SizeInBytes() const
#define KALDI_ASSERT(cond)
Matrix for CUDA computing.
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type=kSetZero)
Set packed matrix to a specified size (can be zero).
void Write(std::ostream &out, bool binary) const
void CopyFromPacked(const CuPackedMatrix< Real > &src)
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type=kSetZero)
Set packed matrix to a specified size (can be zero).