22 #include <cuda_runtime_api.h> 23 #include <cublas_v2.h> 46 block_data_.resize(data.size());
47 MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0;
48 for (
size_t b = 0; b < data.size(); b++) {
49 MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols();
56 row_offset += num_rows;
57 col_offset += num_cols;
58 max_num_rows = std::max(max_num_rows, num_rows);
59 block_data_[b] = block_data;
61 num_rows_ = row_offset;
62 data_.Resize(max_num_rows, col_offset);
63 for (
int32 b = 0; b < NumBlocks(); b++)
64 Block(b).CopyFromMat(data[b]);
71 KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
74 block_data.col_offset, block_data.num_cols);
79 KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
82 block_data.col_offset, block_data.num_cols);
108 if (cu_data_ != NULL) {
109 if (CuDevice::Instantiate().Enabled()) {
110 CuDevice::Instantiate().Free(cu_data_);
113 KALDI_ERR <<
"CuBlockMatrix: you have CUDA data pointer but " 114 <<
"no GPU is enabled: likely code error.";
125 if (block_data_.size() == 0)
return;
126 if (CuDevice::Instantiate().Enabled()) {
128 std::vector<CuBlockMatrixData> tmp_cu_data(NumBlocks());
129 int32 row_offset = 0, col_offset = 0;
130 for (
size_t b = 0; b < NumBlocks(); b++) {
137 row_offset += this_mat.
NumRows();
138 col_offset += this_mat.
NumCols();
142 CuDevice::Instantiate().Malloc(size));
143 CU_SAFE_CALL(cudaMemcpyAsync(cu_data_, &(tmp_cu_data[0]), size,
144 cudaMemcpyHostToDevice, cudaStreamPerThread));
145 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
146 CuDevice::Instantiate().AccuProfile(__func__, tim);
164 int32 num_blocks = NumBlocks();
166 for (
int32 b = 0; b < num_blocks; b++)
167 this->Block(b).Write(os, binary);
175 int i =
Peek(is, binary);
176 std::vector<CuMatrix<Real> > data;
177 if (i != static_cast<int>(
'<')) {
184 for (
int32 i = 0; i < size; i++)
185 data[i].Read(is, binary);
192 for (
int32 i = 0; i < size; i++)
193 data[i].Read(is, binary);
199 this->Swap(&block_mat);
219 A_row_stride = A.
Stride(), A_col_stride = 1,
221 B_row_stride = B.
Stride(), B_col_stride = 1;
230 KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols()
231 && A_num_cols == B_num_rows);
232 if (NumBlocks() == 0)
return;
234 if (CuDevice::Instantiate().Enabled()) {
241 int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols();
242 int32 y_blocksize = max_block_rows;
245 int32 z_blocksize = max_block_cols;
246 while (z_blocksize * x_blocksize * y_blocksize >
CU1DBLOCK || z_blocksize >
CU2DBLOCK)
249 dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize);
250 dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize),
251 n_blocks(max_block_rows, y_blocksize),
252 n_blocks(max_block_cols, z_blocksize));
253 cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(),
254 A.
Data(), A_num_cols, A_row_stride, A_col_stride,
255 B.
Data(), B_row_stride, B_col_stride, alpha, beta);
256 CU_SAFE_CALL(cudaGetLastError());
257 CuDevice::Instantiate().AccuProfile(__func__, tim);
261 int32 row_offset = 0, col_offset = 0;
265 this_num_cols = this_block.
NumCols();
267 A.
Range(row_offset, this_num_rows,
270 row_offset, this_num_rows)),
273 col_offset, this_num_cols) :
274 B.
Range(col_offset, this_num_cols,
276 this_block.
AddMatMat(alpha, A_part, transA, B_part, transB, beta);
277 row_offset += this_num_rows;
278 col_offset += this_num_cols;
280 KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
287 for (
size_t i = 0;
i < block_data_.size();
i++)
288 max_cols = std::max(max_cols, block_data_[
i].num_cols);
294 return data_.NumRows();
304 this_num_cols = this_block.
NumCols();
306 col_offset, this_num_cols);
308 row_offset += this_num_rows;
309 col_offset += this_num_cols;
311 KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
317 template<
typename Real>
318 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat) {
320 mat.Write(out, binary);
325 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<float> &mat);
327 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<double> &mat);
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
MatrixIndexT Stride() const
struct CuBlockMatrixData_ CuBlockMatrixData
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
MatrixIndexT MaxBlockRows() const
CuBlockMatrix & operator=(const CuBlockMatrix &other)
Assignment operator.
std::vector< BlockMatrixData > block_data_
void FreeCudaData()
If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL.
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
void CopyFromMat(const CuMatrix< Real > &M)
Copies elements within the block structure from matrix M, discarding others.
CuSubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
The class CuBlockMatrix holds a vector of objects of type CuMatrix, say, M_1, M_2, .
This class represents a matrix that's stored on the GPU if we have one, and in memory if not...
int Peek(std::istream &is, bool binary)
Peek consumes whitespace (if binary == false) and then returns the peek() value of the stream...
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
void AddMatMat(BaseFloat alpha, const CuMatrix< Real > &A, MatrixTransposeType transA, const CuMatrix< Real > &B, MatrixTransposeType transB, BaseFloat beta)
Does *this = alpha A B + beta * *this, discarding elements of the product outside the block structure...
void SetCudaData()
If using GPU, allocate and set cu_data_ on the GPU to reflect "data_".
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
This class is used for a piece of a CuMatrix.
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
const Real * Data() const
Return data pointer (const).
MatrixIndexT NumCols() const
#define KALDI_ASSERT(cond)
void Swap(CuBlockMatrix *other)
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
void Read(std::istream &is, bool binary)
MatrixIndexT NumRows() const
Dimensions.
MatrixIndexT MaxBlockCols() const
void Destroy()
Frees and deinitializes everything.
const CuSubMatrix< Real > Block(MatrixIndexT b) const
void Write(std::ostream &os, bool binary) const