22 #include <cuda_runtime_api.h>    23 #include <cublas_v2.h>    46   block_data_.resize(data.size());
    47   MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0;
    48   for (
size_t b = 0; b < data.size(); b++) {
    49     MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols();
    56     row_offset += num_rows;
    57     col_offset += num_cols;
    58     max_num_rows = std::max(max_num_rows, num_rows);
    59     block_data_[b] = block_data;
    61   num_rows_ = row_offset;
    62   data_.Resize(max_num_rows, col_offset);
    63   for (
int32 b = 0; b < NumBlocks(); b++)
    64     Block(b).CopyFromMat(data[b]);
    71   KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
    74                            block_data.col_offset, block_data.num_cols);
    79   KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
    82                            block_data.col_offset, block_data.num_cols);
   108   if (cu_data_ != NULL) {
   109     if (CuDevice::Instantiate().Enabled()) {
   110       CuDevice::Instantiate().Free(cu_data_);
   113       KALDI_ERR << 
"CuBlockMatrix: you have CUDA data pointer but "   114                 << 
"no GPU is enabled: likely code error.";
   125   if (block_data_.size() == 0) 
return; 
   126   if (CuDevice::Instantiate().Enabled()) {
   128     std::vector<CuBlockMatrixData> tmp_cu_data(NumBlocks());
   129     int32 row_offset = 0, col_offset = 0;
   130     for (
size_t b = 0; b < NumBlocks(); b++) {
   137       row_offset += this_mat.
NumRows();
   138       col_offset += this_mat.
NumCols();
   142         CuDevice::Instantiate().Malloc(size));
   143     CU_SAFE_CALL(cudaMemcpyAsync(cu_data_, &(tmp_cu_data[0]), size, 
   144                                  cudaMemcpyHostToDevice, cudaStreamPerThread));
   145     CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
   146     CuDevice::Instantiate().AccuProfile(__func__, tim);    
   164   int32 num_blocks = NumBlocks();
   166   for (
int32 b = 0; b < num_blocks; b++)
   167     this->Block(b).Write(os, binary);
   175   int i = 
Peek(is, binary);
   176   std::vector<CuMatrix<Real> > data;
   177   if (i != static_cast<int>(
'<')) {
   184     for (
int32 i = 0; i < size; i++)
   185       data[i].Read(is, binary);
   192     for (
int32 i = 0; i < size; i++)
   193       data[i].Read(is, binary);
   199   this->Swap(&block_mat);
   219       A_row_stride = A.
Stride(), A_col_stride = 1,
   221       B_row_stride = B.
Stride(), B_col_stride = 1;
   230   KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols()
   231                && A_num_cols == B_num_rows);
   232   if (NumBlocks() == 0) 
return; 
   234   if (CuDevice::Instantiate().Enabled()) {
   241     int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols();
   242     int32 y_blocksize = max_block_rows;
   245     int32 z_blocksize = max_block_cols;
   246     while (z_blocksize * x_blocksize * y_blocksize > 
CU1DBLOCK || z_blocksize > 
CU2DBLOCK)
   249     dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize);
   250     dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize),
   251                  n_blocks(max_block_rows, y_blocksize),
   252                  n_blocks(max_block_cols, z_blocksize));
   253     cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(),
   254                            A.
Data(), A_num_cols, A_row_stride, A_col_stride,
   255                            B.
Data(), B_row_stride, B_col_stride, alpha, beta);
   256     CU_SAFE_CALL(cudaGetLastError());    
   257     CuDevice::Instantiate().AccuProfile(__func__, tim);    
   261     int32 row_offset = 0, col_offset = 0;    
   265           this_num_cols = this_block.
NumCols();
   267                                   A.
Range(row_offset, this_num_rows,
   270                                           row_offset, this_num_rows)),
   273                             col_offset, this_num_cols) :
   274                     B.
Range(col_offset, this_num_cols,
   276       this_block.
AddMatMat(alpha, A_part, transA, B_part, transB, beta);
   277       row_offset += this_num_rows;
   278       col_offset += this_num_cols;
   280     KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
   287   for (
size_t i = 0; 
i < block_data_.size(); 
i++)
   288     max_cols = std::max(max_cols, block_data_[
i].num_cols);
   294   return data_.NumRows();
   304         this_num_cols = this_block.
NumCols();
   306                                 col_offset, this_num_cols);
   308     row_offset += this_num_rows;
   309     col_offset += this_num_cols;
   311   KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
   317 template<
typename Real>
   318 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat) {
   320   mat.Write(out, binary);
   325 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<float> &mat);
   327 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<double> &mat);
 
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
 
MatrixIndexT Stride() const
 
struct CuBlockMatrixData_ CuBlockMatrixData
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
 
MatrixIndexT MaxBlockRows() const
 
CuBlockMatrix & operator=(const CuBlockMatrix &other)
Assignment operator. 
 
std::vector< BlockMatrixData > block_data_
 
void FreeCudaData()
If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL. 
 
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
 
void CopyFromMat(const CuMatrix< Real > &M)
Copies elements within the block structure from matrix M, discarding others. 
 
CuSubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
 
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
 
The class CuBlockMatrix holds a vector of objects of type CuMatrix, say, M_1, M_2, . 
 
This class represents a matrix that's stored on the GPU if we have one, and in memory if not...
 
int Peek(std::istream &is, bool binary)
Peek consumes whitespace (if binary == false) and then returns the peek() value of the stream...
 
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
 
void AddMatMat(BaseFloat alpha, const CuMatrix< Real > &A, MatrixTransposeType transA, const CuMatrix< Real > &B, MatrixTransposeType transB, BaseFloat beta)
Does *this = alpha A B + beta * *this, discarding elements of the product outside the block structure...
 
void SetCudaData()
If using GPU, allocate and set cu_data_ on the GPU to reflect "data_". 
 
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
 
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C. 
 
This class is used for a piece of a CuMatrix. 
 
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters. 
 
const Real * Data() const
Return data pointer (const). 
 
MatrixIndexT NumCols() const
 
#define KALDI_ASSERT(cond)
 
void Swap(CuBlockMatrix *other)
 
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
 
void Read(std::istream &is, bool binary)
 
MatrixIndexT NumRows() const
Dimensions. 
 
MatrixIndexT MaxBlockCols() const
 
void Destroy()
Frees and deinitializes everything. 
 
const CuSubMatrix< Real > Block(MatrixIndexT b) const
 
void Write(std::ostream &os, bool binary) const