doc/cu-block-matrix_8cc_source.html

 // cudamatrix/cu-block-matrix.cc

 // Copyright 2013      Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.


 #if HAVE_CUDA == 1
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif

 #include <algorithm>
 #include "base/timer.h"
 #include "cudamatrix/cu-block-matrix.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-device.h"

 namespace kaldi {

 template<class Real>
 CuBlockMatrix<Real>::CuBlockMatrix() {
 #if HAVE_CUDA == 1
   cu_data_ = NULL;
 #endif
 }

 template<class Real>
 CuBlockMatrix<Real>::CuBlockMatrix(const std::vector<CuMatrix<Real> >&data) {
 #if HAVE_CUDA == 1
   cu_data_ = NULL;
 #endif
   block_data_.resize(data.size());
   MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0;
   for (size_t b = 0; b < data.size(); b++) {
     MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols();
     KALDI_ASSERT(num_rows > 0 && num_cols > 0);
     BlockMatrixData block_data;
     block_data.num_rows = num_rows;
     block_data.num_cols = num_cols;
     block_data.row_offset = row_offset;
     block_data.col_offset = col_offset;
     row_offset += num_rows;
     col_offset += num_cols;
     max_num_rows = std::max(max_num_rows, num_rows);
     block_data_[b] = block_data;
   }
   num_rows_ = row_offset;
   data_.Resize(max_num_rows, col_offset);
   for (int32 b = 0; b < NumBlocks(); b++)
     Block(b).CopyFromMat(data[b]);
   SetCudaData();
 }


 template<class Real>
 const CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) const {
   KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
   const BlockMatrixData &block_data = block_data_[b];
   return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
                            block_data.col_offset, block_data.num_cols);
 }

 template<class Real>
 CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) {
   KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
   BlockMatrixData &block_data = block_data_[b];
   return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
                            block_data.col_offset, block_data.num_cols);
 }


 template<class Real>
 CuBlockMatrix<Real>::CuBlockMatrix(const CuBlockMatrix<Real> &other):
     data_(other.data_), block_data_(other.block_data_), num_rows_(other.num_rows_) {
 #if HAVE_CUDA == 1
   cu_data_ = NULL;
 #endif
   SetCudaData();
 }

 template<class Real>
 CuBlockMatrix<Real> &CuBlockMatrix<Real>::operator =(const CuBlockMatrix<Real> &other) {
   FreeCudaData();
   data_ = other.data_;
   block_data_ = other.block_data_;
   num_rows_ = other.num_rows_;
   SetCudaData();
   return *this;
 }

 template<class Real>
 void CuBlockMatrix<Real>::FreeCudaData() {
 #if HAVE_CUDA == 1
   if (cu_data_ != NULL) {
     if (CuDevice::Instantiate().Enabled()) {
       CuDevice::Instantiate().Free(cu_data_);
       cu_data_ = NULL;
     } else {
       KALDI_ERR << "CuBlockMatrix: you have CUDA data pointer but "
                 << "no GPU is enabled: likely code error.";
     }
   }
 #endif
 }


 template<class Real>
 void CuBlockMatrix<Real>::SetCudaData() {
 #if HAVE_CUDA == 1
   KALDI_ASSERT(cu_data_ == NULL);
   if (block_data_.size() == 0) return; // Nothing to do.
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     std::vector<CuBlockMatrixData> tmp_cu_data(NumBlocks());
     int32 row_offset = 0, col_offset = 0;
     for (size_t b = 0; b < NumBlocks(); b++) {
       CuSubMatrix<Real> this_mat = Block(b);
       CuBlockMatrixData &this_cu_data = tmp_cu_data[b];
       this_cu_data.row_offset = row_offset;
       this_cu_data.col_offset = col_offset;
       this_cu_data.matrix_dim = this_mat.Dim();
       this_cu_data.matrix_data = static_cast<void*>(this_mat.Data());
       row_offset += this_mat.NumRows();
       col_offset += this_mat.NumCols();
     }
     size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
     cu_data_ = static_cast<CuBlockMatrixData*>(
         CuDevice::Instantiate().Malloc(size));
     CU_SAFE_CALL(cudaMemcpyAsync(cu_data_, &(tmp_cu_data[0]), size,
                                  cudaMemcpyHostToDevice, cudaStreamPerThread));
     CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   }
 #endif
 }

 template<class Real>
 void CuBlockMatrix<Real>::Swap(CuBlockMatrix<Real> *other) {
   data_.Swap(&other->data_);
   block_data_.swap(other->block_data_);
   std::swap(num_rows_, other->num_rows_);
 #if HAVE_CUDA == 1
   std::swap(cu_data_, other->cu_data_);
 #endif
 }

 template<class Real>
 void CuBlockMatrix<Real>::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "<CuBlockMatrix>");
   int32 num_blocks = NumBlocks();
   WriteBasicType(os, binary, num_blocks);
   for (int32 b = 0; b < num_blocks; b++)
     this->Block(b).Write(os, binary);
   WriteToken(os, binary, "</CuBlockMatrix>");
 }


 template<class Real>
 void CuBlockMatrix<Real>::Read(std::istream &is, bool binary) {
   Destroy();
   int i = Peek(is, binary);
   std::vector<CuMatrix<Real> > data;
   if (i != static_cast<int>('<')) {
     // back-compatibility code so we can read the older format of
     // MixtureProbComponent.  This code should be deleted eventually.
     int32 size;
     ReadBasicType(is, binary, &size);
     KALDI_ASSERT(size >= 0);
     data.resize(size);
     for (int32 i = 0; i < size; i++)
       data[i].Read(is, binary);
   } else {
     ExpectToken(is, binary, "<CuBlockMatrix>");
     int32 size;
     ReadBasicType(is, binary, &size);
     KALDI_ASSERT(size >= 0);
     data.resize(size);
     for (int32 i = 0; i < size; i++)
       data[i].Read(is, binary);
     ExpectToken(is, binary, "</CuBlockMatrix>");
   }

   CuBlockMatrix<Real> block_mat(data); // initializer from std::vector<CuMatrix<Real> > does
   // the main job of initialization.
   this->Swap(&block_mat);
 }

 template<class Real>
 void CuBlockMatrix<Real>::Destroy() {
   data_.Resize(0, 0);
   block_data_.clear();
   num_rows_ = 0;
   FreeCudaData();
 }

 // Does *this = alpha A B + beta * *this, discarding elements outside
 // the block structure of the *this matrix.
 template<class Real>
 void CuBlockMatrix<Real>::AddMatMat(
     BaseFloat alpha,
     const CuMatrix<Real> &A, MatrixTransposeType transA,
     const CuMatrix<Real> &B, MatrixTransposeType transB,
     BaseFloat beta) {
   MatrixIndexT A_num_rows = A.NumRows(), A_num_cols = A.NumCols(),
       A_row_stride = A.Stride(), A_col_stride = 1,
       B_num_rows = B.NumRows(), B_num_cols = B.NumCols(),
       B_row_stride = B.Stride(), B_col_stride = 1;
   if (transA == kTrans) {
     std::swap(A_num_rows, A_num_cols);
     std::swap(A_row_stride, A_col_stride);
   }
   if (transB == kTrans) {
     std::swap(B_num_rows, B_num_cols);
     std::swap(B_row_stride, B_col_stride);
   }
   KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols()
                && A_num_cols == B_num_rows);
   if (NumBlocks() == 0) return; // empty matrix.
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     // (x,y,z) dimensions are (block-id, row-of-block, col-of-block)
     // First some logic to choose block dims...
     // we assume (which we can, safely) that CU1DBLOCK is <= the max threads per block.
     int32 x_blocksize = std::min(CU1DBLOCK, NumBlocks()); // x dim corresponds to block-idx.
     int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols();
     int32 y_blocksize = max_block_rows;
     while (y_blocksize * x_blocksize > CU1DBLOCK || y_blocksize > CU2DBLOCK)
       y_blocksize--;
     int32 z_blocksize = max_block_cols;
     while (z_blocksize * x_blocksize * y_blocksize > CU1DBLOCK || z_blocksize > CU2DBLOCK)
       z_blocksize--;

     dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize);
     dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize),
                  n_blocks(max_block_rows, y_blocksize),
                  n_blocks(max_block_cols, z_blocksize));
     cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(),
                            A.Data(), A_num_cols, A_row_stride, A_col_stride,
                            B.Data(), B_row_stride, B_col_stride, alpha, beta);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     int32 row_offset = 0, col_offset = 0;
     for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
       CuSubMatrix<Real> this_block = Block(b);
       MatrixIndexT this_num_rows = this_block.NumRows(),
           this_num_cols = this_block.NumCols();
       CuSubMatrix<Real> A_part = (transA == kNoTrans ?
                                   A.Range(row_offset, this_num_rows,
                                           0, A.NumCols()) :
                                   A.Range(0, A.NumRows(),
                                           row_offset, this_num_rows)),
           B_part = (transB == kNoTrans ?
                     B.Range(0, B.NumRows(),
                             col_offset, this_num_cols) :
                     B.Range(col_offset, this_num_cols,
                             0, B.NumCols()));
       this_block.AddMatMat(alpha, A_part, transA, B_part, transB, beta);
       row_offset += this_num_rows;
       col_offset += this_num_cols;
     }
     KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
   }
 }

 template<class Real>
 MatrixIndexT CuBlockMatrix<Real>::MaxBlockCols() const {
   MatrixIndexT max_cols = 0;
   for (size_t i = 0; i < block_data_.size(); i++)
     max_cols = std::max(max_cols, block_data_[i].num_cols);
   return max_cols;
 }

 template<class Real>
 MatrixIndexT CuBlockMatrix<Real>::MaxBlockRows() const {
   return data_.NumRows();
 }

 template<class Real>
 void CuBlockMatrix<Real>::CopyFromMat(const CuMatrix<Real> &M) {
   KALDI_ASSERT(NumRows() == M.NumRows() && NumCols() == M.NumCols());
   MatrixIndexT row_offset = 0, col_offset = 0;
   for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
     CuSubMatrix<Real> this_block = Block(b);
     MatrixIndexT this_num_rows = this_block.NumRows(),
         this_num_cols = this_block.NumCols();
     const CuSubMatrix<Real> src(M, row_offset, this_num_rows,
                                 col_offset, this_num_cols);
     this_block.CopyFromMat(src);
     row_offset += this_num_rows;
     col_offset += this_num_cols;
   }
   KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
 }

 template<typename Real>
 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat) {
   bool binary = false;
   mat.Write(out, binary);
   return out;
 }
 // instantiate the template
 template
 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<float> &mat);
 template
 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<double> &mat);

 // Instantiate the class for float and double.
 template class CuBlockMatrix<float>;
 template class CuBlockMatrix<double>;

 } // namespace kaldi
CuBlockMatrixData_::row_offset
int32_cuda row_offset
Definition: cu-matrixdim.h:69

kaldi::CuMatrixBase::CopyFromMat
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::CuMatrixBase::Stride
MatrixIndexT Stride() const
Definition: cu-matrix.h:217

CuBlockMatrixData
struct CuBlockMatrixData_ CuBlockMatrixData
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...

kaldi::CuBlockMatrix::MaxBlockRows
MatrixIndexT MaxBlockRows() const
Definition: cu-block-matrix.cc:293

kaldi::CuBlockMatrix::operator=
CuBlockMatrix & operator=(const CuBlockMatrix &other)
Assignment operator.
Definition: cu-block-matrix.cc:96

kaldi::CuBlockMatrix::block_data_
std::vector< BlockMatrixData > block_data_
Definition: cu-block-matrix.h:135

cu-device.h

kaldi::CuBlockMatrix::FreeCudaData
void FreeCudaData()
If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL.
Definition: cu-block-matrix.cc:106

kaldi::ReadBasicType
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55

kaldi::CuBlockMatrix::CopyFromMat
void CopyFromMat(const CuMatrix< Real > &M)
Copies elements within the block structure from matrix M, discarding others.
Definition: cu-block-matrix.cc:298

kaldi::CuMatrixBase::Range
CuSubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:653

kaldi::CuBlockMatrix::BlockMatrixData::num_cols
MatrixIndexT num_cols
Definition: cu-block-matrix.h:115

kaldi::swap
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
Definition: basic-filebuf.h:275

kaldi::CuBlockMatrix
The class CuBlockMatrix holds a vector of objects of type CuMatrix, say, M_1, M_2, .
Definition: cu-block-matrix.h:51

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

cu-matrix.h

kaldi::CuMatrix
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71

cu-block-matrix.h

kaldi::Peek
int Peek(std::istream &is, bool binary)
Peek consumes whitespace (if binary == false) and then returns the peek() value of the stream...
Definition: io-funcs.cc:145

data_
uint64 data_
Definition: arpa-lm-compiler.cc:108

kaldi::CuBlockMatrix::BlockMatrixData::col_offset
MatrixIndexT col_offset
Definition: cu-block-matrix.h:117

CuBlockMatrixData_
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
Definition: cu-matrixdim.h:68

timer.h

kaldi::kTrans
Definition: matrix-common.h:33

kaldi::CuBlockMatrix::data_
CuMatrix< Real > data_
Definition: cu-block-matrix.h:110

kaldi::MatrixIndexT
int32 MatrixIndexT
Definition: matrix-common.h:98

CuBlockMatrixData_::col_offset
int32_cuda col_offset
Definition: cu-matrixdim.h:70

float

kaldi::CuBlockMatrix::AddMatMat
void AddMatMat(BaseFloat alpha, const CuMatrix< Real > &A, MatrixTransposeType transA, const CuMatrix< Real > &B, MatrixTransposeType transB, BaseFloat beta)
Does *this = alpha A B + beta * *this, discarding elements of the product outside the block structure...
Definition: cu-block-matrix.cc:213

kaldi::CuBlockMatrix::SetCudaData
void SetCudaData()
If using GPU, allocate and set cu_data_ on the GPU to reflect "data_".
Definition: cu-block-matrix.cc:122

kaldi::ExpectToken
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191

KALDI_ERR
#define KALDI_ERR
Definition: kaldi-error.h:147

CU1DBLOCK
#define CU1DBLOCK
Definition: cu-matrixdim.h:57

kaldi::kNoTrans
Definition: matrix-common.h:34

CU2DBLOCK
#define CU2DBLOCK
Definition: cu-matrixdim.h:61

kaldi::CuMatrixBase::AddMatMat
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291

kaldi::CuSubMatrix
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70

kaldi::WriteToken
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134

CuBlockMatrixData_::matrix_data
void * matrix_data
Definition: cu-matrixdim.h:72

CuBlockMatrixData_::matrix_dim
MatrixDim matrix_dim
Definition: cu-matrixdim.h:71

kaldi::CuBlockMatrix::BlockMatrixData
Definition: cu-block-matrix.h:113

kaldi::CuMatrixBase::Data
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::CuMatrixBase::NumCols
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

kaldi::CuBlockMatrix::BlockMatrixData::row_offset
MatrixIndexT row_offset
Definition: cu-block-matrix.h:116

kaldi::CuBlockMatrix::Swap
void Swap(CuBlockMatrix *other)
Definition: cu-block-matrix.cc:152

kaldi::MatrixTransposeType
MatrixTransposeType
Definition: matrix-common.h:32

kaldi::CuMatrixBase::Dim
::MatrixDim Dim() const
Definition: cu-matrix.h:221

kaldi::CuBlockMatrix::BlockMatrixData::num_rows
MatrixIndexT num_rows
Definition: cu-block-matrix.h:114

kaldi::WriteBasicType
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34

kaldi::CuBlockMatrix::Read
void Read(std::istream &is, bool binary)
Definition: cu-block-matrix.cc:173

kaldi::CuBlockMatrix::num_rows_
MatrixIndexT num_rows_
Definition: cu-block-matrix.h:137

kaldi::CuMatrixBase::NumRows
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215

kaldi::CuBlockMatrix::MaxBlockCols
MatrixIndexT MaxBlockCols() const
Definition: cu-block-matrix.cc:285

kaldi::CuBlockMatrix::CuBlockMatrix
CuBlockMatrix()
Definition: cu-block-matrix.cc:35

kaldi::CuBlockMatrix::Destroy
void Destroy()
Frees and deinitializes everything.
Definition: cu-block-matrix.cc:203

kaldi::CuBlockMatrix::Block
const CuSubMatrix< Real > Block(MatrixIndexT b) const
Definition: cu-block-matrix.cc:70

kaldi::CuBlockMatrix::Write
void Write(std::ostream &os, bool binary) const
Definition: cu-block-matrix.cc:162