cu-block-matrix.cc
Go to the documentation of this file.
1 // cudamatrix/cu-block-matrix.cc
2 
3 // Copyright 2013 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 #if HAVE_CUDA == 1
22 #include <cuda_runtime_api.h>
23 #include <cublas_v2.h>
24 #endif
25 
26 #include <algorithm>
27 #include "base/timer.h"
29 #include "cudamatrix/cu-matrix.h"
30 #include "cudamatrix/cu-device.h"
31 
32 namespace kaldi {
33 
34 template<class Real>
36 #if HAVE_CUDA == 1
37  cu_data_ = NULL;
38 #endif
39 }
40 
41 template<class Real>
43 #if HAVE_CUDA == 1
44  cu_data_ = NULL;
45 #endif
46  block_data_.resize(data.size());
47  MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0;
48  for (size_t b = 0; b < data.size(); b++) {
49  MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols();
50  KALDI_ASSERT(num_rows > 0 && num_cols > 0);
51  BlockMatrixData block_data;
52  block_data.num_rows = num_rows;
53  block_data.num_cols = num_cols;
54  block_data.row_offset = row_offset;
55  block_data.col_offset = col_offset;
56  row_offset += num_rows;
57  col_offset += num_cols;
58  max_num_rows = std::max(max_num_rows, num_rows);
59  block_data_[b] = block_data;
60  }
61  num_rows_ = row_offset;
62  data_.Resize(max_num_rows, col_offset);
63  for (int32 b = 0; b < NumBlocks(); b++)
64  Block(b).CopyFromMat(data[b]);
65  SetCudaData();
66 }
67 
68 
69 template<class Real>
71  KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
72  const BlockMatrixData &block_data = block_data_[b];
73  return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
74  block_data.col_offset, block_data.num_cols);
75 }
76 
77 template<class Real>
79  KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
80  BlockMatrixData &block_data = block_data_[b];
81  return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
82  block_data.col_offset, block_data.num_cols);
83 }
84 
85 
86 template<class Real>
88  data_(other.data_), block_data_(other.block_data_), num_rows_(other.num_rows_) {
89 #if HAVE_CUDA == 1
90  cu_data_ = NULL;
91 #endif
92  SetCudaData();
93 }
94 
95 template<class Real>
97  FreeCudaData();
98  data_ = other.data_;
99  block_data_ = other.block_data_;
100  num_rows_ = other.num_rows_;
101  SetCudaData();
102  return *this;
103 }
104 
105 template<class Real>
107 #if HAVE_CUDA == 1
108  if (cu_data_ != NULL) {
109  if (CuDevice::Instantiate().Enabled()) {
110  CuDevice::Instantiate().Free(cu_data_);
111  cu_data_ = NULL;
112  } else {
113  KALDI_ERR << "CuBlockMatrix: you have CUDA data pointer but "
114  << "no GPU is enabled: likely code error.";
115  }
116  }
117 #endif
118 }
119 
120 
121 template<class Real>
123 #if HAVE_CUDA == 1
124  KALDI_ASSERT(cu_data_ == NULL);
125  if (block_data_.size() == 0) return; // Nothing to do.
126  if (CuDevice::Instantiate().Enabled()) {
127  CuTimer tim;
128  std::vector<CuBlockMatrixData> tmp_cu_data(NumBlocks());
129  int32 row_offset = 0, col_offset = 0;
130  for (size_t b = 0; b < NumBlocks(); b++) {
131  CuSubMatrix<Real> this_mat = Block(b);
132  CuBlockMatrixData &this_cu_data = tmp_cu_data[b];
133  this_cu_data.row_offset = row_offset;
134  this_cu_data.col_offset = col_offset;
135  this_cu_data.matrix_dim = this_mat.Dim();
136  this_cu_data.matrix_data = static_cast<void*>(this_mat.Data());
137  row_offset += this_mat.NumRows();
138  col_offset += this_mat.NumCols();
139  }
140  size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
141  cu_data_ = static_cast<CuBlockMatrixData*>(
142  CuDevice::Instantiate().Malloc(size));
143  CU_SAFE_CALL(cudaMemcpyAsync(cu_data_, &(tmp_cu_data[0]), size,
144  cudaMemcpyHostToDevice, cudaStreamPerThread));
145  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
146  CuDevice::Instantiate().AccuProfile(__func__, tim);
147  }
148 #endif
149 }
150 
151 template<class Real>
153  data_.Swap(&other->data_);
154  block_data_.swap(other->block_data_);
155  std::swap(num_rows_, other->num_rows_);
156 #if HAVE_CUDA == 1
157  std::swap(cu_data_, other->cu_data_);
158 #endif
159 }
160 
161 template<class Real>
162 void CuBlockMatrix<Real>::Write(std::ostream &os, bool binary) const {
163  WriteToken(os, binary, "<CuBlockMatrix>");
164  int32 num_blocks = NumBlocks();
165  WriteBasicType(os, binary, num_blocks);
166  for (int32 b = 0; b < num_blocks; b++)
167  this->Block(b).Write(os, binary);
168  WriteToken(os, binary, "</CuBlockMatrix>");
169 }
170 
171 
172 template<class Real>
173 void CuBlockMatrix<Real>::Read(std::istream &is, bool binary) {
174  Destroy();
175  int i = Peek(is, binary);
176  std::vector<CuMatrix<Real> > data;
177  if (i != static_cast<int>('<')) {
178  // back-compatibility code so we can read the older format of
179  // MixtureProbComponent. This code should be deleted eventually.
180  int32 size;
181  ReadBasicType(is, binary, &size);
182  KALDI_ASSERT(size >= 0);
183  data.resize(size);
184  for (int32 i = 0; i < size; i++)
185  data[i].Read(is, binary);
186  } else {
187  ExpectToken(is, binary, "<CuBlockMatrix>");
188  int32 size;
189  ReadBasicType(is, binary, &size);
190  KALDI_ASSERT(size >= 0);
191  data.resize(size);
192  for (int32 i = 0; i < size; i++)
193  data[i].Read(is, binary);
194  ExpectToken(is, binary, "</CuBlockMatrix>");
195  }
196 
197  CuBlockMatrix<Real> block_mat(data); // initializer from std::vector<CuMatrix<Real> > does
198  // the main job of initialization.
199  this->Swap(&block_mat);
200 }
201 
202 template<class Real>
204  data_.Resize(0, 0);
205  block_data_.clear();
206  num_rows_ = 0;
207  FreeCudaData();
208 }
209 
210 // Does *this = alpha A B + beta * *this, discarding elements outside
211 // the block structure of the *this matrix.
212 template<class Real>
214  BaseFloat alpha,
215  const CuMatrix<Real> &A, MatrixTransposeType transA,
216  const CuMatrix<Real> &B, MatrixTransposeType transB,
217  BaseFloat beta) {
218  MatrixIndexT A_num_rows = A.NumRows(), A_num_cols = A.NumCols(),
219  A_row_stride = A.Stride(), A_col_stride = 1,
220  B_num_rows = B.NumRows(), B_num_cols = B.NumCols(),
221  B_row_stride = B.Stride(), B_col_stride = 1;
222  if (transA == kTrans) {
223  std::swap(A_num_rows, A_num_cols);
224  std::swap(A_row_stride, A_col_stride);
225  }
226  if (transB == kTrans) {
227  std::swap(B_num_rows, B_num_cols);
228  std::swap(B_row_stride, B_col_stride);
229  }
230  KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols()
231  && A_num_cols == B_num_rows);
232  if (NumBlocks() == 0) return; // empty matrix.
233 #if HAVE_CUDA == 1
234  if (CuDevice::Instantiate().Enabled()) {
235  CuTimer tim;
236 
237  // (x,y,z) dimensions are (block-id, row-of-block, col-of-block)
238  // First some logic to choose block dims...
239  // we assume (which we can, safely) that CU1DBLOCK is <= the max threads per block.
240  int32 x_blocksize = std::min(CU1DBLOCK, NumBlocks()); // x dim corresponds to block-idx.
241  int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols();
242  int32 y_blocksize = max_block_rows;
243  while (y_blocksize * x_blocksize > CU1DBLOCK || y_blocksize > CU2DBLOCK)
244  y_blocksize--;
245  int32 z_blocksize = max_block_cols;
246  while (z_blocksize * x_blocksize * y_blocksize > CU1DBLOCK || z_blocksize > CU2DBLOCK)
247  z_blocksize--;
248 
249  dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize);
250  dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize),
251  n_blocks(max_block_rows, y_blocksize),
252  n_blocks(max_block_cols, z_blocksize));
253  cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(),
254  A.Data(), A_num_cols, A_row_stride, A_col_stride,
255  B.Data(), B_row_stride, B_col_stride, alpha, beta);
256  CU_SAFE_CALL(cudaGetLastError());
257  CuDevice::Instantiate().AccuProfile(__func__, tim);
258  } else
259 #endif
260  {
261  int32 row_offset = 0, col_offset = 0;
262  for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
263  CuSubMatrix<Real> this_block = Block(b);
264  MatrixIndexT this_num_rows = this_block.NumRows(),
265  this_num_cols = this_block.NumCols();
266  CuSubMatrix<Real> A_part = (transA == kNoTrans ?
267  A.Range(row_offset, this_num_rows,
268  0, A.NumCols()) :
269  A.Range(0, A.NumRows(),
270  row_offset, this_num_rows)),
271  B_part = (transB == kNoTrans ?
272  B.Range(0, B.NumRows(),
273  col_offset, this_num_cols) :
274  B.Range(col_offset, this_num_cols,
275  0, B.NumCols()));
276  this_block.AddMatMat(alpha, A_part, transA, B_part, transB, beta);
277  row_offset += this_num_rows;
278  col_offset += this_num_cols;
279  }
280  KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
281  }
282 }
283 
284 template<class Real>
286  MatrixIndexT max_cols = 0;
287  for (size_t i = 0; i < block_data_.size(); i++)
288  max_cols = std::max(max_cols, block_data_[i].num_cols);
289  return max_cols;
290 }
291 
292 template<class Real>
294  return data_.NumRows();
295 }
296 
297 template<class Real>
299  KALDI_ASSERT(NumRows() == M.NumRows() && NumCols() == M.NumCols());
300  MatrixIndexT row_offset = 0, col_offset = 0;
301  for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
302  CuSubMatrix<Real> this_block = Block(b);
303  MatrixIndexT this_num_rows = this_block.NumRows(),
304  this_num_cols = this_block.NumCols();
305  const CuSubMatrix<Real> src(M, row_offset, this_num_rows,
306  col_offset, this_num_cols);
307  this_block.CopyFromMat(src);
308  row_offset += this_num_rows;
309  col_offset += this_num_cols;
310  }
311  KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
312 }
313 
317 template<typename Real>
318 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat) {
319  bool binary = false;
320  mat.Write(out, binary);
321  return out;
322 }
323 // instantiate the template
324 template
325 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<float> &mat);
326 template
327 std::ostream &operator << (std::ostream &out, const CuBlockMatrix<double> &mat);
328 
329 // Instantiate the class for float and double.
330 template class CuBlockMatrix<float>;
331 template class CuBlockMatrix<double>;
332 
333 } // namespace kaldi
int32_cuda row_offset
Definition: cu-matrixdim.h:69
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
MatrixIndexT Stride() const
Definition: cu-matrix.h:217
struct CuBlockMatrixData_ CuBlockMatrixData
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
MatrixIndexT MaxBlockRows() const
CuBlockMatrix & operator=(const CuBlockMatrix &other)
Assignment operator.
std::vector< BlockMatrixData > block_data_
void FreeCudaData()
If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL.
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
void CopyFromMat(const CuMatrix< Real > &M)
Copies elements within the block structure from matrix M, discarding others.
CuSubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:653
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
The class CuBlockMatrix holds a vector of objects of type CuMatrix, say, M_1, M_2, .
kaldi::int32 int32
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
int Peek(std::istream &is, bool binary)
Peek consumes whitespace (if binary == false) and then returns the peek() value of the stream...
Definition: io-funcs.cc:145
uint64 data_
This structure is used in cu-block-matrix.h to store information about a block-diagonal matrix...
Definition: cu-matrixdim.h:68
CuMatrix< Real > data_
int32 MatrixIndexT
Definition: matrix-common.h:98
int32_cuda col_offset
Definition: cu-matrixdim.h:70
void AddMatMat(BaseFloat alpha, const CuMatrix< Real > &A, MatrixTransposeType transA, const CuMatrix< Real > &B, MatrixTransposeType transB, BaseFloat beta)
Does *this = alpha A B + beta * *this, discarding elements of the product outside the block structure...
void SetCudaData()
If using GPU, allocate and set cu_data_ on the GPU to reflect "data_".
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191
#define KALDI_ERR
Definition: kaldi-error.h:147
#define CU1DBLOCK
Definition: cu-matrixdim.h:57
#define CU2DBLOCK
Definition: cu-matrixdim.h:61
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
MatrixDim matrix_dim
Definition: cu-matrixdim.h:71
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void Swap(CuBlockMatrix *other)
MatrixTransposeType
Definition: matrix-common.h:32
::MatrixDim Dim() const
Definition: cu-matrix.h:221
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34
void Read(std::istream &is, bool binary)
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
MatrixIndexT MaxBlockCols() const
void Destroy()
Frees and deinitializes everything.
const CuSubMatrix< Real > Block(MatrixIndexT b) const
void Write(std::ostream &os, bool binary) const