cu-packed-matrix.cc
Go to the documentation of this file.
1 // cudamatrix/cu-packed-matrix.cc
2 
3 // Copyright 2009-2013 Johns Hopkins University (author: Daniel Povey)
4 // Karel Vesely
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 
22 
23 #if HAVE_CUDA == 1
24 #include <cuda_runtime_api.h>
25 #include <cublas_v2.h>
26 #endif
27 
28 #include "base/timer.h"
29 #include "cudamatrix/cu-common.h"
30 #include "cudamatrix/cu-vector.h"
31 #include "cudamatrix/cu-device.h"
32 #include "cudamatrix/cu-kernels.h"
33 #include "cudamatrix/cu-math.h"
36 
37 namespace kaldi {
38 
39 template<typename Real>
41  MatrixResizeType resize_type) {
42  // This code does not currently support the other resize_type options.
43  KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
44 
45  if (this->num_rows_ == rows) {
46  if (resize_type == kSetZero) this->SetZero();
47  return;
48  }
49 
50  if (this->num_rows_ != 0)
51  this->Destroy();
52  if (rows == 0) return;
53 #if HAVE_CUDA == 1
54  CuDevice &device = CuDevice::Instantiate();
55  if (device.Enabled()) {
56  CuTimer tim;
57  this->num_rows_ = rows;
58  size_t nr = static_cast<size_t>(num_rows_),
59  num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
60  this->data_ = static_cast<Real*>(device.Malloc(num_bytes));
61 
62  if (resize_type == kSetZero) this->SetZero();
63  device.AccuProfile("CuPackedMatrix::Resize", tim);
64  } else
65 #endif
66  { // Let the initializer of SpMatrix<Real> handle the allocation,
67  // and then just do Swap which will switch the pointers.
68  // This wastes a few instructions but is simple to code.
69  SpMatrix<Real> mat(rows, resize_type);
70  this->Swap(&mat);
71  }
72 }
73 
74 template<typename Real>
76  if (num_rows_ != 0) {
77  MatrixIndexT size = num_rows_ * (num_rows_ + 1) / 2;
78  CuSubVector<Real> tmp(data_, size);
79  CuRand<Real> rand;
80  rand.RandGaussian(&tmp);
81  }
82 }
83 
84 template<typename Real>
86 #if HAVE_CUDA == 1
87  if (CuDevice::Instantiate().Enabled()) {
88  if (this->data_ != NULL) {
89  CuDevice::Instantiate().Free(this->data_);
90  }
91  } else
92 #endif
93  {
94  if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
95  }
96  this->data_ = NULL;
97  this->num_rows_ = 0;
98 }
99 
100 template<typename Real>
102 #if HAVE_CUDA == 1
103  if (CuDevice::Instantiate().Enabled()) {
104  if (this->num_rows_ == 0) {
105  if (mat->num_rows_ != 0) {
106  // *this is empty, but mat is nonempty.
107  Resize(mat->num_rows_, kUndefined);
108  CopyFromPacked(*mat);
109  mat->Resize(0);
110  }
111  // else both are empty.
112  } else { // *this is nonempty.
113  if (mat->num_rows_ != 0) {
114  // Both *this and *mat are nonempty. Recurse to simpler cases.
115  // this could be done more efficiently in the case where
116  // the size does not change.
117  PackedMatrix<Real> temp;
118  this->Swap(&temp); // now temp is full, *this is empty.
119  mat->Swap(&temp); // now mat has data from *this, temp has
120  // data from mat.
121  this->Swap(&temp); // copy data in mat to *this, which is now empty.
122  } else { // *this is full but *mat is empty.
123  mat->Resize(this->num_rows_, kUndefined);
124  this->CopyToPacked(mat);
125  this->Destroy();
126  }
127  }
128  } else
129 #endif
130  {
131  std::swap(mat->data_, this->data_);
132  std::swap(mat->num_rows_, this->num_rows_);
133  }
134 }
135 
136 template<typename Real>
138  KALDI_ASSERT(src.NumRows() == num_rows_);
139 #if HAVE_CUDA == 1
140  if (CuDevice::Instantiate().Enabled()) {
141  if (num_rows_ == 0) return; // Nothing to do.
142  CuTimer tim;
143  size_t nr = static_cast<size_t>(num_rows_),
144  num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
145 
146  CU_SAFE_CALL(
147  cudaMemcpyAsync(data_, src.data_, num_bytes, cudaMemcpyDeviceToDevice,
148  cudaStreamPerThread));
149  CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked1",
150  tim);
151  } else
152 #endif
153  {
154  Mat().CopyFromPacked(src.Mat());
155  }
156 }
157 
158 template<typename Real>
160  KALDI_ASSERT(src.NumRows() == num_rows_);
161 #if HAVE_CUDA == 1
162  if (CuDevice::Instantiate().Enabled()) {
163  if (num_rows_ == 0) return; // Nothing to do.
164  CuTimer tim;
165  CU_SAFE_CALL(cudaMemcpyAsync(data_, src.data_, src.SizeInBytes(),
166  cudaMemcpyHostToDevice, cudaStreamPerThread));
167  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
168  CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked2", tim);
169  } else
170 #endif
171  {
172  Mat().CopyFromPacked(src);
173  //memcpy(data_, src.Data(), SizeInBytes());
174  }
175 }
176 
177 template<typename Real>
179  KALDI_ASSERT(dst->NumRows() == NumRows());
180 
181 #if HAVE_CUDA == 1
182  if (CuDevice::Instantiate().Enabled()) {
183  if (num_rows_ == 0) return; // Nothing to do.
184  CuTimer tim;
185  size_t nr = static_cast<size_t>(num_rows_),
186  num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
187 
188  CU_SAFE_CALL(cudaMemcpyAsync(dst->data_, data_, num_bytes,
189  cudaMemcpyDeviceToHost, cudaStreamPerThread));
190  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
191  CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyToPackedD2H", tim);
192  } else
193 #endif
194  {
195  //memcpy(data_, dst->Data(), SizeInBytes());
196  dst->CopyFromPacked(Mat());
197  }
198 }
199 
200 /*
201 template<typename Real>
202 void CuPackedMatrix<Real>::CopyRowsFromPacked(int32 r, const CuPackedMatrix<Real> &src, int32 src_ro, int32 dst_ro) {
203  KALDI_ASSERT(r+src_ro <= src.NumRows());
204  KALDI_ASSERT(r+dst_ro <= NumRows());
205  KALDI_ASSERT(NumCols() == src.NumCols());
206 
207  #if HAVE_CUDA == 1
208  if (CuDevice::Instantiate().Enabled()) {
209  CuTimer tim;
210 
211  MatrixIndexT dst_pitch = stride_*sizeof(Real);
212  MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
213  MatrixIndexT width = src.NumCols()*sizeof(Real);
214 
215  const Real *p_src = src.Data() + src_ro*src.Stride();
216  Real *p_dst = data_ + dst_ro*stride_;
217 
218  CU_SAFE_CALL(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, r, cudaMemcpyDeviceToDevice));
219 
220  CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D", tim);
221  } else
222  #endif
223  {
224  memcpy(Data()+dst_ro*stride_, src.Data()+src_ro*src.Stride(), r*stride_*sizeof(Real));
225  }
226 } */
227 
228 
229 
230 template<typename Real>
231 void CuPackedMatrix<Real>::Read(std::istream &is, bool binary) {
232  PackedMatrix<Real> temp;
233  temp.Read(is, binary);
234  Destroy();
235  Swap(&temp);
236 }
237 
238 template<typename Real>
239 void CuPackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
240  PackedMatrix<Real> temp(this->num_rows_, kUndefined);
241  this->CopyToPacked(&temp);
242  temp.Write(os, binary);
243 }
244 
245 template<typename Real>
247  #if HAVE_CUDA == 1
248  if (CuDevice::Instantiate().Enabled()) {
249  CuTimer tim;
250  size_t nr = static_cast<size_t>(num_rows_),
251  num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
252 
253  CU_SAFE_CALL(cudaMemsetAsync(reinterpret_cast<void*>(this->data_), 0,
254  num_bytes, cudaStreamPerThread));
255  CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetZero", tim);
256  } else
257  #endif
258  {
259  Mat().SetZero();
260  }
261 }
262 
263 template<typename Real>
265  Real result = 0.0;
266 #if HAVE_CUDA == 1
267  if (CuDevice::Instantiate().Enabled()) {
268  if (num_rows_ == 0) return 0.0;
269  CuVector<Real> tmp(num_rows_, kUndefined);
270  tmp.CopyDiagFromPacked(*this);
271  return tmp.Sum();
272  } else
273 #endif
274  {
275  result = Mat().Trace();
276  }
277  return result;
278 }
279 
280 template<typename Real>
282 #if HAVE_CUDA == 1
283  if (CuDevice::Instantiate().Enabled()) {
284  if (num_rows_ == 0) return;
285  CuTimer tim;
286  int dimBlock(CU1DBLOCK);
287  int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
288  cuda_set_diag_packed(dimGrid,dimBlock,data_,alpha,num_rows_);
289  CU_SAFE_CALL(cudaGetLastError());
290  CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetDiag", tim);
291  } else
292 #endif
293  {
294  Mat().SetDiag(alpha);
295  }
296 }
297 
298 template<typename Real>
299 void CuPackedMatrix<Real>::Scale(Real alpha) {
300 #if HAVE_CUDA == 1
301  if (CuDevice::Instantiate().Enabled()) {
302  CuTimer tim;
303  size_t nr = static_cast<size_t>(num_rows_),
304  num_elements = ((nr * (nr+1)) / 2);
305  CUBLAS_SAFE_CALL(cublas_scal(GetCublasHandle(), num_elements, alpha, data_, 1));
306 
307  CuDevice::Instantiate().AccuProfile("CuPackedMatrix::Scale", tim);
308  } else
309 #endif
310  {
311  Mat().Scale(alpha);
312  }
313 }
314 
315 template<typename Real>
317 #if HAVE_CUDA == 1
318  if (CuDevice::Instantiate().Enabled()) {
319  CuTimer tim;
320  int dimBlock(CU1DBLOCK);
321  int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
322  cuda_scale_diag_packed(dimGrid,dimBlock,data_,alpha,num_rows_);
323  CU_SAFE_CALL(cudaGetLastError());
324  CuDevice::Instantiate().AccuProfile("CuPackedMatrix::ScaleDiag", tim);
325  } else
326 #endif
327  {
328  Mat().ScaleDiag(alpha);
329  }
330 }
331 
332 template<typename Real>
333 void CuPackedMatrix<Real>::AddPacked(const Real alpha, const CuPackedMatrix<Real> &M) {
334  KALDI_ASSERT(num_rows_ == M.NumRows());
335 #if HAVE_CUDA == 1
336  if (CuDevice::Instantiate().Enabled()) {
337  if (num_rows_ == 0) return;
338  CuTimer tim;
339  size_t nr = num_rows_,
340  sz = (nr * (nr + 1)) / 2;
341  cublas_axpy(GetCublasHandle(), sz, alpha, M.Data(), 1, data_, 1);
342  CuDevice::Instantiate().AccuProfile("CuPackedMatrix::AddPacked", tim);
343  } else
344 #endif
345  {
346  Mat().AddPacked(alpha, M.Mat());
347  }
348 }
349 
350 template<typename Real>
352 #if HAVE_CUDA == 1
353  if (CuDevice::Instantiate().Enabled()) {
354  if (num_rows_ == 0) return;
355  CuTimer tim;
356  int dimBlock(CU1DBLOCK);
357  int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
358  cuda_add_diag_packed(dimGrid,dimBlock,data_,r,num_rows_);
359  CU_SAFE_CALL(cudaGetLastError());
360  CuDevice::Instantiate().AccuProfile("CuPackedMatrix::AddToDiag", tim);
361  } else
362 #endif
363  {
364  // TODO
365  Mat().AddToDiag(r);
366  }
367 }
368 
369 template<typename Real>
371 #if HAVE_CUDA == 1
372  if (CuDevice::Instantiate().Enabled()) {
373  this->SetZero();
374  this->SetDiag(1.0);
375  } else
376 #endif
377  {
378  Mat().SetUnit();
379  }
380 }
381 
385 template<typename Real>
386 std::ostream &operator << (std::ostream &out, const CuPackedMatrix<Real> &mat) {
387  PackedMatrix<Real> temp(mat.NumRows());
388  mat.CopyToPacked(&temp);
389  out << temp;
390  return out;
391 }
392 
393 // instantiate the template
394 template
395 std::ostream &operator << (std::ostream &out, const CuPackedMatrix<float> &mat);
396 template
397 std::ostream &operator << (std::ostream &out, const CuPackedMatrix<double> &mat);
398 
399 
400 // Instantiate class CuPackedMatrix for float and double.
401 template class CuPackedMatrix<float>;
402 template class CuPackedMatrix<double>;
403 
404 
405 } // namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Packed symetric matrix class.
Definition: matrix-common.h:62
void CopyDiagFromPacked(const CuPackedMatrix< Real > &M)
Extracts the diagonal of a packed matrix M; works for Sp or Tp.
Definition: cu-vector.cc:1177
MatrixResizeType
Definition: matrix-common.h:37
const PackedMatrix< Real > & Mat() const
void Swap(CuPackedMatrix< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
void Read(std::istream &in, bool binary, bool add=false)
void Write(std::ostream &out, bool binary) const
void AddPacked(const Real alpha, const CuPackedMatrix< Real > &M)
void SetDiag(Real alpha)
< Set to random values of a normal distribution
MatrixIndexT NumRows() const
Real Sum() const
Definition: cu-vector.cc:297
void CopyToPacked(PackedMatrix< Real > *dst) const
void AddToDiag(Real r)
< Set the diagonal value to alpha
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
MatrixIndexT NumRows() const
void SetRandn()
< Set to unit matrix.
uint64 data_
MatrixIndexT num_rows_
void RandGaussian(CuMatrixBase< Real > *tgt)
Fill with Normal random numbers,.
Definition: cu-rand.cc:116
int32 MatrixIndexT
Definition: matrix-common.h:98
Packed matrix: base class for triangular and symmetric matrices.
Definition: matrix-common.h:64
void Read(std::istream &in, bool binary)
void Swap(PackedMatrix< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
#define CU1DBLOCK
Definition: cu-matrixdim.h:57
#define KALDI_MEMALIGN_FREE(x)
Definition: kaldi-utils.h:60
void Scale(Real alpha)
void SetUnit()
< Set to zero
void ScaleDiag(Real alpha)
void CopyFromPacked(const PackedMatrix< OtherReal > &orig)
size_t SizeInBytes() const
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
Matrix for CUDA computing.
Definition: matrix-common.h:75
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type=kSetZero)
Set packed matrix to a specified size (can be zero).
void Write(std::ostream &out, bool binary) const
void CopyFromPacked(const CuPackedMatrix< Real > &src)
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type=kSetZero)
Set packed matrix to a specified size (can be zero).