cu-matrix.cc
Go to the documentation of this file.
1 // cudamatrix/cu-matrix.cc
2 
3 // Copyright 2009-2012 Karel Vesely, Lucas Ondel
4 // 2013 Ehsan Variani
5 // 2013 Johns Hopkins University (author: Daniel Povey)
6 // 2013 Hainan Xu
7 // 2013 Xiaohui Zhang
8 // 2013-2015 Guoguo Chen
9 // 2016-2017 Shiyin Kang
10 // 2017 Hossein Hadian
11 // 2019 Yiwen Shao
12 
13 // See ../../COPYING for clarification regarding multiple authors
14 //
15 // Licensed under the Apache License, Version 2.0 (the "License");
16 // you may not use this file except in compliance with the License.
17 // You may obtain a copy of the License at
18 //
19 // http://www.apache.org/licenses/LICENSE-2.0
20 //
21 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
22 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
23 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
24 // MERCHANTABLITY OR NON-INFRINGEMENT.
25 // See the Apache 2 License for the specific language governing permissions and
26 // limitations under the License.
27 
28 
29 #if HAVE_CUDA == 1
30 #include <cuda_runtime_api.h>
31 #include <cublas_v2.h>
32 #endif
33 
34 #include "base/timer.h"
35 #include "cudamatrix/cu-common.h"
36 #include "cudamatrix/cu-vector.h"
37 #include "cudamatrix/cu-device.h"
38 #include "cudamatrix/cu-kernels.h"
39 #include "cudamatrix/cu-array.h"
40 #include "cudamatrix/cu-math.h"
46 
47 namespace kaldi {
48 
49 template<typename Real>
51  MatrixResizeType resize_type,
52  MatrixStrideType stride_type) {
53  // This code does not currently support the other resize_type options.
54  KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
55  if (rows * cols == 0) KALDI_ASSERT(rows == 0 && cols == 0);
56  if (this->num_rows_ == rows && this->num_cols_ == cols) {
57  if (resize_type == kSetZero) this->SetZero();
58  return;
59  }
60  if (this->num_rows_ != 0)
61  this->Destroy();
62  if (rows == 0) return;
63 #if HAVE_CUDA == 1
64  if (CuDevice::Instantiate().Enabled()) {
65  CuTimer tim;
66  MatrixIndexT row_bytes = cols * sizeof(Real);
67  size_t pitch;
68  if (stride_type == kDefaultStride) {
69  this->data_ = static_cast<Real*>(CuDevice::Instantiate().MallocPitch(
70  row_bytes, rows, &pitch));
71  this->num_rows_ = rows;
72  this->num_cols_ = cols;
73  this->stride_ = pitch / sizeof(Real);
74  } else { // kStrideEqualNumCols
75  size_t bytes = rows * cols * sizeof(Real);
76  this->data_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(bytes));
77  this->num_rows_ = rows;
78  this->num_cols_ = cols;
79  this->stride_ = cols;
80  }
81  if (resize_type == kSetZero) this->SetZero();
82  CuDevice::Instantiate().AccuProfile("CuMatrix::Resize", tim);
83  } else
84 #endif
85  { // Let the initializer of Matrix<Real> handle the allocation,
86  // and then just do Swap which will switch the pointers.
87  // This wastes a few instructions but is simple to code.
88  Matrix<Real> mat(rows, cols, resize_type, stride_type);
89  this->Swap(&mat);
90  }
91 }
92 
93 template<typename Real>
95 #if HAVE_CUDA == 1
96  if (CuDevice::Instantiate().Enabled()) {
97  if (this->data_ != NULL) {
98  CuTimer tim;
99  CuDevice::Instantiate().Free(this->data_);
100  CuDevice::Instantiate().AccuProfile(__func__, tim);
101  }
102  } else
103 #endif
104  {
105  if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
106  }
107  this->data_ = NULL;
108  this->num_rows_ = 0;
109  this->num_cols_ = 0;
110  this->stride_ = 0;
111 }
112 
113 template<typename Real>
115  std::swap(mat->data_, this->data_);
116  std::swap(mat->num_cols_, this->num_cols_);
117  std::swap(mat->num_rows_, this->num_rows_);
118  std::swap(mat->stride_, this->stride_);
119 }
120 
121 
122 template<typename Real>
124 #if HAVE_CUDA == 1
125  if (CuDevice::Instantiate().Enabled()) {
126  if (this->num_rows_ == 0) {
127  if (mat->num_rows_ != 0) {
128  // *this is empty, but mat is nonempty.
129  this->Resize(mat->num_rows_, mat->num_cols_, kUndefined);
130  this->CopyFromMat(*mat);
131  mat->Resize(0, 0);
132  }
133  // else both are empty.
134  } else { // *this is nonempty.
135  if (mat->num_rows_ != 0) {
136  // Both *this and *mat are nonempty. Recurse to simpler cases.
137  // this could be done more efficiently in the case where
138  // the size does not change.
139  Matrix<Real> temp;
140  this->Swap(&temp); // now temp is full, *this is empty.
141  mat->Swap(&temp); // now mat has data from *this, temp has
142  // data from mat.
143  this->Swap(&temp); // copy data in mat to *this, which is now empty.
144  } else { // *this is full but *mat is empty.
145  mat->Resize(this->num_rows_, this->num_cols_, kUndefined);
146  this->CopyToMat(mat);
147  this->Destroy();
148  }
149  }
150  } else
151 #endif
152  {
153  std::swap(mat->data_, this->data_);
154  std::swap(mat->num_cols_, this->num_cols_);
155  std::swap(mat->num_rows_, this->num_rows_);
156  std::swap(mat->stride_, this->stride_);
157  }
158 }
159 
160 template <typename Real>
162  MatrixTransposeType trans) {
163  this->SetZero();
164  if (trans == kNoTrans) {
165  KALDI_ASSERT(NumRows() == B.NumRows() && NumCols() == B.NumCols());
166  int32 row_offset = 0, col_offset = 0;
167  for (int32 b = 0; b < B.NumBlocks(); b++) {
168  const CuMatrixBase<Real> &block = B.Block(b);
169  int32 num_rows = block.NumRows(), num_cols = block.NumCols();
170  CuSubMatrix<Real> this_block(*this, row_offset, num_rows,
171  col_offset, num_cols);
172  this_block.CopyFromMat(block);
173  row_offset += num_rows;
174  col_offset += num_cols;
175  }
176  KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
177  } else {
178  KALDI_ASSERT(NumRows() == B.NumCols() && NumCols() == B.NumRows());
179  int32 row_offset = 0, col_offset = 0;
180  for (int32 b = 0; b < B.NumBlocks(); b++) {
181  const CuMatrixBase<Real> &block = B.Block(b);
182  int32 num_rows = block.NumCols(), num_cols = block.NumRows();
183  CuSubMatrix<Real> this_block(*this, row_offset, num_rows,
184  col_offset, num_cols);
185  this_block.CopyFromMat(block, kTrans);
186  row_offset += num_rows;
187  col_offset += num_cols;
188  }
189  KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
190  }
191 }
192 
193 
194 template <typename Real>
196  MatrixTransposeType trans): CuMatrixBase<Real>() {
197  if (trans == kNoTrans) {
198  Resize(B.NumRows(), B.NumCols(), kUndefined);
199  this->CopyFromBlock(B);
200  } else {
201  Resize(B.NumCols(), B.NumRows(), kUndefined);
202  this->CopyFromBlock(B, kTrans);
203  }
204 }
205 
206 template<class Real>
207 template<class OtherReal>
209  MatrixTransposeType trans) {
210  if (sizeof(Real) == sizeof(OtherReal) &&
211  static_cast<const void*>(M.Data()) ==
212  static_cast<const void*>(this->Data())) {
213  if (M.Data() == NULL)
214  return;
215  // CopyFromMat called on same data. Nothing to do (except sanity checks)
216  KALDI_ASSERT(trans == kNoTrans && M.NumRows() == NumRows() &&
217  M.NumCols() == NumCols() && M.Stride() == Stride());
218  return;
219  }
220 #if HAVE_CUDA == 1
221  if (CuDevice::Instantiate().Enabled()) {
222  if (trans == kNoTrans) {
223  KALDI_ASSERT(M.NumRows() == num_rows_ && M.NumCols() == num_cols_);
224  } else {
225  KALDI_ASSERT(M.NumCols() == num_rows_ && M.NumRows() == num_cols_);
226  }
227  if (M.num_rows_ == 0) return; // Nothing to do.
228  CuTimer tim;
229  if (sizeof(Real) == sizeof(OtherReal) && trans == kNoTrans ) {
230  MatrixIndexT dst_pitch = stride_ * sizeof(Real);
231  MatrixIndexT src_pitch = M.Stride() * sizeof(Real);
232  MatrixIndexT width = M.NumCols() * sizeof(Real);
233  CU_SAFE_CALL(
234  cudaMemcpy2DAsync(data_, dst_pitch, M.data_, src_pitch,
235  width, M.num_rows_, cudaMemcpyDeviceToDevice,
236  cudaStreamPerThread));
237  } else {
238  if (trans == kNoTrans) {
239  dim3 dimGrid, dimBlock;
240  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
241  &dimGrid, &dimBlock);
242  cuda_copy_from_mat(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim());
243  } else {
244  // 2D thread block with warps (blockDim.x) along the row-dim of input M.
245  // Each (8x32) thread block will transpose (32x32) data
246  const int32 warpSize = 32;
247  dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
248  dim3 dimGrid(n_blocks(M.NumCols(), warpSize),
249  n_blocks(M.NumRows(), warpSize));
250  cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, M.data_, Dim(),
251  M.Dim());
252  }
253  CU_SAFE_CALL(cudaGetLastError());
254  }
255  CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from other CuMatrixBase)", tim);
256  } else
257 #endif
258  {
259  Mat().CopyFromMat(M.Mat(), trans);
260  }
261 }
262 
263 // Instantiate the template above.
264 template
266  MatrixTransposeType trans);
267 template
269  MatrixTransposeType trans);
270 template
272  MatrixTransposeType trans);
273 template
275  MatrixTransposeType trans);
276 
277 
278 template<typename Real>
279 template<typename OtherReal>
281  MatrixTransposeType trans) {
282  KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
283  if (num_rows_ == 0)
284  return;
285 #if HAVE_CUDA == 1
286  if (CuDevice::Instantiate().Enabled()) {
287  CuTimer tim;
288  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
289  dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK),
290  n_blocks(num_rows_, CU2DBLOCK));
291  if (trans == kNoTrans) {
292  cuda_copy_from_tp(dimGrid, dimBlock, data_, M.Data(), Dim());
293  } else {
294  cuda_copy_from_tp_trans(dimGrid, dimBlock, data_, M.Data(), Dim());
295  }
296  CuDevice::Instantiate().AccuProfile(__func__, tim);
297  } else
298 #endif
299  {
300  Mat().CopyFromTp(M.Mat(), trans);
301  }
302 }
303 // instantiate the template above.
305  MatrixTransposeType trans);
306 template void CuMatrixBase<float>::CopyFromTp(const CuTpMatrix<double> &M,
307  MatrixTransposeType trans);
309  MatrixTransposeType trans);
310 template void CuMatrixBase<double>::CopyFromTp(const CuTpMatrix<double> &M,
311  MatrixTransposeType trans);
312 
313 template<typename Real>
315  MatrixTransposeType trans) {
316 #if HAVE_CUDA == 1
317  if (CuDevice::Instantiate().Enabled()) {
318  if (trans == kNoTrans) {
319  KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);
320  CuTimer tim;
321 
322  MatrixIndexT dst_pitch = stride_*sizeof(Real);
323  MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
324  MatrixIndexT width = src.NumCols()*sizeof(Real);
325  CU_SAFE_CALL(cudaMemcpy2DAsync(data_, dst_pitch, src.Data(), src_pitch,
326  width, src.NumRows(), cudaMemcpyHostToDevice,
327  cudaStreamPerThread));
328  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
329 
330  CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)", tim);
331  } else {
332  CuMatrix<Real> trans_mat(src); // Do the transpose on the GPU board.
333  this->CopyFromMat(trans_mat, kTrans);
334  }
335  } else
336 #endif
337  {
338  Mat().CopyFromMat(src, trans);
339  }
340 }
341 
342 template<typename Real>
343 template<typename OtherReal>
345  MatrixTransposeType trans) {
346  CuMatrix<OtherReal> temp(src);
347  this->CopyFromMat(temp, trans);
348 }
349 
350 // instantiate the template above.
351 template
353  MatrixTransposeType trans);
354 template
356  MatrixTransposeType trans);
357 
358 
359 template<typename Real>
361  KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
362  if (num_rows_ == 0)
363  return;
364 #if HAVE_CUDA == 1
365  if (CuDevice::Instantiate().Enabled()) {
366  CuTimer tim;
367  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
368  dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
369  n_blocks(NumRows(), CU2DBLOCK));
370  cuda_copy_from_sp(dimGrid, dimBlock, M.Data(), data_, Dim());
371  CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromSp", tim);
372  } else
373 #endif
374  {
375  Mat().CopyFromSp(M.Mat());
376  }
377 }
378 
379 template<typename Real>
381  if (trans == kNoTrans)
382  this->Resize(other.NumRows(), other.NumCols(), kUndefined);
383  else
384  this->Resize(other.NumCols(), other.NumRows(), kUndefined);
385  this->CopyFromMat(other, trans);
386 }
387 
388 template<typename Real>
390  if (trans == kNoTrans)
391  this->Resize(other.NumRows(), other.NumCols(), kUndefined);
392  else
393  this->Resize(other.NumCols(), other.NumRows(), kUndefined);
394  this->CopyFromMat(other, trans);
395 }
396 
397 
398 template<typename Real>
399 template<typename OtherReal>
401  if (trans == kNoTrans)
402  this->Resize(other.NumRows(), other.NumCols(), kUndefined);
403  else
404  this->Resize(other.NumCols(), other.NumRows(), kUndefined);
405  this->CopyFromMat(other, trans);
406 }
407 // Instantiate the template above.
408 template
410 template
412 template
414 template
416 
417 
418 template <typename Real>
420  int32_t start_range, int32_t end_range,
421  int32_t clamp_low, int32_t clamp_high) {
422 
423  KALDI_ASSERT(NumCols() == this->NumCols());
424  KALDI_ASSERT(NumRows() == end_range-start_range);
425 
426 #if HAVE_CUDA == 1
427  if (CuDevice::Instantiate().Enabled()) {
428  cuda_mat_copy_range_clamped(start_range, end_range, NumCols(),
429  src.Data(), src.Stride(), clamp_low, clamp_high,
430  Data(), Stride());
431  } else
432 #endif
433  {
434  for (int32 t = start_range; t < end_range; t++) {
435  int32 t_clamped = t;
436  if (t_clamped < clamp_low) t_clamped = clamp_low;
437  if (t_clamped >= clamp_high) t_clamped = clamp_high;
438  CuSubVector<Real> dest_row=this->Row(t - start_range);
439  const CuSubVector<Real> src_row=src.Row(t_clamped);
440  dest_row.CopyFromVec(src_row);
441  }
442  }
443 }
444 
445 template<typename Real>
446 template<typename OtherReal>
448  MatrixTransposeType trans) const {
449 #if HAVE_CUDA == 1
450  if (CuDevice::Instantiate().Enabled()) {
451  if (trans == kTrans || sizeof(OtherReal) != sizeof(Real)) {
452  CuMatrix<OtherReal> this_trans(*this, trans);
453  this_trans.CopyToMat(dst, kNoTrans);
454  } else {
455  KALDI_ASSERT(dst->NumRows() == NumRows() && dst->NumCols() == NumCols());
456  if (num_rows_ == 0) return;
457  CuTimer tim;
458 
459  MatrixIndexT src_pitch = stride_*sizeof(Real);
460  MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real);
461  MatrixIndexT width = NumCols()*sizeof(Real);
462  CU_SAFE_CALL(cudaMemcpy2DAsync(dst->Data(), dst_pitch, this->data_,
463  src_pitch, width, this->num_rows_,
464  cudaMemcpyDeviceToHost, cudaStreamPerThread));
465  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
466  CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToMatD2H", tim);
467  }
468  } else
469  #endif
470  {
471  dst->CopyFromMat(Mat(), trans);
472  }
473 }
474 
475 // instantiate the template above.
476 template
478  MatrixTransposeType trans) const;
479 template
481  MatrixTransposeType trans) const;
482 template
483 void CuMatrixBase<float>::CopyToMat(MatrixBase<double> *dst,
484  MatrixTransposeType trans) const;
485 template
486 void CuMatrixBase<double>::CopyToMat(MatrixBase<double> *dst,
487  MatrixTransposeType trans) const;
488 
489 
490 
491 
492 
493 template<typename Real>
494 void CuMatrix<Real>::Read(std::istream &is, bool binary) {
495  Matrix<Real> temp;
496  temp.Read(is, binary);
497  Destroy();
498  Swap(&temp);
499 }
500 
501 template<typename Real>
502 void CuMatrixBase<Real>::Write(std::ostream &os, bool binary) const {
503  Matrix<Real> temp(this->num_rows_, this->num_cols_, kUndefined);
504  this->CopyToMat(&temp);
505  temp.Write(os, binary);
506 }
507 
508 template<typename Real>
510 #if HAVE_CUDA == 1
511  if (CuDevice::Instantiate().Enabled()) {
512  CuTimer tim;
513  CU_SAFE_CALL(cudaMemset2DAsync(data_, stride_ * sizeof(Real), 0,
514  num_cols_ * sizeof(Real), num_rows_ ,
515  cudaStreamPerThread));
516  CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim);
517  } else
518 #endif
519  {
520  Mat().SetZero();
521  }
522 }
523 
524 
525 
526 
527 /*
528  * Methods wrapping the ANSI-C CUDA kernels
529  */
530 template<typename Real>
531 void CuMatrixBase<Real>::Set(Real value) {
532  #if HAVE_CUDA == 1
533  if (CuDevice::Instantiate().Enabled()) {
534  if (num_rows_ == 0) return;
535  CuTimer tim;
536 
537  dim3 dimGrid, dimBlock;
538  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
539  &dimGrid, &dimBlock);
540 
541  cuda_set_const(dimGrid, dimBlock, data_, value, Dim());
542  CU_SAFE_CALL(cudaGetLastError());
543 
544  CuDevice::Instantiate().AccuProfile(__func__, tim);
545  } else
546  #endif
547  {
548  Mat().Set(value);
549  }
550 }
551 
552 // set zero the elements above the diagonal.
553 template<typename Real>
555 #if HAVE_CUDA == 1
556  if (CuDevice::Instantiate().Enabled()) {
557  if (num_rows_ == 0) return;
558  CuTimer tim;
559 
560  dim3 dimGrid, dimBlock;
561  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
562  &dimGrid, &dimBlock);
563 
564  cuda_set_zero_above_diag(dimGrid, dimBlock, data_, Dim());
565  CU_SAFE_CALL(cudaGetLastError());
566 
567  CuDevice::Instantiate().AccuProfile(__func__, tim);
568  } else
569 #endif
570  {
571  MatrixBase<Real> &mat = Mat();
572  int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
573  for (int32 r = 0; r + 1 < num_rows; r++) {
574  SubVector<Real> vec(mat, r),
575  vec_part(vec, r + 1, num_cols - (r + 1));
576  vec_part.SetZero();
577  }
578  }
579 }
580 
581 template<typename Real>
582 void CuMatrixBase<Real>::Add(Real value) {
583 #if HAVE_CUDA == 1
584  if (CuDevice::Instantiate().Enabled()) {
585  if (num_rows_ == 0) return;
586  CuTimer tim;
587 
588  dim3 dimGrid, dimBlock;
589  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
590  &dimGrid, &dimBlock);
591 
592  cuda_add(dimGrid, dimBlock, data_, value, Dim());
593  CU_SAFE_CALL(cudaGetLastError());
594 
595  CuDevice::Instantiate().AccuProfile(__func__, tim);
596  } else
597  #endif
598  {
599  Mat().Add(value);
600  }
601 }
602 
603 template<typename Real>
605 #if HAVE_CUDA == 1
606  if (CuDevice::Instantiate().Enabled()) {
607  if (num_rows_ == 0) return;
608  CuTimer tim;
609  // We'll create a fake matrix with "num_diag" rows, one
610  // columnn, and a stride of "this_stride". The y-value of
611  // the grid/blocks corresponds to the row, in this kernel.
612  MatrixIndexT num_diag = std::min(num_rows_, num_cols_),
613  this_stride = stride_ + 1;
614  dim3 dimBlock(1, CU1DBLOCK);
615  dim3 dimGrid(1, n_blocks(num_diag, CU1DBLOCK));
616  ::MatrixDim d = { num_diag, 1, this_stride };
617  cuda_add(dimGrid, dimBlock, data_, value, d);
618  CU_SAFE_CALL(cudaGetLastError());
619 
620  CuDevice::Instantiate().AccuProfile(__func__, tim);
621  } else
622  #endif
623  {
624  Mat().AddToDiag(value);
625  }
626 }
627 
628 template<typename Real>
629 bool CuMatrixBase<Real>::IsUnit(Real tol) const {
630  // want to return:
631  //FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.:
632  //sqrt (trace((*this - I)(*this-I)) <= tol * NumRows()
633  // trace((*this - I)(*this - I)) <= tol * NumRows()
634  // trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows()
635  // trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows()
636  KALDI_ASSERT(this->NumRows() == this->NumCols());
637  return (TraceMatMat(*this, *this, kTrans) + this->NumRows() - 2.0 * this->Trace() <=
638  tol * this->NumRows());
639 }
640 
641 
642 
643 template<typename Real>
644 void CuMatrixBase<Real>::Scale(Real value) {
645 #if HAVE_CUDA == 1
646  if (CuDevice::Instantiate().Enabled()) {
647  if (num_rows_ == 0) return;
648  CuTimer tim;
649 
650  dim3 dimGrid, dimBlock;
651  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
652  &dimGrid, &dimBlock);
653 
654  cuda_scale(dimGrid, dimBlock, data_, value, Dim());
655  CU_SAFE_CALL(cudaGetLastError());
656 
657  CuDevice::Instantiate().AccuProfile(__func__, tim);
658  } else
659 #endif
660  {
661  Mat().Scale(value);
662  }
663 }
664 
665 
666 template<typename Real>
668  #if HAVE_CUDA == 1
669  if (CuDevice::Instantiate().Enabled()) {
670  CuTimer tim;
671 
672  KALDI_ASSERT(num_cols_ == A.NumCols());
673  KALDI_ASSERT(num_rows_ == A.NumRows());
674 
675  dim3 dimGrid, dimBlock;
676  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
677  &dimGrid, &dimBlock);
678 
679  cuda_mul_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
680  CU_SAFE_CALL(cudaGetLastError());
681 
682  CuDevice::Instantiate().AccuProfile(__func__, tim);
683  } else
684  #endif
685  {
686  Mat().MulElements(A.Mat());
687  }
688 }
689 
690 template<typename Real>
692  #if HAVE_CUDA == 1
693  if (CuDevice::Instantiate().Enabled()) {
694  CuTimer tim;
695 
696  KALDI_ASSERT(num_cols_ == A.NumCols());
697  KALDI_ASSERT(num_rows_ == A.NumRows());
698 
699  dim3 dimGrid, dimBlock;
700  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
701  &dimGrid, &dimBlock);
702 
703  cuda_div_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
704  CU_SAFE_CALL(cudaGetLastError());
705 
706  CuDevice::Instantiate().AccuProfile(__func__, tim);
707  } else
708  #endif
709  {
710  Mat().DivElements(A.Mat());
711  }
712 }
713 
714 template<typename Real>
716  #if HAVE_CUDA == 1
717  if (CuDevice::Instantiate().Enabled()) {
718  CuTimer tim;
719 
720  KALDI_ASSERT(num_cols_ == A.NumCols());
721  KALDI_ASSERT(num_rows_ == A.NumRows());
722 
723  dim3 dimGrid, dimBlock;
724  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
725  &dimGrid, &dimBlock);
726 
727  cuda_max(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
728  CU_SAFE_CALL(cudaGetLastError());
729 
730  CuDevice::Instantiate().AccuProfile(__func__, tim);
731  } else
732  #endif
733  {
734  Mat().Max(A.Mat());
735  }
736 }
737 
738 
739 template<typename Real>
741  #if HAVE_CUDA == 1
742  if (CuDevice::Instantiate().Enabled()) {
743  CuTimer tim;
744 
745  KALDI_ASSERT(num_cols_ == A.NumCols());
746  KALDI_ASSERT(num_rows_ == A.NumRows());
747 
748  dim3 dimGrid, dimBlock;
749  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
750  &dimGrid, &dimBlock);
751 
752  cuda_min(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
753  CU_SAFE_CALL(cudaGetLastError());
754 
755  CuDevice::Instantiate().AccuProfile(__func__, tim);
756  } else
757  #endif
758  {
759  Mat().Min(A.Mat());
760  }
761 }
762 
763 
764 template<typename Real>
766 #if HAVE_CUDA == 1
767  if (CuDevice::Instantiate().Enabled()) {
768  CuTimer tim;
769 
770  KALDI_ASSERT(scale.Dim() == NumCols());
771 
772 
773  dim3 dimGrid, dimBlock;
774  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
775  &dimGrid, &dimBlock);
776 
777  cuda_mul_cols_vec(dimGrid, dimBlock, data_, scale.data_, Dim());
778  CU_SAFE_CALL(cudaGetLastError());
779 
780 
781  CuDevice::Instantiate().AccuProfile(__func__, tim);
782  } else
783 #endif
784  {
785  Mat().MulColsVec(scale.Vec());
786  }
787 }
788 
789 
790 
791 template<typename Real>
793  #if HAVE_CUDA == 1
794  if (CuDevice::Instantiate().Enabled()) {
795  CuTimer tim;
796 
797  KALDI_ASSERT(scale.Dim() == NumRows());
798 
799  dim3 dimGrid, dimBlock;
800  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
801  &dimGrid, &dimBlock);
802 
803  cuda_mul_rows_vec(dimGrid, dimBlock, data_, scale.data_, Dim());
804  CU_SAFE_CALL(cudaGetLastError());
805 
806 
807  CuDevice::Instantiate().AccuProfile(__func__, tim);
808  } else
809  #endif
810  {
811  Mat().MulRowsVec(scale.Vec());
812  }
813 }
814 
815 template<typename Real>
817  KALDI_ASSERT(src.NumCols() > 0);
818 #if HAVE_CUDA == 1
819  if (CuDevice::Instantiate().Enabled()) {
820  CuTimer tim;
821  int group_size = this->NumCols() / src.NumCols();
822 
823  dim3 dimGrid, dimBlock;
824  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
825  &dimGrid, &dimBlock);
826 
827  cuda_mul_rows_group_mat(dimGrid, dimBlock, this->data_, src.data_,
828  this->Dim(), src.Stride(), group_size);
829  CU_SAFE_CALL(cudaGetLastError());
830 
831  CuDevice::Instantiate().AccuProfile(__func__, tim);
832  } else
833 #endif
834  {
835  Mat().MulRowsGroupMat(src.Mat());
836  }
837 }
838 
839 
840 template<typename Real>
842  const CuMatrixBase<Real> &out_value,
843  const CuMatrixBase<Real> &out_deriv,
844  Real power) {
845  KALDI_ASSERT(out_value.NumCols() > 0);
846  KALDI_ASSERT(out_value.NumCols() == out_deriv.NumCols());
847  int group_size = this->NumCols() / out_value.NumCols();
848  KALDI_ASSERT(this->NumCols() == out_value.NumCols() * group_size);
849 #if HAVE_CUDA == 1
850  if (CuDevice::Instantiate().Enabled()) {
851  CuTimer tim;
852  const int kWarpSize = 32;
853  dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
854  dim3 dimGrid(n_blocks(NumCols(), dimBlock.x),
855  n_blocks(NumRows(), dimBlock.y));
856  if (dimGrid.x * dimGrid.y > 1024) {
857  dimGrid.y = std::max(1024 / dimGrid.x, unsigned(1));
858  }
859  cuda_diff_group_pnorm(dimGrid, dimBlock, this->data_, in_value.Data(),
860  out_value.Data(), out_deriv.Data(), Dim(),
861  in_value.Stride(), out_value.Stride(),
862  out_deriv.Stride(), group_size, power);
863  CU_SAFE_CALL(cudaGetLastError());
864  CuDevice::Instantiate().AccuProfile(__func__, tim);
865  } else
866 #endif
867  {
868  Mat().GroupPnormDeriv(in_value.Mat(), out_value.Mat(), power);
869  MulRowsGroupMat(out_deriv);
870  }
871 }
872 
873 template<typename Real>
875  const CuMatrixBase<Real> &src2) {
876  KALDI_ASSERT(src2.NumCols() > 0);
877  int group_size = this->NumCols() / src2.NumCols();
878  KALDI_ASSERT(this->NumCols() == src2.NumCols() * group_size);
879 #if HAVE_CUDA == 1
880  if (CuDevice::Instantiate().Enabled()) {
881  CuTimer tim;
882  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
883  dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
884  n_blocks(NumRows(), CU2DBLOCK));
885  cuda_calc_group_max_deriv(dimGrid, dimBlock, this->data_, src1.Data(),
886  src2.Data(), Dim(), src1.Stride(), src2.Stride(),
887  group_size);
888  CU_SAFE_CALL(cudaGetLastError());
889 
890  CuDevice::Instantiate().AccuProfile(__func__, tim);
891  } else
892 #endif
893  {
894  Mat().GroupMaxDeriv(src1.Mat(), src2.Mat());
895  }
896 }
897 
898 template<typename Real>
900 #if HAVE_CUDA == 1
901  if (CuDevice::Instantiate().Enabled()) {
902  CuTimer tim;
903 
904  KALDI_ASSERT(div.Dim() == NumRows());
905 
906  dim3 dimGrid, dimBlock;
907  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
908  &dimGrid, &dimBlock);
909  // For large matrix we do more work per thread by limiting the
910  // the grid size to reduce the block launching overhead.
911  if (dimGrid.x * dimGrid.y > 1024) {
912  dimGrid.x = 1024 / dimGrid.y;
913  if (dimGrid.x == 0) {
914  dimGrid.x = 1;
915  }
916  }
917  cuda_div_rows_vec(dimGrid, dimBlock, data_, div.data_, Dim());
918  CU_SAFE_CALL(cudaGetLastError());
919 
920  CuDevice::Instantiate().AccuProfile(__func__, tim);
921  } else
922 #endif
923  {
924  Vector<Real> temp(div.Vec()); // will copy.
925  temp.InvertElements();
926  Mat().MulRowsVec(temp);
927  }
928 }
929 
930 
931 template<typename Real>
933 #if HAVE_CUDA == 1
934  if (CuDevice::Instantiate().Enabled()) {
935  CuTimer tim;
936 
937  dim3 dimGrid, dimBlock;
938  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
939  &dimGrid, &dimBlock);
940 
941  cuda_invert_elements(dimGrid, dimBlock, data_, Dim());
942  CU_SAFE_CALL(cudaGetLastError());
943 
944  CuDevice::Instantiate().AccuProfile(__func__, tim);
945  } else
946 #endif
947  {
948  Mat().InvertElements();
949  }
950 }
951 
952 
953 template<typename Real>
955  MatrixTransposeType transA) {
956 
957 #if HAVE_CUDA == 1
958  if (CuDevice::Instantiate().Enabled()) {
959  if (transA == kNoTrans) {
960  KALDI_ASSERT(A.NumRows() == num_rows_ && A.NumCols() == num_cols_);
961  } else {
962  KALDI_ASSERT(A.NumCols() == num_rows_ && A.NumRows() == num_cols_);
963  }
964  if (num_rows_ == 0) return;
965  CuTimer tim;
966  // This block dimension seems to work better than the
967  // one from GetBlockSizesForSimpleMatrixOperation().
968  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
969  dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
970  n_blocks(NumRows(), CU2DBLOCK));
971  cuda_add_mat(dimGrid, dimBlock, alpha, A.data_,
972  data_, Dim(), A.Stride(),
973  (transA == kTrans ? 1 : 0));
974  CU_SAFE_CALL(cudaGetLastError());
975 
976  CuDevice::Instantiate().AccuProfile(__func__, tim);
977  } else
978 #endif
979  {
980  Mat().AddMat(alpha, A.Mat(), transA);
981  }
982 }
983 
984 template<typename Real>
986  MatrixTransposeType trans) {
987 #if HAVE_CUDA == 1
988  if (CuDevice::Instantiate().Enabled()) {
989  if (trans == kNoTrans) {
990  KALDI_ASSERT(NumRows() == A.NumRows());
991  KALDI_ASSERT(NumCols() == A.NumCols());
992  } else {
993  KALDI_ASSERT(NumRows() == A.NumCols());
994  KALDI_ASSERT(NumCols() == A.NumRows());
995  }
996 
997  CuTimer tim;
998 
999  // We use warpSize threads per row to access only the nonzero elements.
1000  // Every CU1DBLOCK/warpSize rows share one thread block.
1001  // 1D grid to cover all rows of A.
1002  const int warpSize = 32;
1003  dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
1004  dim3 dimGrid(n_blocks(A.NumRows(), dimBlock.y));
1005 
1006  if (trans == kNoTrans) {
1007  cuda_add_smat(dimGrid, dimBlock, Data(), Dim(), alpha, A.CsrRowPtr(),
1008  A.CsrColIdx(), A.CsrVal());
1009  } else {
1010  cuda_add_smat_trans(dimGrid, dimBlock, Data(), Dim(), alpha,
1011  A.CsrRowPtr(), A.CsrColIdx(), A.CsrVal());
1012  }
1013 
1014  CU_SAFE_CALL(cudaGetLastError());
1015  CuDevice::Instantiate().AccuProfile(__func__, tim);
1016  } else
1017 #endif
1018  {
1019  Mat().AddSmat(alpha, A.Smat(), trans);
1020  }
1021 }
1022 
1023 template<typename Real>
1025  MatrixTransposeType transA,
1026  const CuMatrixBase<Real> &B, Real beta) {
1027 #if HAVE_CUDA == 1
1028  if (CuDevice::Instantiate().Enabled()) {
1029  if (transA == kNoTrans) {
1030  KALDI_ASSERT(NumRows() == A.NumRows());
1031  KALDI_ASSERT(NumCols() == B.NumCols());
1032  KALDI_ASSERT(A.NumCols() == B.NumRows());
1033  } else {
1034  KALDI_ASSERT(NumRows() == A.NumCols());
1035  KALDI_ASSERT(NumCols() == B.NumCols());
1036  KALDI_ASSERT(A.NumRows() == B.NumRows());
1037  }
1038 
1039  CuTimer tim;
1040 
1041  // We have op(A) and BT in col-major (B in row-major).
1042  // We first compute C in col-major (CT in row-major)
1043  // with C = op(A) * BT^T by cusparse_csrmm2,
1044  // then transpose CT to get C in row-major
1045  CuMatrix<Real> CT(*this, kTrans);
1046 
1047  cusparseMatDescr_t descr;
1048  CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descr));
1049  if (transA == kTrans) {
1050  // Note: only op(A)=A is supported if op(B)=B^T according to cusparse doc
1051  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmm2
1053  CU_SAFE_CALL(
1054  cusparse_csrmm2(GetCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
1055  CUSPARSE_OPERATION_TRANSPOSE, AT.NumRows(),
1056  CT.NumRows(), AT.NumCols(), AT.NumElements(), &alpha,
1057  descr, AT.CsrVal(), AT.CsrRowPtr(), AT.CsrColIdx(),
1058  B.Data(), B.Stride(), &beta, CT.Data(), CT.Stride()));
1059  } else {
1060  CU_SAFE_CALL(
1061  cusparse_csrmm2(GetCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
1062  CUSPARSE_OPERATION_TRANSPOSE, A.NumRows(),
1063  CT.NumRows(), A.NumCols(), A.NumElements(), &alpha,
1064  descr, A.CsrVal(), A.CsrRowPtr(), A.CsrColIdx(),
1065  B.Data(), B.Stride(), &beta, CT.Data(), CT.Stride()));
1066  }
1067  CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descr));
1068 
1069  this->CopyFromMat(CT, kTrans);
1070 
1071  CuDevice::Instantiate().AccuProfile(__func__, tim);
1072  } else
1073 #endif
1074  {
1075  Mat().AddSmatMat(alpha, A.Smat(), transA, B.Mat(), beta);
1076  }
1077 }
1078 
1079 template<typename Real>
1081  const CuSparseMatrix<Real> &B,
1082  MatrixTransposeType transB, Real beta) {
1083 #if HAVE_CUDA == 1
1084  if (CuDevice::Instantiate().Enabled()) {
1085  if (transB == kNoTrans) {
1086  KALDI_ASSERT(NumRows() == A.NumRows());
1087  KALDI_ASSERT(NumCols() == B.NumCols());
1088  KALDI_ASSERT(A.NumCols() == B.NumRows());
1089  } else {
1090  KALDI_ASSERT(NumRows() == A.NumRows());
1091  KALDI_ASSERT(NumCols() == B.NumRows());
1092  KALDI_ASSERT(A.NumCols() == B.NumCols());
1093  }
1094 
1095  CuTimer tim;
1096 
1097  cusparseMatDescr_t descr;
1098  CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descr));
1099  CU_SAFE_CALL(
1100  cusparse_csrmm(
1101  GetCusparseHandle(),
1102  transB == kNoTrans ?
1103  CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
1104  B.NumRows(), NumRows(), B.NumCols(), B.NumElements(), &alpha, descr,
1105  B.CsrVal(), B.CsrRowPtr(), B.CsrColIdx(), A.Data(), A.Stride(),
1106  &beta, Data(), Stride()));
1107  CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descr));
1108 
1109  CuDevice::Instantiate().AccuProfile(__func__, tim);
1110  } else
1111 #endif
1112  {
1113  Mat().AddMatSmat(alpha, A.Mat(), B.Smat(), transB, beta);
1114  }
1115 }
1116 
1117 
1118 template<typename Real>
1120  MatrixTransposeType transA) {
1121  if (num_rows_ == 0 || num_cols_ == 0) return;
1122 
1123  if (A.NumRows() >= (transA == kNoTrans ? num_rows_ : num_cols_) &&
1124  A.NumCols() >= (transA == kNoTrans ? num_cols_ : num_rows_)) {
1125  // This is the "summing", not broadcasting, version of AddMatBlocks.
1126  // It supports both regular and transposed operation.
1127  int32 num_row_blocks, num_col_blocks;
1128  if (transA == kNoTrans) {
1129  KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0);
1130  num_row_blocks = A.Mat().NumRows() / num_rows_;
1131  num_col_blocks = A.Mat().NumCols() / num_cols_;
1132  } else {
1133  KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0);
1134  num_row_blocks = A.Mat().NumRows() / num_cols_;
1135  num_col_blocks = A.Mat().NumCols() / num_rows_;
1136  }
1137 #if HAVE_CUDA == 1
1138  if (CuDevice::Instantiate().Enabled()) {
1139  CuTimer tim;
1140  dim3 dimGrid, dimBlock;
1141  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1142  &dimGrid, &dimBlock);
1143  cuda_add_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks,
1144  num_col_blocks, data_, Dim(), A.Stride(),
1145  (transA == kTrans ? 1 : 0));
1146  CU_SAFE_CALL(cudaGetLastError());
1147 
1148  CuDevice::Instantiate().AccuProfile(__func__, tim);
1149  } else
1150 #endif
1151  {
1152  int32 nr, nc;
1153  if (transA == kNoTrans) {
1154  nr = num_rows_;
1155  nc = num_cols_;
1156  } else {
1157  nr = num_cols_;
1158  nc = num_rows_;
1159  }
1160  for (int32 i = 0; i < num_row_blocks; i++) {
1161  for (int32 j = 0; j < num_col_blocks; j++) {
1162  Mat().AddMat(alpha, SubMatrix<Real>(A.Mat(), i * nr, nr, j * nc, nc),
1163  transA);
1164  }
1165  }
1166  }
1167  } else {
1168  // This is the "broadcasting" version of AddMatBlocks, where
1169  // *this is larger than src.
1170  if (transA != kNoTrans)
1171  KALDI_ERR << "Transposed operation not supported currently.";
1172  if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0))
1173  KALDI_ERR << "Invalid sizes of arguments";
1174 #if HAVE_CUDA == 1
1175  if (CuDevice::Instantiate().Enabled()) {
1176  CuTimer tim;
1177  dim3 dimGrid, dimBlock;
1178  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1179  &dimGrid, &dimBlock);
1180  cuda_add_mat_repeated(dimGrid, dimBlock, alpha,
1181  A.data_, A.Dim(), data_, Dim());
1182  CU_SAFE_CALL(cudaGetLastError());
1183  CuDevice::Instantiate().AccuProfile(__func__, tim);
1184  } else
1185 #endif
1186  {
1187  const MatrixBase<Real> &src_mat = A.Mat(),
1188  &this_mat = this->Mat();
1189  for (int32 row_offset = 0; row_offset < NumRows();
1190  row_offset += src_mat.NumRows()) {
1191  for (int32 col_offset = 0; col_offset < NumCols();
1192  col_offset += src_mat.NumCols()) {
1193  SubMatrix<Real> this_part(this_mat,
1194  row_offset, src_mat.NumRows(),
1195  col_offset, src_mat.NumCols());
1196  this_part.AddMat(alpha, src_mat);
1197  }
1198  }
1199  }
1200  }
1201 }
1202 
1205 template<typename Real>
1207  const CuMatrixBase<Real> &B, const CuMatrixBase<Real> &C) {
1208 #if HAVE_CUDA == 1
1209  if (CuDevice::Instantiate().Enabled()) {
1210  CuTimer tim;
1211 
1212  KALDI_ASSERT(num_rows_ == A.num_rows_ && num_cols_ == A.num_cols_);
1213  KALDI_ASSERT(num_rows_ == B.num_rows_ && num_cols_ == B.num_cols_);
1214  KALDI_ASSERT(num_rows_ == C.num_rows_ && num_cols_ == C.num_cols_);
1215  if (num_rows_ == 0) return;
1216  dim3 dimGrid, dimBlock;
1217  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1218  &dimGrid, &dimBlock);
1219  cuda_set_mat_mat_div_mat(dimGrid, dimBlock, A.data_, B.data_, C.data_,
1220  data_, Dim(), A.Stride(), B.Stride(), C.Stride());
1221  CU_SAFE_CALL(cudaGetLastError());
1222 
1223  CuDevice::Instantiate().AccuProfile(__func__, tim);
1224  } else
1225 #endif
1226  {
1227  Mat().SetMatMatDivMat(A.Mat(), B.Mat(), C.Mat());
1228  }
1229 }
1230 
1231 template<typename Real>
1233  const CuVectorBase<Real> &col,
1234  Real beta) {
1235  if (col.Dim() != NumRows()) {
1236  KALDI_ERR << "Non matching dimensions: Rows:" << NumRows() << " VectorDim:" << col.Dim();
1237  }
1238 
1239  #if HAVE_CUDA == 1
1240  if (CuDevice::Instantiate().Enabled()) {
1241  CuTimer tim;
1242  dim3 dimGrid, dimBlock;
1243  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1244  &dimGrid, &dimBlock);
1245  cuda_add_vec_to_cols(dimGrid, dimBlock, alpha, col.data_, beta,
1246  data_, Dim());
1247  CU_SAFE_CALL(cudaGetLastError());
1248 
1249  CuDevice::Instantiate().AccuProfile(__func__, tim);
1250  } else
1251  #endif
1252  {
1253  if (beta != 1.0) Mat().Scale(beta);
1254  Mat().AddVecToCols(alpha, col.Vec());
1255  }
1256 }
1257 
1258 
1259 
1260 template<typename Real>
1262  const CuVectorBase<Real> &row,
1263  Real beta) {
1264  if (row.Dim() != NumCols()) {
1265  KALDI_ERR << "Non matching dimensions: Cols:" << NumCols() << " VectorDim:" << row.Dim();
1266  }
1267 #if HAVE_CUDA == 1
1268  if (CuDevice::Instantiate().Enabled()) {
1269  CuTimer tim;
1270  dim3 dimGrid, dimBlock;
1271  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1272  &dimGrid, &dimBlock);
1273  cuda_add_vec_to_rows(dimGrid, dimBlock, alpha, row.data_, beta, data_, Dim());
1274  CU_SAFE_CALL(cudaGetLastError());
1275 
1276  CuDevice::Instantiate().AccuProfile(__func__, tim);
1277  } else
1278 #endif
1279  {
1280  if (beta != 1.0) Mat().Scale(beta);
1281  Mat().AddVecToRows(alpha, row.Vec());
1282  }
1283 }
1284 
1285 
1286 
1287 /*
1288  * Method wrapping the CUBLAS function GEMM
1289  */
1290 template<typename Real>
1292  Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
1293  const CuMatrixBase<Real> &B, MatrixTransposeType transB, Real beta) {
1294 
1295 
1296  // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping?
1297  // keep trans..., just swap A&B matrices: A->B B->A
1298  MatrixIndexT m = ((transB==kTrans)? B.NumRows() : B.NumCols());
1299  MatrixIndexT n = ((transA==kTrans)? A.NumCols() : A.NumRows());
1300  MatrixIndexT k = ((transB==kTrans)? B.NumCols() : B.NumRows());
1301  MatrixIndexT k1 = ((transA==kTrans)? A.NumRows() : A.NumCols());
1302 
1303  KALDI_ASSERT(m == NumCols());
1304  KALDI_ASSERT(n == NumRows());
1305  KALDI_ASSERT(k == k1);
1306 
1307  if (m == 0) return;
1308 
1309 
1310 #if HAVE_CUDA == 1
1311  if (CuDevice::Instantiate().Enabled()) {
1312  CuTimer tim;
1313  CUBLAS_SAFE_CALL(cublas_gemm(GetCublasHandle(),
1314  (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
1315  (transA==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
1316  m, n, k, alpha, B.data_, B.Stride(),
1317  A.data_, A.Stride(), beta, data_, Stride()));
1318 
1319  CuDevice::Instantiate().AccuProfile(__func__, tim);
1320  } else
1321 #endif
1322  {
1323  Mat().AddMatMat(alpha, A.Mat(), transA, B.Mat(), transB, beta);
1324  }
1325 }
1326 
1327 
1328 template<typename Real>
1330  Real alpha, const CuVectorBase<Real> &x, const CuVectorBase<Real> &y) {
1331 
1332  MatrixIndexT m = y.Dim();
1333  MatrixIndexT n = x.Dim();
1334  KALDI_ASSERT(m == NumCols());
1335  KALDI_ASSERT(n == NumRows());
1336 
1337 #if HAVE_CUDA == 1
1338  if (CuDevice::Instantiate().Enabled()) {
1339  CuTimer tim;
1340  CUBLAS_SAFE_CALL(cublas_ger(GetCublasHandle(), m, n, alpha,
1341  y.Data(), 1, x.Data(), 1, data_, Stride()));
1342 
1343  CuDevice::Instantiate().AccuProfile(__func__, tim);
1344  } else
1345 #endif
1346  {
1347  Mat().AddVecVec(alpha, x.Vec(), y.Vec());
1348  }
1349 }
1350 
1351 
1352 template<typename Real>
1354  Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
1355  Real beta) {
1356  KALDI_ASSERT(num_rows_ == num_cols_ &&
1357  ((transA == kNoTrans && A.num_rows_ == num_rows_) ||
1358  (transA == kTrans && A.num_cols_ == num_cols_)));
1359  if (num_rows_ == 0) return;
1360  KALDI_ASSERT(A.data_ != data_);
1361 
1362 #if HAVE_CUDA == 1
1363  if (CuDevice::Instantiate().Enabled()) {
1364  CuTimer tim;
1365  cublasOperation_t trans = (transA == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T);
1366  MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);
1367  CUBLAS_SAFE_CALL(cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER,
1368  trans, num_rows_, A_other_dim,
1369  alpha, A.Data(), A.Stride(),
1370  beta, this->data_, this->stride_));
1371 
1372  CuDevice::Instantiate().AccuProfile(__func__, tim);
1373  } else
1374 #endif
1375  {
1376  Mat().SymAddMat2(alpha, A.Mat(), transA, beta);
1377  }
1378 }
1379 
1380 
1381 template<typename Real>
1383  const Real alpha, const CuVectorBase<Real> &v,
1384  const CuMatrixBase<Real> &M, MatrixTransposeType transM,
1385  Real beta) {
1386 #if HAVE_CUDA == 1
1387  if (CuDevice::Instantiate().Enabled()) {
1388  if (transM == kNoTrans) {
1389  KALDI_ASSERT(SameDim(*this, M));
1390  } else {
1391  KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows());
1392  }
1393  KALDI_ASSERT(v.Dim() == this->NumRows());
1394 
1395  CuTimer tim;
1396  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
1397  dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
1398  n_blocks(num_rows_, CU2DBLOCK));
1399  MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
1400  if (transM == kTrans)
1401  std::swap(M_row_stride, M_col_stride);
1402  cuda_add_diag_vec_mat(dimGrid, dimBlock, alpha, data_, Dim(),
1403  v.Data(), M.Data(), M_row_stride, M_col_stride, beta);
1404  CU_SAFE_CALL(cudaGetLastError());
1405  CuDevice::Instantiate().AccuProfile(__func__, tim);
1406  } else
1407 #endif
1408  {
1409  Mat().AddDiagVecMat(alpha, v.Vec(), M.Mat(), transM, beta);
1410  }
1411 }
1412 
1413 
1414 template<typename Real>
1416  const Real alpha,
1417  const CuMatrixBase<Real> &M, MatrixTransposeType transM,
1418  CuVectorBase<Real> &v,
1419  Real beta) {
1420 #if HAVE_CUDA == 1
1421  if (CuDevice::Instantiate().Enabled()) {
1422  if (transM == kNoTrans) {
1423  KALDI_ASSERT(SameDim(*this, M));
1424  } else {
1425  KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows());
1426  }
1427  KALDI_ASSERT(v.Dim() == this->NumCols());
1428 
1429  CuTimer tim;
1430  dim3 dimGrid, dimBlock;
1431  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1432  &dimGrid, &dimBlock);
1433  MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
1434  if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
1435  cuda_add_mat_diag_vec(dimGrid, dimBlock, alpha, data_, Dim(),
1436  M.Data(), M_row_stride, M_col_stride, v.Data(), beta);
1437  CU_SAFE_CALL(cudaGetLastError());
1438  CuDevice::Instantiate().AccuProfile(__func__, tim);
1439  } else
1440 #endif
1441  {
1442  Mat().AddMatDiagVec(alpha, M.Mat(), transM, v.Vec(), beta);
1443  }
1444 }
1445 
1446 template<typename Real>
1448  const CuMatrixBase<Real> &A, const CuMatrixBase<Real> &B, Real beta) {
1449 #if HAVE_CUDA == 1
1450  if (CuDevice::Instantiate().Enabled()) {
1451  KALDI_ASSERT(SameDim(*this, A) && SameDim(A, B));
1452  CuTimer tim;
1453  dim3 dimGrid, dimBlock;
1454  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1455  &dimGrid, &dimBlock);
1456  cuda_add_mat_mat_elements(dimGrid, dimBlock, this->data_, A.Data(),
1457  B.Data(), Dim(), A.Stride(), B.Stride(), alpha, beta);
1458  CuDevice::Instantiate().AccuProfile(__func__, tim);
1459  } else
1460 #endif
1461  {
1462  Mat().AddMatMatElements(alpha, A.Mat(), B.Mat(), beta);
1463  }
1464 }
1465 
1466 template<typename Real>
1468  const CuMatrixBase<Real> &src,
1469  const CuVectorBase<Real> &alpha,
1470  const CuVectorBase<Real> &beta) {
1471  KALDI_ASSERT(src.NumRows() == this->NumRows());
1472  KALDI_ASSERT(src.NumCols() == this->NumCols());
1473  KALDI_ASSERT(alpha.Dim() == this->NumCols());
1474  KALDI_ASSERT(beta.Dim() == this->NumCols());
1475 #if HAVE_CUDA == 1
1476  if (CuDevice::Instantiate().Enabled()) {
1477  CuTimer tim;
1478 
1479  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
1480  dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK));
1481 
1482  cuda_parametric_relu(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
1483  src.Stride(), alpha.data_, beta.data_);
1484  CU_SAFE_CALL(cudaGetLastError());
1485 
1486  CuDevice::Instantiate().AccuProfile(__func__, tim);
1487  } else
1488 #endif
1489  {
1490  // Do it on CPU,
1491  for (MatrixIndexT r = 0; r < NumRows(); r++) {
1492  for (MatrixIndexT c = 0; c < NumCols(); c++) {
1493  Real src_elem = src.Mat()(r,c);
1494  this->Mat()(r,c) = src_elem * (src_elem >= 0.0 ? alpha.Vec()(c) : beta.Vec()(c));
1495  }
1496  }
1497  }
1498 }
1499 
1500 template<typename Real>
1502  const CuMatrixBase<Real> &value,
1503  const CuMatrixBase<Real> &diff,
1504  const CuVectorBase<Real> &alpha,
1505  const CuVectorBase<Real> &beta) {
1506 #if HAVE_CUDA == 1
1507  if (CuDevice::Instantiate().Enabled()) {
1508  CuTimer tim;
1509 
1510  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
1511  dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK));
1512 
1513  cuda_diff_parametric_relu(dimGrid, dimBlock, data_, diff.data_, value.data_,
1514  Dim(), diff.Stride(), value.Stride(),
1515  alpha.data_, beta.data_);
1516  CU_SAFE_CALL(cudaGetLastError());
1517 
1518  CuDevice::Instantiate().AccuProfile(__func__, tim);
1519  } else
1520 #endif
1521  {
1522  // Do it on CPU,
1523  for (MatrixIndexT r = 0; r < NumRows(); r++) {
1524  for (MatrixIndexT c = 0; c < NumCols(); c++) {
1525  Real value_elem = value.Mat()(r,c);
1526  this->Mat()(r,c) = diff.Mat()(r,c) *
1527  (value_elem >= 0.0 ? alpha.Vec()(c) : beta.Vec()(c));
1528  }
1529  }
1530  }
1531 }
1532 
1533 template<typename Real>
1535  KALDI_ASSERT(SameDim(*this, src));
1536 #if HAVE_CUDA == 1
1537  if (CuDevice::Instantiate().Enabled()) {
1538  CuTimer tim;
1539  dim3 dimGrid, dimBlock;
1540  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1541  &dimGrid, &dimBlock);
1542  cuda_sigmoid(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
1543  src.Stride());
1544  CU_SAFE_CALL(cudaGetLastError());
1545 
1546  CuDevice::Instantiate().AccuProfile(__func__, tim);
1547  } else
1548  #endif
1549  {
1550  Mat().Sigmoid(src.Mat());
1551  }
1552 }
1553 
1554 template<typename Real>
1556  KALDI_ASSERT(SameDim(*this, src));
1557 #if HAVE_CUDA == 1
1558  if (CuDevice::Instantiate().Enabled()) {
1559  CuTimer tim;
1560  dim3 dimGrid, dimBlock;
1561  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1562  &dimGrid, &dimBlock);
1563  cuda_soft_hinge(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
1564  src.Stride());
1565  CU_SAFE_CALL(cudaGetLastError());
1566 
1567  CuDevice::Instantiate().AccuProfile(__func__, tim);
1568  } else
1569  #endif
1570  {
1571  Mat().SoftHinge(src.Mat());
1572  }
1573 }
1574 
1575 template<typename Real>
1577  int group_size = src.NumCols() / this->NumCols();
1578  KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size &&
1579  this->NumRows() == src.NumRows());
1580 #if HAVE_CUDA == 1
1581  if (CuDevice::Instantiate().Enabled()) {
1582  CuTimer tim;
1583  if (power == Real(0) || power == Real(1) || power == Real(2)
1584  || power == std::numeric_limits<Real>::infinity()) {
1585  // One thread block per row.
1586  // Use 2D block for small group size to simplify the calculation
1587  // Each group is reduced by threads_per_group threads.
1588  // threads_per_group should be a power of 2 for fast tree reduction.
1589  int threads_per_group = CU1DBLOCK;
1590  while (threads_per_group * 3 / 2 >= group_size) {
1591  threads_per_group >>= 1;
1592  }
1593  if (group_size == 1) {
1594  threads_per_group = 1;
1595  }
1596  dim3 dimBlock(threads_per_group, CU1DBLOCK / threads_per_group);
1597  dim3 dimGrid(NumRows());
1598  cuda_group_spec_pnorm(dimGrid, dimBlock, this->data_, src.data_,
1599  this->Dim(), src.Stride(), group_size, power);
1600  } else {
1601  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
1602  dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
1603  n_blocks(NumRows(), CU2DBLOCK));
1604  cuda_group_pnorm(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
1605  src.Stride(), group_size, power);
1606  }
1607  CU_SAFE_CALL(cudaGetLastError());
1608  CuDevice::Instantiate().AccuProfile(__func__, tim);
1609  } else
1610 #endif
1611  {
1612  Mat().GroupPnorm(src.Mat(), power);
1613  }
1614 }
1615 
1616 template<typename Real>
1618  int group_size = src.NumCols() / this->NumCols();
1619  KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size &&
1620  this->NumRows() == src.NumRows());
1621 #if HAVE_CUDA == 1
1622  if (CuDevice::Instantiate().Enabled()) {
1623  CuTimer tim;
1624  // One thread block per row.
1625  // Use 2D block for small group size to simplify the calculation.
1626  // Each group is reduced by threads_per_group threads.
1627  // threads_per_group should be a power of 2 for fast tree reduction.
1628  // group size: 1 2 3 4 5 6 7 .. 12 13 .. 24 25 .. 48 ...
1629  // threads_per_group: 1 1 1 2 2 2 4 .. 4 8 .. 8 16 .. 16 ...
1630  int threads_per_group = CU1DBLOCK;
1631  while (threads_per_group * 3 / 2 >= group_size) {
1632  threads_per_group >>= 1;
1633  }
1634  if (group_size == 1) {
1635  threads_per_group = 1;
1636  }
1637  dim3 dimBlock(threads_per_group, CU1DBLOCK / threads_per_group);
1638  dim3 dimGrid(NumRows());
1639  cuda_group_max(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
1640  src.Stride(), group_size);
1641  CU_SAFE_CALL(cudaGetLastError());
1642  CuDevice::Instantiate().AccuProfile(__func__, tim);
1643  } else
1644 #endif
1645  {
1646  Mat().GroupMax(src.Mat());
1647  }
1648 }
1649 
1650 /*
1651 Think of sv_labels as a Matrix, denoting the "correct" label of each frame to
1652 each phone-state; it's very likely to contain a LOT of zeros
1653 
1654 tot_weight = the sum of ALL element in matrix sv_labels
1655 tot_objf = the sum of the product of (each element in matrix sv_labels) and (the
1656  log of its counterpart in matrix output)
1657 
1658 an element in "this" matrix = (the element in matrix sv_labels) divided by (the element in matrix output)
1659 */
1660 template<typename Real>
1661 void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& sv_labels,
1662  const CuMatrix<Real> &output,
1663  Real *tot_objf, Real* tot_weight) {
1664  { // check the input.
1665  typedef typename std::vector<MatrixElement<Real> >::const_iterator Iter;
1666  MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
1667  for (Iter iter = sv_labels.begin(); iter != sv_labels.end(); ++iter) {
1668  KALDI_ASSERT(iter->row < num_rows && iter->row >= 0 &&
1669  iter->column < num_cols && iter->column >= 0);
1670  }
1671  }
1672 
1673 #if HAVE_CUDA == 1
1674  if (CuDevice::Instantiate().Enabled()) {
1675  if (sv_labels.empty()) {
1676  KALDI_WARN << "Empty supervision labels";
1677  *tot_objf = 0.0;
1678  *tot_weight = 0.0;
1679  return;
1680  }
1681  void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() * sizeof(MatrixElement<Real>));
1682  CU_SAFE_CALL(cudaMemcpyAsync(addr, sv_labels.data(), sv_labels.size() *
1683  sizeof(MatrixElement<Real>),
1684  cudaMemcpyHostToDevice,
1685  cudaStreamPerThread));
1686  CuTimer tim;
1687  CuVector<Real> tmp(2, kUndefined);
1688  int dimBlock(CU1DBLOCK);
1689  int dimGrid = 1; // only 1 block here. we have loops in each thread.
1690  cuda_comp_obj_deriv(dimGrid, dimBlock, (MatrixElement<Real>*)addr,
1691  sv_labels.size(), output.Data(), output.Dim(),
1692  this->Data(), this->Dim(), tmp.Data());
1693  Vector<Real> tmp_cpu(tmp);
1694  *tot_objf = tmp_cpu(0);
1695  *tot_weight = tmp_cpu(1);
1696  CuDevice::Instantiate().Free(addr);
1697  CuDevice::Instantiate().AccuProfile(__func__, tim);
1698  } else
1699 #endif
1700  {
1701  *tot_objf = 0.0;
1702  *tot_weight = 0.0;
1703  for(int32 i = 0; i<sv_labels.size(); i++) {
1704  int32 m = sv_labels[i].row, label = sv_labels[i].column;
1705  Real weight = sv_labels[i].weight;
1706  //KALDI_ASSERT(label >= 0 && label < nnet_.OutputDim());
1707  Real this_prob = output(m, label);
1708  KALDI_ASSERT(this_prob >= 0.99e-20); // we floored to 1.0e-20 in SoftmaxLayer.
1709  *tot_objf += weight * kaldi::Log(this_prob);
1710  *tot_weight += weight;
1711  (*this)(m, label) += weight / this_prob;
1712  }
1713  }
1714 }
1715 
1716 template<typename Real> // Y->this, X->src
1718  KALDI_ASSERT(SameDim(*this, src));
1719 #if HAVE_CUDA == 1
1720  if (CuDevice::Instantiate().Enabled()) {
1721  CuTimer tim;
1722  size_t dimBlock = CU1DBLOCK;
1723  size_t dimGrid = src.num_rows_;
1724  cuda_softmax_reduce(dimGrid, dimBlock, data_, src.data_, Dim(), src.Stride());
1725  CU_SAFE_CALL(cudaGetLastError());
1726 
1727  CuDevice::Instantiate().AccuProfile(__func__, tim);
1728  } else
1729  #endif
1730  {
1731  MatrixBase<Real> &mat(this->Mat());
1732  mat.CopyFromMat(src.Mat());
1733  for(MatrixIndexT r = 0; r < mat.NumRows(); r++) {
1734  mat.Row(r).ApplySoftMax();
1735  }
1736  }
1737 }
1738 
1739 template<typename Real> // Y->this, X->src
1741  KALDI_ASSERT(SameDim(*this, src));
1742 #if HAVE_CUDA == 1
1743  if (CuDevice::Instantiate().Enabled()) {
1744  CuTimer tim;
1745  size_t dimBlock = CU1DBLOCK;
1746  size_t dimGrid = src.num_rows_;
1747  cuda_log_softmax_reduce(dimGrid, dimBlock,
1748  data_, src.data_, Dim(), src.Stride());
1749  CU_SAFE_CALL(cudaGetLastError());
1750 
1751  CuDevice::Instantiate().AccuProfile(__func__, tim);
1752  } else
1753 #endif
1754  {
1755  MatrixBase<Real> &mat(this->Mat());
1756  mat.CopyFromMat(src.Mat());
1757  for(MatrixIndexT r = 0; r < mat.NumRows(); r++) {
1758  mat.Row(r).ApplyLogSoftMax();
1759  }
1760  }
1761 }
1762 
1763 template<typename Real>
1765  const CuMatrixBase<Real> &diff) {
1766  KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff));
1767 #if HAVE_CUDA == 1
1768  if (CuDevice::Instantiate().Enabled()) {
1769  CuTimer tim;
1770  dim3 dimGrid, dimBlock;
1771  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1772  &dimGrid, &dimBlock);
1773  cuda_diff_sigmoid(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride());
1774  CU_SAFE_CALL(cudaGetLastError());
1775 
1776  CuDevice::Instantiate().AccuProfile(__func__, tim);
1777  } else
1778 #endif
1779  {
1780  Mat().DiffSigmoid(value.Mat(), diff.Mat());
1781  }
1782 }
1783 
1784 
1785 template<typename Real>
1787  KALDI_ASSERT(SameDim(*this, src));
1788 #if HAVE_CUDA == 1
1789  if (CuDevice::Instantiate().Enabled()) {
1790  CuTimer tim;
1791  dim3 dimGrid, dimBlock;
1792  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1793  &dimGrid, &dimBlock);
1794 
1795  cuda_tanh(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride());
1796  CU_SAFE_CALL(cudaGetLastError());
1797 
1798  CuDevice::Instantiate().AccuProfile(__func__, tim);
1799  } else
1800 #endif
1801  {
1802  Mat().Tanh(src.Mat());
1803  }
1804 }
1805 
1806 
1807 
1808 template<typename Real>
1810  const CuMatrixBase<Real> &diff) {
1811 #if HAVE_CUDA == 1
1812  if (CuDevice::Instantiate().Enabled()) {
1813  CuTimer tim;
1814  dim3 dimGrid, dimBlock;
1815  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
1816  &dimGrid, &dimBlock);
1817  cuda_diff_tanh(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride());
1818  CU_SAFE_CALL(cudaGetLastError());
1819 
1820  CuDevice::Instantiate().AccuProfile(__func__, tim);
1821  } else
1822 #endif
1823  {
1824  Mat().DiffTanh(value.Mat(), diff.Mat());
1825  }
1826 }
1827 
1828 template<typename Real>
1830 #if HAVE_CUDA == 1
1831  if (CuDevice::Instantiate().Enabled()) {
1832  CuTimer tim;
1833  id->Resize(num_rows_);
1834  MatrixDim d = Dim();
1835 
1836  // CUDA thread layout: one thread block per matrix-row.
1837  dim3 dimBlock(CU1DBLOCK);
1838  dim3 dimGrid(num_rows_);
1839  cuda_find_row_max_id(dimGrid, dimBlock, data_, NULL, id->Data(), d);
1840  CU_SAFE_CALL(cudaGetLastError());
1841 
1842  // now we have the indices!
1843  CuDevice::Instantiate().AccuProfile(__func__, tim);
1844  } else
1845 #endif
1846  {
1847  // allocate index buffer
1848  id->Resize(num_rows_);
1849  id->Set(-1);
1850  // find maxima
1851  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
1852  for (MatrixIndexT r = 0; r < num_rows; r++) {
1853  Real max = -1e21;
1854  int32 max_id = -1;
1855  const Real *row_data = Mat().RowData(r);
1856  for (MatrixIndexT c = 0; c < num_cols; c++) {
1857  if (max < row_data[c]) {
1858  max = row_data[c];
1859  max_id = c;
1860  }
1861  }
1862  id->Data()[r] = max_id;
1863  }
1864  }
1865 }
1866 
1867 template<typename Real>
1869  const CuMatrixBase<Real> &diff) {
1870 
1871  KALDI_ASSERT(SameDim(value, diff) && SameDim(value, *this) &&
1872  this != &value);
1873 
1874 #if HAVE_CUDA == 1
1875  if (CuDevice::Instantiate().Enabled()) {
1876  CuTimer tim;
1877 
1878  // CUDA thread layout: one thread block per matrix-row.
1879  dim3 dimBlock(CU1DBLOCK);
1880  dim3 dimGrid(num_rows_);
1881  cuda_diff_softmax(dimGrid, dimBlock, data_, this->Dim(), value.Data(),
1882  value.Stride(), diff.Data(), diff.Stride());
1883  CU_SAFE_CALL(cudaGetLastError());
1884 
1885  CuDevice::Instantiate().AccuProfile(__func__, tim);
1886  } else
1887 #endif
1888  {
1889  const CuMatrixBase<Real> &P(value), &E(diff);
1890  CuMatrixBase<Real> &D(*this);
1891 
1892  CuVector<Real> pe_vec(D.NumRows()); // For each row i, the dot product (p_t . e_t).
1893  pe_vec.AddDiagMatMat(1.0, P, kNoTrans, E, kTrans, 0.0);
1894 
1895  D.CopyFromMat(E);
1896  D.MulElements(P);
1897  // At this point, D = P .* E (in matlab notation)
1898  D.AddDiagVecMat(-1.0, pe_vec, P, kNoTrans, 1.0); // does D -= diag(pe_vec) * P.
1899  }
1900 }
1901 
1902 template<typename Real>
1904  const CuMatrixBase<Real> &out_value, const CuMatrixBase<Real> &out_deriv) {
1905 
1906  KALDI_ASSERT(SameDim(out_value, out_deriv) && SameDim(out_value, *this) &&
1907  this != &out_value);
1908 
1909 #if HAVE_CUDA == 1
1910  if (CuDevice::Instantiate().Enabled()) {
1911  CuTimer tim;
1912 
1913  // CUDA thread layout: one thread block per matrix-row.
1914  dim3 dimBlock(CU1DBLOCK);
1915  dim3 dimGrid(num_rows_);
1916  cuda_diff_log_softmax(dimGrid, dimBlock, this->Dim(), out_value.Data(),
1917  out_value.Stride(), out_deriv.Data(),
1918  out_deriv.Stride(), data_);
1919  CU_SAFE_CALL(cudaGetLastError());
1920 
1921  CuDevice::Instantiate().AccuProfile(__func__, tim);
1922  } else
1923 #endif
1924  {
1925  if (this == &out_deriv) {
1926  // the code below doesn't work for in-place, so make a copy and recurse.
1927  CuMatrix<Real> temp(NumRows(), NumCols(), kUndefined);
1928  temp.DiffLogSoftmaxPerRow(out_value, out_deriv);
1929  CopyFromMat(temp);
1930  return;
1931  }
1932  /*
1933  Let the output be y, then
1934  y_i = x_i - log(sum_i exp(x_i))
1935  where x_i is the input to the component. The Jacobian matrix of this
1936  function is
1937  J = I - 1 exp(y^T)
1938  where 1 is a vector of ones. Let the derivative vector at the output be e,
1939  and at the input be d, then we have
1940  d = e - exp(y) Sum(e)
1941  d_i = e_i - exp(y_i) Sum(e)
1942  */
1943  const CuMatrixBase<Real> &Y(out_value), &E(out_deriv);
1944  CuMatrixBase<Real> &D(*this);
1945 
1946  D.CopyFromMat(Y);
1947  D.ApplyExp(); // exp(y)
1948  CuVector<Real> E_sum(D.NumRows()); // Initializes to zero
1949  E_sum.AddColSumMat(1.0, E); // Sum(e)
1950  D.MulRowsVec(E_sum); // exp(y) Sum(e)
1951  D.Scale(-1.0); // - exp(y) Sum(e)
1952  D.AddMat(1.0, E, kNoTrans); // e - exp(y_i) Sum(e)
1953  }
1954 }
1955 
1956 template<typename Real>
1958  CuVector<Real> *log_post_tgt) {
1959 
1960  KALDI_ASSERT(tgt.Dim() == num_rows_);
1961  log_post_tgt->Resize(tgt.Dim());
1962 
1963 #if HAVE_CUDA == 1
1964  if (CuDevice::Instantiate().Enabled()) {
1965  CuTimer tim;
1966  dim3 dimBlock(1, CU2DBLOCK*8);
1967  dim3 dimGrid(1, n_blocks(tgt.Dim(), CU2DBLOCK*8));
1968  cuda_diff_xent(dimGrid, dimBlock, tgt.Data(), data_,
1969  log_post_tgt->data_, Dim());
1970 
1971  CuDevice::Instantiate().AccuProfile(__func__, tim);
1972  } else
1973 #endif
1974  {
1975  MatrixIndexT num_rows = num_rows_;
1976  for(int32 r = 0; r < num_rows; r++) {
1977  int32 col_tgt = tgt.Data()[r];
1978  Real &value = Mat()(r, col_tgt);
1979  log_post_tgt->Vec()(r) = kaldi::Log(value);
1980  value -= 1.0;
1981  }
1982  }
1983 }
1984 
1985 
1986 template<typename Real>
1988  KALDI_ASSERT(this->NumRows() == this->NumCols());
1989  const int32 block_size = 64; // We can tune this.
1990 #if HAVE_CUDA == 1
1991  bool have_gpu = CuDevice::Instantiate().Enabled();
1992 #else
1993  bool have_gpu = false;
1994 #endif
1995  if (this->NumRows() == 0) {
1996  return;
1997  }
1998  if (inv_cholesky == NULL && this->NumRows() >= block_size * 2 && have_gpu) {
1999  // Even if the user did not request the inverse Cholesky, for large enough
2000  // matrices (on GPUs) it's going to be more efficient to compute it anyway
2001  // as the recursion depends on it.
2002  CuMatrix<Real> inv(this->NumRows(), this->NumCols());
2003  Cholesky(&inv);
2004  return;
2005  }
2006  if (this->NumRows() <= block_size || inv_cholesky == NULL || !have_gpu) {
2007  // Don't recurse: compute the Cholesky (and inverse Cholesky, if requested)
2008  // directly, on the CPu.
2009  int32 dim = this->NumRows();
2010  CuSpMatrix<Real> this_sp(dim, kUndefined);
2011  this_sp.CopyFromMat(*this, kTakeLower);
2012  SpMatrix<Real> this_sp_cpu(this_sp);
2013  TpMatrix<Real> C_cpu(dim);
2014  C_cpu.Cholesky(this_sp_cpu);
2015  CuTpMatrix<Real> C(C_cpu);
2016  this->CopyFromTp(C);
2017  if (inv_cholesky != NULL) {
2018  C_cpu.Invert(); // Get inverse Cholesky on CPU.
2019  C.CopyFromTp(C_cpu);
2020  inv_cholesky->CopyFromTp(C); // Copy inverse Cholesky from CPU.
2021  }
2022  return;
2023  }
2024  // At this point, if none of the other cases apply, we recurse.
2025 
2026  // The selection of dim1 is a heuristic. We could also just take half.
2027  int32 tot_dim = this->NumRows();
2028  int32 dim1;
2029  // Break it up into a whole number of blocks, for better memory alignment.
2030  // The line below, setting dim1 can be decided on a heuristic basis: from
2031  // the point of view of correctness, it can really be any value
2032  // 0 < dim1 < tot_dim.
2033  dim1 = block_size * std::max<int32>(1, tot_dim / (2 * block_size));
2034 
2035  int32 dim2 = tot_dim - dim1;
2036  CuSubMatrix<Real> this_11(*this, 0, dim1, 0, dim1),
2037  this_12(*this, 0, dim1, dim1, dim2),
2038  this_21(*this, dim1, dim2, 0, dim1),
2039  this_22(*this, dim1, dim2, dim1, dim2);
2040  CuSubMatrix<Real> inv_11(*inv_cholesky, 0, dim1, 0, dim1),
2041  inv_12(*inv_cholesky, 0, dim1, dim1, dim2),
2042  inv_21(*inv_cholesky, dim1, dim2, 0, dim1),
2043  inv_22(*inv_cholesky, dim1, dim2, dim1, dim2);
2044  /*
2045  Here is the math on block-wise Cholesky. We'll use a Matlab-like notation for blocks of a matrix,
2046  e.g. [ A B; C D ], and also for transposes, e.g. A' is the transpose of A.
2047  Let A be the input matrix; we want to compute both its Cholesky L and its inverse Cholesky, which
2048  we'll call M.
2049  OK. let L = [ L11 0; L21 L22 ] be the Cholesky factor of A.
2050  We have A = L L' = [ L11 0; L21 L22 ] * [ L11' L21'; 0 L22' ]. Multiplying it out,
2051  if A = [ A11 A12; A21 A22 ]; then
2052  A11 = L11 L11', A21 = L21 L11', A22 = L21 L21' + L22 L22', and A12 = A21'.
2053 
2054  We also want an expression for the inverse of L (we call this M).
2055  If M = [ M11 0; M21 M22 ], then it's not hard to see that
2056  M11 = inv(L11), M22 = inv(L22).
2057  We can work out M21 as follows. We know that [ L11 0; L21 L22 ] [ M11 0; M21 M22 ] = [ I 0; 0 I ].
2058  Considering the zero on the bottom of the rhs, we have: L21 M11 + L22 M21 = 0, which gives us:
2059  M21 = - L22^{-1} L21 M11 = - M22 L21 M11.
2060 
2061  Next, we want expressions for L21 and L22. From the equation A21 = L21 L11', we have:
2062  L21 = A21 inv(L11') = A21 M11'
2063  We can compute L22 and M22 recursively by doing Cholesky (and computing the inverse Cholesky)
2064  on the quantity T = (A22 - L21 L21'). [we give it the name T just for easy reference.]
2065 
2066  Computationally, we do this as follows:
2067  (1) Recurse to get L11 and M11.
2068  (2) Compute L21 = A21 M11'
2069  (3) Compute T = A22 - L21 L21'
2070  (4) Recurse on T to get L22 and M22.
2071  (5) Compute M21 = -M22 L21 M11.
2072  Next, we have to consider the in-place nature of the computation, since L overwrites A
2073  [M has its own storage, in "inv_cholesky"].
2074  We address this here:
2075  (1) is in-place [L11 replaces A11, M11 has its own storage].
2076  (2) L21 gets written where M21 belongs.
2077  (3) T replaces A22.
2078  (4) is in-place [L22 replaces T where A22 was, M22 has its own storage]
2079  (5):(a) we first compute the transpose of (L21 M11) is done in the upper part of A/L,
2080  where A12 or L12 would be. Define a temporary expression
2081  U = (L21 M11)' = M11' L21'; this goes where A12 or L12 would be.
2082  (b) copy L21 to where it should be, in *this.
2083  (c) Compute M21 = -M22 U', in the correct place for M21.
2084  (d) zero L12 and M12. */
2085 
2086  // (1) compute L11 and M11.
2087  this_11.Cholesky(&inv_11);
2088  // (2) compute L21 = A21 M11'. For now it's in the "wrong place", where M21 should be.
2089  inv_21.AddMatMat(1.0, this_21, kNoTrans, inv_11, kTrans, 0.0);
2090  // (3) compute T = A22 - L21 L21'. Note: only the lower triangle of T will be valid, but
2091  // that's OK because Cholesky will ignore the upper part.
2092  this_22.SymAddMat2(-1.0, inv_21, kNoTrans, 1.0);
2093  // (4) Recurse to compute L22 and M22.
2094  this_22.Cholesky(&inv_22);
2095  // (5)(a) compute U = M11' L21'. We use the storage of this_12 for this. Note that L21 is
2096  // currently where M21 should be.
2097  this_12.AddMatMat(1.0, inv_11, kTrans, inv_21, kTrans, 0.0);
2098  // (5)(b) copy L21 to where it should be.
2099  this_21.CopyFromMat(inv_21);
2100  // (5)(c) compute M21 = -M22 U'.
2101  inv_21.AddMatMat(-1.0, inv_22, kNoTrans, this_12, kTrans, 0.0);
2102  // (5)(d) zero L12 and M12.
2103  this_12.SetZero();
2104  inv_12.SetZero();
2105 
2106 }
2107 
2108 
2109 
2110 template<typename Real>
2112  KALDI_ASSERT(num_rows_ == num_cols_);
2113  if (num_rows_ == 0) return;
2114 #if HAVE_CUDA == 1
2115  if (CuDevice::Instantiate().Enabled()) {
2116  CuTimer tim;
2117  CuMatrix<Real> inv_cholesky(num_rows_, num_rows_);
2118  this->Cholesky(&inv_cholesky);
2119  // note: SymAddMat2 only updates lower part of *this.
2120  this->SymAddMat2(1.0, inv_cholesky, kTrans, 0.0);
2121  this->CopyLowerToUpper();
2122  CuDevice::Instantiate().AccuProfile(__func__, tim);
2123  } else
2124 #endif
2125  {
2126  SpMatrix<Real> temp_sp(this->Mat(), kTakeLower);
2127  TpMatrix<Real> C(temp_sp.NumRows(), kUndefined);
2128  C.Cholesky(temp_sp);
2129  C.Invert();
2130  temp_sp.AddTp2(1.0, C, kTrans, 0.0);
2131  this->Mat().CopyFromSp(temp_sp);
2132  // was previously just: CuSpMatrix::Invert().
2133  }
2134 }
2135 
2136 template<typename Real>
2138  float tol) const {
2139  CuMatrix<Real> diff(*this);
2140  diff.AddMat(-1.0, other);
2141  return (diff.FrobeniusNorm() <= tol * (*this).FrobeniusNorm());
2142 }
2143 
2144 template<typename Real>
2146  const CuMatrixBase<Real> &B,
2147  MatrixTransposeType trans) {
2148  if (A.num_rows_ == 0) {
2149  KALDI_ASSERT(B.num_rows_ == 0);
2150  return 0.0;
2151  }
2152  Real result = 0;
2153 #if HAVE_CUDA == 1
2154  if (CuDevice::Instantiate().Enabled()) {
2155  if (trans == kNoTrans) {
2156  KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows());
2157  } else {
2158  KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
2159  }
2160  CuTimer tim;
2161  // 2D blocks: each (8x32) block sums up (32x32) elements.
2162  // 2D grid: try to cover all the matrix A unless it is too big.
2163  // Kernel will reduce to ~256 elements with good performance,
2164  // if the matrix is not in a very bad shape.
2165  // (wider or taller than 32x8192)
2166  // CPU will then reduce to 1 element.
2167  const int kWarpSize = 32;
2168  dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
2169  dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize),
2170  n_blocks(A.NumRows(), kWarpSize));
2171  if (dimGrid.x * dimGrid.y > 256) {
2172  dimGrid.y = 256 / dimGrid.x;
2173  if (dimGrid.y == 0) {
2174  dimGrid.y = 1;
2175  }
2176  }
2177  CuVector<Real> result_vec(dimGrid.x * dimGrid.y, kUndefined);
2178  if (trans == kNoTrans) {
2179  cuda_trace_mat_mat(dimGrid, dimBlock, A.Data(), B.Data(), A.Dim(),
2180  B.Stride(), result_vec.Data());
2181  } else {
2182  cuda_trace_mat_mat_trans(dimGrid, dimBlock, A.Data(), B.Data(), A.Dim(),
2183  B.Stride(), result_vec.Data());
2184  }
2185  CU_SAFE_CALL(cudaGetLastError());
2186  Vector<Real> result_cpu(result_vec); // copying from CUDA faster than summing in CUDA.
2187  result = result_cpu.Sum();
2188  CuDevice::Instantiate().AccuProfile(__func__, tim);
2189  } else
2190 #endif
2191  {
2192  result = TraceMatMat(A.Mat(), B.Mat(), trans);
2193  }
2194  return result;
2195 }
2196 
2197 template
2198 float TraceMatMat(const CuMatrixBase<float> &A,
2199  const CuMatrixBase<float> &B,
2200  MatrixTransposeType trans);
2201 template
2202 double TraceMatMat(const CuMatrixBase<double> &A,
2203  const CuMatrixBase<double> &B,
2204  MatrixTransposeType trans);
2205 
2206 template<typename Real>
2207 void AddMatMatBatched(const Real alpha, std::vector<CuSubMatrix<Real>* > &C,
2208  const std::vector<CuSubMatrix<Real>* > &A,
2209  MatrixTransposeType transA,
2210  const std::vector<CuSubMatrix<Real>* > &B,
2211  MatrixTransposeType transB, const Real beta) {
2212  KALDI_ASSERT(A.size() == B.size() && B.size() == C.size());
2213  int32 size = A.size();
2214 
2215  if (size == 0) return;
2216 
2217  // all elements must have the same num-rows, num-cols and stride
2218  for (int32 i = 0; i + 1 < size; i++) {
2219  KALDI_ASSERT(A[i]->NumRows() == A[i+1]->NumRows());
2220  KALDI_ASSERT(A[i]->NumCols() == A[i+1]->NumCols());
2221  KALDI_ASSERT(A[i]->Stride() == A[i+1]->Stride());
2222  KALDI_ASSERT(B[i]->NumRows() == B[i+1]->NumRows());
2223  KALDI_ASSERT(B[i]->NumCols() == B[i+1]->NumCols());
2224  KALDI_ASSERT(B[i]->Stride() == B[i+1]->Stride());
2225  KALDI_ASSERT(C[i]->NumRows() == C[i+1]->NumRows());
2226  KALDI_ASSERT(C[i]->NumCols() == C[i+1]->NumCols());
2227  KALDI_ASSERT(C[i]->Stride() == C[i+1]->Stride());
2228  }
2229  // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping?
2230  // keep trans..., just swap A&B matrices: A->B B->A
2231  MatrixIndexT m = ((transB==kTrans)? B[0]->NumRows() : B[0]->NumCols());
2232  MatrixIndexT n = ((transA==kTrans)? A[0]->NumCols() : A[0]->NumRows());
2233  MatrixIndexT k = ((transB==kTrans)? B[0]->NumCols() : B[0]->NumRows());
2234  MatrixIndexT k1 = ((transA==kTrans)? A[0]->NumRows() : A[0]->NumCols());
2235 
2236  KALDI_ASSERT(m == C[0]->NumCols());
2237  KALDI_ASSERT(n == C[0]->NumRows());
2238  KALDI_ASSERT(k == k1);
2239 
2240  if (m == 0) return;
2241 
2242 #if HAVE_CUDA == 1
2243  if (CuDevice::Instantiate().Enabled()) {
2244  CuTimer tim;
2245  Real **device_abc_array =
2246  static_cast<Real**>(CuDevice::Instantiate().Malloc(3 * size * sizeof(Real*)));
2247  const Real **device_a_array = const_cast<const Real**>(device_abc_array);
2248  const Real **device_b_array = const_cast<const Real**>(device_abc_array) + size;
2249  Real **device_c_array = device_abc_array + 2 * size;
2250  const Real **host_abc_array = new const Real*[3*size];
2251  const Real **host_a_array = host_abc_array;
2252  const Real **host_b_array = host_abc_array + size;
2253  const Real **host_c_array = host_abc_array + 2 * size;
2254 
2255  for (int32 i = 0; i < size; i++) {
2256  host_a_array[i] = A[i]->data_;
2257  host_b_array[i] = B[i]->data_;
2258  host_c_array[i] = C[i]->data_;
2259  }
2260 
2261  CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array,
2262  3*size*sizeof(Real*), cudaMemcpyHostToDevice,
2263  cudaStreamPerThread));
2264 
2265  CUBLAS_SAFE_CALL(cublas_gemmBatched(GetCublasHandle(),
2266  (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
2267  (transA==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
2268  m, n, k, alpha, device_b_array,
2269  B[0]->Stride(), device_a_array,
2270  A[0]->Stride(), beta, device_c_array,
2271  C[0]->Stride(), size));
2272 
2273  CuDevice::Instantiate().Free(device_abc_array);
2274  delete[] host_abc_array;
2275 
2276  CuDevice::Instantiate().AccuProfile(__func__, tim);
2277  } else
2278 #endif
2279  {
2280  for (int32 i = 0; i < size; i++) {
2281  C[i]->Mat().AddMatMat(alpha, A[i]->Mat(), transA, B[i]->Mat(), transB, beta);
2282  }
2283  }
2284 }
2285 
2286 template
2287 void AddMatMatBatched(const float alpha, std::vector<CuSubMatrix<float>* > &C,
2288  const std::vector<CuSubMatrix<float>* > &A,
2289  MatrixTransposeType transA,
2290  const std::vector<CuSubMatrix<float>* > &B,
2291  MatrixTransposeType transB, const float beta);
2292 
2293 template
2294 void AddMatMatBatched(const double alpha, std::vector<CuSubMatrix<double>* > &C,
2295  const std::vector<CuSubMatrix<double>* > &A,
2296  MatrixTransposeType transA,
2297  const std::vector<CuSubMatrix<double>* > &B,
2298  MatrixTransposeType transB, const double beta);
2299 
2300 template<typename Real>
2302 #if HAVE_CUDA == 1
2303  if (CuDevice::Instantiate().Enabled()) {
2304  CuTimer tim;
2305  if (v.Dim() == num_rows_*num_cols_) {
2306  if (stride_ == num_cols_) {
2307  const Real* v_data = v.Data();
2308  CU_SAFE_CALL(
2309  cudaMemcpyAsync(data_, v_data, sizeof(Real)*num_rows_*num_cols_,
2310  cudaMemcpyDeviceToDevice, cudaStreamPerThread));
2311  } else {
2312  CU_SAFE_CALL(
2313  cudaMemcpy2DAsync(data_, stride_ * sizeof(Real), v.Data(),
2314  num_cols_*sizeof(Real), num_cols_*sizeof(Real),
2315  num_rows_, cudaMemcpyDeviceToDevice,
2316  cudaStreamPerThread));
2317  }
2318  } else if (v.Dim() == num_cols_) {
2319  dim3 dimGrid, dimBlock;
2320  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2321  &dimGrid, &dimBlock);
2322  cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data());
2323  CU_SAFE_CALL(cudaGetLastError());
2324  } else {
2325  KALDI_ERR << "Wrong sized arguments";
2326  }
2327  CuDevice::Instantiate().AccuProfile(__func__, tim);
2328  } else
2329 #endif
2330  {
2331  Mat().CopyRowsFromVec(v.Vec());
2332  }
2333 }
2334 
2335 template<typename Real>
2337 #if HAVE_CUDA == 1
2338  if (CuDevice::Instantiate().Enabled()) {
2339  CuTimer tim;
2340  if (v.Dim() == num_rows_*num_cols_) {
2341  if (stride_ == num_cols_) {
2342  const Real* v_data = v.Data();
2343  CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data,
2344  sizeof(Real)*num_rows_*num_cols_,
2345  cudaMemcpyHostToDevice,
2346  cudaStreamPerThread));
2347  } else {
2348  const Real *v_data = v.Data();
2349  for (MatrixIndexT r = 0; r < num_rows_; r++) {
2350  Real *row_data = RowData(r);
2351  CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data, sizeof(Real)*num_cols_,
2352  cudaMemcpyHostToDevice,
2353  cudaStreamPerThread));
2354  v_data += num_cols_;
2355  }
2356  }
2357  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
2358  } else if (v.Dim() == num_cols_) {
2359  dim3 dimGrid, dimBlock;
2360  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2361  &dimGrid, &dimBlock);
2362  cuda_copy_rows_from_vec(dimGrid, dimBlock, this->data_, this->Dim(), v.Data());
2363  CU_SAFE_CALL(cudaGetLastError());
2364  } else {
2365  KALDI_ERR << "Wrong sized arguments";
2366  }
2367  CuDevice::Instantiate().AccuProfile(__func__, tim);
2368  } else
2369 #endif
2370  {
2371  Mat().CopyRowsFromVec(v);
2372  }
2373 }
2374 
2375 template<typename Real>
2377 #if HAVE_CUDA == 1
2378  if (CuDevice::Instantiate().Enabled()) {
2379  CuTimer tim;
2380  if (rv.Dim() == num_rows_ * num_cols_) {
2381  // treat rv as a matrix of the size (num_cols x num_rows_)
2382  // and use transposed copy to fill *this
2383  // see CuMatrixBase<Real>::CopyFromMat() for more detail of the impl
2384  MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ };
2385  const int32 warpSize = 32;
2386  dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
2387  dim3 dimGrid(n_blocks(rv_dim.cols, warpSize),
2388  n_blocks(rv_dim.rows, warpSize));
2389  cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, rv.Data(), Dim(),
2390  rv_dim);
2391  CU_SAFE_CALL(cudaGetLastError());
2392  } else if (rv.Dim() == num_rows_) {
2393  // use 2D block (8x32) and large enough grid to cover matrix *this
2394  // dimBlock.x need to be at least warpSize for coalesced memory access.
2395  const int32 warpSize = 32;
2396  dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
2397  dim3 dimGrid(n_blocks(num_cols_, dimBlock.x),
2398  n_blocks(num_rows_, dimBlock.y));
2399  cuda_copy_cols_from_vec(dimGrid, dimBlock, Data(), Dim(), rv.Data());
2400  CU_SAFE_CALL(cudaGetLastError());
2401  } else {
2402  KALDI_ERR<< "Wrong sized arguments";
2403  }
2404  CuDevice::Instantiate().AccuProfile(__func__, tim);
2405  } else
2406 #endif
2407  {
2408  Mat().CopyColsFromVec(rv.Vec());
2409  }
2410 }
2411 
2412 
2413 template<typename Real>
2415  const MatrixIndexT col) {
2416  KALDI_ASSERT(v.Dim() == num_rows_ &&
2417  static_cast<UnsignedMatrixIndexT>(col) <
2418  static_cast<UnsignedMatrixIndexT>(num_cols_));
2419 #if HAVE_CUDA == 1
2420  if (CuDevice::Instantiate().Enabled()) {
2421  CuTimer tim;
2422  cublas_copy(GetCublasHandle(),
2423  v.Dim(), v.Data(), 1,
2424  this->data_ + col, this->stride_);
2425  CU_SAFE_CALL(cudaGetLastError());
2426  CuDevice::Instantiate().AccuProfile(__func__, tim);
2427  } else
2428 #endif
2429  {
2430  Mat().CopyColFromVec(v.Vec(), col);
2431  }
2432 }
2433 
2434 template<typename Real>
2436  KALDI_ASSERT(SameDim(*this, src));
2437 #if HAVE_CUDA == 1
2438  if (CuDevice::Instantiate().Enabled()) {
2439  CuTimer tim;
2440  dim3 dimGrid, dimBlock;
2441  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2442  &dimGrid, &dimBlock);
2443  cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
2444  src.Stride());
2445  CU_SAFE_CALL(cudaGetLastError());
2446 
2447  CuDevice::Instantiate().AccuProfile(__func__, tim);
2448  } else
2449  #endif
2450  {
2451  Mat().Heaviside(src.Mat());
2452  }
2453 }
2454 
2455 template<typename Real>
2457  KALDI_ASSERT(SameDim(*this, src));
2458 #if HAVE_CUDA == 1
2459  if (CuDevice::Instantiate().Enabled()) {
2460  CuTimer tim;
2461  dim3 dimGrid, dimBlock;
2462  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2463  &dimGrid, &dimBlock);
2464  cuda_exp(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
2465  src.Stride());
2466  CU_SAFE_CALL(cudaGetLastError());
2467 
2468  CuDevice::Instantiate().AccuProfile(__func__, tim);
2469  } else
2470  #endif
2471  {
2472  Mat().Exp(src.Mat());
2473  }
2474 }
2475 
2476 template<typename Real>
2478  KALDI_ASSERT(SameDim(*this, src));
2479 #if HAVE_CUDA == 1
2480  if (CuDevice::Instantiate().Enabled()) {
2481  if (num_rows_ == 0) return;
2482  CuTimer tim;
2483  dim3 dimGrid, dimBlock;
2484  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2485  &dimGrid, &dimBlock);
2486 
2487  cuda_log(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
2488  src.Stride());
2489  CU_SAFE_CALL(cudaGetLastError());
2490 
2491  CuDevice::Instantiate().AccuProfile(__func__, tim);
2492  } else
2493  #endif
2494  {
2495  Mat().Log(src.Mat());
2496  }
2497 }
2498 
2499 template<typename Real>
2500 void CuMatrixBase<Real>::Pow(const CuMatrixBase<Real> &src, Real power) {
2501  KALDI_ASSERT(SameDim(*this, src));
2502 #if HAVE_CUDA == 1
2503  if (CuDevice::Instantiate().Enabled()) {
2504  CuTimer tim;
2505  dim3 dimGrid, dimBlock;
2506  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2507  &dimGrid, &dimBlock);
2508  cuda_pow(dimGrid, dimBlock, this->data_, src.data_, power, this->Dim(),
2509  src.Stride());
2510  CU_SAFE_CALL(cudaGetLastError());
2511 
2512  CuDevice::Instantiate().AccuProfile(__func__, tim);
2513  } else
2514  #endif
2515  {
2516  Mat().Pow(src.Mat(), power);
2517  }
2518 }
2519 
2520 template<typename Real>
2521 void CuMatrixBase<Real>::PowAbs(const CuMatrixBase<Real> &src, Real power, bool include_sign) {
2522  KALDI_ASSERT(SameDim(*this, src));
2523 #if HAVE_CUDA == 1
2524  if (CuDevice::Instantiate().Enabled()) {
2525  CuTimer tim;
2526  dim3 dimGrid, dimBlock;
2527  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2528  &dimGrid, &dimBlock);
2529  cuda_pow_abs(dimGrid, dimBlock, this->data_, src.data_, power, include_sign,
2530  this->Dim(), src.Stride());
2531  CU_SAFE_CALL(cudaGetLastError());
2532  CuDevice::Instantiate().AccuProfile(__func__, tim);
2533  } else
2534 #endif
2535  {
2536  Mat().PowAbs(src.Mat(), power, include_sign);
2537  }
2538 }
2539 
2540 template<typename Real>
2541 void CuMatrixBase<Real>::ExpLimited(const CuMatrixBase<Real> &src, Real lower_limit, Real upper_limit) {
2542  KALDI_ASSERT(SameDim(*this, src));
2543  KALDI_ASSERT(upper_limit > lower_limit);
2544 #if HAVE_CUDA == 1
2545  if (CuDevice::Instantiate().Enabled()) {
2546  CuTimer tim;
2547  dim3 dimGrid, dimBlock;
2548  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2549  &dimGrid, &dimBlock);
2550  cuda_exp_limited(dimGrid, dimBlock, this->data_, src.data_, lower_limit, upper_limit,
2551  this->Dim(), src.Stride());
2552  CU_SAFE_CALL(cudaGetLastError());
2553  CuDevice::Instantiate().AccuProfile(__func__, tim);
2554  } else
2555 #endif
2556  {
2557  Mat().ExpLimited(src.Mat(), lower_limit, upper_limit);
2558  }
2559 }
2560 
2561 
2562 template<typename Real>
2564  KALDI_ASSERT(SameDim(*this, src));
2565 #if HAVE_CUDA == 1
2566  if (CuDevice::Instantiate().Enabled()) {
2567  CuTimer tim;
2568  dim3 dimGrid, dimBlock;
2569  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2570  &dimGrid, &dimBlock);
2571  cuda_exp_special(dimGrid, dimBlock, this->data_, src.data_, Dim(), src.Stride());
2572  CU_SAFE_CALL(cudaGetLastError());
2573  CuDevice::Instantiate().AccuProfile(__func__, tim);
2574  } else
2575 #endif
2576  {
2577  Mat().ExpSpecial(src.Mat());
2578  }
2579 }
2580 
2581 template<typename Real>
2582 void CuMatrixBase<Real>::Floor(const CuMatrixBase<Real> &src, Real floor_val) {
2583  KALDI_ASSERT(SameDim(*this, src));
2584 #if HAVE_CUDA == 1
2585  if (CuDevice::Instantiate().Enabled()) {
2586  CuTimer tim;
2587  dim3 dimGrid, dimBlock;
2588  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2589  &dimGrid, &dimBlock);
2590  cuda_floor(dimGrid, dimBlock, data_, src.data_, floor_val, this->Dim(), src.Stride());
2591  CU_SAFE_CALL(cudaGetLastError());
2592  CuDevice::Instantiate().AccuProfile(__func__, tim);
2593  } else
2594 #endif
2595  {
2596  Mat().Floor(src.Mat(), floor_val);
2597  }
2598 }
2599 
2600 template<typename Real>
2601 void CuMatrixBase<Real>::Ceiling(const CuMatrixBase<Real> &src, Real ceiling_val) {
2602  KALDI_ASSERT(SameDim(*this, src));
2603 #if HAVE_CUDA == 1
2604  if (CuDevice::Instantiate().Enabled()) {
2605  CuTimer tim;
2606  dim3 dimGrid, dimBlock;
2607  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2608  &dimGrid, &dimBlock);
2609  cuda_ceiling(dimGrid, dimBlock, this->data_, src.data_, ceiling_val, this->Dim(), src.Stride());
2610  CU_SAFE_CALL(cudaGetLastError());
2611  CuDevice::Instantiate().AccuProfile(__func__, tim);
2612  } else
2613 #endif
2614  {
2615  Mat().Ceiling(src.Mat(), ceiling_val);
2616  }
2617 }
2618 
2619 
2620 template<typename Real>
2622  KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
2623 #if HAVE_CUDA == 1
2624  if (CuDevice::Instantiate().Enabled()) {
2625  CuTimer tim;
2626  if (mat.Stride() == mat.NumCols()) {
2627  CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
2628  cudaMemcpyDeviceToHost, cudaStreamPerThread));
2629  } else {
2630  // we could definitely do better than the following.
2631  Real* vec_data = data_;
2632  for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
2633  CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
2634  sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToHost,
2635  cudaStreamPerThread));
2636  vec_data += mat.NumCols();
2637  }
2638  }
2639  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
2640  CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyRowsFromMat", tim);
2641  } else
2642 #endif
2643  {
2644  CopyRowsFromMat(mat.Mat());
2645  }
2646 }
2647 
2648 // Instantiate the template above.
2649 template
2651 template
2653 
2654 
2655 template<typename Real>
2657  const CuArrayBase<MatrixIndexT> &indices) {
2658 #if HAVE_CUDA == 1
2659  if (CuDevice::Instantiate().Enabled()) {
2660  KALDI_ASSERT(indices.Dim() == NumCols());
2661  KALDI_ASSERT(NumRows() == src.NumRows());
2662  CuTimer tim;
2663  dim3 dimGrid, dimBlock;
2664  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2665  &dimGrid, &dimBlock);
2666  cuda_copy_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride());
2667  CU_SAFE_CALL(cudaGetLastError());
2668  CuDevice::Instantiate().AccuProfile(__func__, tim);
2669  } else
2670 #endif
2671  {
2672  Mat().CopyCols(src.Mat(), indices.Data());
2673  }
2674 }
2675 
2676 
2677 template<typename Real>
2679  const CuArrayBase<MatrixIndexT> &indices) {
2680 #if HAVE_CUDA == 1
2681  if (CuDevice::Instantiate().Enabled()) {
2682  KALDI_ASSERT(static_cast<MatrixIndexT>(indices.Dim()) == NumRows());
2683  KALDI_ASSERT(NumCols() == src.NumCols());
2684 
2685  CuTimer tim;
2686  dim3 dimGrid, dimBlock;
2687  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2688  &dimGrid, &dimBlock);
2689  cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), indices.Data(),
2690  Dim(), src.Stride());
2691  CU_SAFE_CALL(cudaGetLastError());
2692  CuDevice::Instantiate().AccuProfile(__func__, tim);
2693  } else
2694 #endif
2695  {
2696  Mat().CopyRows(src.Mat(), indices.Data());
2697  }
2698 }
2699 
2700 template<typename Real>
2702  const CuArrayBase<MatrixIndexT> &indices) {
2703 #if HAVE_CUDA == 1
2704  if (CuDevice::Instantiate().Enabled()) {
2705  KALDI_ASSERT(indices.Dim() == NumCols());
2706  KALDI_ASSERT(NumRows() == src.NumRows());
2707  CuTimer tim;
2708  dim3 dimGrid, dimBlock;
2709  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2710  &dimGrid, &dimBlock);
2711  cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(),
2712  Dim(), src.Stride());
2713  CU_SAFE_CALL(cudaGetLastError());
2714  CuDevice::Instantiate().AccuProfile(__func__, tim);
2715  } else
2716 #endif
2717  {
2718  Mat().AddCols(src.Mat(), indices.Data());
2719  }
2720 }
2721 
2722 template<typename Real>
2724  if (NumRows() == 0) return;
2725 #if HAVE_CUDA == 1
2726  if (CuDevice::Instantiate().Enabled()) {
2727  KALDI_ASSERT(static_cast<MatrixIndexT>(src.Dim()) == NumRows());
2728  CuTimer tim;
2729  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
2730  dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
2731  n_blocks(num_rows_, CU2DBLOCK));
2732  cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), Dim());
2733  CU_SAFE_CALL(cudaGetLastError());
2734  CuDevice::Instantiate().AccuProfile(__func__, tim);
2735  } else
2736 #endif
2737  {
2738  Mat().CopyRows(src.Data());
2739  }
2740 }
2741 
2742 
2743 template<typename Real>
2745  if (NumRows() == 0) return;
2746 #if HAVE_CUDA == 1
2747  if (CuDevice::Instantiate().Enabled()) {
2748  KALDI_ASSERT(static_cast<MatrixIndexT>(dst.Dim()) == NumRows());
2749 
2750  CuTimer tim;
2751  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
2752  dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
2753  n_blocks(num_rows_, CU2DBLOCK));
2754  cuda_copy_to_rows(dimGrid, dimBlock, dst.Data(), data_, Dim());
2755  CU_SAFE_CALL(cudaGetLastError());
2756  CuDevice::Instantiate().AccuProfile(__func__, tim);
2757  } else
2758 #endif
2759  {
2760  Mat().CopyToRows(dst.Data());
2761  }
2762 }
2763 
2764 
2765 template<typename Real>
2767  const CuMatrixBase<Real> &src,
2768  const CuArrayBase<MatrixIndexT> &indexes) {
2769  if (NumRows() == 0) return;
2770 #if HAVE_CUDA == 1
2771  if (CuDevice::Instantiate().Enabled()) {
2772  KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
2773  KALDI_ASSERT(src.NumCols() == NumCols());
2774  CuTimer tim;
2775  dim3 dimGrid, dimBlock;
2776  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2777  &dimGrid, &dimBlock);
2778  cuda_add_rows(dimGrid, dimBlock, alpha,
2779  data_, src.Data(), indexes.Data(), Dim(), src.Stride());
2780  CU_SAFE_CALL(cudaGetLastError());
2781  CuDevice::Instantiate().AccuProfile(__func__, tim);
2782  } else
2783 #endif
2784  {
2785  Mat().AddRows(alpha, src.Mat(), indexes.Data());
2786  }
2787 }
2788 
2789 template<typename Real>
2791  const CuArrayBase<MatrixIndexT> &indexes) {
2792  if (NumRows() == 0) return;
2793  KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
2794 #if HAVE_CUDA == 1
2795  if (CuDevice::Instantiate().Enabled()) {
2796  KALDI_ASSERT(src.NumCols() == NumCols());
2797  CuTimer tim;
2798  dim3 dimGrid, dimBlock;
2799  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2800  &dimGrid, &dimBlock);
2801  cuda_mul_rows(dimGrid, dimBlock,
2802  data_, src.Data(), indexes.Data(), Dim(), src.Stride());
2803  CU_SAFE_CALL(cudaGetLastError());
2804  CuDevice::Instantiate().AccuProfile(__func__, tim);
2805  } else
2806 #endif
2807  {
2808  MatrixBase<Real> &this_mat(Mat());
2809  const MatrixBase<Real> &src_mat(src.Mat());
2810  int32 num_rows = NumRows();
2811  const MatrixIndexT *index_ptr = indexes.Data();
2812  for (int32 r = 0; r < num_rows; r++) {
2813  int32 src_r = index_ptr[r];
2814  if (src_r < 0)
2815  continue;
2816  SubVector<Real> this_row(this_mat, r),
2817  src_row(src_mat, src_r);
2818  this_row.MulElements(src_row);
2819  }
2820  }
2821 }
2822 
2823 
2824 
2825 template<typename Real>
2827  if (NumRows() == 0) return;
2828 #if HAVE_CUDA == 1
2829  if (CuDevice::Instantiate().Enabled()) {
2830  KALDI_ASSERT(static_cast<MatrixIndexT>(src.Dim()) == NumRows());
2831  CuTimer tim;
2832  dim3 dimGrid, dimBlock;
2833  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2834  &dimGrid, &dimBlock);
2835  cuda_add_rows(dimGrid, dimBlock, alpha, data_, src.Data(), Dim());
2836  CU_SAFE_CALL(cudaGetLastError());
2837  CuDevice::Instantiate().AccuProfile(__func__, tim);
2838  } else
2839 #endif
2840  {
2841  Mat().AddRows(alpha, src.Data());
2842  }
2843 }
2844 
2845 
2846 template<typename Real>
2848  const CuArrayBase<Real*> &dst) const {
2849  if (NumRows() == 0) return;
2850 #if HAVE_CUDA == 1
2851  if (CuDevice::Instantiate().Enabled()) {
2852  KALDI_ASSERT(static_cast<MatrixIndexT>(dst.Dim()) == NumRows());
2853  CuTimer tim;
2854  dim3 dimGrid, dimBlock;
2855  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2856  &dimGrid, &dimBlock);
2857  cuda_add_to_rows(dimGrid, dimBlock, alpha, dst.Data(), data_, Dim());
2858  CU_SAFE_CALL(cudaGetLastError());
2859  CuDevice::Instantiate().AccuProfile(__func__, tim);
2860  } else
2861 #endif
2862  {
2863  Mat().AddToRows(alpha, dst.Data());
2864  }
2865 }
2866 
2867 
2868 template<typename Real>
2870  const CuArrayBase<MatrixIndexT> &indexes,
2871  CuMatrixBase<Real> *dst) const {
2872  if (NumRows() == 0) return;
2873 #if HAVE_CUDA == 1
2874  if (CuDevice::Instantiate().Enabled()) {
2875  KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
2876  KALDI_ASSERT(dst->NumCols() == NumCols());
2877  CuTimer tim;
2878  dim3 dimGrid, dimBlock;
2879  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2880  &dimGrid, &dimBlock);
2881  cuda_add_to_rows(dimGrid, dimBlock, alpha, dst->Data(), data_, indexes.Data(), Dim(), dst->Stride());
2882  CU_SAFE_CALL(cudaGetLastError());
2883  CuDevice::Instantiate().AccuProfile(__func__, tim);
2884  } else
2885 #endif
2886  {
2887  Mat().AddToRows(alpha, indexes.Data(), &(dst->Mat()));
2888  }
2889 }
2890 
2891 
2892 template<typename Real>
2894  const CuArrayBase<Int32Pair> &indices) {
2895  KALDI_ASSERT(static_cast<MatrixIndexT>(indices.Dim()) == NumCols());
2896  KALDI_ASSERT(NumRows() == src.NumRows());
2897  if (NumRows() == 0) return;
2898 #if HAVE_CUDA == 1
2899  if (CuDevice::Instantiate().Enabled()) {
2900  CuTimer tim;
2901  dim3 dimGrid, dimBlock;
2902  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2903  &dimGrid, &dimBlock);
2904  cuda_sum_column_ranges(dimGrid, dimBlock, data_, Dim(), src.Data(),
2905  src.Dim(), indices.Data());
2906  CU_SAFE_CALL(cudaGetLastError());
2907  CuDevice::Instantiate().AccuProfile(__func__, tim);
2908  } else
2909 #endif
2910  {
2911  int32 num_rows = this->num_rows_, num_cols = this->num_cols_,
2912  this_stride = this->stride_, src_stride = src.stride_;
2913  Real *data = this->data_;
2914  const Real *src_data = src.data_;
2915  const Int32Pair *indices_data = indices.Data();
2916  for (int32 row = 0; row < num_rows; row++) {
2917  for (int32 col = 0; col < num_cols; col++) {
2918  int32 start_col = indices_data[col].first,
2919  end_col = indices_data[col].second;
2920  Real sum = 0.0;
2921  for (int32 src_col = start_col; src_col < end_col; src_col++)
2922  sum += src_data[row * src_stride + src_col];
2923  data[row * this_stride + col] = sum;
2924  }
2925  }
2926  }
2927 }
2928 
2929 
2930 template<typename Real>
2932  const CuArrayBase<Int32Pair> &indexes) {
2933  KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
2934  KALDI_ASSERT(src.NumCols() == NumCols());
2935  if (NumRows() == 0) return;
2936 #if HAVE_CUDA == 1
2937  if (CuDevice::Instantiate().Enabled()) {
2938  CuTimer tim;
2939  dim3 dimGrid, dimBlock;
2940  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
2941  &dimGrid, &dimBlock);
2942  cuda_add_row_ranges(dimGrid, dimBlock,
2943  data_, Dim(), src.Data(), src.Dim(), indexes.Data());
2944  CU_SAFE_CALL(cudaGetLastError());
2945  CuDevice::Instantiate().AccuProfile(__func__, tim);
2946  } else
2947 #endif
2948  { // Implement here for the CPU..
2949  int32 num_rows = this->num_rows_, num_cols = this->num_cols_,
2950  this_stride = this->stride_, src_stride = src.stride_;
2951  Real *data = this->data_;
2952  const Real *src_data = src.data_;
2953  const Int32Pair *indexes_data = indexes.Data();
2954  for (int32 row = 0; row < num_rows; row++) {
2955  int32 start_row = indexes_data[row].first,
2956  end_row = indexes_data[row].second;
2957  for (int32 col = 0; col < num_cols; col++) {
2958  Real sum = 0.0;
2959  for (int32 src_row = start_row; src_row < end_row; src_row++)
2960  sum += src_data[src_row * src_stride + col];
2961  data[row * this_stride + col] += sum;
2962  }
2963  }
2964  }
2965 }
2966 
2967 
2968 template<typename Real>
2970  KALDI_ASSERT(num_cols_ == num_rows_);
2971  if (num_rows_ == 0) return;
2972 #if HAVE_CUDA == 1
2973  if (CuDevice::Instantiate().Enabled()) {
2974  CuTimer tim;
2975  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
2976  int32 dim = num_rows_;
2977  dim3 dimGrid(n_blocks(dim, CU2DBLOCK),
2978  n_blocks(dim, CU2DBLOCK));
2979  cuda_copy_low_upp(dimGrid, dimBlock, data_, Dim());
2980  CU_SAFE_CALL(cudaGetLastError());
2981  CuDevice::Instantiate().AccuProfile(__func__, tim);
2982  } else
2983 #endif
2984  {
2985  Mat().CopyLowerToUpper();
2986  }
2987 }
2988 
2989 template<typename Real>
2991  KALDI_ASSERT(num_cols_ == num_rows_);
2992  if (num_rows_ == 0) return;
2993 #if HAVE_CUDA == 1
2994  if (CuDevice::Instantiate().Enabled()) {
2995  CuTimer tim;
2996  int32 dim = this->num_rows_;
2997  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
2998  dim3 dimGrid(n_blocks(dim, CU2DBLOCK),
2999  n_blocks(dim, CU2DBLOCK));
3000  cuda_copy_upp_low(dimGrid, dimBlock, data_, Dim());
3001  CU_SAFE_CALL(cudaGetLastError());
3002  CuDevice::Instantiate().AccuProfile(__func__, tim);
3003  } else
3004 #endif
3005  {
3006  Mat().CopyUpperToLower();
3007  }
3008 }
3009 
3010 
3011 template<typename Real>
3013 #if HAVE_CUDA == 1
3014  if (CuDevice::Instantiate().Enabled()) {
3015  KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
3016  CuTimer tim;
3017 
3018  CuVector<Real> col_sum(num_rows_, kUndefined);
3019  cuda_sum_mat_cols(num_rows_, CU1DBLOCK, col_sum.Data(), data_, Dim());
3020  Real ans = col_sum.Sum();
3021 
3022  CuDevice::Instantiate().AccuProfile(__func__, tim);
3023  return ans;
3024  } else
3025 #endif
3026  {
3027  return Mat().Sum();
3028  }
3029 }
3030 
3031 
3032 template<typename Real>
3034 #if HAVE_CUDA == 1
3035  if (CuDevice::Instantiate().Enabled()) {
3036  KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
3037  CuTimer tim;
3038 
3039  CuVector<Real> col_max(num_rows_, kUndefined);
3040  cuda_max_mat_cols(num_rows_, CU1DBLOCK, col_max.Data(), data_, Dim());
3041  Real ans = col_max.Max();
3042 
3043  CuDevice::Instantiate().AccuProfile(__func__, tim);
3044  return ans;
3045  } else
3046 #endif
3047  {
3048  return Mat().Max();
3049  }
3050 }
3051 
3052 
3053 template<typename Real>
3055 #if HAVE_CUDA == 1
3056  if (CuDevice::Instantiate().Enabled()) {
3057  KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
3058  CuTimer tim;
3059 
3060  CuVector<Real> col_min(num_rows_, kUndefined);
3061  cuda_min_mat_cols(num_rows_, CU1DBLOCK, col_min.Data(), data_, Dim());
3062  Real ans = col_min.Min();
3063 
3064  CuDevice::Instantiate().AccuProfile(__func__, tim);
3065  return ans;
3066  } else
3067 #endif
3068  {
3069  return Mat().Min();
3070  }
3071 }
3072 
3073 
3074 template<typename Real>
3075 Real CuMatrixBase<Real>::Trace(bool check_square) const {
3076 #if HAVE_CUDA == 1
3077  if (CuDevice::Instantiate().Enabled()) {
3078  CuTimer tim;
3079  if (check_square) KALDI_ASSERT(this->num_rows_ == this->num_cols_);
3080  MatrixIndexT dim = std::min(this->num_rows_, this->num_cols_);
3081  CuVector<Real> tmp(1, kUndefined); // for result.
3082  int dimBlock(CU1DBLOCK);
3083  int dimGrid = 1;// only 1 block here. we have loops in each thread //(n_blocks(dim_, CU1DBLOCK));
3084  cuda_vec_sum(dimGrid, dimBlock, data_, tmp.Data(), dim, Stride() + 1);
3085  CU_SAFE_CALL(cudaGetLastError());
3086  CuDevice::Instantiate().AccuProfile("CuVectorBase::Sum", tim);
3087  return tmp(0);
3088  } else
3089 #endif
3090  {
3091  return Mat().Trace(check_square);
3092  }
3093 }
3094 
3095 template <typename Real>
3097  MatrixTransposeType trans) {
3098  switch (src.Type()) {
3099  case kFullMatrix: {
3100  const Matrix<BaseFloat> &src_full_mat = src.GetFullMatrix();
3101  this->CopyFromMat(src_full_mat, trans);
3102  return;
3103  }
3104  case kCompressedMatrix: {
3105  Matrix<BaseFloat> mat;
3106  src.GetMatrix(&mat);
3107  this->CopyFromMat(mat, trans);
3108  return;
3109  }
3110  case kSparseMatrix: {
3111  const SparseMatrix<BaseFloat> &smat = src.GetSparseMatrix();
3112 #if HAVE_CUDA == 1
3113  if (CuDevice::Instantiate().Enabled()) {
3114  // only take this branch if we're actually using CUDA, or it would
3115  // entail a wasteful copy of the sparse matrix.
3116  CuSparseMatrix<BaseFloat> cu_smat(smat);
3117  cu_smat.CopyToMat(this, trans);
3118  return;
3119  }
3120 #endif
3121  smat.CopyToMat(&(Mat()), trans);
3122  return;
3123  }
3124  default:
3125  KALDI_ERR << "Invalid GeneralMatrix type.";
3126  }
3127 }
3128 
3129 
3130 
3131 template<typename Real>
3133  if (num_rows_ == 0) return;
3134 #if HAVE_CUDA == 1
3135  if (CuDevice::Instantiate().Enabled()) {
3136  CuRand<Real> tmp;
3137  tmp.RandGaussian(this);
3138  } else
3139 #endif
3140  {
3141  Mat().SetRandn();
3142  }
3143 }
3144 
3145 template<typename Real>
3147  if (num_rows_ == 0) return;
3148 #if HAVE_CUDA == 1
3149  if (CuDevice::Instantiate().Enabled()) {
3150  CuRand<Real> tmp;
3151  tmp.RandUniform(this);
3152  } else
3153 #endif
3154  {
3155  Mat().SetRandUniform();
3156  }
3157 }
3158 
3159 template<typename Real>
3160 void Matrix<Real>::Swap(CuMatrix<Real> *mat) { mat->Swap(this); }
3161 // instantiate the template above.
3162 template void Matrix<float>::Swap(CuMatrix<float> *mat);
3163 template void Matrix<double>::Swap(CuMatrix<double> *mat);
3164 
3166 template<typename Real>
3167 template<typename OtherReal>
3169  MatrixTransposeType trans) : CuMatrixBase<Real>() {
3170 
3171  if (trans == kNoTrans) {
3172  Resize(M.NumRows(), M.NumCols());
3173  this->CopyFromMat(M);
3174  } else {
3175  Resize(M.NumCols(), M.NumRows());
3176  this->CopyFromMat(M, kTrans);
3177  }
3178 }
3179 
3180 // Instantiate this constructor for float->double and double->float.
3181 template
3183  MatrixTransposeType trans);
3184 template
3186  MatrixTransposeType trans);
3187 
3188 
3189 template<typename Real>
3191  if (this->num_rows_ == 0)
3192  return;
3193  // Copy and swap for all cases.
3194  // No need for a separate kernel of squared matrix in-place transpose.
3195  // It has the same possible peak performance as copy_transpose,
3196  // if allocate/deallocate overhead can be ignored.
3197  CuMatrix<Real> tmp(*this, kTrans);
3198  this->Swap(&tmp);
3199 }
3200 
3201 
3202 // Version of AddMatMat where 2nd argument is of type CuBlockMatrix.
3203 // Caution:
3204 template<typename Real>
3206  Real alpha,
3207  const CuMatrixBase<Real> &A, MatrixTransposeType transA,
3208  const CuBlockMatrix<Real> &B, MatrixTransposeType transB,
3209  Real beta) {
3210  // Check dimensions
3211  int32 A_num_rows = A.NumRows(), A_num_cols = A.NumCols(),
3212  A_row_stride = A.Stride(), A_col_stride = 1,
3213  B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
3214  if (transA == kTrans) {
3215  std::swap(A_num_rows, A_num_cols);
3216  std::swap(A_row_stride, A_col_stride);
3217  }
3218  if (transB == kTrans) {
3219  std::swap(B_num_rows, B_num_cols);
3220  }
3221  // At this point the {A,B}_{rows,cols} variables are
3222  // after any transposition.
3223  KALDI_ASSERT(NumRows() == A_num_rows && NumCols() == B_num_cols);
3224  KALDI_ASSERT(A_num_cols == B_num_rows);
3225  int32 B_num_blocks = B.NumBlocks();
3226 
3227  if (num_rows_ == 0) return;
3228 #if HAVE_CUDA == 1
3229  if (CuDevice::Instantiate().Enabled()) {
3230  CuTimer tim;
3231  MatrixDim this_dim = Dim();
3232 
3233  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
3234  // (x,y) indices will be (row of *this, block of B)
3235  dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK),
3236  n_blocks(B_num_blocks, CU2DBLOCK));
3237 
3238  // caution: the use of x as the row-index is not good, but
3239  // this code is not much used, so I'm not updating it.a
3240  cuda_add_mat_blockmat(dimGrid, dimBlock, data_, this_dim, A.Data(),
3241  A_num_rows, A_num_cols, A_row_stride, A_col_stride,
3242  B.CuData(), B_num_blocks, alpha, beta,
3243  (transB == kTrans ? 1 : 0));
3244 
3245  CU_SAFE_CALL(cudaGetLastError());
3246 
3247  CuDevice::Instantiate().AccuProfile(__func__, tim);
3248  } else
3249 #endif
3250  {
3251  // "row_offset" and "col_offset" are offsets into B (or into B^T, if
3252  // transB == kTrans).
3253  int32 row_offset = 0, col_offset = 0;
3254  for (int32 b = 0; b < B_num_blocks; b++) {
3255  const CuSubMatrix<Real> this_block = B.Block(b);
3256  int32 this_num_rows = this_block.NumRows(),
3257  this_num_cols = this_block.NumCols();
3258  if (transB == kTrans) std::swap(this_num_rows, this_num_cols);
3259  CuSubMatrix<Real> this_part(*this, 0, num_rows_,
3260  col_offset, this_num_cols);
3261  CuSubMatrix<Real> A_part = (transA == kNoTrans ?
3262  CuSubMatrix<Real>(A, 0, num_rows_,
3263  row_offset, this_num_rows) :
3264  CuSubMatrix<Real>(A, row_offset, this_num_rows,
3265  0, num_rows_));
3266  this_part.AddMatMat(alpha, A_part, transA, this_block, transB, beta);
3267  row_offset += this_num_rows;
3268  col_offset += this_num_cols;
3269  }
3270  // Note: the values being compared below are all after applying any
3271  // transposition to B.
3272  KALDI_ASSERT(row_offset == B_num_rows && col_offset == B_num_cols);
3273  }
3274 }
3275 
3276 template<typename Real>
3278  const std::vector<MatrixElement<Real> >& input) {
3279  // Checks the dimension.
3280  MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
3281  for (int32 i = 0; i < input.size(); ++i) {
3282  KALDI_ASSERT(input[i].row < num_rows && input[i].row >= 0 &&
3283  input[i].column < num_cols && input[i].column >= 0);
3284  }
3285 #if HAVE_CUDA == 1
3286  if (CuDevice::Instantiate().Enabled()) {
3287  void *addr = CuDevice::Instantiate().Malloc(input.size() * sizeof(MatrixElement<Real>));
3288  CU_SAFE_CALL(cudaMemcpyAsync(addr, input.data(),
3289  input.size() * sizeof(MatrixElement<Real>),
3290  cudaMemcpyHostToDevice, cudaStreamPerThread));
3291 
3292  CuTimer tim;
3293  int dimBlock(CU1DBLOCK);
3294  int dimGrid(n_blocks(input.size(), CU1DBLOCK));
3295 
3296  cuda_matrix_add_elements(dimGrid, dimBlock, this->data_, this->Dim(),
3297  alpha, (MatrixElement<Real>*)addr, input.size());
3298  CU_SAFE_CALL(cudaGetLastError());
3299  CuDevice::Instantiate().Free(addr);
3300  CuDevice::Instantiate().AccuProfile(__func__, tim);
3301  } else
3302 #endif
3303  {
3304  for (int32 i = 0; i < input.size(); i++) {
3305  (*this)(input[i].row, input[i].column) += alpha * input[i].weight;
3306  }
3307  }
3308 }
3309 
3310 template<typename Real>
3312  const Real *input) {
3313  if (indexes.Dim() == 0) return;
3314  KALDI_ASSERT(input != NULL);
3315 
3316 #if HAVE_CUDA == 1
3317  if (CuDevice::Instantiate().Enabled()) {
3318  CuTimer tim;
3319  CuVector<Real> tmp_vec(indexes.Dim(), kUndefined);
3320  CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input,
3321  indexes.Dim() * sizeof(Real),
3322  cudaMemcpyHostToDevice, cudaStreamPerThread));
3323 
3324  int dimBlock(CU1DBLOCK);
3325  int dimGrid = n_blocks(indexes.Dim(), CU1DBLOCK);
3326  cuda_matrix_add_indexed_values(dimGrid, dimBlock, this->Dim(), alpha,
3327  indexes.Data(), tmp_vec.Data(), indexes.Dim(), this->data_);
3328  CU_SAFE_CALL(cudaGetLastError());
3329  CuDevice::Instantiate().AccuProfile(__func__, tim);
3330  } else
3331 #endif
3332  {
3333  MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
3334  const Int32Pair *index = indexes.Data();
3335  for (int32 i = 0; i < indexes.Dim(); i++) {
3336  KALDI_ASSERT(index[i].first < num_rows && index[i].first >= 0 &&
3337  index[i].second < num_cols && index[i].second >= 0);
3338  (*this)(index[i].first, index[i].second) += alpha * input[i];
3339  }
3340  }
3341 }
3342 
3343 template<typename Real>
3344 void CuMatrixBase<Real>::AddToElements(Real alpha, const CuArrayBase<int32> &elements) {
3345  KALDI_ASSERT(elements.Dim() == NumRows());
3346 #if HAVE_CUDA == 1
3347  if (CuDevice::Instantiate().Enabled()) {
3348  CuTimer tim;
3349 
3350  dim3 dimBlock(CU1DBLOCK);
3351  dim3 dimGrid(n_blocks(NumRows(), CU1DBLOCK));
3352 
3353  cuda_matrix_add_to_elements(dimGrid, dimBlock, alpha, data_, Dim(), elements.Data());
3354  CU_SAFE_CALL(cudaGetLastError());
3355  CuDevice::Instantiate().AccuProfile(__func__, tim);
3356  } else
3357 #endif
3358  {
3359  MatrixBase<Real> &this_mat = this->Mat();
3360  const int32* row_to_col = elements.Data();
3361  for (int32 r = 0; r < this_mat.NumRows(); r++) {
3362  KALDI_ASSERT(row_to_col[r] >= -1);
3363  if (row_to_col[r] >= 0)
3364  this_mat(r, row_to_col[r]) += alpha;
3365  }
3366  }
3367 }
3368 
3369 template<typename Real>
3370 void CuMatrixBase<Real>::Lookup(const std::vector<Int32Pair> &indices,
3371  Real *output) const {
3372  // Checks the dimension.
3373  MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
3374  for (int32 i = 0; i < indices.size(); ++i) {
3375  KALDI_ASSERT(indices[i].first < num_rows && indices[i].first >= 0 &&
3376  indices[i].second < num_cols && indices[i].second >= 0);
3377  }
3378  if (indices.size() == 0) return;
3379  KALDI_ASSERT(output != NULL);
3380 
3381 #if HAVE_CUDA == 1
3382  if (CuDevice::Instantiate().Enabled()) {
3383  CuArray<Int32Pair> cuda_indices(indices);
3384  Lookup(cuda_indices, output);
3385  } else
3386 #endif
3387  {
3388  for (int32 i = 0; i < indices.size(); i++) {
3389  output[i] = (*this)(indices[i].first, indices[i].second);
3390  }
3391  }
3392 }
3393 
3394 template<typename Real>
3396  Real *output) const {
3397  int32 num_elements = indices.Dim();
3398  if (num_elements == 0) return;
3399  KALDI_ASSERT(output != NULL);
3400 
3401 #if HAVE_CUDA == 1
3402  if (CuDevice::Instantiate().Enabled()) {
3403  CuArray<Real> cuda_output(num_elements);
3404  CuTimer tim;
3405  dim3 dimBlock(CU1DBLOCK, 1);
3406  dim3 dimGrid(n_blocks(num_elements, CU1DBLOCK), 1);
3407 
3408  cuda_matrix_lookup(dimGrid, dimBlock, this->data_, this->Dim(),
3409  indices.Data(), num_elements, cuda_output.Data());
3410  CU_SAFE_CALL(cudaGetLastError());
3411 
3412  cuda_output.CopyToHost(output);
3413  CuDevice::Instantiate().AccuProfile(__func__, tim);
3414  } else
3415 #endif
3416  {
3417  MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
3418  const Int32Pair *index = indices.Data();
3419  for (int32 i = 0; i < num_elements; i++) {
3420  KALDI_ASSERT(index[i].first < num_rows && index[i].first >= 0 &&
3421  index[i].second < num_cols && index[i].second >= 0);
3422  output[i] = (*this)(index[i].first, index[i].second);
3423  }
3424  }
3425 }
3426 
3427 
3428 template<typename Real>
3430  // Check the inputs:
3431  KALDI_ASSERT(mat.NumRows() == NumRows() && mat.NumCols() == NumCols());
3432  KALDI_ASSERT(mask != NULL);
3433  // Resizes the output matrix:
3434  mask->Resize(NumRows(), NumCols(), kSetZero);
3435 
3436 #if HAVE_CUDA == 1
3437  if (CuDevice::Instantiate().Enabled()) {
3438  CuTimer tim;
3439  dim3 dimGrid, dimBlock;
3440  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
3441  &dimGrid, &dimBlock);
3442  cuda_equal_element_mask(dimGrid, dimBlock, this->data_, mat.Data(),
3443  mask->Data(), this->Dim(), mat.Stride(),
3444  mask->Stride());
3445  CU_SAFE_CALL(cudaGetLastError());
3446 
3447  CuDevice::Instantiate().AccuProfile(__func__, tim);
3448  } else
3449 #endif
3450  {
3451  for (int32 r = 0; r < NumRows(); r++) {
3452  for (int32 c = 0; c < NumCols(); c++) {
3453  (*mask)(r,c) = ((*this)(r,c) == mat(r,c) ? 1.0 : 0.0);
3454  }
3455  }
3456  }
3457 }
3458 
3459 
3463 template<typename Real>
3464 std::ostream &operator << (std::ostream &out, const CuMatrixBase<Real> &mat) {
3465  Matrix<Real> temp(mat.NumRows(), mat.NumCols());
3466  mat.CopyToMat(&temp);
3467  out << temp;
3468  return out;
3469 }
3470 // instantiate the template
3471 template
3472 std::ostream &operator << (std::ostream &out, const CuMatrixBase<float> &mat);
3473 template
3474 std::ostream &operator << (std::ostream &out, const CuMatrixBase<double> &mat);
3475 
3476 
3477 // Instantiate classes CuMatrix and CuMatrixBase for float and double.
3478 template class CuMatrix<float>;
3479 template class CuMatrix<double>;
3480 template class CuMatrixBase<float>;
3481 template class CuMatrixBase<double>;
3482 
3483 
3484 
3485 
3486 
3487 
3488 
3489 } // namespace kaldi
const MatrixBase< Real > & Mat() const
Definition: cu-matrix.h:755
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
double Exp(double x)
Definition: kaldi-math.h:83
MatrixIndexT Stride() const
Definition: cu-matrix.h:217
MatrixIndexT NumRows() const
This class provides a way for switching between double and float types.
Definition: matrix-common.h:84
Packed symetric matrix class.
Definition: matrix-common.h:62
void Write(std::ostream &out, bool binary) const
write to stream.
This class is a wrapper that enables you to store a matrix in one of three forms: either as a Matrix<...
const Real * CsrVal() const
Returns pointer to the data array of length nnz_ that holds all nonzero values in zero-based CSR form...
int32_cuda rows
Definition: cu-matrixdim.h:47
MatrixResizeType
Definition: matrix-common.h:37
void CopyToMat(CuMatrixBase< OtherReal > *dest, MatrixTransposeType trans=kNoTrans) const
void CopyFromTp(const CuTpMatrix< Real > &other)
Definition: cu-tp-matrix.h:68
const CuSubVector< Real > Row(MatrixIndexT i) const
Definition: cu-matrix.h:670
MatrixIndexT stride_
< Number of rows
Definition: kaldi-matrix.h:816
void RandUniform(CuMatrixBase< Real > *tgt)
Fill with uniform [0..1] floats,.
Definition: cu-rand.cc:60
void CopyToMat(MatrixBase< OtherReal > *dst, MatrixTransposeType trans=kNoTrans) const
Definition: cu-matrix.cc:447
void GetMatrix(Matrix< BaseFloat > *mat) const
Outputs the contents as a matrix.
MatrixIndexT NumRows() const
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
const Real * Data() const
Gives pointer to raw data (const).
Definition: kaldi-matrix.h:79
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:46
Real * data_
GPU data pointer (or regular data pointer if CUDA is not compiled in or we have no GPU)...
Definition: cu-vector.h:248
Real * data_
data memory area
Definition: kaldi-matrix.h:808
Real Sum() const
Definition: cu-vector.cc:297
void AddMat(const Real alpha, const MatrixBase< Real > &M, MatrixTransposeType transA=kNoTrans)
*this += alpha * M [or M^T]
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
The class CuBlockMatrix holds a vector of objects of type CuMatrix, say, M_1, M_2, .
kaldi::int32 int32
void AddMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
*this += alpha * A
Definition: cu-matrix.cc:954
A class for storing matrices.
Definition: kaldi-matrix.h:823
const T * Data() const
Get raw pointer.
Definition: cu-array.h:52
const Matrix< BaseFloat > & GetFullMatrix() const
Returns the contents as a Matrix<BaseFloat>.
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
Real * data_
GPU data pointer (or regular matrix data pointer,.
Definition: cu-matrix.h:777
void Swap(Matrix< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
void CopyFromMat(const MatrixBase< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given matrix. (no resize is done).
MatrixIndexT NumRows() const
uint32 UnsignedMatrixIndexT
uint64 data_
template void AddMatMatBatched(const double alpha, std::vector< CuSubMatrix< double > * > &C, const std::vector< CuSubMatrix< double > * > &A, MatrixTransposeType transA, const std::vector< CuSubMatrix< double > * > &B, MatrixTransposeType transB, const double beta)
MatrixIndexT NumRows() const
MatrixIndexT stride_
Definition: cu-matrix.h:787
void AddDiagMatMat(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const CuMatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
Definition: cu-vector.cc:611
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
void CopyToMat(MatrixBase< OtherReal > *other, MatrixTransposeType t=kNoTrans) const
Copy to matrix. It must already have the correct size.
void Scale(Real value)
Definition: cu-matrix.cc:644
const SpMatrix< Real > & Mat() const
Definition: cu-sp-matrix.h:132
void RandGaussian(CuMatrixBase< Real > *tgt)
Fill with Normal random numbers,.
Definition: cu-rand.cc:116
void Read(std::istream &in, bool binary, bool add=false)
read from stream.
void Cholesky(const SpMatrix< Real > &orig)
Definition: tp-matrix.cc:88
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols.
Definition: kaldi-matrix.h:70
int32 MatrixIndexT
Definition: matrix-common.h:98
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
Definition: cu-vector.cc:1078
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
const int * CsrColIdx() const
Returns pointer to the integer array of length nnz_ that contains the column indices of the correspon...
double Log(double x)
Definition: kaldi-math.h:100
GeneralMatrixType Type() const
Returns the type of the matrix: kSparseMatrix, kCompressedMatrix or kFullMatrix.
void AddColSumMat(Real alpha, const CuMatrixBase< Real > &mat, Real beta=1.0)
Sum the columns of the matrix, add to vector.
Definition: cu-vector.cc:1298
void Swap(Matrix< Real > *mat)
Definition: cu-matrix.cc:123
MatrixIndexT num_rows_
< Number of columns
Definition: kaldi-matrix.h:813
const SparseMatrix< Real > & Smat() const
void MulElements(const CuMatrixBase< Real > &A)
Multiply two matrices elementwise: C = C .* A.
Definition: cu-matrix.cc:667
struct rnnlm::@11::@12 n
void CopyFromBlock(const CuBlockMatrix< Real > &B, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:161
MatrixStrideType
Definition: matrix-common.h:44
void AddTp2(const Real alpha, const TpMatrix< Real > &T, MatrixTransposeType transM, const Real beta=0.0)
The following function does: this <– beta*this + alpha * T * T^T.
Definition: sp-matrix.cc:1156
void SymAddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transA, Real beta)
*this = beta * *this + alpha * M M^T, for symmetric matrices.
Definition: cu-matrix.cc:1353
void Resize(MatrixIndexT dim, MatrixResizeType t=kSetZero)
Allocate the memory.
Definition: cu-vector.cc:993
#define KALDI_ERR
Definition: kaldi-error.h:147
#define CU1DBLOCK
Definition: cu-matrixdim.h:57
#define KALDI_MEMALIGN_FREE(x)
Definition: kaldi-utils.h:60
#define CU2DBLOCK
Definition: cu-matrixdim.h:61
MatrixIndexT NumElements() const
Packed symetric matrix class.
Definition: matrix-common.h:63
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
MatrixIndexT num_cols_
these attributes store the real matrix size as it is stored in memory including memalignment ...
Definition: kaldi-matrix.h:812
void Cholesky(CuMatrixBase< Real > *inv_cholesky=NULL)
This function does sets *this to the Cholesky factor of *this (i.e.
Definition: cu-matrix.cc:1987
#define KALDI_WARN
Definition: kaldi-error.h:150
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70
void CopyToHost(T *dst) const
Version of the above function that copies contents to a host array (i.e.
Definition: cu-array-inl.h:198
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
int32_cuda cols
Definition: cu-matrixdim.h:48
Real Sum() const
Returns sum of the elements.
void CopyFromMat(const CuMatrixBase< Real > &orig, SpCopyType copy_type=kTakeLower)
Definition: cu-sp-matrix.cc:39
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
Definition: cu-vector.cc:782
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746
const int * CsrRowPtr() const
Returns pointer to the integer array of length NumRows()+1 that holds indices of the first nonzero el...
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
MatrixIndexT NumBlocks() const
void DiffLogSoftmaxPerRow(const CuMatrixBase< Real > &out_value, const CuMatrixBase< Real > &out_deriv)
Differentiate backward through the log softmax function.
Definition: cu-matrix.cc:1903
void Destroy()
Definition: cu-matrix.cc:94
A class representing a vector.
Definition: kaldi-vector.h:406
void InvertElements()
Invert all elements.
const VectorBase< Real > & Vec() const
Definition: cu-vector.h:235
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void CopyFromTp(const CuTpMatrix< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:280
MatrixTransposeType
Definition: matrix-common.h:32
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: cu-vector.h:72
MatrixIndexT num_cols_
Definition: cu-matrix.h:785
::MatrixDim Dim() const
Definition: cu-matrix.h:221
Real FrobeniusNorm() const
Definition: cu-matrix.h:226
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
int32_cuda second
Definition: cu-matrixdim.h:80
const SparseMatrix< BaseFloat > & GetSparseMatrix() const
Returns the contents as a SparseMatrix.
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:49
MatrixIndexT NumCols() const
void SetZero()
Set vector to all zeros.
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i&#39;th row by scale[i]
Definition: cu-matrix.cc:792
Sub-matrix representation.
Definition: kaldi-matrix.h:988
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
const TpMatrix< Real > & Mat() const
Definition: cu-tp-matrix.h:80
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265
void Resize(MatrixIndexT rows, MatrixIndexT cols, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Allocate the memory.
Definition: cu-matrix.cc:50
Real Min() const
Returns the minimum value of any element, or +infinity for the empty vector.
Definition: cu-vector.cc:744
MatrixIndexT NumCols() const
const CuSubMatrix< Real > Block(MatrixIndexT b) const
template double TraceMatMat(const CuMatrixBase< double > &A, const CuMatrixBase< double > &B, MatrixTransposeType trans)
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69
Vector for CUDA computing.
Definition: matrix-common.h:72
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
Definition: cu-matrix.cc:1382
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).
Definition: cu-matrix.h:740
int32_cuda first
Definition: cu-matrixdim.h:79
MatrixIndexT num_rows_
Definition: cu-matrix.h:786