cu-vector.cc
Go to the documentation of this file.
1 // cudamatrix/cu-vector.cc
2 
3 // Copyright 2012-2013 Karel Vesely
4 // 2012-2014 Johns Hopkins University (author: Daniel Povey)
5 // 2017 Daniel Galvez
6 // 2016-2018 Shiyin Kang
7 // 2019 Yiwen Shao
8 
9 // See ../../COPYING for clarification regarding multiple authors
10 //
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 //
15 // http://www.apache.org/licenses/LICENSE-2.0
16 //
17 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
19 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
20 // MERCHANTABLITY OR NON-INFRINGEMENT.
21 // See the Apache 2 License for the specific language governing permissions and
22 // limitations under the License.
23 
24 #if HAVE_CUDA == 1
25 #include <cuda_runtime_api.h>
26 #include <cublas_v2.h>
27 #endif
28 
29 #include "base/timer.h"
30 #include "cudamatrix/cu-common.h"
31 #include "cudamatrix/cu-vector.h"
32 #include "cudamatrix/cu-device.h"
33 #include "cudamatrix/cu-kernels.h"
34 #include "cudamatrix/cu-math.h"
35 #include "cudamatrix/cu-vector.h"
36 #include "cudamatrix/cu-matrix.h"
37 #include "cudamatrix/cu-rand.h"
42 
43 namespace kaldi {
44 
45 
46 template<typename Real>
47 Real VecVec(const CuVectorBase<Real> &a,
48  const CuVectorBase<Real> &b) {
49  //MatrixIndexT a_dim = a.Dim();
50  KALDI_ASSERT(a.Dim() == b.Dim());
51  Real result = 0;
52 #if HAVE_CUDA == 1
53  if (CuDevice::Instantiate().Enabled()) {
54  CuTimer tim;
55  CUBLAS_SAFE_CALL(cublas_dot(GetCublasHandle(), a.Dim(), a.Data(), 1, b.Data(),
56  1, &result));
57  CuDevice::Instantiate().AccuProfile(__func__, tim);
58 } else
59 #endif
60  {
61  result = VecVec(a.Vec(), b.Vec());
62  }
63  return result;
64 }
65 // instantiate the template above
66 template float VecVec(const CuVectorBase<float> &a, const CuVectorBase<float> &b);
67 template double VecVec(const CuVectorBase<double> &a, const CuVectorBase<double> &b);
68 
69 // The version of VecVec that can do type conversion. For now we give this a
70 // stupid implementation that converts one of the vectors. If it ever becomes
71 // an efficiency bottleneck, we can revisit this.
72 template<typename Real, typename OtherReal>
74  CuVector<Real> B2(B);
75  return VecVec(A, B2); // This will call the single-parameter template.
76 }
77 // instantiate the template above
78 template float VecVec(const CuVectorBase<float> &A, const CuVectorBase<double> &B);
79 template double VecVec(const CuVectorBase<double> &A, const CuVectorBase<float> &B);
80 
81 
82 template<typename Real>
84  const CuVectorBase<Real> &v2) {
85  KALDI_ASSERT(v1.Dim() == M.NumRows() && M.NumCols() == v2.Dim());
86  if (v1.Dim() > v2.Dim()) { // do v2*M first
87  CuVector<Real> v2M(v1.Dim());
88  v2M.AddMatVec(1.0, M, kNoTrans, v2, 0.0);
89  return VecVec(v2M, v1);
90  } else { // do v1*M first
91  CuVector<Real> v1M(v2.Dim());
92  v1M.AddMatVec(1.0, M, kTrans, v1, 0.0);
93  return VecVec(v1M, v2);
94  }
95 }
96 // instantiate the template above
97 template float VecMatVec(const CuVectorBase<float> &v1, const CuMatrixBase<float> &M,
98  const CuVectorBase<float> &v2);
99 template double VecMatVec(const CuVectorBase<double> &v1, const CuMatrixBase<double> &M,
100  const CuVectorBase<double> &v2);
101 
102 template<typename Real>
104  KALDI_ASSERT(col < mat.NumCols());
105  KALDI_ASSERT(dim_ == mat.NumRows());
106 #if HAVE_CUDA == 1
107  if (CuDevice::Instantiate().Enabled()) {
108  CuTimer tim;
109  cublas_copy(GetCublasHandle(),
110  this->dim_, mat.Data() + col, mat.Stride(), this->data_, 1);
111  CU_SAFE_CALL(cudaGetLastError());
112  CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim);
113  } else
114 #endif
115  {
116  Vec().CopyColFromMat(mat.Mat(),col);
117  }
118 }
119 
120 template<>
121 template<>
123  KALDI_ASSERT(col < mat.NumCols());
124  KALDI_ASSERT(dim_ == mat.NumRows());
125 #if HAVE_CUDA == 1
126  if (CuDevice::Instantiate().Enabled()) {
127  CuTimer tim;
128  int dimBlock(CU1DBLOCK);
129  int dimGrid(n_blocks(dim_,CU1DBLOCK));
130 
131  cuda_copy_col_from_mat_df(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_);
132  CU_SAFE_CALL(cudaGetLastError());
133  CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim);
134  } else
135 #endif
136  {
137  Vec().CopyColFromMat(mat.Mat(), col);
138  }
139 }
140 
141 
142 template<>
143 template<>
145  KALDI_ASSERT(col < mat.NumCols());
146  KALDI_ASSERT(dim_ == mat.NumRows());
147 #if HAVE_CUDA == 1
148  if (CuDevice::Instantiate().Enabled()) {
149  CuTimer tim;
150  int dimBlock(CU1DBLOCK);
151  int dimGrid(n_blocks(dim_,CU1DBLOCK));
152 
153  cuda_copy_col_from_mat_fd(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_);
154  CU_SAFE_CALL(cudaGetLastError());
155  CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim);
156  } else
157 #endif
158  {
159  Vec().CopyColFromMat(mat.Mat(), col);
160  }
161 }
162 
163 template<typename Real>
165  KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
166 #if HAVE_CUDA == 1
167  if (CuDevice::Instantiate().Enabled()) {
168  if (dim_ == 0) return;
169  CuTimer tim;
170  if (mat.Stride() == mat.NumCols() && mat.NumRows() != 0) {
171  CU_SAFE_CALL(
172  cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
173  cudaMemcpyDeviceToDevice, cudaStreamPerThread));
174  } else {
175  Real* vec_data = data_;
176  for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
177  CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
178  sizeof(Real) * mat.NumCols(),
179  cudaMemcpyDeviceToDevice,
180  cudaStreamPerThread));
181  vec_data += mat.NumCols();
182  }
183  }
184  CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyRowsFromMat", tim);
185  } else
186 #endif
187  {
188  Vec().CopyRowsFromMat(mat.Mat());
189  }
190 }
191 
192 template<typename Real>
194 #if HAVE_CUDA == 1
195  if (CuDevice::Instantiate().Enabled()) {
196  CuTimer tim;
197  Real ans;
198  KALDI_ASSERT(p == 1.0 || p == 2.0);
199  if (dim_ == 0) return 0.0;
200  if (p == 1.0) {
201  cublas_asum(GetCublasHandle(), dim_, data_, 1, &ans);
202  } else {
203  cublas_nrm2(GetCublasHandle(), dim_, data_, 1, &ans);
204  }
205  CuDevice::Instantiate().AccuProfile(__func__, tim);
206  if (ans != ans) {
207  KALDI_ERR << "NaN in norm " << *this;
208  }
209  return ans;
210  } else
211 #endif
212  {
213  return Vec().Norm(p);
214  }
215 }
216 
217 template<typename Real>
219  KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
220 #if HAVE_CUDA == 1
221  if (CuDevice::Instantiate().Enabled()) {
222  if (dim_ == 0) return;
223  CuTimer tim;
224  if (mat.Stride() == mat.NumCols()) {
225  CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
226  cudaMemcpyHostToDevice, cudaStreamPerThread));
227  } else {
228  Real* vec_data = data_;
229  for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
230  CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
231  sizeof(Real) * mat.NumCols(),
232  cudaMemcpyHostToDevice, cudaStreamPerThread));
233  vec_data += mat.NumCols();
234  }
235  }
236  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
237  CuDevice::Instantiate().AccuProfile(__func__, tim);
238  } else
239 #endif
240  {
241  Vec().CopyRowsFromMat(mat);
242  }
243 }
244 
245 template<typename Real>
247  KALDI_ASSERT(v.Dim() == NumCols() * NumRows());
248 #if HAVE_CUDA == 1
249  if (CuDevice::Instantiate().Enabled()) {
250  if (num_rows_ == 0) return;
251  CuTimer tim;
252  if (Stride() == NumCols()) {
253  CU_SAFE_CALL(cudaMemcpyAsync(data_, v.Data(),
254  sizeof(Real)*v.Dim(),
255  cudaMemcpyDeviceToHost,
256  cudaStreamPerThread));
257  } else {
258  const Real* vec_data = v.Data();
259  for (MatrixIndexT r = 0; r < NumRows(); r++) {
260  CU_SAFE_CALL(cudaMemcpyAsync(RowData(r), vec_data,
261  sizeof(Real) * NumCols(),
262  cudaMemcpyDeviceToHost,
263  cudaStreamPerThread));
264  vec_data += NumCols();
265  }
266  }
267  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
268  CuDevice::Instantiate().AccuProfile(__func__, tim);
269  } else
270 #endif
271  {
272  CopyRowsFromVec(v.Vec());
273  }
274 }
275 
276 // instantiate the template above.
279 
280 template<typename Real>
282  if (dim_ == 0) return;
283  CuRand<Real> tmp;
284  tmp.RandGaussian(this);
285 }
286 
287 template<typename Real>
289  if (dim_ == 0) return;
290  CuRand<Real> tmp;
291  tmp.RandUniform(this);
292 }
293 
294 
295 
296 template<typename Real>
298  if (dim_ == 0)
299  return 0.0;
300 #if HAVE_CUDA == 1
301  if (CuDevice::Instantiate().Enabled()) {
302  Real result;
303  CuTimer tim;
304 
305  // Small vectors are copied to RAM and reduced on CPU.
306  // The length is chosen by cu-vector-speed-test
307  if (dim_ < 4096) {
308  Vector<Real> ans_cpu(*this);
309  result = ans_cpu.Sum();
310  } else {
311  // Use no more than 256 blocks (still too many?)
312  int dimBlock = CU1DBLOCK;
313  int dimGrid = n_blocks(dim_, dimBlock);
314  if (dimGrid > 256) {
315  dimGrid = 256;
316  }
317  CuVector<Real> ans(dimGrid, kUndefined);
318  cuda_vec_sum(dimGrid, dimBlock, data_, ans.Data(), dim_, 1);
319  CU_SAFE_CALL(cudaGetLastError());
320  Vector<Real> ans_cpu(ans);
321  result = ans_cpu.Sum();
322  }
323 
324  CuDevice::Instantiate().AccuProfile(__func__, tim);
325  return result;
326  } else
327 #endif
328  {
329  return Vec().Sum();
330  }
331 }
332 
333 template<typename Real>
335 #if HAVE_CUDA == 1
336  if (CuDevice::Instantiate().Enabled()) {
337  if (dim_ == 0) return;
338  CuTimer tim;
339  size_t dimBlock = CU1DBLOCK;
340  size_t dimGrid = 1; // dimGrid value represent the number of rows
341  ::MatrixDim dim = { 1, this->dim_, this->dim_};
342  cuda_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);//actually dim is not stride...
343  CU_SAFE_CALL(cudaGetLastError());
344  CuDevice::Instantiate().AccuProfile(__func__, tim);
345  } else
346 #endif
347  {
348  Vec().ApplySoftMax();
349  }
350 }
351 
352 template<typename Real>
353 void CuVectorBase<Real>::Floor(const CuVectorBase<Real> &src, Real floor_val, MatrixIndexT *floored_count) {
354 #if HAVE_CUDA == 1
355  if (CuDevice::Instantiate().Enabled()) {
356  int dimBlock(CU1DBLOCK);
357  int dimGrid(n_blocks(dim_,CU1DBLOCK));
358  if (floored_count == nullptr) {
359  if (dim_ == 0) return;
360  CuTimer tim;
361  // We are calling a function meant for matrices, by viewing the
362  // vector as a matrix with a single row.
363  ::MatrixDim dim = {1, Dim(), 1};
364  cuda_floor(dimGrid, dimBlock, this->data_, src.Data(), floor_val, dim, 1);
365  CuDevice::Instantiate().AccuProfile("CuVectorBase::FloorNoCount", tim);
366  } else {
367  if (dim_ == 0) { *floored_count = 0; return; }
368  CuTimer tim;
369 
370  CuVector<float> count_vec(dim_, kUndefined);
371 
372  cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_);
373  CU_SAFE_CALL(cudaGetLastError());
374  *floored_count = count_vec.Sum();
375  CuDevice::Instantiate().AccuProfile("CuVectorBase::Floor", tim);
376  }
377  } else
378 #endif
379  {
380  Vec().Floor(src.Vec(), floor_val, floored_count);
381  }
382 }
383 
384 template<typename Real>
385 void CuVectorBase<Real>::Ceiling(const CuVectorBase<Real> &src, Real ceiling_val,
386  MatrixIndexT *ceiled_count) {
387 #if HAVE_CUDA == 1
388  if (CuDevice::Instantiate().Enabled()) {
389  int dimBlock(CU1DBLOCK);
390  int dimGrid(n_blocks(dim_,CU1DBLOCK));
391  if (ceiled_count == nullptr) {
392  if (dim_ == 0) return;
393  CuTimer tim;
394  // We are calling a function meant for matrices, by viewing the
395  // vector as a matrix with a single row.
396  ::MatrixDim dim = {1, Dim(), 1};
397  cuda_ceiling(dimGrid, dimBlock, this->data_, src.Data(), ceiling_val, dim, 1);
398 
399  CuDevice::Instantiate().AccuProfile("CuVectorBase::CeilingNoCount", tim);
400  } else {
401  if (dim_ == 0) { *ceiled_count = 0; return; }
402  CuTimer tim;
403 
404  CuVector<float> count_vec(dim_, kUndefined);
405 
406  cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_);
407  CU_SAFE_CALL(cudaGetLastError());
408  *ceiled_count = count_vec.Sum();
409  CuDevice::Instantiate().AccuProfile("CuVectorBase::Ceiling", tim);
410  }
411  } else
412 #endif
413  {
414  Vec().Ceiling(src.Vec(), ceiling_val, ceiled_count);
415  }
416 }
417 
418 template<typename Real>
419 void CuVectorBase<Real>::Pow(const CuVectorBase<Real> &src, Real power) {
420 #if HAVE_CUDA == 1
421  if (CuDevice::Instantiate().Enabled()) {
422  if (dim_ == 0) return;
423  CuTimer tim;
424  // for this particular kernel, x is #rows, y is #cols. so
425  // fake matrix with 1 row, Dim() cols.
426  dim3 dimBlock(CU1DBLOCK, 1);
427  dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK), 1);
428  ::MatrixDim fake_matrix_dim = { 1, Dim(), 1 };
429  // num_cols is Dim(), num_rows is 1, stride is 1 (it's a don't-care).
430  cuda_pow(dimGrid, dimBlock, this->data_, src.Data(), power, fake_matrix_dim, 1);
431  CU_SAFE_CALL(cudaGetLastError());
432  CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyPow", tim);
433  } else
434 #endif
435  {
436  Vec().Pow(src.Vec(), power);
437  }
438 }
439 
440 
441 template<typename Real>
443 #if HAVE_CUDA == 1
444  if (CuDevice::Instantiate().Enabled()) {
445  if (dim_ == 0) return;
446  CuTimer tim;
447  int dimBlock(CU1DBLOCK);
448  int dimGrid(n_blocks(dim_,CU1DBLOCK));
449 
450  cuda_vec_apply_exp(dimGrid, dimBlock, data_, dim_);
451  CU_SAFE_CALL(cudaGetLastError());
452  CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyExp", tim);
453  } else
454 #endif
455  {
456  Vec().ApplyExp();
457  }
458 }
459 
460 
461 template<typename Real>
463 #if HAVE_CUDA == 1
464  if (CuDevice::Instantiate().Enabled()) {
465  if (dim_ == 0) return;
466  CuTimer tim;
467  int dimBlock(CU1DBLOCK);
468  int dimGrid(n_blocks(dim_,CU1DBLOCK));
469 
470  CuVector<Real> flag(1);
471  cuda_vec_apply_log(dimGrid, dimBlock, data_, flag.Data(), dim_);
472  CU_SAFE_CALL(cudaGetLastError());
473  if (flag(0) > 0)
474  KALDI_ERR << "Trying to take log of a negative number.";
475  CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyLog", tim);
476  } else
477 #endif
478  {
479  Vec().ApplyLog();
480  }
481 }
482 
483 template<typename Real>
485 #if HAVE_CUDA == 1
486  if (CuDevice::Instantiate().Enabled()) {
487  if (dim_ == 0) return;
488  CuTimer tim;
489  size_t dimBlock = CU1DBLOCK;
490  size_t dimGrid = 1; // dimGrid value represent the number of rows
491  ::MatrixDim dim = { 1, this->dim_, this->dim_};
492 
493  cuda_log_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);
494  CU_SAFE_CALL(cudaGetLastError());
495  CuDevice::Instantiate().AccuProfile(__func__, tim);
496  } else
497 #endif
498  {
499  Vec().ApplyLogSoftMax();
500  }
501 }
502 
503 
504 
505 template<typename Real>
506 void CuVectorBase<Real>::AddMatVec(const Real alpha,
507  const CuMatrixBase<Real> &M,
508  MatrixTransposeType trans,
509  const CuVectorBase<Real> &v,
510  const Real beta) {
511  KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_) ||
512  (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
513  KALDI_ASSERT(&v != this);
514 #if HAVE_CUDA == 1
515  if (CuDevice::Instantiate().Enabled()) {
516  if (dim_ == 0) return;
517  CuTimer tim;
518 
519  // Everything is backwards in CuBlas. We need to reverse rows, columns,
520  // transpose-ness.
521  CUBLAS_SAFE_CALL(cublas_gemv(GetCublasHandle(),
522  (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T),
523  M.NumCols(), M.NumRows(), alpha, M.Data(),
524  M.Stride(), v.Data(), 1, beta, data_, 1));
525 
526  CuDevice::Instantiate().AccuProfile(__func__, tim);
527  } else
528 #endif
529  {
530  Vec().AddMatVec(alpha,M.Mat(),trans,v.Vec(),beta);
531  }
532 }
533 
534 template<typename Real>
535 void CuVectorBase<Real>::AddSpVec(const Real alpha,
536  const CuSpMatrix<Real> &M,
537  const CuVectorBase<Real> &v,
538  const Real beta) {
539  KALDI_ASSERT(M.NumCols() == v.dim_ && M.NumRows() == dim_);
540  KALDI_ASSERT(&v != this);
541 #if HAVE_CUDA == 1
542  if (CuDevice::Instantiate().Enabled()) {
543  if (dim_ == 0) return;
544  CuTimer tim;
545 
546  // Note: in our opinion the CuSpMatrix represents a lower-triangular matrix, but
547  // in CUBLAS, for some stupid reason, everything is reversed.
548  CUBLAS_SAFE_CALL(cublas_spmv(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, Dim(),
549  alpha, M.Data(), v.Data(), 1, beta, data_, 1));
550 
551  CuDevice::Instantiate().AccuProfile(__func__, tim);
552  } else
553 #endif
554  {
555  Vec().AddSpVec(alpha,M.Mat(),v.Vec(),beta);
556  }
557 }
558 
559 template<typename Real>
561  const CuVectorBase<Real> &r, Real beta) {
562  KALDI_ASSERT((dim_ == v.dim_ && dim_ == r.dim_));
563  KALDI_ASSERT(this != &v && this != &r);
564 #if HAVE_CUDA == 1
565  if (CuDevice::Instantiate().Enabled()) {
566  if (dim_ == 0) return;
567  CuTimer tim;
568  int dimBlock(CU1DBLOCK);
569  int dimGrid(n_blocks(dim_,CU1DBLOCK));
570 
571  cuda_add_vec_vec(dimGrid, dimBlock, alpha, data_, v.Data(), r.Data(), beta, dim_);
572  CU_SAFE_CALL(cudaGetLastError());
573  CuDevice::Instantiate().AccuProfile("CuVectorBase::AddVecVec", tim);
574  } else
575 #endif
576  {
577  Vec().AddVecVec(alpha, v.Vec(), r.Vec(), beta);
578  }
579 }
580 
581 
582 template<typename Real>
583 bool CuVectorBase<Real>::ApproxEqual(const CuVectorBase<Real> &other, float tol) const {
584  if (dim_ != other.dim_) KALDI_ERR << "ApproxEqual: size mismatch "
585  << dim_ << " vs. " << other.dim_;
586  KALDI_ASSERT(tol >= 0.0);
587  CuVector<Real> tmp(*this);
588  tmp.AddVec(-1.0, other);
589  BaseFloat tmp_norm = sqrt(VecVec(tmp, tmp)), this_norm = sqrt(VecVec(*this, *this));
590  return tmp_norm <= static_cast<Real>(tol) * this_norm;
591 }
592 
593 
594 template<typename Real>
596  MatrixTransposeType trans, Real beta) {
597 #if HAVE_CUDA == 1
598  if (CuDevice::Instantiate().Enabled()) {
599  if (dim_ == 0) return;
600  MatrixTransposeType other_trans = (trans == kTrans ? kNoTrans : kTrans);
601  KALDI_ASSERT(dim_ == (trans == kNoTrans ? M.NumRows() : M.NumCols()));
602  this->AddDiagMatMat(alpha, M, trans, M, other_trans, beta);
603  } else
604 #endif
605  {
606  Vec().AddDiagMat2(alpha, M.Mat(), trans, beta);
607  }
608 }
609 
610 template<typename Real>
612  MatrixTransposeType transM,
613  const CuMatrixBase<Real> &N,
614  MatrixTransposeType transN, Real beta) {
615 #if HAVE_CUDA == 1
616  if (CuDevice::Instantiate().Enabled()) {
617  CuTimer tim;
618 
619  if (transM != transN) {
620  KALDI_ASSERT(M.NumCols() == N.NumCols());
621  KALDI_ASSERT(M.NumRows() == N.NumRows());
622  if (transM == kNoTrans) {
623  // Case 1: diag(M*N') == sum(M.*N, 2)
624  // 1D grid and 1D block. One block per row of N.
625  // 1D grid expands along the column of N.
626  int dimBlock(CU1DBLOCK);
627  int dimGrid(M.NumRows());
628  cuda_add_diag_mat_mat_MNT(dimGrid, dimBlock, alpha, M.Data(), M.Dim(),
629  N.Data(), N.Stride(), beta, data_);
630  } else {
631  // Case 2: diag(M'*N) == sum(M.*N, 1)
632  // 16x16 or 8x32 2D block for coalesced memory access.
633  // Grid shape is designed as follows,
634  // 1. for small matrices, use 1D grid with only 1 row of 16x16 block,
635  // to avoid multiple kernel launch;
636  // 2. for large enough matrices (no matter thin or fat),
637  // use 1- or 2-D grid so that the grid contains
638  // at least and not much larger than 'kOptNumBlocks' blocks
639  // to fully utilize the GPU;
640  const int32 warpSize = 32;
641  const int32 kOptNumBlocks = 512;
642  const int32 tile_dim =
643  (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ?
644  16 : 32;
645  dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
646  dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x),
647  n_blocks(N.NumRows(), dimBlock.y));
648  dimGrid.y = std::min(dimGrid.y, (kOptNumBlocks - 1) / dimGrid.x + 1);
649  dimGrid.y = tile_dim == 16 ? 1 : dimGrid.y;
650  if (dimGrid.y > 1) {
651  CuMatrix<Real> buf(dimGrid.y, N.NumCols());
652  cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, Real(1), M.Data(),
653  M.Stride(), N.Data(), N.Dim(), Real(0),
654  buf.Data(), buf.Stride());
655  this->AddRowSumMat(alpha, buf, beta);
656  } else {
657  cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, alpha, M.Data(),
658  M.Stride(), N.Data(), N.Dim(), beta, data_,
659  dim_);
660  }
661  }
662  } else {
663  KALDI_ASSERT(M.NumCols() == N.NumRows());
664  KALDI_ASSERT(N.NumCols() == M.NumRows());
665  if (transM == kNoTrans) {
666  // Case 3: diag(M*N) == sum(M'.*N, 1)
667  // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access.
668  // One block per 'tile_dim' columns of N.
669  // 1D grid expands along the row of N.
670  int tile_dim =
671  sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
672  dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
673  dim3 dimGrid(n_blocks(N.NumCols(), tile_dim));
674  cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(),
675  N.Data(), N.Dim(), beta, data_);
676  } else {
677  // Case 4: diag(M'*N') == sum(N'.*M, 1)
678  // Same kernel and config as case 3 except M and N are swapped.
679  int tile_dim =
680  sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
681  dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
682  dim3 dimGrid(n_blocks(M.NumCols(), tile_dim));
683  cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(),
684  M.Data(), M.Dim(), beta, data_);
685  }
686  }
687  CU_SAFE_CALL(cudaGetLastError());
688 
689  CuDevice::Instantiate().AccuProfile(__func__, tim);
690  } else
691 #endif
692  {
693  Vec().AddDiagMatMat(alpha, M.Mat(), transM, N.Mat(), transN, beta);
694  }
695 }
696 
697 template<typename Real>
698 void CuVectorBase<Real>::AddTpVec(const Real alpha, const CuTpMatrix<Real> &M,
699  const MatrixTransposeType trans,
700  const CuVectorBase<Real> &v,
701  const Real beta) {
702  KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows());
703 #if HAVE_CUDA == 1
704  if (CuDevice::Instantiate().Enabled()) {
705  if (dim_ == 0) return;
706  CuTimer tim;
707  if (beta == 0.0) {
708  if (&v != this) CopyFromVec(v);
709  MulTp(M, trans);
710  if (alpha != 1.0) Scale(alpha);
711  } else {
712  CuVector<Real> tmp(v);
713  tmp.MulTp(M, trans);
714  if (beta != 1.0) Scale(beta); // *this <-- beta * *this
715  AddVec(alpha, tmp, 1.0); // *this += alpha * M * v
716  }
717  CuDevice::Instantiate().AccuProfile(__func__, tim);
718  } else
719 #endif
720  {
721  Vec().AddTpVec(alpha, M.Mat(), trans, v.Vec(), beta);
722  }
723 }
724 
725 
726 template<typename Real>
728  KALDI_ASSERT(M.NumRows() == dim_);
729 #if HAVE_CUDA == 1
730  if (CuDevice::Instantiate().Enabled()) {
731  if (dim_ == 0) return;
732  CuTimer tim;
733  cublas_tpmv(GetCublasHandle(), (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T),
734  M.NumRows(), M.Data(), data_, 1);
735  CuDevice::Instantiate().AccuProfile("CuVectorBase::MulTp", tim);
736  } else
737 #endif
738  {
739  Vec().MulTp(M.Mat(), trans);
740  }
741 }
742 
743 template<typename Real>
745  Real result = 0.0;
746 #if HAVE_CUDA == 1
747  if (CuDevice::Instantiate().Enabled()) {
748  if (dim_ == 0) { // min of an empty set is infinity.
749  return std::numeric_limits<Real>::infinity();
750  }
751  CuTimer tim;
752 
753  // Small vectors are copied to RAM and reduced on CPU.
754  // The length is chosen by cu-vector-speed-test
755  if (dim_ < 4096) {
756  Vector<Real> ans_cpu(*this);
757  result = ans_cpu.Min();
758  } else {
759  // Use no more than 256 blocks (still too many?)
760  int dimBlock = CU1DBLOCK;
761  int dimGrid = n_blocks(dim_, dimBlock);
762  if (dimGrid > 256) {
763  dimGrid = 256;
764  }
765  CuVector<Real> ans(dimGrid, kUndefined);
766  cuda_vec_min(dimGrid, dimBlock, data_, ans.Data(), dim_, 1);
767  CU_SAFE_CALL(cudaGetLastError());
768  Vector<Real> ans_cpu(ans);
769  result = ans_cpu.Min();
770  }
771 
772  CuDevice::Instantiate().AccuProfile(__func__, tim);
773  } else
774 #endif
775  {
776  result = (this->Vec()).Min();
777  }
778  return result;
779 }
780 
781 template<typename Real>
783  Real result = 0.0;
784 #if HAVE_CUDA == 1
785  if (CuDevice::Instantiate().Enabled()) {
786  if (dim_ == 0) { // max of an empty set is -infinity.
787  return -std::numeric_limits<Real>::infinity();
788  }
789  CuTimer tim;
790 
791  // Small vectors are copied to RAM and reduced on CPU.
792  // The length is chosen by cu-vector-speed-test
793  if (dim_ < 4096) {
794  Vector<Real> ans_cpu(*this);
795  result = ans_cpu.Max();
796  } else {
797  // Use no more than 256 blocks (still too many?)
798  int dimBlock = CU1DBLOCK;
799  int dimGrid = n_blocks(dim_, dimBlock);
800  if (dimGrid > 256) {
801  dimGrid = 256;
802  }
803  CuVector<Real> ans(dimGrid, kUndefined);
804  cuda_vec_max(dimGrid, dimBlock, data_, ans.Data(), dim_, 1);
805  CU_SAFE_CALL(cudaGetLastError());
806  Vector<Real> ans_cpu(ans);
807  result = ans_cpu.Max();
808  }
809 
810  CuDevice::Instantiate().AccuProfile(__func__, tim);
811  } else
812 #endif
813  {
814  result = (this->Vec()).Max();
815  }
816  return result;
817 }
818 
819 template<typename Real>
820 void CuVectorBase<Real>::ReplaceValue(Real orig, Real changed) {
821 #if HAVE_CUDA == 1
822  if (CuDevice::Instantiate().Enabled()) {
823  if (dim_ == 0) return;
824  CuTimer tim;
825  int dimBlock(CU1DBLOCK);
826  int dimGrid(n_blocks(dim_, CU1DBLOCK));
827  cuda_replace_value(dimGrid, dimBlock, data_, dim_, orig, changed);
828  CU_SAFE_CALL(cudaGetLastError());
829  CuDevice::Instantiate().AccuProfile(__func__, tim);
830  } else
831 #endif
832  {
833  Vec().ReplaceValue(orig, changed);
834  }
835 }
836 
837 template<typename Real>
839  KALDI_ASSERT(dim_ == v.dim_);
840 #if HAVE_CUDA == 1
841  if (CuDevice::Instantiate().Enabled()) {
842  if (dim_ == 0) return;
843  CuTimer tim;
844  int dimBlock(CU1DBLOCK);
845  int dimGrid(n_blocks(dim_, CU1DBLOCK));
846  cuda_vec_mul_elements(dimGrid, dimBlock, data_, v.Data(), dim_);
847  CU_SAFE_CALL(cudaGetLastError());
848  CuDevice::Instantiate().AccuProfile("CuVectorBase::MulElements", tim);
849  } else
850 #endif
851  {
852  Vec().MulElements(v.Vec());
853  }
854 }
855 
856 template<typename Real>
858  // this just creates a matrix and calls the matrix version.
859  KALDI_ASSERT(dim_ == v.dim_);
860  CuSubMatrix<Real> this_mat(this->Data(), 1, dim_, dim_),
861  v_mat(v.Data(), 1, dim_, dim_);
862  this_mat.DivElements(v_mat);
863 }
864 
865 
866 
867 template<>
868 template<>
870  KALDI_ASSERT(src.Dim() == dim_);
871 #if HAVE_CUDA == 1
872  if (CuDevice::Instantiate().Enabled()) {
873  if (dim_ == 0) return;
874  CuTimer tim;
875  CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, src.Data(), 1, data_, 1));
876  CuDevice::Instantiate().AccuProfile(__func__, tim);
877  } else
878 #endif
879  {
880  Vec().CopyFromVec(src.Vec());
881  }
882 }
883 
884 template<>
885 template<>
887  KALDI_ASSERT(src.Dim() == dim_);
888 #if HAVE_CUDA == 1
889  if (CuDevice::Instantiate().Enabled()) {
890  if (dim_ == 0) return;
891  CuTimer tim;
892  CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, src.Data(), 1, data_, 1));
893  CuDevice::Instantiate().AccuProfile(__func__, tim);
894  } else
895 #endif
896  {
897  Vec().CopyFromVec(src.Vec());
898  }
899 }
900 
901 
902 template<typename Real>
903 template<typename OtherReal>
905 #if HAVE_CUDA == 1
906  if (CuDevice::Instantiate().Enabled()) {
907  if (sizeof(Real) != sizeof(OtherReal)) {
908  CuVector<OtherReal> temp(dim_, kUndefined);
909  temp.CopyFromVec(src);
910  this->CopyFromVec(temp);
911  } else {
912  KALDI_ASSERT(src.Dim() == dim_);
913  if (dim_ == 0) return;
914  CuTimer tim;
915  CU_SAFE_CALL(cudaMemcpyAsync(data_, src.Data(), src.Dim()*sizeof(Real),
916  cudaMemcpyHostToDevice, cudaStreamPerThread));
917  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
918  CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D", tim);
919  }
920  } else
921  #endif
922  {
923  Vec().CopyFromVec(src);
924  }
925 }
926 // Instantiate the template above.
927 template
929 template
931 template
932 void CuVectorBase<float>::CopyFromVec(const VectorBase<double> &src);
933 template
934 void CuVectorBase<double>::CopyFromVec(const VectorBase<double> &src);
935 
936 template<typename Real>
937 template<typename OtherReal>
939  KALDI_ASSERT(dim_ == dst->Dim());
940 #if HAVE_CUDA == 1
941  if (CuDevice::Instantiate().Enabled()) {
942  if (sizeof(Real) != sizeof(OtherReal)) {
943  CuVector<OtherReal> temp(*this);
944  temp.CopyToVec(dst);
945  } else {
946  if (dim_ == 0) return;
947  CuTimer tim;
948  CU_SAFE_CALL(cudaMemcpyAsync(dst->Data(), this->data_,
949  sizeof(Real) * dim_, cudaMemcpyDeviceToHost,
950  cudaStreamPerThread));
951  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
952  CuDevice::Instantiate().AccuProfile(__func__, tim);
953  }
954  } else
955 #endif
956  {
957  dst->CopyFromVec(this->Vec());
958  }
959 }
960 
961 
962 template<typename Real>
963 void CuVector<Real>::Read(std::istream &is, bool binary) {
964  Vector<Real> temp;
965  temp.Read(is, binary);
966  Destroy();
967  Swap(&temp);
968 }
969 
970 
971 
972 template<typename Real>
973 void CuVector<Real>::Write(std::ostream &os, bool binary) const {
974  Vector<BaseFloat> temp(this->dim_, kUndefined);
975  this->CopyToVec(&temp);
976  temp.Write(os, binary);
977 }
978 
979 
980 template<typename Real>
982  this->Resize(v.Dim());
983  this->CopyFromVec(v);
984 }
985 
986 template<typename Real>
988  this->Resize(v.dim_);
989  this->CopyFromVec(v);
990 }
991 
992 template<typename Real>
994  KALDI_ASSERT(t == kSetZero || t == kUndefined); // Others not implemented
995  // yet.
996  if (this->dim_ == dim) {
997  this->SetZero();
998  return;
999  }
1000  if (this->dim_ != 0)
1001  this->Destroy();
1002  if (dim == 0) return;
1003 #if HAVE_CUDA == 1
1004  if (CuDevice::Instantiate().Enabled()) {
1005  CuTimer tim;
1006  this->data_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(dim * sizeof(Real)));
1007  this->dim_ = dim;
1008  if (t == kSetZero) this->SetZero();
1009  CuDevice::Instantiate().AccuProfile("CuVector::Resize", tim);
1010  } else
1011 #endif
1012  {
1013  Vector<Real> vec(dim);
1014  this->Swap(&vec);
1015  }
1016 }
1017 
1018 template<typename Real>
1020  std::swap(this->data_, vec->data_);
1021  std::swap(this->dim_, vec->dim_);
1022 }
1023 
1024 
1025 template<typename Real>
1027 #if HAVE_CUDA == 1
1028  if (CuDevice::Instantiate().Enabled()) {
1029  if (this->dim_ == 0) {
1030  if (vec->dim_ != 0) {
1031  // *this is empty, but vec is nonempty.
1032  Resize(vec->dim_, kUndefined);
1033  this->CopyFromVec(*vec);
1034  vec->Resize(0);
1035  }
1036  // else both are empty.
1037  } else { // *this is nonempty.
1038  if (vec->dim_ != 0) {
1039  // Both *this and *vec are nonempty. Recurse to simpler cases.
1040  // this could be done more efficiently in the case where
1041  // the size does not change.
1042  Vector<Real> temp;
1043  this->Swap(&temp); // now temp is full, *this is empty.
1044  vec->Swap(&temp); // now vec has data from *this, temp has
1045  // data from vec.
1046  Swap(&temp); // copy data in vec to *this, which is now empty.
1047  } else { // *this is full but *vec is empty.
1048  vec->Resize(this->dim_, kUndefined);
1049  this->CopyToVec(vec);
1050  this->Destroy();
1051  }
1052  }
1053  } else
1054 #endif
1055  {
1056  std::swap(vec->data_, this->data_);
1057  std::swap(vec->dim_, this->dim_);
1058  }
1059 }
1060 
1061 template<typename Real>
1063 #if HAVE_CUDA == 1
1064  if (CuDevice::Instantiate().Enabled()) {
1065  if (this->data_ != NULL)
1066  CuDevice::Instantiate().Free(this->data_);
1067  } else
1068 #endif
1069  {
1070  if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
1071  }
1072  this->data_ = NULL;
1073  this->dim_ = 0;
1074 }
1075 
1076 
1077 template<typename Real>
1079  KALDI_ASSERT(src.Dim() == dim_);
1080 #if HAVE_CUDA == 1
1081  if (CuDevice::Instantiate().Enabled()) {
1082  if (dim_ == 0) return;
1083  CuTimer tim;
1084  CU_SAFE_CALL(
1085  cudaMemcpyAsync(data_, src.data_, src.dim_ * sizeof(Real),
1086  cudaMemcpyDeviceToDevice, cudaStreamPerThread));
1087  CuDevice::Instantiate().AccuProfile(__func__, tim);
1088  } else
1089  #endif
1090  {
1091  memcpy(static_cast<void*>(data_), static_cast<void*>(src.data_),
1092  dim_ * sizeof(Real));
1093  }
1094 }
1095 
1096 
1097 template<typename Real>
1099  if (dim_==0 || data_==NULL) return;
1100 #if HAVE_CUDA == 1
1101  if (CuDevice::Instantiate().Enabled()) {
1102  KALDI_ASSERT(dim_>=0);
1103  KALDI_ASSERT(data_!=NULL);
1104  CuTimer tim;
1105  CU_SAFE_CALL(cudaMemsetAsync(data_, 0, dim_*sizeof(Real),
1106  cudaStreamPerThread));
1107  CuDevice::Instantiate().AccuProfile("CuVector::SetZero", tim);
1108  } else
1109 #endif
1110  {
1111  Vec().SetZero();
1112  }
1113 }
1114 
1115 
1116 
1118 template<typename Real>
1119 std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec) {
1120  Vector<Real> temp(vec.Dim());
1121  vec.CopyToVec(&temp);
1122  out << temp;
1123  return out;
1124 }
1125 // Instantiate the above.
1126 template
1127 std::ostream &operator << (std::ostream &out, const CuVectorBase<float> &vec);
1128 template
1129 std::ostream &operator << (std::ostream &out, const CuVectorBase<double> &vec);
1130 
1131 /*
1132  * Methods wrapping the ANSI-C CUDA kernels
1133  */
1134 template<typename Real>
1135 void CuVectorBase<Real>::Set(Real value) {
1136 #if HAVE_CUDA == 1
1137  if (CuDevice::Instantiate().Enabled()) {
1138  CuTimer tim;
1139 
1140  dim3 dimBlock(CU1DBLOCK);
1141  dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK));
1142  ::MatrixDim d = { 1, Dim(), Dim() };
1143 
1144  cuda_set_const(dimGrid, dimBlock, data_, value, d);
1145  CU_SAFE_CALL(cudaGetLastError());
1146  CuDevice::Instantiate().AccuProfile(__func__, tim);
1147  } else
1148 #endif
1149  {
1150  Vec().Set(value);
1151  }
1152 }
1153 
1154 
1155 
1156 template<typename Real>
1157 void CuVectorBase<Real>::Add(Real value) {
1158 #if HAVE_CUDA == 1
1159  if (CuDevice::Instantiate().Enabled()) {
1160  CuTimer tim;
1161 
1162  dim3 dimBlock(CU1DBLOCK);
1163  dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK));
1164  ::MatrixDim d = { 1, Dim(), Dim() };
1165 
1166  cuda_add(dimGrid, dimBlock, data_, value, d);
1167  CU_SAFE_CALL(cudaGetLastError());
1168  CuDevice::Instantiate().AccuProfile(__func__, tim);
1169  } else
1170 #endif
1171  {
1172  Vec().Add(value);
1173  }
1174 }
1175 
1176 template<typename Real>
1178 #if HAVE_CUDA == 1
1179  if (CuDevice::Instantiate().Enabled()) {
1180  KALDI_ASSERT(dim_ == M.NumRows());
1181  if (dim_ == 0) return;
1182  CuTimer tim;
1183  int dimBlock(CU1DBLOCK);
1184  int dimGrid(n_blocks(Dim(), CU1DBLOCK));
1185  cuda_vec_copy_diag_from_packed(dimGrid, dimBlock, data_, M.Data(), dim_);
1186  CU_SAFE_CALL(cudaGetLastError());
1187 
1188  CuDevice::Instantiate().AccuProfile(__func__, tim);
1189  } else
1190 #endif
1191  {
1192  Vec().CopyDiagFromPacked(M.Mat());
1193  }
1194 }
1195 
1196 
1197 template<typename Real>
1199 #if HAVE_CUDA == 1
1200  if (CuDevice::Instantiate().Enabled()) {
1201  KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
1202  CuTimer tim;
1203  CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, M.Data(), M.Stride() + 1,
1204  data_, 1));
1205 
1206  CuDevice::Instantiate().AccuProfile(__func__, tim);
1207  } else
1208 #endif
1209  {
1210  Vec().CopyDiagFromMat(M.Mat());
1211  }
1212 }
1213 
1214 
1215 template<typename Real>
1216 void CuVectorBase<Real>::Scale(Real value) {
1217  #if HAVE_CUDA == 1
1218  if (CuDevice::Instantiate().Enabled()) {
1219  if (Dim() == 0 ) return;
1220 
1221  CuTimer tim;
1222  dim3 dimBlock(CU1DBLOCK);
1223  dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK));
1224  ::MatrixDim d = { 1, Dim(), Dim() };
1225  cuda_scale(dimGrid, dimBlock, data_, value, d);
1226  CU_SAFE_CALL(cudaGetLastError());
1227 
1228  CuDevice::Instantiate().AccuProfile(__func__, tim);
1229  } else
1230  #endif
1231  {
1232  Vec().Scale(value);
1233  }
1234 }
1235 
1236 template<typename Real>
1238  Real beta) {
1239  KALDI_ASSERT(vec.Dim() == Dim());
1240 
1241 #if HAVE_CUDA == 1
1242  if (CuDevice::Instantiate().Enabled()) {
1243  CuTimer tim;
1244  int32 dim = this->dim_;
1245  Real *data = this->data_;
1246  const Real *vec_data = vec.data_;
1247  if (beta != 1.0) CU_SAFE_CALL(cuda_scal(GetCublasHandle(), dim, beta, data, 1));
1248  if (alpha != 0.0) CU_SAFE_CALL(cuda_axpy(GetCublasHandle(), dim, alpha, vec_data, 1, data, 1));
1249  CuDevice::Instantiate().AccuProfile(__func__, tim);
1250  } else
1251  #endif
1252  {
1253  if (beta != 1.0) Vec().Scale(beta);
1254  Vec().AddVec(alpha, vec.Vec());
1255  }
1256 }
1257 
1258 
1259 template<typename Real>
1260 template<typename OtherReal>
1262  Real beta) {
1263  // We could implement this directly, without using a temporary-- this can
1264  // be done later, when we have time.
1265  CuVector<Real> temp(vec);
1266  this->AddVec(alpha, temp, beta);
1267 }
1268 // instantiate the template above.
1269 template
1270 void CuVectorBase<float>::AddVec(float alpha, const CuVectorBase<double> &vec,
1271  float beta);
1272 template
1273 void CuVectorBase<double>::AddVec(double alpha, const CuVectorBase<float> &vec,
1274  double beta);
1275 
1276 template<typename Real>
1278  Real beta) {
1279  KALDI_ASSERT(mat.NumCols() == Dim());
1280  if (Dim() == 0)
1281  return;
1282 #if HAVE_CUDA == 1
1283  if (CuDevice::Instantiate().Enabled()) {
1284  CuTimer tim;
1285  cuda_add_row_sum_mat(mat.NumCols(), CU1DBLOCK, Data(), mat.Data(),
1286  mat.Dim(), alpha, beta);
1287  CU_SAFE_CALL(cudaGetLastError());
1288 
1289  CuDevice::Instantiate().AccuProfile(__func__, tim);
1290  } else
1291 #endif
1292  {
1293  Vec().AddRowSumMat(alpha, mat.Mat(), beta);
1294  }
1295 }
1296 
1297 template<typename Real>
1299  Real beta) {
1300 #if HAVE_CUDA == 1
1301  if (CuDevice::Instantiate().Enabled()) {
1302  CuTimer tim;
1303  KALDI_ASSERT(mat.NumRows() == Dim());
1304 
1305  cuda_add_col_sum_mat(mat.NumRows(), CU1DBLOCK, Data(), mat.Data(),
1306  mat.Dim(), alpha, beta);
1307  CU_SAFE_CALL(cudaGetLastError());
1308 
1309  CuDevice::Instantiate().AccuProfile(__func__, tim);
1310  } else
1311 #endif
1312  {
1313  Vec().AddColSumMat(alpha, mat.Mat(), beta);
1314  }
1315 }
1316 
1317 template<typename Real>
1319 #if HAVE_CUDA == 1
1320  if (CuDevice::Instantiate().Enabled()) {
1321  CuTimer tim;
1322 
1323  dim3 dimBlock(CU1DBLOCK, 1);
1324  dim3 dimGrid(n_blocks(dim_, CU1DBLOCK));
1325  MatrixDim d = {1, dim_, dim_};
1326 
1327  cuda_invert_elements(dimGrid, dimBlock, data_, d);
1328  CU_SAFE_CALL(cudaGetLastError());
1329 
1330  CuDevice::Instantiate().AccuProfile(__func__, tim);
1331  } else
1332 #endif
1333  {
1334  Vec().InvertElements();
1335  }
1336 }
1337 
1338 
1339 template<typename Real>
1341  const MatrixTransposeType trans,
1342  const CuArrayBase<int32> &elements) {
1343  KALDI_ASSERT(elements.Dim() == Dim());
1344 #if HAVE_CUDA == 1
1345  if (CuDevice::Instantiate().Enabled()) {
1346  CuTimer tim;
1347 
1348  dim3 dimBlock(CU1DBLOCK);
1349  dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK));
1350 
1351  cuda_vector_copy_elements(dimGrid, dimBlock, this->data_, Dim(),
1352  mat.Data(), mat.Stride(), trans == kTrans,
1353  elements.Data());
1354  CU_SAFE_CALL(cudaGetLastError());
1355  CuDevice::Instantiate().AccuProfile(__func__, tim);
1356  } else
1357 #endif
1358  {
1359  VectorBase<Real> &this_vec = this->Vec();
1360  const MatrixBase<Real> &src_mat = mat.Mat();
1361  const int32* index_map = elements.Data();
1362  KALDI_ASSERT((Dim() == mat.NumRows() && trans == kNoTrans)
1363  || (Dim() == mat.NumCols() && trans == kTrans));
1364  for (int32 i = 0; i < Dim(); i++) {
1365  int32 j = index_map[i];
1366  KALDI_ASSERT(j >= 0);
1367  if (trans == kNoTrans) {
1368  KALDI_ASSERT(j < mat.NumCols());
1369  this_vec(i) = src_mat(i, j);
1370  } else {
1371  KALDI_ASSERT(j < mat.NumRows());
1372  this_vec(i) = src_mat(j, i);
1373  }
1374  }
1375  }
1376 }
1377 
1378 
1379 
1380 template
1382 template
1384 template
1385 void CuVectorBase<float>::CopyToVec(VectorBase<double> *dst) const;
1386 template
1387 void CuVectorBase<double>::CopyToVec(VectorBase<double> *dst) const;
1388 
1389 template class CuVectorBase<float>;
1390 template class CuVectorBase<double>;
1391 
1392 template class CuVector<float>;
1393 template class CuVector<double>;
1394 
1395 } // namespace
const MatrixBase< Real > & Mat() const
Definition: cu-matrix.h:755
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
MatrixIndexT Stride() const
Definition: cu-matrix.h:217
This class provides a way for switching between double and float types.
Definition: matrix-common.h:84
MatrixResizeType
Definition: matrix-common.h:37
const PackedMatrix< Real > & Mat() const
void MulTp(const CuTpMatrix< Real > &M, const MatrixTransposeType trans)
Multiplies this vector by lower-triangular marix: *this <– *this *M.
Definition: cu-vector.cc:727
void RandUniform(CuMatrixBase< Real > *tgt)
Fill with uniform [0..1] floats,.
Definition: cu-rand.cc:60
MatrixIndexT NumRows() const
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
void CopyColFromMat(const CuMatrixBase< Real > &mat, MatrixIndexT col)
Definition: cu-vector.cc:103
const Real * Data() const
Gives pointer to raw data (const).
Definition: kaldi-matrix.h:79
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:46
Real * data_
GPU data pointer (or regular data pointer if CUDA is not compiled in or we have no GPU)...
Definition: cu-vector.h:248
Real Sum() const
Definition: cu-vector.cc:297
void Write(std::ostream &Out, bool binary) const
Writes to C++ stream (option to write in binary).
Real * RowData(MatrixIndexT i)
Returns pointer to data for one row (non-const)
Definition: kaldi-matrix.h:87
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
kaldi::int32 int32
Real VecMatVec(const VectorBase< Real > &v1, const MatrixBase< Real > &M, const VectorBase< Real > &v2)
Returns .
const T * Data() const
Get raw pointer.
Definition: cu-array.h:52
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
Real Min() const
Returns the minimum value of any element, or +infinity for the empty vector.
uint64 data_
template double VecVec(const CuVectorBase< double > &A, const CuVectorBase< float > &B)
const SpMatrix< Real > & Mat() const
Definition: cu-sp-matrix.h:132
void RandGaussian(CuMatrixBase< Real > *tgt)
Fill with Normal random numbers,.
Definition: cu-rand.cc:116
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols.
Definition: kaldi-matrix.h:70
void AddMatVec(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, const CuVectorBase< Real > &v, const Real beta)
Definition: cu-vector.cc:506
int32 MatrixIndexT
Definition: matrix-common.h:98
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
Definition: cu-vector.cc:1078
MatrixIndexT dim_
dimension of vector
Definition: kaldi-vector.h:397
void Swap(Vector< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define CU1DBLOCK
Definition: cu-matrixdim.h:57
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
#define KALDI_MEMALIGN_FREE(x)
Definition: kaldi-utils.h:60
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70
void DivElements(const CuMatrixBase< Real > &A)
Divide two matrices elementwise: C = A ./ A.
Definition: cu-matrix.cc:691
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
Real Sum() const
Returns sum of the elements.
void AddVec(Real alpha, const CuVectorBase< Real > &vec, Real beta=1.0)
Definition: cu-vector.cc:1237
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746
Real * data_
data memory area
Definition: kaldi-vector.h:395
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
MatrixIndexT dim_
dimension of the vector
Definition: cu-vector.h:250
A class representing a vector.
Definition: kaldi-vector.h:406
const VectorBase< Real > & Vec() const
Definition: cu-vector.h:235
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
Matrix for CUDA computing.
Definition: matrix-common.h:75
MatrixTransposeType
Definition: matrix-common.h:32
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: cu-vector.h:72
::MatrixDim Dim() const
Definition: cu-matrix.h:221
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
MatrixIndexT NumCols() const
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:49
const Matrix< Real > & Mat() const
Definition: cu-matrix.h:879
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
void CopyToVec(VectorBase< OtherReal > *dst) const
Definition: cu-vector.cc:938
void Read(std::istream &in, bool binary, bool add=false)
Read function using C++ streams.
const TpMatrix< Real > & Mat() const
Definition: cu-tp-matrix.h:80
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69
Vector for CUDA computing.
Definition: matrix-common.h:72
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).
Definition: cu-matrix.h:740