cu-sp-matrix.cc
Go to the documentation of this file.
1 // cudamatrix/cu-sp-matrix.cc
2 
3 // Copyright 2013 Karel Vesely
4 // 2014-2015 Johns Hopkins University (author: Daniel Povey)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #if HAVE_CUDA == 1
22 #include <cuda_runtime_api.h>
23 #include <cublas_v2.h>
24 #endif
25 
26 #include "base/timer.h"
27 #include "cudamatrix/cu-common.h"
28 #include "cudamatrix/cu-vector.h"
29 #include "cudamatrix/cu-device.h"
30 #include "cudamatrix/cu-kernels.h"
31 #include "cudamatrix/cu-math.h"
33 #include "cudamatrix/cu-matrix.h"
35 
36 namespace kaldi {
37 
38 template<typename Real>
40  SpCopyType copy_type) {
41  KALDI_ASSERT(this->num_rows_ == M.NumRows() &&
42  this->num_rows_ == M.NumCols());
43  if (this->num_rows_ == 0)
44  return;
45 #if HAVE_CUDA == 1
46  if (CuDevice::Instantiate().Enabled()) {
47  CuTimer tim;
48  MatrixIndexT D = this->NumRows();
49  if (D == 0)
50  return;
51  switch (copy_type) {
52  case kTakeMeanAndCheck:
53  KALDI_ERR << "kTakeMeanAndCheck not supported!";
54  // The grid/block dimensions have been very roughly tuned for the
55  // individual cases.
56  case kTakeMean:
57  {
58  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
59  dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
60  cuda_take_mean(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
61  CU_SAFE_CALL(cudaGetLastError());
62  }
63  break;
64  case kTakeLower:
65  {
66  int32 block_size = std::min(CU1DBLOCK, this->num_rows_);
67  dim3 dimBlock(1, block_size);
68  dim3 dimGrid(D, n_blocks(D, block_size));
69  cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
70  CU_SAFE_CALL(cudaGetLastError());
71  }
72  break;
73  case kTakeUpper:
74  {
75  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
76  dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
77  cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
78  CU_SAFE_CALL(cudaGetLastError());
79  }
80  break;
81  default:
82  KALDI_ASSERT("Invalid argument to CuSpMatrix::CopyFromMat");
83  }
84  CuDevice::Instantiate().AccuProfile("CuSpMatrix::CopyFromMat(from CuMatrixBase)", tim);
85  } else
86 #endif
87  {
88  Mat().CopyFromMat(M.Mat(), copy_type);
89  }
90 }
91 
92 template<typename Real>
94 #if HAVE_CUDA == 1
95  if (CuDevice::Instantiate().Enabled()) {
96  CuMatrix<Real> mat(this->num_rows_, this->num_rows_);
97  mat.CopyFromSp(*this);
98  mat.SymInvertPosDef();
99  this->CopyFromMat(mat);
100  } else
101 #endif
102  { // Use inversion of CPU-based SpMatrix.
103  Mat().Invert();
104  }
105 }
106 
107 template<typename Real>
108 void CuSpMatrix<Real>::AddVec2(const Real alpha, const CuVectorBase<Real> &v) {
109  KALDI_ASSERT(v.Dim() == this->NumRows());
110 #if HAVE_CUDA == 1
111  if (CuDevice::Instantiate().Enabled()) {
112  if (this->num_rows_ == 0) return;
113  CuTimer tim;
114  size_t nr = this->num_rows_;
115  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
116  dim3 dimGrid(n_blocks(nr, CU2DBLOCK), n_blocks(nr, CU2DBLOCK));
117 
118  CUBLAS_SAFE_CALL(cublas_spr(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, this->num_rows_, alpha, v.Data(),
119  1, this->Data()));
120 
121  CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddVec2", tim);
122  } else
123 #endif
124  {
125  Mat().AddVec2(alpha, v.Vec());
126  }
127 }
128 
129 template<typename Real>
130 void CuSpMatrix<Real>::AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
131  MatrixTransposeType transM, const Real beta) {
132  KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
133  || (transM == kTrans && this->NumRows() == M.NumCols()));
134 
135 #if HAVE_CUDA == 1
136  if (CuDevice::Instantiate().Enabled()) {
137  if (this->num_rows_ == 0) return;
138  CuTimer tim;
139  MatrixIndexT this_dim = this->NumRows(),
140  m_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows());
141 
142  if (this_dim == 0) return;
143  if (alpha == 0.0) {
144  if (beta != 1.0) this->Scale(beta);
145  return;
146  }
147 
148  cublasOperation_t trans = (transM == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T);
149 
150  CuMatrix<Real> tmp_mat(*this);
151  cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, trans, this_dim, m_other_dim, alpha, M.Data(),
152  M.Stride(), beta, tmp_mat.Data(), tmp_mat.Stride());
153  this->CopyFromMat(tmp_mat, kTakeLower);
154 
155  CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddMat2", tim);
156  } else
157 #endif
158  {
159  Mat().AddMat2(alpha, M.Mat(), transM, beta);
160  }
161 }
162 
167 template<typename Real, typename OtherReal>
169  KALDI_ASSERT(A.NumRows() == B.NumRows());
170 #if HAVE_CUDA == 1
171  if (CuDevice::Instantiate().Enabled()) {
172  if (A.NumRows() == 0) return 0.0;
173  MatrixIndexT nr = A.NumRows(), size = nr * (nr+1) / 2;
174  CuVector<Real> Adiag(nr, kUndefined);
175  CuVector<OtherReal> Bdiag(nr, kUndefined);
176  Adiag.CopyDiagFromPacked(A);
177  Bdiag.CopyDiagFromPacked(B);
178  CuSubVector<Real> Aall(A.Data(), size);
179  CuSubVector<OtherReal> Ball(B.Data(), size);
180  // Below, we subtrace VecVec(Adiag, Bdiag) to remove double-counting
181  // on the diagonal.
182  return 2.0 * VecVec(Aall, Ball) - VecVec(Adiag, Bdiag);
183  } else
184 #endif
185  {
186  return TraceSpSp(A.Mat(), B.Mat());
187  }
188 }
189 template
190 float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<float> &B);
191 template
192 float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<double> &B);
193 template
194 double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<float> &B);
195 template
196 double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<double> &B);
197 
198 
199 template<typename Real>
200 bool CuSpMatrix<Real>::ApproxEqual(const CuSpMatrix<Real> &B, Real tol) const {
201  KALDI_ASSERT(this->NumRows() == B.NumRows());
202  CuSpMatrix<Real> diff(*this);
203  diff.AddSp(-1.0, B);
204  Real a = this->FrobeniusNorm(), b = B.FrobeniusNorm(),
205  d = diff.FrobeniusNorm();
206  return (d <= tol * std::max(a, b));
207 }
208 
209 template<typename Real>
210 bool CuSpMatrix<Real>::IsUnit(Real tol) const {
211  // want to return:
212  //FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.:
213  //sqrt (trace((*this - I)(*this-I)) <= tol * NumRows()
214  // trace((*this - I)(*this - I)) <= tol * NumRows()
215  // trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows()
216  // trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows()
217 
218  // Note: we could do this more efficiently still, by slightly changing the
219  // definition of IsUnit and getting rid of the extra stuff inside TraceSpSp
220  // that corrects for the diagonal being counted twice.
221  return (TraceSpSp(*this, *this) + this->NumRows() - 2.0 * this->Trace() <=
222  tol * this->NumRows());
223 }
224 
225 template <class Real>
227  this->Resize(in.NumRows(), kUndefined);
228  this->CopyFromPacked(in);
229  return *this;
230 }
231 
232 template class CuSpMatrix<float>;
233 template class CuSpMatrix<double>;
234 
235 
236 
237 } // namespace
const MatrixBase< Real > & Mat() const
Definition: cu-matrix.h:755
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
MatrixIndexT Stride() const
Definition: cu-matrix.h:217
CuSpMatrix< Real > & operator=(const CuSpMatrix< Real > &in)
void CopyDiagFromPacked(const CuPackedMatrix< Real > &M)
Extracts the diagonal of a packed matrix M; works for Sp or Tp.
Definition: cu-vector.cc:1177
MatrixIndexT NumRows() const
kaldi::int32 int32
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void Invert()
Note: the CuMatrix version of the Invert() function will only work for positive definite matrices; it...
Definition: cu-sp-matrix.cc:93
uint64 data_
const SpMatrix< Real > & Mat() const
Definition: cu-sp-matrix.h:132
void SymInvertPosDef()
Inversion for positive definite symmetric matrices.
Definition: cu-matrix.cc:2111
int32 MatrixIndexT
Definition: matrix-common.h:98
void CopyFromSp(const CuSpMatrix< Real > &M)
Definition: cu-matrix.cc:360
#define KALDI_ERR
Definition: kaldi-error.h:147
#define CU1DBLOCK
Definition: cu-matrixdim.h:57
double TraceSpSp(const SpMatrix< double > &A, const SpMatrix< double > &B)
Definition: sp-matrix.cc:326
#define CU2DBLOCK
Definition: cu-matrixdim.h:61
bool IsUnit(Real tol=0.001) const
Real FrobeniusNorm() const
Definition: cu-sp-matrix.h:79
void CopyFromMat(const CuMatrixBase< Real > &orig, SpCopyType copy_type=kTakeLower)
Definition: cu-sp-matrix.cc:39
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
const VectorBase< Real > & Vec() const
Definition: cu-vector.h:235
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixTransposeType
Definition: matrix-common.h:32
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: cu-vector.h:72
::MatrixDim Dim() const
Definition: cu-matrix.h:221
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
void AddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const Real beta)
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69
Vector for CUDA computing.
Definition: matrix-common.h:72
void AddVec2(const Real alpha, const CuVectorBase< Real > &v)
bool ApproxEqual(const CuSpMatrix< Real > &other, Real tol=0.001) const