doc/cu-vector_8cc_source.html

 // cudamatrix/cu-vector.cc

 // Copyright 2012-2013  Karel Vesely
 //           2012-2014  Johns Hopkins University (author: Daniel Povey)
 //                2017  Daniel Galvez
 //           2016-2018  Shiyin Kang
 //                2019  Yiwen Shao

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.

 #if HAVE_CUDA == 1
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif

 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-vector.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-kernels.h"
 #include "cudamatrix/cu-math.h"
 #include "cudamatrix/cu-vector.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-rand.h"
 #include "cudamatrix/cu-tp-matrix.h"
 #include "cudamatrix/cu-sp-matrix.h"
 #include "cudamatrix/cu-sparse-matrix.h"
 #include "cudamatrix/cublas-wrappers.h"

 namespace kaldi {


 template<typename Real>
 Real VecVec(const CuVectorBase<Real> &a,
             const CuVectorBase<Real> &b) {
   //MatrixIndexT a_dim = a.Dim();
   KALDI_ASSERT(a.Dim() == b.Dim());
   Real result = 0;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CUBLAS_SAFE_CALL(cublas_dot(GetCublasHandle(), a.Dim(), a.Data(), 1, b.Data(),
                                 1, &result));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
 } else
 #endif
   {
     result = VecVec(a.Vec(), b.Vec());
   }
   return result;
 }
 // instantiate the template above
 template float VecVec(const CuVectorBase<float> &a, const CuVectorBase<float> &b);
 template double VecVec(const CuVectorBase<double> &a, const CuVectorBase<double> &b);

 // The version of VecVec that can do type conversion.  For now we give this a
 // stupid implementation that converts one of the vectors.  If it ever becomes
 // an efficiency bottleneck, we can revisit this.
 template<typename Real, typename OtherReal>
 Real VecVec(const CuVectorBase<Real> &A, const CuVectorBase<OtherReal> &B) {
   CuVector<Real> B2(B);
   return VecVec(A, B2); // This will call the single-parameter template.
 }
 // instantiate the template above
 template float VecVec(const CuVectorBase<float> &A, const CuVectorBase<double> &B);
 template double VecVec(const CuVectorBase<double> &A, const CuVectorBase<float> &B);


 template<typename Real>
 Real VecMatVec(const CuVectorBase<Real> &v1, const CuMatrixBase<Real> &M,
                const CuVectorBase<Real> &v2) {
   KALDI_ASSERT(v1.Dim() == M.NumRows() && M.NumCols() == v2.Dim());
   if (v1.Dim() > v2.Dim()) {  // do v2*M first
     CuVector<Real> v2M(v1.Dim());
     v2M.AddMatVec(1.0, M, kNoTrans, v2, 0.0);
     return VecVec(v2M, v1);
   } else {  // do v1*M first
     CuVector<Real> v1M(v2.Dim());
     v1M.AddMatVec(1.0, M, kTrans, v1, 0.0);
     return VecVec(v1M, v2);
   }
 }
 // instantiate the template above
 template float VecMatVec(const CuVectorBase<float> &v1, const CuMatrixBase<float> &M,
                const CuVectorBase<float> &v2);
 template double VecMatVec(const CuVectorBase<double> &v1, const CuMatrixBase<double> &M,
                const CuVectorBase<double> &v2);

 template<typename Real>
 void CuVectorBase<Real>::CopyColFromMat(const CuMatrixBase<Real> &mat, MatrixIndexT col) {
   KALDI_ASSERT(col < mat.NumCols());
   KALDI_ASSERT(dim_ == mat.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     cublas_copy(GetCublasHandle(),
                 this->dim_, mat.Data() + col, mat.Stride(), this->data_, 1);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim);
   } else
 #endif
   {
     Vec().CopyColFromMat(mat.Mat(),col);
   }
 }

 template<>
 template<>
 void CuVectorBase<double>::CopyColFromMat(const CuMatrixBase<float> &mat, MatrixIndexT col) {
   KALDI_ASSERT(col < mat.NumCols());
   KALDI_ASSERT(dim_ == mat.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_,CU1DBLOCK));

     cuda_copy_col_from_mat_df(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim);
   } else
 #endif
   {
     Vec().CopyColFromMat(mat.Mat(), col);
   }
 }


 template<>
 template<>
 void CuVectorBase<float>::CopyColFromMat(const CuMatrixBase<double> &mat, MatrixIndexT col) {
   KALDI_ASSERT(col < mat.NumCols());
   KALDI_ASSERT(dim_ == mat.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_,CU1DBLOCK));

     cuda_copy_col_from_mat_fd(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim);
   } else
 #endif
   {
     Vec().CopyColFromMat(mat.Mat(), col);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::CopyRowsFromMat(const CuMatrixBase<Real> &mat) {
   KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     if (mat.Stride() == mat.NumCols() && mat.NumRows() != 0) {
       CU_SAFE_CALL(
         cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
                         cudaMemcpyDeviceToDevice, cudaStreamPerThread));
     } else {
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
         CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
                                      sizeof(Real) * mat.NumCols(),
                                      cudaMemcpyDeviceToDevice,
                                      cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
     }
     CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyRowsFromMat", tim);
   } else
 #endif
   {
     Vec().CopyRowsFromMat(mat.Mat());
   }
 }

 template<typename Real>
 Real CuVectorBase<Real>::Norm(Real p) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     Real ans;
     KALDI_ASSERT(p == 1.0 || p == 2.0);
     if (dim_ == 0) return 0.0;
     if (p == 1.0) {
       cublas_asum(GetCublasHandle(), dim_, data_, 1, &ans);
     } else {
       cublas_nrm2(GetCublasHandle(), dim_, data_, 1, &ans);
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
     if (ans != ans) {
       KALDI_ERR << "NaN in norm " << *this;
     }
     return ans;
   } else
 #endif
   {
     return Vec().Norm(p);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
   KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     if (mat.Stride() == mat.NumCols()) {
       CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
                               cudaMemcpyHostToDevice, cudaStreamPerThread));
     } else {
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
         CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
                                 sizeof(Real) * mat.NumCols(),
                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
     }
     CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().CopyRowsFromMat(mat);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
   KALDI_ASSERT(v.Dim() == NumCols() * NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     CuTimer tim;
     if (Stride() == NumCols()) {
       CU_SAFE_CALL(cudaMemcpyAsync(data_, v.Data(),
                               sizeof(Real)*v.Dim(),
                               cudaMemcpyDeviceToHost,
                               cudaStreamPerThread));
     } else {
       const Real* vec_data = v.Data();
       for (MatrixIndexT r = 0; r < NumRows(); r++) {
         CU_SAFE_CALL(cudaMemcpyAsync(RowData(r), vec_data,
                                 sizeof(Real) * NumCols(),
                                 cudaMemcpyDeviceToHost,
                                 cudaStreamPerThread));
         vec_data += NumCols();
       }
     }
     CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     CopyRowsFromVec(v.Vec());
   }
 }

 // instantiate the template above.
 template void MatrixBase<float>::CopyRowsFromVec(const CuVectorBase<float> &v);
 template void MatrixBase<double>::CopyRowsFromVec(const CuVectorBase<double> &v);

 template<typename Real>
 void CuVectorBase<Real>::SetRandn() {
   if (dim_ == 0) return;
   CuRand<Real> tmp;
   tmp.RandGaussian(this);
 }

 template<typename Real>
 void CuVectorBase<Real>::SetRandUniform() {
   if (dim_ == 0) return;
   CuRand<Real> tmp;
   tmp.RandUniform(this);
 }


 template<typename Real>
 Real CuVectorBase<Real>::Sum() const {
   if (dim_ == 0)
     return 0.0;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Real result;
     CuTimer tim;

     // Small vectors are copied to RAM and reduced on CPU.
     // The length is chosen by cu-vector-speed-test
     if (dim_ < 4096) {
       Vector<Real> ans_cpu(*this);
       result = ans_cpu.Sum();
     } else {
       // Use no more than 256 blocks (still too many?)
       int dimBlock = CU1DBLOCK;
       int dimGrid = n_blocks(dim_, dimBlock);
       if (dimGrid > 256) {
         dimGrid = 256;
       }
       CuVector<Real> ans(dimGrid, kUndefined);
       cuda_vec_sum(dimGrid, dimBlock, data_, ans.Data(), dim_, 1);
       CU_SAFE_CALL(cudaGetLastError());
       Vector<Real> ans_cpu(ans);
       result = ans_cpu.Sum();
     }

     CuDevice::Instantiate().AccuProfile(__func__, tim);
     return result;
   } else
 #endif
   {
     return Vec().Sum();
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::ApplySoftMax() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     size_t dimBlock = CU1DBLOCK;
     size_t dimGrid = 1;       // dimGrid value represent the number of rows
     ::MatrixDim dim = { 1, this->dim_, this->dim_};
     cuda_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);//actually dim is not stride...
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().ApplySoftMax();
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::Floor(const CuVectorBase<Real> &src, Real floor_val, MatrixIndexT *floored_count) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_,CU1DBLOCK));
     if (floored_count == nullptr) {
       if (dim_ == 0) return;
       CuTimer tim;
       // We are calling a function meant for matrices, by viewing the
       // vector as a matrix with a single row.
       ::MatrixDim dim = {1, Dim(), 1};
       cuda_floor(dimGrid, dimBlock, this->data_, src.Data(), floor_val, dim, 1);
       CuDevice::Instantiate().AccuProfile("CuVectorBase::FloorNoCount", tim);
     } else {
       if (dim_ == 0) { *floored_count = 0; return; }
       CuTimer tim;

       CuVector<float> count_vec(dim_, kUndefined);

       cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_);
       CU_SAFE_CALL(cudaGetLastError());
       *floored_count = count_vec.Sum();
       CuDevice::Instantiate().AccuProfile("CuVectorBase::Floor", tim);
     }
   } else
 #endif
   {
     Vec().Floor(src.Vec(), floor_val, floored_count);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::Ceiling(const CuVectorBase<Real> &src, Real ceiling_val,
                                  MatrixIndexT *ceiled_count) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_,CU1DBLOCK));
     if (ceiled_count == nullptr) {
       if (dim_ == 0) return;
       CuTimer tim;
       // We are calling a function meant for matrices, by viewing the
       // vector as a matrix with a single row.
       ::MatrixDim dim = {1, Dim(), 1};
       cuda_ceiling(dimGrid, dimBlock, this->data_, src.Data(), ceiling_val, dim, 1);

       CuDevice::Instantiate().AccuProfile("CuVectorBase::CeilingNoCount", tim);
     } else {
       if (dim_ == 0) { *ceiled_count = 0; return; }
       CuTimer tim;

       CuVector<float> count_vec(dim_, kUndefined);

       cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_);
       CU_SAFE_CALL(cudaGetLastError());
       *ceiled_count = count_vec.Sum();
       CuDevice::Instantiate().AccuProfile("CuVectorBase::Ceiling", tim);
     }
   } else
 #endif
   {
     Vec().Ceiling(src.Vec(), ceiling_val, ceiled_count);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::Pow(const CuVectorBase<Real> &src, Real power) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     // for this particular kernel, x is #rows, y is #cols.  so
     // fake matrix with 1 row, Dim() cols.
     dim3 dimBlock(CU1DBLOCK, 1);
     dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK), 1);
     ::MatrixDim fake_matrix_dim = { 1, Dim(), 1 };
     // num_cols is Dim(), num_rows is 1, stride is 1 (it's a don't-care).
     cuda_pow(dimGrid, dimBlock, this->data_, src.Data(), power, fake_matrix_dim, 1);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyPow", tim);
   } else
 #endif
   {
     Vec().Pow(src.Vec(), power);
   }
 }


 template<typename Real>
 void CuVectorBase<Real>::ApplyExp() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_,CU1DBLOCK));

     cuda_vec_apply_exp(dimGrid, dimBlock, data_, dim_);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyExp", tim);
   } else
 #endif
   {
     Vec().ApplyExp();
   }
 }


 template<typename Real>
 void CuVectorBase<Real>::ApplyLog() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_,CU1DBLOCK));

     CuVector<Real> flag(1);
     cuda_vec_apply_log(dimGrid, dimBlock, data_, flag.Data(), dim_);
     CU_SAFE_CALL(cudaGetLastError());
     if (flag(0) > 0)
       KALDI_ERR << "Trying to take log of a negative number.";
     CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyLog", tim);
   } else
 #endif
   {
     Vec().ApplyLog();
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::ApplyLogSoftMax() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     size_t dimBlock = CU1DBLOCK;
     size_t dimGrid = 1;       // dimGrid value represent the number of rows
     ::MatrixDim dim = { 1, this->dim_, this->dim_};

     cuda_log_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().ApplyLogSoftMax();
   }
 }


 template<typename Real>
 void CuVectorBase<Real>::AddMatVec(const Real alpha,
                                    const CuMatrixBase<Real> &M,
                                    MatrixTransposeType trans,
                                    const CuVectorBase<Real> &v,
                                    const Real beta) {
   KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_) ||
                (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;

     // Everything is backwards in CuBlas.  We need to reverse rows, columns,
     // transpose-ness.
     CUBLAS_SAFE_CALL(cublas_gemv(GetCublasHandle(),
                                  (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T),
                                  M.NumCols(), M.NumRows(), alpha, M.Data(),
                                  M.Stride(), v.Data(), 1, beta, data_, 1));

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().AddMatVec(alpha,M.Mat(),trans,v.Vec(),beta);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::AddSpVec(const Real alpha,
                                   const CuSpMatrix<Real> &M,
                                   const CuVectorBase<Real> &v,
                                   const Real beta) {
   KALDI_ASSERT(M.NumCols() == v.dim_ && M.NumRows() == dim_);
   KALDI_ASSERT(&v != this);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;

     // Note: in our opinion the CuSpMatrix represents a lower-triangular matrix, but
     // in CUBLAS, for some stupid reason, everything is reversed.
     CUBLAS_SAFE_CALL(cublas_spmv(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, Dim(),
                                  alpha, M.Data(), v.Data(), 1, beta, data_, 1));

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().AddSpVec(alpha,M.Mat(),v.Vec(),beta);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::AddVecVec(Real alpha, const CuVectorBase<Real> &v,
                                    const CuVectorBase<Real> &r, Real beta) {
   KALDI_ASSERT((dim_ == v.dim_ && dim_ == r.dim_));
   KALDI_ASSERT(this != &v && this != &r);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_,CU1DBLOCK));

     cuda_add_vec_vec(dimGrid, dimBlock, alpha, data_, v.Data(), r.Data(), beta, dim_);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::AddVecVec", tim);
   } else
 #endif
   {
     Vec().AddVecVec(alpha, v.Vec(), r.Vec(), beta);
   }
 }


 template<typename Real>
 bool CuVectorBase<Real>::ApproxEqual(const CuVectorBase<Real> &other, float tol) const {
   if (dim_ != other.dim_) KALDI_ERR << "ApproxEqual: size mismatch "
                                     << dim_ << " vs. " << other.dim_;
   KALDI_ASSERT(tol >= 0.0);
   CuVector<Real> tmp(*this);
   tmp.AddVec(-1.0, other);
   BaseFloat tmp_norm = sqrt(VecVec(tmp, tmp)), this_norm = sqrt(VecVec(*this, *this));
   return tmp_norm <= static_cast<Real>(tol) * this_norm;
 }


 template<typename Real>
 void CuVectorBase<Real>::AddDiagMat2(Real alpha, const CuMatrixBase<Real> &M,
                                      MatrixTransposeType trans, Real beta) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     MatrixTransposeType other_trans = (trans == kTrans ? kNoTrans : kTrans);
     KALDI_ASSERT(dim_ == (trans == kNoTrans ? M.NumRows() : M.NumCols()));
     this->AddDiagMatMat(alpha, M, trans, M, other_trans, beta);
   } else
 #endif
   {
     Vec().AddDiagMat2(alpha, M.Mat(), trans, beta);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
                                        MatrixTransposeType transM,
                                        const CuMatrixBase<Real> &N,
                                        MatrixTransposeType transN, Real beta) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     if (transM != transN) {
       KALDI_ASSERT(M.NumCols() == N.NumCols());
       KALDI_ASSERT(M.NumRows() == N.NumRows());
       if (transM == kNoTrans) {
         // Case 1: diag(M*N') == sum(M.*N, 2)
         // 1D grid and 1D block. One block per row of N.
         // 1D grid expands along the column of N.
         int dimBlock(CU1DBLOCK);
         int dimGrid(M.NumRows());
         cuda_add_diag_mat_mat_MNT(dimGrid, dimBlock, alpha, M.Data(), M.Dim(),
                                   N.Data(), N.Stride(), beta, data_);
       } else {
         // Case 2: diag(M'*N) == sum(M.*N, 1)
         // 16x16 or 8x32 2D block for coalesced memory access.
         // Grid shape is designed as follows,
         // 1. for small matrices, use 1D grid with only 1 row of 16x16 block,
         //    to avoid multiple kernel launch;
         // 2. for large enough matrices (no matter thin or fat),
         //    use 1- or 2-D grid so that the grid contains
         //    at least and not much larger than 'kOptNumBlocks' blocks
         //    to fully utilize the GPU;
         const int32 warpSize = 32;
         const int32 kOptNumBlocks = 512;
         const int32 tile_dim =
             (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ?
                 16 : 32;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x),
                      n_blocks(N.NumRows(), dimBlock.y));
         dimGrid.y = std::min(dimGrid.y, (kOptNumBlocks - 1) / dimGrid.x + 1);
         dimGrid.y = tile_dim == 16 ? 1 : dimGrid.y;
         if (dimGrid.y > 1) {
           CuMatrix<Real> buf(dimGrid.y, N.NumCols());
           cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, Real(1), M.Data(),
                                     M.Stride(), N.Data(), N.Dim(), Real(0),
                                     buf.Data(), buf.Stride());
           this->AddRowSumMat(alpha, buf, beta);
         } else {
           cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, alpha, M.Data(),
                                     M.Stride(), N.Data(), N.Dim(), beta, data_,
                                     dim_);
         }
       }
     } else {
       KALDI_ASSERT(M.NumCols() == N.NumRows());
       KALDI_ASSERT(N.NumCols() == M.NumRows());
       if (transM == kNoTrans) {
         // Case 3: diag(M*N) == sum(M'.*N, 1)
         // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access.
         // One block per 'tile_dim' columns of N.
         // 1D grid expands along the row of N.
         int tile_dim =
             sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(),
                                  N.Data(), N.Dim(), beta, data_);
       } else {
         // Case 4: diag(M'*N') == sum(N'.*M, 1)
         // Same kernel and config as case 3 except M and N are swapped.
         int tile_dim =
             sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(M.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(),
                                  M.Data(), M.Dim(), beta, data_);
       }
     }
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().AddDiagMatMat(alpha, M.Mat(), transM, N.Mat(), transN, beta);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::AddTpVec(const Real alpha, const CuTpMatrix<Real> &M,
                                   const MatrixTransposeType trans,
                                   const CuVectorBase<Real> &v,
                                   const Real beta) {
   KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     if (beta == 0.0) {
       if (&v != this) CopyFromVec(v);
       MulTp(M, trans);
       if (alpha != 1.0) Scale(alpha);
     } else {
       CuVector<Real> tmp(v);
       tmp.MulTp(M, trans);
       if (beta != 1.0) Scale(beta);  // *this <-- beta * *this
       AddVec(alpha, tmp, 1.0);          // *this += alpha * M * v
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().AddTpVec(alpha, M.Mat(), trans, v.Vec(), beta);
   }
 }


 template<typename Real>
 void CuVectorBase<Real>::MulTp(const CuTpMatrix<Real> &M, const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     cublas_tpmv(GetCublasHandle(), (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T),
                 M.NumRows(), M.Data(), data_, 1);
     CuDevice::Instantiate().AccuProfile("CuVectorBase::MulTp", tim);
   } else
 #endif
   {
     Vec().MulTp(M.Mat(), trans);
   }
 }

 template<typename Real>
 Real CuVectorBase<Real>::Min() const {
   Real result = 0.0;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) {  // min of an empty set is infinity.
       return std::numeric_limits<Real>::infinity();
     }
     CuTimer tim;

     // Small vectors are copied to RAM and reduced on CPU.
     // The length is chosen by cu-vector-speed-test
     if (dim_ < 4096) {
       Vector<Real> ans_cpu(*this);
       result = ans_cpu.Min();
     } else {
       // Use no more than 256 blocks (still too many?)
       int dimBlock = CU1DBLOCK;
       int dimGrid = n_blocks(dim_, dimBlock);
       if (dimGrid > 256) {
         dimGrid = 256;
       }
       CuVector<Real> ans(dimGrid, kUndefined);
       cuda_vec_min(dimGrid, dimBlock, data_, ans.Data(), dim_, 1);
       CU_SAFE_CALL(cudaGetLastError());
       Vector<Real> ans_cpu(ans);
       result = ans_cpu.Min();
     }

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     result = (this->Vec()).Min();
   }
   return result;
 }

 template<typename Real>
 Real CuVectorBase<Real>::Max() const {
   Real result = 0.0;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) {  // max of an empty set is -infinity.
       return -std::numeric_limits<Real>::infinity();
     }
     CuTimer tim;

     // Small vectors are copied to RAM and reduced on CPU.
     // The length is chosen by cu-vector-speed-test
     if (dim_ < 4096) {
       Vector<Real> ans_cpu(*this);
       result = ans_cpu.Max();
     } else {
       // Use no more than 256 blocks (still too many?)
       int dimBlock = CU1DBLOCK;
       int dimGrid = n_blocks(dim_, dimBlock);
       if (dimGrid > 256) {
         dimGrid = 256;
       }
       CuVector<Real> ans(dimGrid, kUndefined);
       cuda_vec_max(dimGrid, dimBlock, data_, ans.Data(), dim_, 1);
       CU_SAFE_CALL(cudaGetLastError());
       Vector<Real> ans_cpu(ans);
       result = ans_cpu.Max();
     }

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     result = (this->Vec()).Max();
   }
   return result;
 }

 template<typename Real>
 void CuVectorBase<Real>::ReplaceValue(Real orig, Real changed) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_, CU1DBLOCK));
     cuda_replace_value(dimGrid, dimBlock, data_, dim_, orig, changed);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().ReplaceValue(orig, changed);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::MulElements(const CuVectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_, CU1DBLOCK));
     cuda_vec_mul_elements(dimGrid, dimBlock, data_, v.Data(), dim_);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::MulElements", tim);
   } else
 #endif
   {
     Vec().MulElements(v.Vec());
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::DivElements(const CuVectorBase<Real> &v) {
   // this just creates a matrix and calls the matrix version.
   KALDI_ASSERT(dim_ == v.dim_);
   CuSubMatrix<Real> this_mat(this->Data(), 1, dim_, dim_),
       v_mat(v.Data(), 1, dim_, dim_);
   this_mat.DivElements(v_mat);
 }


 template<>
 template<>
 void CuVectorBase<double>::CopyFromVec(const CuVectorBase<float> &src) {
   KALDI_ASSERT(src.Dim() == dim_);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, src.Data(), 1, data_, 1));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().CopyFromVec(src.Vec());
   }
 }

 template<>
 template<>
 void CuVectorBase<float>::CopyFromVec(const CuVectorBase<double> &src) {
   KALDI_ASSERT(src.Dim() == dim_);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, src.Data(), 1, data_, 1));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().CopyFromVec(src.Vec());
   }
 }


 template<typename Real>
 template<typename OtherReal>
 void CuVectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &src) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (sizeof(Real) != sizeof(OtherReal)) {
       CuVector<OtherReal> temp(dim_, kUndefined);
       temp.CopyFromVec(src);
       this->CopyFromVec(temp);
     } else {
       KALDI_ASSERT(src.Dim() == dim_);
       if (dim_ == 0) return;
       CuTimer tim;
       CU_SAFE_CALL(cudaMemcpyAsync(data_, src.Data(), src.Dim()*sizeof(Real),
                                    cudaMemcpyHostToDevice, cudaStreamPerThread));
       CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D", tim);
     }
   } else
   #endif
   {
     Vec().CopyFromVec(src);
   }
 }
 // Instantiate the template above.
 template
 void CuVectorBase<float>::CopyFromVec(const VectorBase<float> &src);
 template
 void CuVectorBase<double>::CopyFromVec(const VectorBase<float> &src);
 template
 void CuVectorBase<float>::CopyFromVec(const VectorBase<double> &src);
 template
 void CuVectorBase<double>::CopyFromVec(const VectorBase<double> &src);

 template<typename Real>
 template<typename OtherReal>
 void CuVectorBase<Real>::CopyToVec(VectorBase<OtherReal> *dst) const {
   KALDI_ASSERT(dim_ == dst->Dim());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (sizeof(Real) != sizeof(OtherReal)) {
       CuVector<OtherReal> temp(*this);
       temp.CopyToVec(dst);
     } else {
       if (dim_ == 0) return;
       CuTimer tim;
       CU_SAFE_CALL(cudaMemcpyAsync(dst->Data(), this->data_,
                               sizeof(Real) * dim_, cudaMemcpyDeviceToHost,
                               cudaStreamPerThread));
       CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     }
   } else
 #endif
   {
     dst->CopyFromVec(this->Vec());
   }
 }


 template<typename Real>
 void CuVector<Real>::Read(std::istream &is, bool binary) {
   Vector<Real> temp;
   temp.Read(is, binary);
   Destroy();
   Swap(&temp);
 }


 template<typename Real>
 void CuVector<Real>::Write(std::ostream &os, bool binary) const {
   Vector<BaseFloat> temp(this->dim_, kUndefined);
   this->CopyToVec(&temp);
   temp.Write(os, binary);
 }


 template<typename Real>
 CuVector<Real>::CuVector(const CuVectorBase<Real> &v) {
   this->Resize(v.Dim());
   this->CopyFromVec(v);
 }

 template<typename Real>
 CuVector<Real>::CuVector(const VectorBase<Real> &v) {
   this->Resize(v.dim_);
   this->CopyFromVec(v);
 }

 template<typename Real>
 void CuVector<Real>::Resize(MatrixIndexT dim, MatrixResizeType t) {
   KALDI_ASSERT(t == kSetZero || t == kUndefined); // Others not implemented
   // yet.
   if (this->dim_ == dim) {
     this->SetZero();
     return;
   }
   if (this->dim_ != 0)
     this->Destroy();
   if (dim == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     this->data_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(dim * sizeof(Real)));
     this->dim_ = dim;
     if (t == kSetZero) this->SetZero();
     CuDevice::Instantiate().AccuProfile("CuVector::Resize", tim);
   } else
 #endif
   {
     Vector<Real> vec(dim);
     this->Swap(&vec);
   }
 }

 template<typename Real>
 void CuVector<Real>::Swap(CuVector<Real> *vec) {
   std::swap(this->data_, vec->data_);
   std::swap(this->dim_, vec->dim_);
 }


 template<typename Real>
 void CuVector<Real>::Swap(Vector<Real> *vec) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (this->dim_ == 0) {
       if (vec->dim_ != 0) {
         // *this is empty, but vec is nonempty.
         Resize(vec->dim_, kUndefined);
         this->CopyFromVec(*vec);
         vec->Resize(0);
       }
       // else both are empty.
     } else { // *this is nonempty.
       if (vec->dim_ != 0) {
         // Both *this and *vec are nonempty.  Recurse to simpler cases.
         // this could be done more efficiently in the case where
         // the size does not change.
         Vector<Real> temp;
         this->Swap(&temp); // now temp is full, *this is empty.
         vec->Swap(&temp); // now vec has data from *this, temp has
         // data from vec.
         Swap(&temp); // copy data in vec to *this, which is now empty.
       } else { // *this is full but *vec is empty.
         vec->Resize(this->dim_, kUndefined);
         this->CopyToVec(vec);
         this->Destroy();
       }
     }
   } else
 #endif
   {
     std::swap(vec->data_, this->data_);
     std::swap(vec->dim_, this->dim_);
   }
 }

 template<typename Real>
 void CuVector<Real>::Destroy() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (this->data_ != NULL)
       CuDevice::Instantiate().Free(this->data_);
   } else
 #endif
   {
     if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
   }
   this->data_ = NULL;
   this->dim_ = 0;
 }


 template<typename Real>
 void CuVectorBase<Real>::CopyFromVec(const CuVectorBase<Real> &src) {
   KALDI_ASSERT(src.Dim() == dim_);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
     CU_SAFE_CALL(
       cudaMemcpyAsync(data_, src.data_, src.dim_ * sizeof(Real),
                       cudaMemcpyDeviceToDevice, cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     memcpy(static_cast<void*>(data_), static_cast<void*>(src.data_),
            dim_ * sizeof(Real));
   }
 }


 template<typename Real>
 void CuVectorBase<Real>::SetZero() {
   if (dim_==0 || data_==NULL) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(dim_>=0);
     KALDI_ASSERT(data_!=NULL);
     CuTimer tim;
     CU_SAFE_CALL(cudaMemsetAsync(data_, 0, dim_*sizeof(Real),
           cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuVector::SetZero", tim);
   } else
 #endif
   {
     Vec().SetZero();
   }
 }


 template<typename Real>
 std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec) {
   Vector<Real> temp(vec.Dim());
   vec.CopyToVec(&temp);
   out << temp;
   return out;
 }
 // Instantiate the above.
 template
 std::ostream &operator << (std::ostream &out, const CuVectorBase<float> &vec);
 template
 std::ostream &operator << (std::ostream &out, const CuVectorBase<double> &vec);

 /*
  * Methods wrapping the ANSI-C CUDA kernels
  */
 template<typename Real>
 void CuVectorBase<Real>::Set(Real value) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK));
     ::MatrixDim d = { 1, Dim(), Dim() };

     cuda_set_const(dimGrid, dimBlock, data_, value, d);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().Set(value);
   }
 }


 template<typename Real>
 void CuVectorBase<Real>::Add(Real value) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK));
     ::MatrixDim d = { 1, Dim(), Dim() };

     cuda_add(dimGrid, dimBlock, data_, value, d);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().Add(value);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::CopyDiagFromPacked(const CuPackedMatrix<Real> &M) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(dim_ == M.NumRows());
     if (dim_ == 0) return;
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(Dim(), CU1DBLOCK));
     cuda_vec_copy_diag_from_packed(dimGrid, dimBlock, data_, M.Data(), dim_);
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().CopyDiagFromPacked(M.Mat());
   }
 }


 template<typename Real>
 void CuVectorBase<Real>::CopyDiagFromMat(const CuMatrix<Real> &M) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
     CuTimer tim;
     CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, M.Data(), M.Stride() + 1,
                                  data_, 1));

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().CopyDiagFromMat(M.Mat());
   }
 }


 template<typename Real>
 void CuVectorBase<Real>::Scale(Real value) {
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (Dim() == 0 ) return;

     CuTimer tim;
     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK));
     ::MatrixDim d = { 1, Dim(), Dim() };
     cuda_scale(dimGrid, dimBlock, data_, value, d);
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Vec().Scale(value);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::AddVec(Real alpha, const CuVectorBase<Real> &vec,
                                 Real beta) {
   KALDI_ASSERT(vec.Dim() == Dim());

 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     int32 dim = this->dim_;
     Real *data = this->data_;
     const Real *vec_data = vec.data_;
     if (beta != 1.0) CU_SAFE_CALL(cuda_scal(GetCublasHandle(), dim, beta, data, 1));
     if (alpha != 0.0) CU_SAFE_CALL(cuda_axpy(GetCublasHandle(), dim, alpha, vec_data, 1, data, 1));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     if (beta != 1.0) Vec().Scale(beta);
     Vec().AddVec(alpha, vec.Vec());
   }
 }


 template<typename Real>
 template<typename OtherReal>
 void CuVectorBase<Real>::AddVec(Real alpha, const CuVectorBase<OtherReal> &vec,
                                 Real beta) {
   // We could implement this directly, without using a temporary-- this can
   // be done later, when we have time.
   CuVector<Real> temp(vec);
   this->AddVec(alpha, temp, beta);
 }
 // instantiate the template above.
 template
 void CuVectorBase<float>::AddVec(float alpha, const CuVectorBase<double> &vec,
                                  float beta);
 template
 void CuVectorBase<double>::AddVec(double alpha, const CuVectorBase<float> &vec,
                                   double beta);

 template<typename Real>
 void CuVectorBase<Real>::AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat,
                                       Real beta) {
   KALDI_ASSERT(mat.NumCols() == Dim());
   if (Dim() == 0)
     return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     cuda_add_row_sum_mat(mat.NumCols(), CU1DBLOCK, Data(), mat.Data(),
                          mat.Dim(), alpha, beta);
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().AddRowSumMat(alpha, mat.Mat(), beta);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::AddColSumMat(Real alpha, const CuMatrixBase<Real> &mat,
                                       Real beta) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     KALDI_ASSERT(mat.NumRows() == Dim());

     cuda_add_col_sum_mat(mat.NumRows(), CU1DBLOCK, Data(), mat.Data(),
                          mat.Dim(), alpha, beta);
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().AddColSumMat(alpha, mat.Mat(), beta);
   }
 }

 template<typename Real>
 void CuVectorBase<Real>::InvertElements() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     dim3 dimBlock(CU1DBLOCK, 1);
     dim3 dimGrid(n_blocks(dim_, CU1DBLOCK));
     MatrixDim d = {1, dim_, dim_};

     cuda_invert_elements(dimGrid, dimBlock, data_, d);
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vec().InvertElements();
   }
 }


 template<typename Real>
 void CuVectorBase<Real>::CopyElements(const CuMatrixBase<Real> &mat,
                                       const MatrixTransposeType trans,
                                       const CuArrayBase<int32> &elements) {
   KALDI_ASSERT(elements.Dim() == Dim());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK));

     cuda_vector_copy_elements(dimGrid, dimBlock, this->data_, Dim(),
                                 mat.Data(), mat.Stride(), trans == kTrans,
                                 elements.Data());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     VectorBase<Real> &this_vec = this->Vec();
     const MatrixBase<Real> &src_mat = mat.Mat();
     const int32* index_map = elements.Data();
     KALDI_ASSERT((Dim() == mat.NumRows() && trans == kNoTrans)
                  || (Dim() == mat.NumCols() && trans == kTrans));
     for (int32 i = 0; i < Dim(); i++) {
       int32 j = index_map[i];
       KALDI_ASSERT(j >= 0);
       if (trans == kNoTrans) {
         KALDI_ASSERT(j < mat.NumCols());
         this_vec(i) = src_mat(i, j);
       } else {
         KALDI_ASSERT(j < mat.NumRows());
         this_vec(i) = src_mat(j, i);
       }
     }
   }
 }


 template
 void CuVectorBase<float>::CopyToVec(VectorBase<float> *dst) const;
 template
 void CuVectorBase<double>::CopyToVec(VectorBase<float> *dst) const;
 template
 void CuVectorBase<float>::CopyToVec(VectorBase<double> *dst) const;
 template
 void CuVectorBase<double>::CopyToVec(VectorBase<double> *dst) const;

 template class CuVectorBase<float>;
 template class CuVectorBase<double>;

 template class CuVector<float>;
 template class CuVector<double>;

 } // namespace
kaldi::CuMatrixBase::Mat
const MatrixBase< Real > & Mat() const
Definition: cu-matrix.h:755

kaldi::CuRand
Definition: cu-common.h:152

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::CuMatrixBase::Stride
MatrixIndexT Stride() const
Definition: cu-matrix.h:217

kaldi::OtherReal
This class provides a way for switching between double and float types.
Definition: matrix-common.h:84

cu-rand.h

kaldi::kUndefined
Definition: matrix-common.h:39

kaldi::MatrixResizeType
MatrixResizeType
Definition: matrix-common.h:37

kaldi::CuPackedMatrix::Mat
const PackedMatrix< Real > & Mat() const
Definition: cu-packed-matrix.h:150

rnnlm::j
int j
Definition: mikolov-rnnlm-lib.cc:66

kaldi::CuVectorBase::MulTp
void MulTp(const CuTpMatrix< Real > &M, const MatrixTransposeType trans)
Multiplies this vector by lower-triangular marix: *this <– *this *M.
Definition: cu-vector.cc:727

kaldi::CuVector
Definition: matrix-common.h:74

kaldi::CuRand::RandUniform
void RandUniform(CuMatrixBase< Real > *tgt)
Fill with uniform [0..1] floats,.
Definition: cu-rand.cc:60

kaldi::CuPackedMatrix::NumRows
MatrixIndexT NumRows() const
Definition: cu-packed-matrix.h:135

kaldi::MatrixBase::NumCols
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67

cu-device.h

kaldi::MatrixBase
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49

kaldi::CuVectorBase::CopyColFromMat
void CopyColFromMat(const CuMatrixBase< Real > &mat, MatrixIndexT col)
Definition: cu-vector.cc:103

kaldi::MatrixBase::Data
const Real * Data() const
Gives pointer to raw data (const).
Definition: kaldi-matrix.h:79

MatrixDim_
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:46

kaldi::CuVectorBase::data_
Real * data_
GPU data pointer (or regular data pointer if CUDA is not compiled in or we have no GPU)...
Definition: cu-vector.h:248

kaldi::CuVectorBase::Sum
Real Sum() const
Definition: cu-vector.cc:297

kaldi::VectorBase::Write
void Write(std::ostream &Out, bool binary) const
Writes to C++ stream (option to write in binary).
Definition: kaldi-vector.cc:1231

cublas-wrappers.h

kaldi::MatrixBase::RowData
Real * RowData(MatrixIndexT i)
Returns pointer to data for one row (non-const)
Definition: kaldi-matrix.h:87

kaldi::swap
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
Definition: basic-filebuf.h:275

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

kaldi::VecMatVec
Real VecMatVec(const VectorBase< Real > &v1, const MatrixBase< Real > &M, const VectorBase< Real > &v2)
Returns  .
Definition: kaldi-vector.cc:1281

cu-matrix.h

kaldi::CuArrayBase::Data
const T * Data() const
Get raw pointer.
Definition: cu-array.h:52

kaldi::CuMatrix
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71

kaldi::Vector::Resize
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
Definition: kaldi-vector.cc:190

kaldi::VectorBase::Min
Real Min() const
Returns the minimum value of any element, or +infinity for the empty vector.
Definition: kaldi-vector.cc:614

kaldi::CuSpMatrix
Definition: matrix-common.h:76

cu-vector.h

data_
uint64 data_
Definition: arpa-lm-compiler.cc:108

timer.h

kaldi::VecVec
template double VecVec(const CuVectorBase< double > &A, const CuVectorBase< float > &B)

kaldi::kTrans
Definition: matrix-common.h:33

kaldi::CuSpMatrix::Mat
const SpMatrix< Real > & Mat() const
Definition: cu-sp-matrix.h:132

kaldi::CuRand::RandGaussian
void RandGaussian(CuMatrixBase< Real > *tgt)
Fill with Normal random numbers,.
Definition: cu-rand.cc:116

kaldi::VectorBase::CopyFromVec
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
Definition: kaldi-vector.cc:228

cu-sp-matrix.h

kaldi::MatrixBase::Stride
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols.
Definition: kaldi-matrix.h:70

kaldi::CuVectorBase::AddMatVec
void AddMatVec(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, const CuVectorBase< Real > &v, const Real beta)
Definition: cu-vector.cc:506

kaldi::MatrixIndexT
int32 MatrixIndexT
Definition: matrix-common.h:98

kaldi::CuVectorBase::CopyFromVec
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
Definition: cu-vector.cc:1078

kaldi::VectorBase::dim_
MatrixIndexT dim_
dimension of vector
Definition: kaldi-vector.h:397

float

kaldi::Vector::Swap
void Swap(Vector< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
Definition: kaldi-vector.cc:1297

cu-kernels.h

KALDI_ERR
#define KALDI_ERR
Definition: kaldi-error.h:147

CU1DBLOCK
#define CU1DBLOCK
Definition: cu-matrixdim.h:57

kaldi::kNoTrans
Definition: matrix-common.h:34

kaldi::VectorBase::Max
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
Definition: kaldi-vector.cc:574

KALDI_MEMALIGN_FREE
#define KALDI_MEMALIGN_FREE(x)
Definition: kaldi-utils.h:60

kaldi::CuSubMatrix
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70

kaldi::VectorBase::Data
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70

kaldi::CuMatrixBase::DivElements
void DivElements(const CuMatrixBase< Real > &A)
Divide two matrices elementwise: C = A ./ A.
Definition: cu-matrix.cc:691

kaldi::VectorBase::Dim
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64

kaldi::VectorBase::Sum
Real Sum() const
Returns sum of the elements.
Definition: kaldi-vector.cc:688

cu-tp-matrix.h

kaldi::CuVectorBase::AddVec
void AddVec(Real alpha, const CuVectorBase< Real > &vec, Real beta=1.0)
Definition: cu-vector.cc:1237

kaldi::CuMatrixBase::Data
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746

kaldi::VectorBase::data_
Real * data_
data memory area
Definition: kaldi-vector.h:395

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

cu-math.h

kaldi::CuMatrixBase
Matrix for CUDA computing.
Definition: matrix-common.h:69

kaldi::CuPackedMatrix::Data
Real * Data()
Definition: cu-packed-matrix.h:113

kaldi::CuMatrixBase::NumCols
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216

kaldi::CuVectorBase::dim_
MatrixIndexT dim_
dimension of the vector
Definition: cu-vector.h:250

kaldi::Vector
A class representing a vector.
Definition: kaldi-vector.h:406

kaldi::CuArrayBase< int32 >

kaldi::CuVectorBase::Vec
const VectorBase< Real > & Vec() const
Definition: cu-vector.h:235

kaldi::kSetZero
Definition: matrix-common.h:38

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

cu-common.h

kaldi::MatrixBase::NumRows
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64

kaldi::CuPackedMatrix
Matrix for CUDA computing.
Definition: matrix-common.h:75

kaldi::MatrixTransposeType
MatrixTransposeType
Definition: matrix-common.h:32

kaldi::CuVectorBase::Data
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: cu-vector.h:72

kaldi::CuMatrixBase::Dim
::MatrixDim Dim() const
Definition: cu-matrix.h:221

kaldi::CuTpMatrix
Definition: matrix-common.h:77

kaldi::CuMatrixBase::NumRows
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215

kaldi::CuPackedMatrix::NumCols
MatrixIndexT NumCols() const
Definition: cu-packed-matrix.h:136

kaldi::VectorBase
Provides a vector abstraction class.
Definition: kaldi-vector.h:41

kaldi::CuArrayBase::Dim
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:49

kaldi::CuMatrix::Mat
const Matrix< Real > & Mat() const
Definition: cu-matrix.h:879

kaldi::VecVec
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37

cu-sparse-matrix.h

kaldi::CuVectorBase::CopyToVec
void CopyToVec(VectorBase< OtherReal > *dst) const
Definition: cu-vector.cc:938

kaldi::Vector::Read
void Read(std::istream &in, bool binary, bool add=false)
Read function using C++ streams.
Definition: kaldi-vector.cc:1109

kaldi::CuTpMatrix::Mat
const TpMatrix< Real > & Mat() const
Definition: cu-tp-matrix.h:80

kaldi::ApproxEqual
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265

rnnlm::d
double d
Definition: mikolov-rnnlm-lib.cc:64

kaldi::CuVectorBase::Dim
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69

kaldi::CuVectorBase
Vector for CUDA computing.
Definition: matrix-common.h:72

kaldi::CuMatrixBase::RowData
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).
Definition: cu-matrix.h:740