doc/cu-math_8cc_source.html

 // cudamatrix/cu-math.cc

 // Copyright 2009-2012  Karel Vesely
 //                      Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.

 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-kernels.h"

 namespace kaldi {

 namespace cu {

 /*
  * templated functions wrapping the ANSI-C CUDA kernel functions
  */


 template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
   KALDI_ASSERT(SameDim(*weight, *grad));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));

     cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
                        weight->Dim(), grad->Stride());
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     MatrixBase<Real> &weight2 = weight->Mat();
     MatrixBase<Real> &grad2 = grad->Mat();
     for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
       for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {

         if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!

         Real l1_signed = l1;
         if (weight2(r, c) < 0.0)
           l1_signed = -l1;

         Real before = weight2(r, c);
         Real after = weight2(r, c) - lr*grad2(r, c) - l1_signed;
         if ((after > 0.0) ^ (before > 0.0)) {
           weight2(r, c) = 0.0;
           grad2(r, c) = 0.0;
         } else {
           weight2(r, c) -= l1_signed;
         }
       }
     }
   }
 }


 template<typename Real>
 void Randomize(const CuMatrixBase<Real> &src,
                const CuArray<int32> &copy_from_idx,
                CuMatrixBase<Real> *tgt) {

   KALDI_ASSERT(src.NumCols() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
   KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());

   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     /*
     Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
     */

     /*
      * Let's use blocksize 4 x 128 (512 threads/block)
      * and extend the randomizable matrices to: col 4*65535, row 128*65535
      * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
      */
     dim3 dimBlock(4, 128);
     dim3 dimGrid(n_blocks(tgt->NumCols(), 4), n_blocks(copy_from_idx.Dim(), 128));
     /*
      */

     MatrixDim dimsrc = src.Dim(); dimsrc.rows=copy_from_idx.Dim();
     MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();

     cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
                    copy_from_idx.Data(), dimtgt, dimsrc);
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     // randomize in CPU
     const MatrixBase<Real> &srcmat = src.Mat();
     const int32 *copy_from_idxvec = copy_from_idx.Data();
     MatrixBase<Real> &tgtmat = tgt->Mat();
     for(int32 i=0; i<copy_from_idx.Dim(); i++) {
       tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
     }
   }
 }


 template<typename Real>
 void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
             CuMatrixBase<Real> *tgt) {

   KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());

   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));

     cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
                 frame_offsets.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     // expand in CPU
     const MatrixBase<Real> &srcmat = src.Mat();
     const int32 *frame_offsetvec = frame_offsets.Data();
     int32 dim = frame_offsets.Dim();
     MatrixBase<Real> &tgtmat = tgt->Mat();
     //
     for(int32 r=0; r < tgtmat.NumRows(); r++) {
       for(int32 off=0; off < dim; off++) {
         int32 r_off = r + frame_offsetvec[off];
         if(r_off < 0) r_off = 0;
         if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
         memcpy(tgtmat.RowData(r)+off*srcmat.NumCols(),srcmat.RowData(r_off),sizeof(Real)*srcmat.NumCols());
       }
     }
   }
 }


 template<typename Real>
 void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices,
           CuMatrixBase<Real> *tgt) {

   KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());

   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));

     cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
               copy_from_indices.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     // expand in CPU
     const MatrixBase<Real> &srcmat = src.Mat();
     const int32 *copy_from_indicesvec = copy_from_indices.Data();
     int32 dim = copy_from_indices.Dim();
     MatrixBase<Real> &tgtmat = tgt->Mat();
     //
     for(int32 r = 0; r < tgtmat.NumRows(); r++) {
       for(int32 c = 0; c < dim; c++) {
         tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
       }
     }
   }
 }

 template <typename Real>
 void EnsureNonzero(const CuMatrixBase<Real> &src,
                    Real epsilon,
                    CuMatrixBase<Real> *dest) {
   KALDI_ASSERT(SameDim(*dest, src) && epsilon > 0.0);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(src.NumRows(), src.NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_ensure_nonzero(dimGrid, dimBlock, src.Data(), src.Dim(),
                         epsilon, dest->Stride(), dest->Data());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     int32 num_rows = src.NumRows(), num_cols = src.NumCols();
     for (int32 r = 0; r < num_rows; r++) {
       const Real *src_data = src.RowData(r);
       Real *dest_data = dest->RowData(r);
       for (int32 c = 0; c < num_cols; c++) {
         Real x = src_data[c], y;
         if (x <= -epsilon || x >= epsilon) y = x;
         else if (x >= 0.0) y = epsilon;
         else y = -epsilon;
         dest_data[c] = y;
       }
     }
   }
 }


 // instantiate the templates.
 template
 void RegularizeL1(CuMatrixBase<float> *weight, CuMatrixBase<float> *grad, float l1, float lr);
 template
 void RegularizeL1(CuMatrixBase<double> *weight, CuMatrixBase<double> *grad, double l1, double lr);

 template
 void Splice(const CuMatrixBase<float> &src, const CuArray<int32> &frame_offsets,
             CuMatrixBase<float> *tgt);
 template
 void Splice(const CuMatrixBase<double> &src, const CuArray<int32> &frame_offsets,
             CuMatrixBase<double> *tgt);
 template
 void Copy(const CuMatrixBase<float> &src, const CuArray<int32> &copy_from_indices,
           CuMatrixBase<float> *tgt);
 template
 void Copy(const CuMatrixBase<double> &src, const CuArray<int32> &copy_from_indices,
           CuMatrixBase<double> *tgt);

 template
 void Randomize(const CuMatrixBase<float> &src,
                const CuArray<int32> &copy_from_idx,
                CuMatrixBase<float> *tgt);
 template
 void Randomize(const CuMatrixBase<double> &src,
                const CuArray<int32> &copy_from_idx,
                CuMatrixBase<double> *tgt);

 // The output y_i = scale * x_i,
 // and we want to RMS value of the y_i to equal target_rms,
 // so y^t y = D * target_rms^2 (if y is one row of the input).
 // we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
 // there is also flooring involved, to avoid division-by-zero
 // problems.  It's important for the backprop, that the floor's
 // square root is exactly representable as float.
 // If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
 // is an extra dimension of the output.
 template<typename Real>
 void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
                      const bool add_log_stddev, CuMatrixBase<Real>* out) {
   const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
   if (add_log_stddev) {
     KALDI_ASSERT(in.NumRows() == out->NumRows());
     KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
   } else {
     KALDI_ASSERT(SameDim(in, *out));
   }

 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     size_t dimBlock = CU1DBLOCK;
     size_t dimGrid = out->NumRows();
     cuda_normalize_per_row(dimGrid, dimBlock, out->Data(), out->Stride(),
                            in.Data(), in.Dim(), target_rms, add_log_stddev);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     CuSubMatrix<Real> out_no_log(*out, 0, out->NumRows(), 0, in.NumCols());
     if (in.Data() != out_no_log.Data())
       out_no_log.CopyFromMat(in);
     CuVector<Real> in_norm(in.NumRows());
     Real d_scaled = in.NumCols() * target_rms * target_rms;
     in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
     in_norm.ApplyFloor(kSquaredNormFloor);
     in_norm.ApplyPow(-0.5);
     out_no_log.MulRowsVec(in_norm);
     if (add_log_stddev) {
       in_norm.ApplyLog();
       in_norm.Scale(-1.0);
       in_norm.Add(log(target_rms));
       out->CopyColFromVec(in_norm, in.NumCols());
     }
   }
 }

 template
 void NormalizePerRow(const CuMatrixBase<float>& in, const float target_rms,
                      const bool add_log_stddev, CuMatrixBase<float>* out);
 template
 void NormalizePerRow(const CuMatrixBase<double>& in, const double target_rms,
                      const bool add_log_stddev, CuMatrixBase<double>* out);


 // A note on the derivative of NormalizeComponent...
 // let both row_in and row_out be vectors of dimension D.
 // Let p = row_in^T row_in / (D * target_rms^2), and let
 // f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
 // row_out = f row_in.
 // Suppose we have a quantity deriv_out which is the derivative
 // of the objective function w.r.t. row_out.  We want to compute
 // deriv_in which is the derivative of the objective function w.r.t.
 // row_in.  Let the objective function be F.  One term is obvious: we have
 // deriv_in = f deriv_out + ....
 // next we have to take into account the derivative that gets back-propagated
 // through f.  Obviously, dF/df = deriv_out^T row_in.
 // And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
 // and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
 // So this term in dF/d(row_in) equals:
 // dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
 // So
 // deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
 //  if add_log_stddev_ true, the deriv_in has another term as
 // dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
 template<typename Real>
 void DiffNormalizePerRow(const CuMatrixBase<Real> &in_value,
                          const CuMatrixBase<Real> &out_deriv,
                          const Real target_rms, const bool add_log_stddev,
                          CuMatrixBase<Real>* in_deriv) {
   const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     size_t dimBlock = CU1DBLOCK;
     size_t dimGrid = in_deriv->NumRows();
     cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->Data(),
                                 in_deriv->Stride(), in_value.Data(),
                                 in_value.Dim(), out_deriv.Data(),
                                 out_deriv.Stride(), target_rms, add_log_stddev);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     const CuSubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
                                              0, in_value.NumCols());
     CuVector<Real> dot_products(out_deriv.NumRows());
     dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
                                kTrans, 0.0);
     CuVector<Real> in_norm(in_value.NumRows());
     Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
     in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);

     if (add_log_stddev) {
       CuVector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
       out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
       // f = log(sqrt(max(epsi, x^T x / D)))
       // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
       // we don't compute this exactly below for the case when x^2 x is very
       // small, but we do make sure that the deriv isn't infinity when the input
       // is zero.
       log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
       log_stddev_deriv.ApplyPow(-1.0);
       out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
       log_stddev_deriv.MulElements(out_deriv_for_stddev);
       if (in_deriv)
         in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
     }
     in_norm.Scale(1.0 / d_scaled);
     in_norm.ApplyFloor(kSquaredNormFloor);
     in_norm.ApplyPow(-0.5);
     if (in_deriv) {
       if (in_deriv->Data() != out_deriv_no_log.Data())
         in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
       else
         in_deriv->MulRowsVec(in_norm);
       in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
       in_norm.ApplyPow(3.0);
       dot_products.MulElements(in_norm);

       in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value, kNoTrans,
                               1.0);
     }
   }
 }

 template
 void DiffNormalizePerRow(const CuMatrixBase<float> &in_value,
                          const CuMatrixBase<float> &out_deriv,
                          const float target_rms, const bool add_log_stddev,
                          CuMatrixBase<float>* in_deriv);
 template
 void DiffNormalizePerRow(const CuMatrixBase<double> &in_value,
                          const CuMatrixBase<double> &out_deriv,
                          const double target_rms, const bool add_log_stddev,
                          CuMatrixBase<double>* in_deriv);


 // not calling this Sigmoid to reduce the chance of future collisions.
 template<typename Real>
 static inline Real ScalarSigmoid(Real a) {
   if (a > Real(0)) {
     return Real(1) / (Real(1) + Exp(-a));
   } else {
     Real x = Exp(a);
     return x / (x + Real(1));
   }
 }

 template<typename Real>
 static inline Real ScalarTanh(Real a) {
   if (a > Real(0)) {
     Real inv_expa = Exp(-a);
     return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa);
   } else {
     Real expa = Exp(a);
     return Real(1) - Real(2) / (Real(1) + expa * expa);
   }
 }

 template<typename Real>
 void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input_mat,
                                 const MatrixBase<Real> &params_mat,
                                 MatrixBase<Real> *output) {
   int32 num_rows = input_mat.NumRows(),
       input_cols = input_mat.NumCols(),
         cell_dim = input_cols / 5;
   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   KALDI_ASSERT(output->NumRows() == num_rows);
   KALDI_ASSERT(params_mat.NumRows() == 3);
   KALDI_ASSERT(params_mat.NumCols() == cell_dim);
   KALDI_ASSERT(output->NumCols() == 2 * cell_dim);

   MatrixBase<Real> &output_mat = *output;
   const Real *params_data = params_mat.Data();
   int32 params_stride = params_mat.Stride();
   for (int32 r = 0; r < num_rows; r++) {
     const Real *input_row = input_mat.RowData(r);
     // i_scale and f_scale relate to dropout, they will normally be 1.0.
     Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]),
          f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]),
          o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]);

     Real *output_row = output_mat.RowData(r);
     for (int32 c = 0; c < cell_dim; c++) {
       Real i_part = input_row[c];
       Real f_part = input_row[c + cell_dim];
       Real c_part = input_row[c + 2 * cell_dim];
       Real o_part = input_row[c + 3 * cell_dim];
       Real c_prev = input_row[c + 4 * cell_dim];
       Real w_ic = params_data[c];
       Real w_fc = params_data[c + params_stride];
       Real w_oc = params_data[c + params_stride * 2];
       Real i_t = ScalarSigmoid(i_part + w_ic * c_prev);
       Real f_t = ScalarSigmoid(f_part + w_fc * c_prev);
       Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part);
       Real o_t = ScalarSigmoid(o_part + w_oc * c_t);
       Real m_t = o_t * o_scale * ScalarTanh(c_t);
       output_row[c] = c_t;
       output_row[c + cell_dim] = m_t;
     }
   }
 }

 template<typename Real>
 void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
                              const CuMatrixBase<Real> &params,
                              CuMatrixBase<Real> *output) {
   int32 num_rows = input.NumRows(),
       input_cols = input.NumCols(),
         cell_dim = input_cols / 5;
   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   KALDI_ASSERT(output->NumRows() == num_rows);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output->NumCols() == 2 * cell_dim);

 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;

     int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);

     // Each thread block is working on 1 row of the data.
     // It's best that cell dim is a multiple fo CU1DBLOCK
     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(num_rows);

     cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(),
                            params.Data(), params.Stride(), output->Stride(),
                            cell_dim, have_dropout_mask, num_rows, output->Data());
     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     CpuComputeLstmNonlinearity(input.Mat(), params.Mat(), &output->Mat());
   }
 }

 template
 void CpuComputeLstmNonlinearity(const MatrixBase<float> &input_mat,
                                 const MatrixBase<float> &params_mat,
                                 MatrixBase<float> *output);
 template
 void CpuComputeLstmNonlinearity(const MatrixBase<double> &input_mat,
                                 const MatrixBase<double> &params_mat,
                                 MatrixBase<double> *output);
 template
 void ComputeLstmNonlinearity(const CuMatrixBase<float> &input,
                              const CuMatrixBase<float> &params,
                              CuMatrixBase<float> *output);
 template
 void ComputeLstmNonlinearity(const CuMatrixBase<double> &input,
                              const CuMatrixBase<double> &params,
                              CuMatrixBase<double> *output);

 template<typename Real>
 void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
                                  const MatrixBase<Real> &params,
                                  const MatrixBase<Real> &output_deriv,
                                  const MatrixBase<double> &deriv_sum_in,
                                  const VectorBase<Real> &self_repair_config,
                                  double count_in,
                                  MatrixBase<Real> *input_deriv,
                                  MatrixBase<Real> *params_deriv,
                                  MatrixBase<double> *value_sum_out,
                                  MatrixBase<double> *deriv_sum_out,
                                  MatrixBase<Real> *self_repair_sum_out) {
   int32 num_rows = input.NumRows(),
       input_cols = input
                    .NumCols(),
         cell_dim = input.NumCols() / 5;
   // Check dimensions.
   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output_deriv.NumRows() == num_rows);
   KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
   KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
   KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
   KALDI_ASSERT(self_repair_config.Dim() == 10);
   if (input_deriv != NULL) {
     KALDI_ASSERT(SameDim(input, *input_deriv));
   }
   if (params_deriv == NULL) {
     KALDI_ASSERT(value_sum_out == NULL);
     KALDI_ASSERT(deriv_sum_out == NULL);
     KALDI_ASSERT(self_repair_sum_out == NULL);
   } else {
     KALDI_ASSERT(value_sum_out != NULL);
     KALDI_ASSERT(deriv_sum_out != NULL);
     KALDI_ASSERT(self_repair_sum_out != NULL);
     KALDI_ASSERT(SameDim(params, *params_deriv));
     KALDI_ASSERT(value_sum_out->NumRows() == 5);
     KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
     KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
     KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
     KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
   }

   const MatrixBase<Real> &input_mat = input;
   const MatrixBase<Real> &params_mat = params;
   const MatrixBase<Real> &output_deriv_mat = output_deriv;
   const MatrixBase<double> &deriv_sum_in_mat = deriv_sum_in;
   const VectorBase<Real> &sr_config = self_repair_config;
   MatrixBase<Real> *input_deriv_mat = (
       input_deriv == NULL ? NULL : input_deriv);
   MatrixBase<Real> *params_deriv_mat = NULL;
   MatrixBase<Real> *self_repair_sum_out_mat = NULL;
   MatrixBase<double> *value_sum_out_mat = NULL;
   MatrixBase<double> *deriv_sum_out_mat = NULL;
   if (params_deriv != NULL) {
     params_deriv_mat = params_deriv;
     value_sum_out_mat = value_sum_out;
     deriv_sum_out_mat = deriv_sum_out;
     self_repair_sum_out_mat = self_repair_sum_out;
   }


   // We add 1.0 (i.e. a small value) to the count to avoid division by zero.
   Real count = 1.0 + count_in;
   for (int32 c = 0; c < cell_dim; c++) {
     // parameters
     Real w_ic = params_mat(0, c);
     Real w_fc = params_mat(1, c);
     Real w_oc = params_mat(2, c);
     // derivative sums w.r.t. parameters.
     Real w_ic_deriv_sum = 0.0;
     Real w_fc_deriv_sum = 0.0;
     Real w_oc_deriv_sum = 0.0;

     // average derivatives, for self-repair.
     // The 5 nonlinearities that are subject to self-repair are written as:
     //  Sigmoid(i_t_input), Sigmoid(f_t_input),
     //  Tanh(c_part), Sigmoid(o_t_input),  Tanh(c_t)
     Real i_t_self_repair = (
         deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
     Real f_t_self_repair = (
         deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
     Real c_part_self_repair = (
         deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
     Real o_t_self_repair = (
         deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
     Real c_t_self_repair = (
         deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
     // Note on how we add self-repair for sigmoids/tanh's.  If self-repair
     // is activated for this unit, then...
     // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0)
     // ... to the input-deriv;
     // For tanh's we'd add -self_repair_scale * tanh(x)
     // If self-repair is not activated, the 'self_repair' scales are set to zero.

     // The following variables are for the accumulation of stats on the
     // sigmoid and tanh units.
     Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0;
     Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0;
     Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0;
     Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0;
     Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0;


     for (int32 r = 0; r < num_rows; r++) {
       Real i_part = input_mat(r, c),
           f_part = input_mat(r, c + cell_dim),
           c_part = input_mat(r, c + 2 * cell_dim),
           o_part = input_mat(r, c + 3 * cell_dim),
           c_prev = input_mat(r, c + 4 * cell_dim);

       Real i_scale = (input_cols == cell_dim * 5 ? 1.0 :
                       input_mat(r, cell_dim * 5)),
            f_scale = (input_cols == cell_dim * 5 ? 1.0 :
                       input_mat(r, cell_dim * 5 + 1)),
            o_scale = (input_cols == cell_dim * 5 ? 1.0 :
                       input_mat(r, cell_dim * 5 + 2));

       // For greater clarity, we give some of the quantities in the
       // forward equations their own names.
       Real i_t_input = i_part + w_ic * c_prev,
           i_t = ScalarSigmoid(i_t_input),
           f_t_input = f_part + w_fc * c_prev,
           f_t = ScalarSigmoid(f_t_input),
           tanh_c_part = ScalarTanh(c_part),
           c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part,
           o_t_input = o_part + w_oc * c_t,
           o_t = ScalarSigmoid(o_t_input),
           tanh_c_t = ScalarTanh(c_t);
       // we'd also compute, in the forward pass,
       //   m_t = o_t * tanh_c_t;
       // but this variable is not needed.

       // Accumulate nonlinearity value and derivative stats.
       // Note:
       //    tanh'(x)  = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
       //  sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)).
       i_t_value_sum += i_t;
       i_t_deriv_sum += i_t * (1.0F - i_t);
       f_t_value_sum += f_t;
       f_t_deriv_sum += f_t * (1.0F - f_t);
       c_part_value_sum += tanh_c_part;
       c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part;
       o_t_value_sum += o_t;
       o_t_deriv_sum += o_t * (1.0F - o_t);
       c_t_value_sum += tanh_c_t;
       c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t;


       // the derivative of the objective function w.r.t. a particular quantity
       // will be written by prepending "d" to the name.
       // We compute these derivatives in the reverse of the order in which
       // we computed the original quantities.
       // dc_t_out is the part of the derivative w.r.t. c_t that
       // comes directly from the output of this function.
       Real dc_t_out = output_deriv_mat(r, c);
       Real dm_t = output_deriv_mat(r, c + cell_dim);
       Real dtanh_c_t = o_t * o_scale * dm_t;
       Real do_t = o_scale * tanh_c_t * dm_t;
       Real do_t_input = (o_t * (1.0F - o_t) * do_t
           - (2.0F * o_t - 1.0F) * o_t_self_repair);
       Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
           + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
       Real dtanh_c_part = i_t * i_scale * dc_t;
       Real df_t = dc_t * f_scale * c_prev;
       Real df_t_input = ((df_t * f_t * (1.0F - f_t)
                           - (2.0F * f_t - 1.0F) * f_t_self_repair));
       Real di_t = dc_t * i_scale * tanh_c_part;
       Real di_t_input = ((di_t * i_t * (1.0F - i_t)
                           - (2.0F * i_t - 1.0F) * i_t_self_repair));

       w_ic_deriv_sum += c_prev * di_t_input;
       w_fc_deriv_sum += c_prev * df_t_input;
       w_oc_deriv_sum += c_t * do_t_input;

       Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t;
       Real do_part = do_t_input;
       Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
           - tanh_c_part * c_part_self_repair);
       Real df_part = df_t_input;
       Real di_part = di_t_input;

       if (input_deriv_mat != NULL) {
         (*input_deriv_mat)(r, c) = di_part;
         (*input_deriv_mat)(r, c + cell_dim) = df_part;
         (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part;
         (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part;
         (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev;
       }
     }

     if (params_deriv != NULL) {
       // note: for optimizing things you can assume that params_deriv and
       // input_deriv_mat are non-NULL (i.e. all the output matrices are
       // non-NULL).  The situations when some of the output matrices are NULL
       // does not happen often (mainly only in testing code).

       (*params_deriv_mat)(0, c) = w_ic_deriv_sum;
       (*params_deriv_mat)(1, c) = w_fc_deriv_sum;
       (*params_deriv_mat)(2, c) = w_oc_deriv_sum;

       (*value_sum_out_mat)(0, c) += i_t_value_sum;
       (*value_sum_out_mat)(1, c) += f_t_value_sum;
       (*value_sum_out_mat)(2, c) += c_part_value_sum;
       (*value_sum_out_mat)(3, c) += o_t_value_sum;
       (*value_sum_out_mat)(4, c) += c_t_value_sum;

       // need to update self_repair_sum_out before deriv_sum_out, because
       // deriv_sum_out and deriv_sum_in might point to the same memory.
       for (int32 i = 0; i < 5; i++)
         (*self_repair_sum_out_mat)(i, c) =
             (deriv_sum_in_mat(i, c) / count < sr_config(i) ? num_rows : 0);

       (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
       (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
       (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum;
       (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum;
       (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum;
     }
   }
 }


 template<typename Real>
 void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
                               const CuMatrixBase<Real> &params,
                               const CuMatrixBase<Real> &output_deriv,
                               const CuMatrixBase<double> &deriv_sum_in,
                               const CuVectorBase<Real> &self_repair_config,
                               double count_in,
                               CuMatrixBase<Real> *input_deriv,
                               CuMatrixBase<Real> *params_deriv,
                               CuMatrixBase<double> *value_sum_out,
                               CuMatrixBase<double> *deriv_sum_out,
                               CuMatrixBase<Real> *self_repair_sum_out) {
   int32 num_rows = input.NumRows(),
         cell_dim = input.NumCols() / 5,
       input_cols = input.NumCols();
   // Check dimensions.
   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output_deriv.NumRows() == num_rows);
   KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
   KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
   KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
   KALDI_ASSERT(self_repair_config.Dim() == 10);
   if (input_deriv != NULL) {
     KALDI_ASSERT(SameDim(input, *input_deriv));
   }
   if (params_deriv == NULL) {
     KALDI_ASSERT(value_sum_out == NULL);
     KALDI_ASSERT(deriv_sum_out == NULL);
     KALDI_ASSERT(self_repair_sum_out == NULL);
   } else {
     KALDI_ASSERT(value_sum_out != NULL);
     KALDI_ASSERT(deriv_sum_out != NULL);
     KALDI_ASSERT(self_repair_sum_out != NULL);
     KALDI_ASSERT(SameDim(params, *params_deriv));
     KALDI_ASSERT(value_sum_out->NumRows() == 5);
     KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
     KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
     KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
     KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
   }


 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     // Each thread block is working on 1 row of the data.
     // It's best that cell dim is a multiple fo CU1DBLOCK

     int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);

     // Use 2D block (8x32 threads) as we need to compute column sum.
     // Use 1D grid to cover the data matrix width `cell_dim`.
     const int kWarpSize = 32;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
 //    dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
 //                 n_blocks(num_rows, dimBlock.y));
 //    if (dimGrid.x * dimGrid.y > 1024) {
 //      dimGrid.y = std::max(1024 / dimGrid.x, 1);
 //    }
     dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
     if (input_deriv == NULL) {
       if (params_deriv == NULL) {
         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
                                     have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
                                     deriv_sum_in.Stride(),
                                     self_repair_config.Data(), count_in + 1,
                                     NULL,
                                     0,
                                     NULL,
                                     0,
                                     NULL,
                                     0,
                                     NULL,
                                     0,
                                     NULL,
                                     0);

       } else {
         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
                                     have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
                                     deriv_sum_in.Stride(),
                                     self_repair_config.Data(), count_in + 1,
                                     NULL,
                                     0, params_deriv->Data(),
                                     params_deriv->Stride(),
                                     value_sum_out->Data(),
                                     value_sum_out->Stride(),
                                     deriv_sum_out->Data(),
                                     deriv_sum_out->Stride(),
                                     self_repair_sum_out->Data(),
                                     self_repair_sum_out->Stride());
       }
     } else {
       if (params_deriv == NULL) {
         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
                                     have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
                                     deriv_sum_in.Stride(),
                                     self_repair_config.Data(), count_in + 1,
                                     input_deriv->Data(), input_deriv->Stride(),
                                     NULL,
                                     0, NULL, 0, NULL, 0, NULL, 0);
       } else {
         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
                                     have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
                                     deriv_sum_in.Stride(),
                                     self_repair_config.Data(), count_in + 1,
                                     input_deriv->Data(), input_deriv->Stride(),
                                     params_deriv->Data(),
                                     params_deriv->Stride(),
                                     value_sum_out->Data(),
                                     value_sum_out->Stride(),
                                     deriv_sum_out->Data(),
                                     deriv_sum_out->Stride(),
                                     self_repair_sum_out->Data(),
                                     self_repair_sum_out->Stride());
       }
     }

     CU_SAFE_CALL(cudaGetLastError());

     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     CpuBackpropLstmNonlinearity(input.Mat(), params.Mat(), output_deriv.Mat(),
                                 deriv_sum_in.Mat(), self_repair_config.Vec(),
                                 count_in, &(input_deriv->Mat()),
                                 &(params_deriv->Mat()), &(value_sum_out->Mat()),
                                 &(deriv_sum_out->Mat()),
                                 &(self_repair_sum_out->Mat()));
   }
 }

 template <typename Real>
 void EnsureNonzero(const CuVectorBase<Real> &src,
                    Real epsilon,
                    CuVectorBase<Real> *dest) {
   KALDI_ASSERT(src.Dim() == dest->Dim());
   int32 dim = src.Dim();
   // fake it with a 1-row matrix.
   CuSubMatrix<Real> src_mat(src.Data(), 1,  dim, dim),
       dest_mat(dest->Data(), 1, dim, dim);
   EnsureNonzero(src_mat, epsilon, &dest_mat);
 }

 // Instantiate the templates we defined above.

 template
 void EnsureNonzero(const CuMatrixBase<float> &src,
                    float epsilon,
                    CuMatrixBase<float> *dest);
 template
 void EnsureNonzero(const CuMatrixBase<double> &src,
                    double epsilon,
                    CuMatrixBase<double> *dest);

 template
 void EnsureNonzero(const CuVectorBase<float> &src,
                    float epsilon,
                    CuVectorBase<float> *dest);
 template
 void EnsureNonzero(const CuVectorBase<double> &src,
                    double epsilon,
                    CuVectorBase<double> *dest);

 template
 void CpuBackpropLstmNonlinearity(const MatrixBase<float> &input,
                                  const MatrixBase<float> &params,
                                  const MatrixBase<float> &output_deriv,
                                  const MatrixBase<double> &deriv_sum_in,
                                  const VectorBase<float> &self_repair_config,
                                  double count_in,
                                  MatrixBase<float> *input_deriv,
                                  MatrixBase<float> *params_deriv,
                                  MatrixBase<double> *value_sum_out,
                                  MatrixBase<double> *deriv_sum_out,
                                  MatrixBase<float> *self_repair_sum_out);
 template
 void CpuBackpropLstmNonlinearity(const MatrixBase<double> &input,
                                  const MatrixBase<double> &params,
                                  const MatrixBase<double> &output_deriv,
                                  const MatrixBase<double> &deriv_sum_in,
                                  const VectorBase<double> &self_repair_config,
                                  double count_in,
                                  MatrixBase<double> *input_deriv,
                                  MatrixBase<double> *params_deriv,
                                  MatrixBase<double> *value_sum_out,
                                  MatrixBase<double> *deriv_sum_out,
                                  MatrixBase<double> *self_repair_sum_out);
 template
 void BackpropLstmNonlinearity(const CuMatrixBase<float> &input,
                               const CuMatrixBase<float> &params,
                               const CuMatrixBase<float> &output_deriv,
                               const CuMatrixBase<double> &deriv_sum_in,
                               const CuVectorBase<float> &self_repair_config,
                               double count_in,
                               CuMatrixBase<float> *input_deriv,
                               CuMatrixBase<float> *params_deriv,
                               CuMatrixBase<double> *value_sum_out,
                               CuMatrixBase<double> *deriv_sum_out,
                               CuMatrixBase<float> *self_repair_sum_out);
 template
 void BackpropLstmNonlinearity(const CuMatrixBase<double> &input,
                               const CuMatrixBase<double> &params,
                               const CuMatrixBase<double> &output_deriv,
                               const CuMatrixBase<double> &deriv_sum_in,
                               const CuVectorBase<double> &self_repair_config,
                               double count_in,
                               CuMatrixBase<double> *input_deriv,
                               CuMatrixBase<double> *params_deriv,
                               CuMatrixBase<double> *value_sum_out,
                               CuMatrixBase<double> *deriv_sum_out,
                               CuMatrixBase<double> *self_repair_sum_out);


 } //namespace cu

 } //namespace kaldi
kaldi::CuMatrixBase::Mat
const MatrixBase< Real > & Mat() const
Definition: cu-matrix.h:755

kaldi::CuMatrixBase::CopyFromMat
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::Exp
double Exp(double x)
Definition: kaldi-math.h:83

kaldi::CuMatrixBase::Stride
MatrixIndexT Stride() const
Definition: cu-matrix.h:217

kaldi::kUndefined
Definition: matrix-common.h:39

MatrixDim_::rows
int32_cuda rows
Definition: cu-matrixdim.h:47

kaldi::CuVector
Definition: matrix-common.h:74

kaldi::cu::CpuComputeLstmNonlinearity
void CpuComputeLstmNonlinearity(const MatrixBase< Real > &input_mat, const MatrixBase< Real > &params_mat, MatrixBase< Real > *output)
Definition: cu-math.cc:445

kaldi::MatrixBase::NumCols
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67

cu-device.h

kaldi::MatrixBase
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49

kaldi::MatrixBase::Data
const Real * Data() const
Gives pointer to raw data (const).
Definition: kaldi-matrix.h:79

MatrixDim_
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:46

kaldi::cu::Randomize
void Randomize(const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< Real > *tgt)
Copies a permutation of src into tgt.
Definition: cu-math.cc:80

kaldi::MatrixBase::RowData
Real * RowData(MatrixIndexT i)
Returns pointer to data for one row (non-const)
Definition: kaldi-matrix.h:87

kaldi::CuVectorBase::AddDiagMat2
void AddDiagMat2(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, Real beta)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
Definition: cu-vector.cc:595

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

cu-matrix.h

kaldi::CuArrayBase::Data
const T * Data() const
Get raw pointer.
Definition: cu-array.h:52

kaldi::CuVectorBase::ApplyFloor
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=NULL)
Definition: cu-vector.h:139

timer.h

kaldi::CuMatrixBase::CopyColFromVec
void CopyColFromVec(const CuVectorBase< Real > &v, const MatrixIndexT col)
Copy vector into specific column of matrix.
Definition: cu-matrix.cc:2414

kaldi::cu::BackpropLstmNonlinearity
void BackpropLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
This function does the &#39;backward&#39; pass corresponding to the function ComputeLstmNonlinearity.
Definition: cu-math.cc:768

kaldi::CuVectorBase::AddDiagMatMat
void AddDiagMatMat(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const CuMatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
Definition: cu-vector.cc:611

kaldi::kTrans
Definition: matrix-common.h:33

kaldi::SameDim
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
Definition: kaldi-matrix.h:1111

count
const size_t count
Definition: arpa-file-parser-test.cc:66

kaldi::MatrixBase::Stride
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols.
Definition: kaldi-matrix.h:70

kaldi::MatrixIndexT
int32 MatrixIndexT
Definition: matrix-common.h:98

kaldi::cu::DiffNormalizePerRow
void DiffNormalizePerRow(const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)
Definition: cu-math.cc:349

kaldi::MatrixBase::Row
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188

cu-kernels.h

CU1DBLOCK
#define CU1DBLOCK
Definition: cu-matrixdim.h:57

kaldi::kNoTrans
Definition: matrix-common.h:34

CU2DBLOCK
#define CU2DBLOCK
Definition: cu-matrixdim.h:61

kaldi::CuSubMatrix
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70

kaldi::VectorBase::Dim
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64

kaldi::CuMatrixBase::Data
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::cu::ScalarTanh
static Real ScalarTanh(Real a)
Definition: cu-math.cc:434

kaldi::cu::EnsureNonzero
void EnsureNonzero(const CuMatrixBase< Real > &src, Real epsilon, CuMatrixBase< Real > *dest)
This function requires that src and dest have the same dimension and epsilon > 0. ...
Definition: cu-math.cc:209

kaldi::cu::Splice
void Splice(const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
Splice concatenates frames of src as specified in frame_offsets into tgt.
Definition: cu-math.cc:132

kaldi::CuMatrixBase
Matrix for CUDA computing.
Definition: matrix-common.h:69

kaldi::CuMatrixBase::NumCols
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216

kaldi::CuVectorBase::Vec
const VectorBase< Real > & Vec() const
Definition: cu-vector.h:235

kaldi::CuArray< int32 >

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

cu-common.h

kaldi::MatrixBase::NumRows
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64

kaldi::CuVectorBase::Data
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: cu-vector.h:72

kaldi::CuMatrixBase::Dim
::MatrixDim Dim() const
Definition: cu-matrix.h:221

kaldi::CuMatrixBase::NumRows
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215

kaldi::VectorBase
Provides a vector abstraction class.
Definition: kaldi-vector.h:41

kaldi::CuArrayBase::Dim
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:49

kaldi::cu::NormalizePerRow
void NormalizePerRow(const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square ...
Definition: cu-math.cc:280

kaldi::cu::ScalarSigmoid
static Real ScalarSigmoid(Real a)
Definition: cu-math.cc:424

kaldi::CuMatrixBase::MulRowsVec
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i&#39;th row by scale[i]
Definition: cu-matrix.cc:792

kaldi::cu::RegularizeL1
void RegularizeL1(CuMatrixBase< Real > *weight, CuMatrixBase< Real > *grad, Real l1, Real lr)
RegularizeL1 is a gradient step with l1 regularization added to the gradient.
Definition: cu-math.cc:37

kaldi::CuVectorBase::Dim
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69

kaldi::CuVectorBase
Vector for CUDA computing.
Definition: matrix-common.h:72

kaldi::CuMatrixBase::AddDiagVecMat
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
Definition: cu-matrix.cc:1382

kaldi::CuMatrixBase::RowData
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).
Definition: cu-matrix.h:740

kaldi::cu::Copy
void Copy(const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< Real > *tgt)
Copies elements from src into tgt as given by copy_from_indices.
Definition: cu-math.cc:173

kaldi::cu::CpuBackpropLstmNonlinearity
void CpuBackpropLstmNonlinearity(const MatrixBase< Real > &input, const MatrixBase< Real > &params, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > *input_deriv, MatrixBase< Real > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)
Definition: cu-math.cc:543

kaldi::cu::ComputeLstmNonlinearity
void ComputeLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, CuMatrixBase< Real > *output)
this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propaga...
Definition: cu-math.cc:489