All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
kaldi::cu Namespace Reference

Functions

template<typename Real >
void RegularizeL1 (CuMatrixBase< Real > *weight, CuMatrixBase< Real > *gradient, Real l1_penalty, Real learning_rate)
 RegularizeL1 is a gradient step with l1 regularization added to the gradient. More...
 
template<typename Real >
void Randomize (const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< Real > *tgt)
 Copies a permutation of src into tgt. More...
 
template<typename Real >
void Splice (const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
 Splice concatenates frames of src as specified in frame_offsets into tgt. More...
 
template<typename Real >
void Copy (const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< Real > *tgt)
 Copies elements from src into tgt as given by copy_from_indices. More...
 
template void RegularizeL1 (CuMatrixBase< float > *weight, CuMatrixBase< float > *grad, float l1, float lr)
 
template void RegularizeL1 (CuMatrixBase< double > *weight, CuMatrixBase< double > *grad, double l1, double lr)
 
template void Splice (const CuMatrixBase< float > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< float > *tgt)
 
template void Splice (const CuMatrixBase< double > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< double > *tgt)
 
template void Copy (const CuMatrixBase< float > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< float > *tgt)
 
template void Copy (const CuMatrixBase< double > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< double > *tgt)
 
template void Randomize (const CuMatrixBase< float > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< float > *tgt)
 
template void Randomize (const CuMatrixBase< double > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< double > *tgt)
 
template<typename Real >
void NormalizePerRow (const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
 This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity. More...
 
template void NormalizePerRow (const CuMatrixBase< float > &in, const float target_rms, const bool add_log_stddev, CuMatrixBase< float > *out)
 
template void NormalizePerRow (const CuMatrixBase< double > &in, const double target_rms, const bool add_log_stddev, CuMatrixBase< double > *out)
 
template<typename Real >
static Real ScalarSigmoid (Real a)
 
template<typename Real >
static Real ScalarTanh (Real a)
 
template<typename Real >
void CpuComputeLstmNonlinearity (const MatrixBase< Real > &input_mat, const MatrixBase< Real > &params_mat, MatrixBase< Real > *output)
 
template<typename Real >
void ComputeLstmNonlinearity (const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, CuMatrixBase< Real > *output)
 this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propagation. More...
 
template void CpuComputeLstmNonlinearity (const MatrixBase< float > &input_mat, const MatrixBase< float > &params_mat, MatrixBase< float > *output)
 
template void CpuComputeLstmNonlinearity (const MatrixBase< double > &input_mat, const MatrixBase< double > &params_mat, MatrixBase< double > *output)
 
template void ComputeLstmNonlinearity (const CuMatrixBase< float > &input, const CuMatrixBase< float > &params, CuMatrixBase< float > *output)
 
template void ComputeLstmNonlinearity (const CuMatrixBase< double > &input, const CuMatrixBase< double > &params, CuMatrixBase< double > *output)
 
template<typename Real >
void CpuBackpropLstmNonlinearity (const MatrixBase< Real > &input, const MatrixBase< Real > &params, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > *input_deriv, MatrixBase< Real > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)
 
template<typename Real >
void BackpropLstmNonlinearity (const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
 
template void CpuBackpropLstmNonlinearity (const MatrixBase< float > &input, const MatrixBase< float > &params, const MatrixBase< float > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< float > &self_repair_config, double count_in, MatrixBase< float > *input_deriv, MatrixBase< float > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< float > *self_repair_sum_out)
 
template void CpuBackpropLstmNonlinearity (const MatrixBase< double > &input, const MatrixBase< double > &params, const MatrixBase< double > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< double > &self_repair_config, double count_in, MatrixBase< double > *input_deriv, MatrixBase< double > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< double > *self_repair_sum_out)
 
template void BackpropLstmNonlinearity (const CuMatrixBase< float > &input, const CuMatrixBase< float > &params, const CuMatrixBase< float > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< float > &self_repair_config, double count_in, CuMatrixBase< float > *input_deriv, CuMatrixBase< float > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< float > *self_repair_sum_out)
 
template void BackpropLstmNonlinearity (const CuMatrixBase< double > &input, const CuMatrixBase< double > &params, const CuMatrixBase< double > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< double > &self_repair_config, double count_in, CuMatrixBase< double > *input_deriv, CuMatrixBase< double > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< double > *self_repair_sum_out)
 
template<typename Real >
void Group2norm (const CuMatrixBase< Real > &src, CuMatrixBase< Real > *dest, int32 group_stride)
 

Function Documentation

void BackpropLstmNonlinearity ( const CuMatrixBase< Real > &  input,
const CuMatrixBase< Real > &  params,
const CuMatrixBase< Real > &  output_deriv,
const CuMatrixBase< double > &  deriv_sum_in,
const CuVectorBase< Real > &  self_repair_config,
double  count_in,
CuMatrixBase< Real > *  input_deriv,
CuMatrixBase< Real > *  params_deriv,
CuMatrixBase< double > *  value_sum_out,
CuMatrixBase< double > *  deriv_sum_out,
CuMatrixBase< Real > *  self_repair_sum_out 
)

Definition at line 622 of file cu-math.cc.

References CpuBackpropLstmNonlinearity(), CU1DBLOCK, CuVectorBase< Real >::Data(), CuMatrixBase< Real >::Data(), CuVectorBase< Real >::Dim(), Timer::Elapsed(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), CuMatrixBase< Real >::Stride(), and CuVectorBase< Real >::Vec().

Referenced by LstmNonlinearityComponent::Backprop(), kaldi::UnitTestBackpropLstmNonlinearity(), and kaldi::UnitTestLstmNonlinearity().

632  {
633  int32 num_rows = input.NumRows();
634  int32 cell_dim = input.NumCols() / 5;
635  // Check dimensions.
636  KALDI_ASSERT(input.NumCols() % 5 == 0);
637  KALDI_ASSERT(params.NumRows() == 3);
638  KALDI_ASSERT(params.NumCols() == cell_dim);
639  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
640  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
641  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
642  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
643  KALDI_ASSERT(self_repair_config.Dim() == 10);
644  KALDI_ASSERT(count_in >= 0);
645  if (input_deriv != NULL) {
646  KALDI_ASSERT(SameDim(input, *input_deriv));
647  }
648  if (params_deriv == NULL) {
649  KALDI_ASSERT(value_sum_out == NULL);
650  KALDI_ASSERT(deriv_sum_out == NULL);
651  KALDI_ASSERT(self_repair_sum_out == NULL);
652  } else {
653  KALDI_ASSERT(value_sum_out != NULL);
654  KALDI_ASSERT(deriv_sum_out != NULL);
655  KALDI_ASSERT(self_repair_sum_out != NULL);
656  KALDI_ASSERT(SameDim(params, *params_deriv));
657  KALDI_ASSERT(value_sum_out->NumRows() == 5);
658  KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
659  KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
660  KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
661  KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
662  }
663 
664 
665 #if HAVE_CUDA == 1
666  if (CuDevice::Instantiate().Enabled()) {
667  Timer tim;
668  // Each thread block is working on 1 row of the data.
669  // It's best that cell dim is a multiple fo CU1DBLOCK
670 
671 
672  // Use 2D block (8x32 threads) as we need to compute column sum.
673  // Use 1D grid to cover the data matrix width `cell_dim`.
674  const int kWarpSize = 32;
675  dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
676 // dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
677 // n_blocks(num_rows, dimBlock.y));
678 // if (dimGrid.x * dimGrid.y > 1024) {
679 // dimGrid.y = std::max(1024 / dimGrid.x, 1);
680 // }
681  dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
682  if (input_deriv == NULL) {
683  if (params_deriv == NULL) {
684  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
685  input.Data(), input.Stride(), params.Data(),
686  params.Stride(), output_deriv.Data(),
687  output_deriv.Stride(), deriv_sum_in.Data(),
688  deriv_sum_in.Stride(),
689  self_repair_config.Data(), count_in + 1,
690  NULL,
691  0,
692  NULL,
693  0,
694  NULL,
695  0,
696  NULL,
697  0,
698  NULL,
699  0);
700 
701  } else {
702  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
703  input.Data(), input.Stride(), params.Data(),
704  params.Stride(), output_deriv.Data(),
705  output_deriv.Stride(), deriv_sum_in.Data(),
706  deriv_sum_in.Stride(),
707  self_repair_config.Data(), count_in + 1,
708  NULL,
709  0, params_deriv->Data(),
710  params_deriv->Stride(),
711  value_sum_out->Data(),
712  value_sum_out->Stride(),
713  deriv_sum_out->Data(),
714  deriv_sum_out->Stride(),
715  self_repair_sum_out->Data(),
716  self_repair_sum_out->Stride());
717  }
718  } else {
719  if (params_deriv == NULL) {
720  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
721  input.Data(), input.Stride(), params.Data(),
722  params.Stride(), output_deriv.Data(),
723  output_deriv.Stride(), deriv_sum_in.Data(),
724  deriv_sum_in.Stride(),
725  self_repair_config.Data(), count_in + 1,
726  input_deriv->Data(), input_deriv->Stride(),
727  NULL,
728  0, NULL, 0, NULL, 0, NULL, 0);
729  } else {
730  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
731  input.Data(), input.Stride(), params.Data(),
732  params.Stride(), output_deriv.Data(),
733  output_deriv.Stride(), deriv_sum_in.Data(),
734  deriv_sum_in.Stride(),
735  self_repair_config.Data(), count_in + 1,
736  input_deriv->Data(), input_deriv->Stride(),
737  params_deriv->Data(),
738  params_deriv->Stride(),
739  value_sum_out->Data(),
740  value_sum_out->Stride(),
741  deriv_sum_out->Data(),
742  deriv_sum_out->Stride(),
743  self_repair_sum_out->Data(),
744  self_repair_sum_out->Stride());
745  }
746  }
747 
748  CU_SAFE_CALL(cudaGetLastError());
749 
750  CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
751  } else
752 #endif
753  {
754  CpuBackpropLstmNonlinearity(input.Mat(), params.Mat(), output_deriv.Mat(),
755  deriv_sum_in.Mat(), self_repair_config.Vec(),
756  count_in, &(input_deriv->Mat()),
757  &(params_deriv->Mat()), &(value_sum_out->Mat()),
758  &(deriv_sum_out->Mat()),
759  &(self_repair_sum_out->Mat()));
760  }
761 }
MatrixIndexT NumCols() const
Definition: cu-matrix.h:196
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
const MatrixBase< Real > & Mat() const
Definition: cu-matrix.h:614
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:195
template void CpuBackpropLstmNonlinearity(const MatrixBase< double > &input, const MatrixBase< double > &params, const MatrixBase< double > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< double > &self_repair_config, double count_in, MatrixBase< double > *input_deriv, MatrixBase< double > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< double > *self_repair_sum_out)
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
MatrixIndexT Stride() const
Definition: cu-matrix.h:197
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:605
template void kaldi::cu::BackpropLstmNonlinearity ( const CuMatrixBase< float > &  input,
const CuMatrixBase< float > &  params,
const CuMatrixBase< float > &  output_deriv,
const CuMatrixBase< double > &  deriv_sum_in,
const CuVectorBase< float > &  self_repair_config,
double  count_in,
CuMatrixBase< float > *  input_deriv,
CuMatrixBase< float > *  params_deriv,
CuMatrixBase< double > *  value_sum_out,
CuMatrixBase< double > *  deriv_sum_out,
CuMatrixBase< float > *  self_repair_sum_out 
)
template void kaldi::cu::BackpropLstmNonlinearity ( const CuMatrixBase< double > &  input,
const CuMatrixBase< double > &  params,
const CuMatrixBase< double > &  output_deriv,
const CuMatrixBase< double > &  deriv_sum_in,
const CuVectorBase< double > &  self_repair_config,
double  count_in,
CuMatrixBase< double > *  input_deriv,
CuMatrixBase< double > *  params_deriv,
CuMatrixBase< double > *  value_sum_out,
CuMatrixBase< double > *  deriv_sum_out,
CuMatrixBase< double > *  self_repair_sum_out 
)
void ComputeLstmNonlinearity ( const CuMatrixBase< Real > &  input,
const CuMatrixBase< Real > &  params,
CuMatrixBase< Real > *  output 
)

this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propagation.

It computes the core part of the LSTM nonlinearity. Refer to class LstmNonlinearityComponent in ../nnet3/nnet-simple-component.h for more context.

Parameters
[in]inputA matrix, of dimension N by 5C (i.e. its num-cols must be a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}).
[in]paramsA matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}.
[out]outputA matrix, of dimension N by 2C. The quantities c_t and m_t respectively are put there (in two blocks of column-dimension C), according to the following equations:

i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) c_t = f_t*c_{t-1} + i_t * Tanh(c_part) o_t = Sigmoid(o_part + w_{oc}*c_t) m_t = o_t * Tanh(c_t)

Definition at line 355 of file cu-math.cc.

References CpuComputeLstmNonlinearity(), CU1DBLOCK, CuMatrixBase< Real >::Data(), Timer::Elapsed(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::Stride().

Referenced by LstmNonlinearityComponent::Propagate(), kaldi::UnitTestCuMathComputeLstmNonlinearity(), and kaldi::UnitTestLstmNonlinearity().

357  {
358  int32 num_rows = input.NumRows();
359  int32 cell_dim = input.NumCols() / 5;
360  KALDI_ASSERT(output->NumRows() == num_rows);
361  KALDI_ASSERT(input.NumCols() % 5 == 0);
362  KALDI_ASSERT(params.NumRows() == 3);
363  KALDI_ASSERT(params.NumCols() == cell_dim);
364  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
365 
366 #if HAVE_CUDA == 1
367  if (CuDevice::Instantiate().Enabled()) {
368  Timer tim;
369 
370  // Each thread block is working on 1 row of the data.
371  // It's best that cell dim is a multiple fo CU1DBLOCK
372  dim3 dimBlock(CU1DBLOCK);
373  dim3 dimGrid(num_rows);
374 
375  cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(),
376  params.Data(), params.Stride(), output->Stride(),
377  cell_dim, num_rows, output->Data());
378  CU_SAFE_CALL(cudaGetLastError());
379 
380  CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
381  } else
382 #endif
383  {
384  CpuComputeLstmNonlinearity(input.Mat(), params.Mat(), &output->Mat());
385  }
386 }
template void CpuComputeLstmNonlinearity(const MatrixBase< double > &input_mat, const MatrixBase< double > &params_mat, MatrixBase< double > *output)
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::ComputeLstmNonlinearity ( const CuMatrixBase< float > &  input,
const CuMatrixBase< float > &  params,
CuMatrixBase< float > *  output 
)
template void kaldi::cu::ComputeLstmNonlinearity ( const CuMatrixBase< double > &  input,
const CuMatrixBase< double > &  params,
CuMatrixBase< double > *  output 
)
void Copy ( const CuMatrixBase< Real > &  src,
const CuArray< int32 > &  copy_from_indices,
CuMatrixBase< Real > *  tgt 
)

Copies elements from src into tgt as given by copy_from_indices.

The matrices src and tgt must have the same dimensions and the dimension of copy_from_indices must equal the number of columns in the src matrix. As a result, tgt(i, j) == src(i, copy_from_indices[j]). Also see CuMatrix::CopyCols(), which is more general.

Definition at line 173 of file cu-math.cc.

References CU2DBLOCK, CuArray< T >::Data(), CuMatrixBase< Real >::Data(), CuArray< T >::Dim(), CuMatrixBase< Real >::Dim(), Timer::Elapsed(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::NumRows().

Referenced by CopyComponent::PropagateFnc(), kaldi::TestClusterUtilsVector(), and kaldi::UnitTestCuMathCopy().

174  {
175 
176  KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
177  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
178 
179  #if HAVE_CUDA == 1
180  if (CuDevice::Instantiate().Enabled()) {
181  Timer tim;
182 
183  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
184  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
185 
186  cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
187  copy_from_indices.Data(), tgt->Dim(), src.Dim());
188  CU_SAFE_CALL(cudaGetLastError());
189 
190  CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
191  } else
192  #endif
193  {
194  // expand in CPU
195  const MatrixBase<Real> &srcmat = src.Mat();
196  const int32 *copy_from_indicesvec = copy_from_indices.Data();
197  int32 dim = copy_from_indices.Dim();
198  MatrixBase<Real> &tgtmat = tgt->Mat();
199  //
200  for(int32 r = 0; r < tgtmat.NumRows(); r++) {
201  for(int32 c = 0; c < dim; c++) {
202  tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
203  }
204  }
205  }
206 }
const T * Data() const
Get raw pointer.
Definition: cu-array.h:65
#define CU2DBLOCK
Definition: cu-matrixdim.h:67
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:62
template void kaldi::cu::Copy ( const CuMatrixBase< float > &  src,
const CuArray< int32 > &  copy_from_indices,
CuMatrixBase< float > *  tgt 
)
template void kaldi::cu::Copy ( const CuMatrixBase< double > &  src,
const CuArray< int32 > &  copy_from_indices,
CuMatrixBase< double > *  tgt 
)
void CpuBackpropLstmNonlinearity ( const MatrixBase< Real > &  input,
const MatrixBase< Real > &  params,
const MatrixBase< Real > &  output_deriv,
const MatrixBase< double > &  deriv_sum_in,
const VectorBase< Real > &  self_repair_config,
double  count_in,
MatrixBase< Real > *  input_deriv,
MatrixBase< Real > *  params_deriv,
MatrixBase< double > *  value_sum_out,
MatrixBase< double > *  deriv_sum_out,
MatrixBase< Real > *  self_repair_sum_out 
)

Definition at line 406 of file cu-math.cc.

References count, VectorBase< Real >::Dim(), rnnlm::i, KALDI_ASSERT, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), kaldi::SameDim(), ScalarSigmoid(), and ScalarTanh().

Referenced by BackpropLstmNonlinearity(), and kaldi::UnitTestBackpropLstmNonlinearity().

416  {
417  int32 num_rows = input.NumRows();
418  int32 cell_dim = input.NumCols() / 5;
419  // Check dimensions.
420  KALDI_ASSERT(input.NumCols() % 5 == 0);
421  KALDI_ASSERT(params.NumRows() == 3);
422  KALDI_ASSERT(params.NumCols() == cell_dim);
423  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
424  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
425  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
426  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
427  KALDI_ASSERT(self_repair_config.Dim() == 10);
428  KALDI_ASSERT(count_in >= 0);
429  if (input_deriv != NULL) {
430  KALDI_ASSERT(SameDim(input, *input_deriv));
431  }
432  if (params_deriv == NULL) {
433  KALDI_ASSERT(value_sum_out == NULL);
434  KALDI_ASSERT(deriv_sum_out == NULL);
435  KALDI_ASSERT(self_repair_sum_out == NULL);
436  } else {
437  KALDI_ASSERT(value_sum_out != NULL);
438  KALDI_ASSERT(deriv_sum_out != NULL);
439  KALDI_ASSERT(self_repair_sum_out != NULL);
440  KALDI_ASSERT(SameDim(params, *params_deriv));
441  KALDI_ASSERT(value_sum_out->NumRows() == 5);
442  KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
443  KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
444  KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
445  KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
446  }
447 
448  const MatrixBase<Real> &input_mat = input;
449  const MatrixBase<Real> &params_mat = params;
450  const MatrixBase<Real> &output_deriv_mat = output_deriv;
451  const MatrixBase<double> &deriv_sum_in_mat = deriv_sum_in;
452  const VectorBase<Real> &sr_config = self_repair_config;
453  MatrixBase<Real> *input_deriv_mat = (
454  input_deriv == NULL ? NULL : input_deriv);
455  MatrixBase<Real> *params_deriv_mat = NULL;
456  MatrixBase<Real> *self_repair_sum_out_mat = NULL;
457  MatrixBase<double> *value_sum_out_mat = NULL;
458  MatrixBase<double> *deriv_sum_out_mat = NULL;
459  if (params_deriv != NULL) {
460  params_deriv_mat = params_deriv;
461  value_sum_out_mat = value_sum_out;
462  deriv_sum_out_mat = deriv_sum_out;
463  self_repair_sum_out_mat = self_repair_sum_out;
464  }
465 
466 
467  // We add 1.0 (i.e. a small value) to the count to avoid division by zero.
468  Real count = 1.0 + count_in;
469  for (int32 c = 0; c < cell_dim; c++) {
470  // parameters
471  Real w_ic = params_mat(0, c);
472  Real w_fc = params_mat(1, c);
473  Real w_oc = params_mat(2, c);
474  // derivative sums w.r.t. parameters.
475  Real w_ic_deriv_sum = 0.0;
476  Real w_fc_deriv_sum = 0.0;
477  Real w_oc_deriv_sum = 0.0;
478 
479  // average derivatives, for self-repair.
480  // The 5 nonlinearities that are subject to self-repair are written as:
481  // Sigmoid(i_t_input), Sigmoid(f_t_input),
482  // Tanh(c_part), Sigmoid(o_t_input), Tanh(c_t)
483  Real i_t_self_repair = (
484  deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
485  Real f_t_self_repair = (
486  deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
487  Real c_part_self_repair = (
488  deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
489  Real o_t_self_repair = (
490  deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
491  Real c_t_self_repair = (
492  deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
493  // Note on how we add self-repair for sigmoids/tanh's. If self-repair
494  // is activated for this unit, then...
495  // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0)
496  // ... to the input-deriv;
497  // For tanh's we'd add -self_repair_scale * tanh(x)
498  // If self-repair is not activated, the 'self_repair' scales are set to zero.
499 
500  // The following variables are for the accumulation of stats on the
501  // sigmoid and tanh units.
502  Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0;
503  Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0;
504  Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0;
505  Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0;
506  Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0;
507 
508 
509  for (int32 r = 0; r < num_rows; r++) {
510  Real i_part = input_mat(r, c),
511  f_part = input_mat(r, c + cell_dim),
512  c_part = input_mat(r, c + 2 * cell_dim),
513  o_part = input_mat(r, c + 3 * cell_dim),
514  c_prev = input_mat(r, c + 4 * cell_dim);
515  // For greater clarity, we give some of the quantities in the
516  // forward equations their own names.
517  Real i_t_input = i_part + w_ic * c_prev,
518  i_t = ScalarSigmoid(i_t_input),
519  f_t_input = f_part + w_fc * c_prev,
520  f_t = ScalarSigmoid(f_t_input),
521  tanh_c_part = ScalarTanh(c_part),
522  c_t = f_t * c_prev + i_t * tanh_c_part,
523  o_t_input = o_part + w_oc * c_t,
524  o_t = ScalarSigmoid(o_t_input),
525  tanh_c_t = ScalarTanh(c_t);
526  // we'd also compute, in the forward pass,
527  // m_t = o_t * tanh_c_t;
528  // but this variable is not needed.
529 
530  // Accumulate nonlinearity value and derivative stats.
531  // Note:
532  // tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
533  // sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)).
534  i_t_value_sum += i_t;
535  i_t_deriv_sum += i_t * (1.0F - i_t);
536  f_t_value_sum += f_t;
537  f_t_deriv_sum += f_t * (1.0F - f_t);
538  c_part_value_sum += tanh_c_part;
539  c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part;
540  o_t_value_sum += o_t;
541  o_t_deriv_sum += o_t * (1.0F - o_t);
542  c_t_value_sum += tanh_c_t;
543  c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t;
544 
545 
546  // the derivative of the objective function w.r.t. a particular quantity
547  // will be written by prepending "d" to the name.
548  // We compute these derivatives in the reverse of the order in which
549  // we computed the original quantities.
550  // dc_t_out is the part of the derivative w.r.t. c_t that
551  // comes directly from the output of this function.
552  Real dc_t_out = output_deriv_mat(r, c);
553  Real dm_t = output_deriv_mat(r, c + cell_dim);
554  Real dtanh_c_t = o_t * dm_t;
555  Real do_t = tanh_c_t * dm_t;
556  Real do_t_input = (o_t * (1.0F - o_t) * do_t
557  - (2.0F * o_t - 1.0F) * o_t_self_repair);
558  Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
559  + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
560  Real dtanh_c_part = i_t * dc_t;
561  Real df_t = dc_t * c_prev;
562  Real df_t_input = (df_t * f_t * (1.0F - f_t)
563  - (2.0F * f_t - 1.0F) * f_t_self_repair);
564  Real di_t = dc_t * tanh_c_part;
565  Real di_t_input = (di_t * i_t * (1.0F - i_t)
566  - (2.0F * i_t - 1.0F) * i_t_self_repair);
567 
568  w_ic_deriv_sum += c_prev * di_t_input;
569  w_fc_deriv_sum += c_prev * df_t_input;
570  w_oc_deriv_sum += c_t * do_t_input;
571 
572  Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * dc_t;
573  Real do_part = do_t_input;
574  Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
575  - tanh_c_part * c_part_self_repair);
576  Real df_part = df_t_input;
577  Real di_part = di_t_input;
578 
579  if (input_deriv_mat != NULL) {
580  (*input_deriv_mat)(r, c) = di_part;
581  (*input_deriv_mat)(r, c + cell_dim) = df_part;
582  (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part;
583  (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part;
584  (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev;
585  }
586  }
587 
588  if (params_deriv != NULL) {
589  // note: for optimizing things you can assume that params_deriv and
590  // input_deriv_mat are non-NULL (i.e. all the output matrices are
591  // non-NULL). The situations when some of the output matrices are NULL
592  // does not happen often (mainly only in testing code).
593 
594  (*params_deriv_mat)(0, c) = w_ic_deriv_sum;
595  (*params_deriv_mat)(1, c) = w_fc_deriv_sum;
596  (*params_deriv_mat)(2, c) = w_oc_deriv_sum;
597 
598  (*value_sum_out_mat)(0, c) += i_t_value_sum;
599  (*value_sum_out_mat)(1, c) += f_t_value_sum;
600  (*value_sum_out_mat)(2, c) += c_part_value_sum;
601  (*value_sum_out_mat)(3, c) += o_t_value_sum;
602  (*value_sum_out_mat)(4, c) += c_t_value_sum;
603 
604  // need to update self_repair_sum_out before deriv_sum_out, because
605  // deriv_sum_out and deriv_sum_in might point to the same memory.
606  for (int32 i = 0; i < 5; i++)
607  (*self_repair_sum_out_mat)(i, c) =
608  (deriv_sum_in_mat(i, c) / count < sr_config(i) ? num_rows : 0);
609 
610  (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
611  (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
612  (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum;
613  (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum;
614  (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum;
615  }
616  }
617 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
const size_t count
static Real ScalarTanh(Real a)
Definition: cu-math.cc:306
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
static Real ScalarSigmoid(Real a)
Definition: cu-math.cc:296
template void kaldi::cu::CpuBackpropLstmNonlinearity ( const MatrixBase< float > &  input,
const MatrixBase< float > &  params,
const MatrixBase< float > &  output_deriv,
const MatrixBase< double > &  deriv_sum_in,
const VectorBase< float > &  self_repair_config,
double  count_in,
MatrixBase< float > *  input_deriv,
MatrixBase< float > *  params_deriv,
MatrixBase< double > *  value_sum_out,
MatrixBase< double > *  deriv_sum_out,
MatrixBase< float > *  self_repair_sum_out 
)
template void kaldi::cu::CpuBackpropLstmNonlinearity ( const MatrixBase< double > &  input,
const MatrixBase< double > &  params,
const MatrixBase< double > &  output_deriv,
const MatrixBase< double > &  deriv_sum_in,
const VectorBase< double > &  self_repair_config,
double  count_in,
MatrixBase< double > *  input_deriv,
MatrixBase< double > *  params_deriv,
MatrixBase< double > *  value_sum_out,
MatrixBase< double > *  deriv_sum_out,
MatrixBase< double > *  self_repair_sum_out 
)
void CpuComputeLstmNonlinearity ( const MatrixBase< Real > &  input_mat,
const MatrixBase< Real > &  params_mat,
MatrixBase< Real > *  output 
)

Definition at line 317 of file cu-math.cc.

References MatrixBase< Real >::Data(), KALDI_ASSERT, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), MatrixBase< Real >::RowData(), ScalarSigmoid(), ScalarTanh(), and MatrixBase< Real >::Stride().

Referenced by ComputeLstmNonlinearity(), and kaldi::UnitTestCuMathComputeLstmNonlinearity().

319  {
320  int32 num_rows = input_mat.NumRows();
321  int32 cell_dim = input_mat.NumCols() / 5;
322  KALDI_ASSERT(output->NumRows() == num_rows);
323  KALDI_ASSERT(input_mat.NumCols() % 5 == 0);
324  KALDI_ASSERT(params_mat.NumRows() == 3);
325  KALDI_ASSERT(params_mat.NumCols() == cell_dim);
326  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
327 
328  MatrixBase<Real> &output_mat = *output;
329  const Real *params_data = params_mat.Data();
330  int32 params_stride = params_mat.Stride();
331  for (int32 r = 0; r < num_rows; r++) {
332  const Real *input_row = input_mat.RowData(r);
333  Real *output_row = output_mat.RowData(r);
334  for (int32 c = 0; c < cell_dim; c++) {
335  Real i_part = input_row[c];
336  Real f_part = input_row[c + cell_dim];
337  Real c_part = input_row[c + 2 * cell_dim];
338  Real o_part = input_row[c + 3 * cell_dim];
339  Real c_prev = input_row[c + 4 * cell_dim];
340  Real w_ic = params_data[c];
341  Real w_fc = params_data[c + params_stride];
342  Real w_oc = params_data[c + params_stride * 2];
343  Real i_t = ScalarSigmoid(i_part + w_ic * c_prev);
344  Real f_t = ScalarSigmoid(f_part + w_fc * c_prev);
345  Real c_t = f_t * c_prev + i_t * ScalarTanh(c_part);
346  Real o_t = ScalarSigmoid(o_part + w_oc * c_t);
347  Real m_t = o_t * ScalarTanh(c_t);
348  output_row[c] = c_t;
349  output_row[c + cell_dim] = m_t;
350  }
351  }
352 }
static Real ScalarTanh(Real a)
Definition: cu-math.cc:306
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
static Real ScalarSigmoid(Real a)
Definition: cu-math.cc:296
template void kaldi::cu::CpuComputeLstmNonlinearity ( const MatrixBase< float > &  input_mat,
const MatrixBase< float > &  params_mat,
MatrixBase< float > *  output 
)
template void kaldi::cu::CpuComputeLstmNonlinearity ( const MatrixBase< double > &  input_mat,
const MatrixBase< double > &  params_mat,
MatrixBase< double > *  output 
)
void kaldi::cu::Group2norm ( const CuMatrixBase< Real > &  src,
CuMatrixBase< Real > *  dest,
int32  group_stride 
)
void NormalizePerRow ( const CuMatrixBase< Real > &  in,
const Real  target_rms,
const bool  add_log_stddev,
CuMatrixBase< Real > *  out 
)

This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity.

It's a little more complicated than you might expect because of the 'self-repair' mechanism that we use to prevent the sigmoid and tanh nonlinearities oversaturating, and because of the average-activation and average-derivative stats that we store for these nonlinearites (these stats are used both to control the self-repair mechanism, and for diagnostic purposes).

Because the forward pass computes various intermediate values that are not output, this function actually has to do the same computations as the forward pass before it actually does the backprop.

Parameters
[in]inputThe same as in ComputeLstmNonlinearity(). A matrix, of dimension N by 5C (i.e. its num-cols must be a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}).
[in]paramsThe same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}.
[in]output_derivA matrix, of dimension N by 2C, containing the derivative of the objective function we're backpropagating, w.r.t. the quantities c_t and m_t (in two blocks of column-dimension C).
[in]deriv_sum_inThis is used in the self-repair code to identify oversaturated nonlinearities. It is a matrix, of dimension 5 by C, corresponding to the totals of the derivatives of the 5 sigmoid and tanh nonlinearities, in they order they appear in the equations in the documentation of ComputeLstmNonlinearity() Rspectively, they appear in the equations for (i_t, f_t, c_t, o_t, m_t). This will be divided by 'count_in' to get the average derivative value so far, for each of the nonlinearities.
[in]self_repair_configA vector of dimension 10, containing the configuration of the self-repair to be used for the 5 nonlinearities. The first 5 elements are the self_repair_lower_threshold values (typically 0.05 for sigmoid and 0.2 for tanh), and the next 5 elements are the corresponding self-repair-scales (typically 10^-5).
[in]count_inThe data-count that corresponds to the stats in 'deriv_sum_in' at entry to the function. This function should tolerate the count being zero (in that case, it is free to do the self-repair or not, as this should only happen on the 1st minibatch of each training job).
[out]input_derivMay be NULL; if not, this function writes, to this location, the backpropagated derivative of the objective function w.r.t. the 'input' matrix. This matrix should have the same dimension as 'input' i.e. N by 5C. In addition to the regular backpropagated derivative, the output will include small values relating to 'self-repair'.
[out]params_derivMay be NULL; if not, this is where this function *writes* [not adds] the backpropagated derivative of the objective function w.r.t. 'params'; it should have the same dimension as 'params' (3 by C). (This matrix will then be processed by the natural gradient code and added to the appropriate copy of the parameter matrix, outside this function).
[out]value_sum_outMust be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C. This function *adds* to this location the total value of each of the sigmoid/tanh nonlinearities that it computes (this is for diagnostic purposes).
[out]deriv_sum_outMust be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C; this function *adds* to this location the total of the derivative of each of the sigmoid/tanh nonlinearities that it computes (this is for diagnostic purposes and to control the self-repair). This function should tolerate the case when 'deriv_sum_out' points to the same data as 'deriv_sum_in'.
[out]self_repair_sum_outMust be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C; this function *writes* to this location the sum of the number of times the self-repair code was activated (integer values 0 <= k <= N). This will be processed outside this function into self-repair stats for diagnostics.Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square equals 1.0.

The output y_i = scale * x_i, and we want to RMS value of the y_i to equal target_rms, so y^t y = D * target_rms^2 (if y is one row of the input). we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). there is also flooring involved, to avoid division-by-zero problems. It's important for the backprop, that the floor's square root is exactly representable as float. If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) is an extra dimension of the output.

Definition at line 246 of file cu-math.cc.

References CuMatrixBase< Real >::CopyColFromVec(), CuMatrixBase< Real >::CopyFromMat(), CU1DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), Timer::Elapsed(), KALDI_ASSERT, kaldi::kNoTrans, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), and CuMatrixBase< Real >::Stride().

Referenced by NormalizeComponent::Propagate(), and kaldi::UnitTestCuMathNormalizePerRow().

247  {
248  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
249  if (add_log_stddev) {
250  KALDI_ASSERT(in.NumRows() == out->NumRows());
251  KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
252  } else {
253  KALDI_ASSERT(SameDim(in, *out));
254  }
255 
256 #if HAVE_CUDA == 1
257  if (CuDevice::Instantiate().Enabled()) {
258  Timer tim;
259  size_t dimBlock = CU1DBLOCK;
260  size_t dimGrid = out->NumRows();
261  cuda_normalize_per_row(dimGrid, dimBlock, out->Data(), out->Stride(),
262  in.Data(), in.Dim(), target_rms, add_log_stddev);
263  CU_SAFE_CALL(cudaGetLastError());
264  CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
265  } else
266 #endif
267  {
268  CuSubMatrix<Real> out_no_log(*out, 0, out->NumRows(), 0, in.NumCols());
269  if (in.Data() != out_no_log.Data())
270  out_no_log.CopyFromMat(in);
271  CuVector<Real> in_norm(in.NumRows());
272  Real d_scaled = in.NumCols() * target_rms * target_rms;
273  in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
274  in_norm.ApplyFloor(kSquaredNormFloor);
275  in_norm.ApplyPow(-0.5);
276  out_no_log.MulRowsVec(in_norm);
277  if (add_log_stddev) {
278  in_norm.ApplyLog();
279  in_norm.Scale(-1.0);
280  in_norm.Add(log(target_rms));
281  out->CopyColFromVec(in_norm, in.NumCols());
282  }
283  }
284 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::NormalizePerRow ( const CuMatrixBase< float > &  in,
const float  target_rms,
const bool  add_log_stddev,
CuMatrixBase< float > *  out 
)
template void kaldi::cu::NormalizePerRow ( const CuMatrixBase< double > &  in,
const double  target_rms,
const bool  add_log_stddev,
CuMatrixBase< double > *  out 
)
void Randomize ( const CuMatrixBase< Real > &  src,
const CuArray< int32 > &  copy_from_idx,
CuMatrixBase< Real > *  tgt 
)

Copies a permutation of src into tgt.

The row permutation is specified in copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The dimensions of copy_from_idx must be equivalent to the number of rows in tgt and src and all elements in the vector must be in [0, src.numRows()-1].

Definition at line 80 of file cu-math.cc.

References CuArray< T >::Data(), CuMatrixBase< Real >::Data(), CuArray< T >::Dim(), CuMatrixBase< Real >::Dim(), Timer::Elapsed(), rnnlm::i, KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), MatrixBase< Real >::Row(), and MatrixDim_::rows.

Referenced by MatrixRandomizer::Randomize(), and kaldi::UnitTestCuMathRandomize().

82  {
83 
84  KALDI_ASSERT(src.NumCols() == tgt->NumCols());
85  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
86  KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());
87 
88  #if HAVE_CUDA == 1
89  if (CuDevice::Instantiate().Enabled()) {
90  Timer tim;
91 
92  /*
93  Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
94  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
95  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
96  */
97 
98  /*
99  * Let's use blocksize 4 x 128 (512 threads/block)
100  * and extend the randomizable matrices to: col 4*65535, row 128*65535
101  * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
102  */
103  dim3 dimBlock(4, 128);
104  dim3 dimGrid(n_blocks(tgt->NumCols(), 4), n_blocks(copy_from_idx.Dim(), 128));
105  /*
106  */
107 
108  MatrixDim dimsrc = src.Dim(); dimsrc.rows=copy_from_idx.Dim();
109  MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();
110 
111  cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
112  copy_from_idx.Data(), dimtgt, dimsrc);
113  CU_SAFE_CALL(cudaGetLastError());
114 
115  CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
116  } else
117  #endif
118  {
119  // randomize in CPU
120  const MatrixBase<Real> &srcmat = src.Mat();
121  const int32 *copy_from_idxvec = copy_from_idx.Data();
122  MatrixBase<Real> &tgtmat = tgt->Mat();
123  for(int32 i=0; i<copy_from_idx.Dim(); i++) {
124  tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
125  }
126  }
127 }
const T * Data() const
Get raw pointer.
Definition: cu-array.h:65
int32_cuda rows
Definition: cu-matrixdim.h:53
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:52
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:62
template void kaldi::cu::Randomize ( const CuMatrixBase< float > &  src,
const CuArray< int32 > &  copy_from_idx,
CuMatrixBase< float > *  tgt 
)
template void kaldi::cu::Randomize ( const CuMatrixBase< double > &  src,
const CuArray< int32 > &  copy_from_idx,
CuMatrixBase< double > *  tgt 
)
void RegularizeL1 ( CuMatrixBase< Real > *  weight,
CuMatrixBase< Real > *  gradient,
Real  l1_penalty,
Real  learning_rate 
)

RegularizeL1 is a gradient step with l1 regularization added to the gradient.

We don't let the value cross over zero from positive to negative or vice versa, in a single step. If an element tries to cross zero and is stopped, we zero the gradient. (Dan: not sure why).

Definition at line 37 of file cu-math.cc.

References CU2DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), Timer::Elapsed(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), MatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), and CuMatrixBase< Real >::Stride().

Referenced by LinearTransform::Update(), and AffineTransform::Update().

37  {
38  KALDI_ASSERT(SameDim(*weight, *grad));
39 #if HAVE_CUDA == 1
40  if (CuDevice::Instantiate().Enabled()) {
41  Timer tim;
42 
43  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
44  dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));
45 
46  cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
47  weight->Dim(), grad->Stride());
48  CU_SAFE_CALL(cudaGetLastError());
49 
50  CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
51  } else
52  #endif
53  {
54  MatrixBase<Real> &weight2 = weight->Mat();
55  MatrixBase<Real> &grad2 = grad->Mat();
56  for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
57  for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
58 
59  if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
60 
61  Real l1_signed = l1;
62  if (weight2(r, c) < 0.0)
63  l1_signed = -l1;
64 
65  Real before = weight2(r, c);
66  Real after = weight2(r, c) - lr*grad2(r, c) - l1_signed;
67  if ((after > 0.0) ^ (before > 0.0)) {
68  weight2(r, c) = 0.0;
69  grad2(r, c) = 0.0;
70  } else {
71  weight2(r, c) -= l1_signed;
72  }
73  }
74  }
75  }
76 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
int32 MatrixIndexT
Definition: matrix-common.h:96
#define CU2DBLOCK
Definition: cu-matrixdim.h:67
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::RegularizeL1 ( CuMatrixBase< float > *  weight,
CuMatrixBase< float > *  grad,
float  l1,
float  lr 
)
template void kaldi::cu::RegularizeL1 ( CuMatrixBase< double > *  weight,
CuMatrixBase< double > *  grad,
double  l1,
double  lr 
)
static Real kaldi::cu::ScalarSigmoid ( Real  a)
inlinestatic

Definition at line 296 of file cu-math.cc.

References kaldi::Exp().

Referenced by CpuBackpropLstmNonlinearity(), and CpuComputeLstmNonlinearity().

296  {
297  if (a > Real(0)) {
298  return Real(1) / (Real(1) + Exp(-a));
299  } else {
300  Real x = Exp(a);
301  return x / (x + Real(1));
302  }
303 }
double Exp(double x)
Definition: kaldi-math.h:83
static Real kaldi::cu::ScalarTanh ( Real  a)
inlinestatic

Definition at line 306 of file cu-math.cc.

References kaldi::Exp().

Referenced by CpuBackpropLstmNonlinearity(), and CpuComputeLstmNonlinearity().

306  {
307  if (a > Real(0)) {
308  Real inv_expa = Exp(-a);
309  return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa);
310  } else {
311  Real expa = Exp(a);
312  return Real(1) - Real(2) / (Real(1) + expa * expa);
313  }
314 }
double Exp(double x)
Definition: kaldi-math.h:83
void Splice ( const CuMatrixBase< Real > &  src,
const CuArray< int32 > &  frame_offsets,
CuMatrixBase< Real > *  tgt 
)

Splice concatenates frames of src as specified in frame_offsets into tgt.

The dimensions of tgt must be equivalent to the number of rows in src and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim(). As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the general case where i in [0..src.NumRows()-1], k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the number of rows in src or less than 0 than the right side of the equation is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid an index out of bounds.

Definition at line 132 of file cu-math.cc.

References CU2DBLOCK, CuArray< T >::Data(), CuMatrixBase< Real >::Data(), CuArray< T >::Dim(), CuMatrixBase< Real >::Dim(), Timer::Elapsed(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), MatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), CuMatrixBase< Real >::NumRows(), and MatrixBase< Real >::RowData().

Referenced by Component::NewComponentOfType(), Splice::PropagateFnc(), and kaldi::UnitTestCuMathSplice().

133  {
134 
135  KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
136  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
137 
138  #if HAVE_CUDA == 1
139  if (CuDevice::Instantiate().Enabled()) {
140  Timer tim;
141 
142  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
143  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
144 
145  cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
146  frame_offsets.Data(), tgt->Dim(), src.Dim());
147  CU_SAFE_CALL(cudaGetLastError());
148 
149  CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
150  } else
151  #endif
152  {
153  // expand in CPU
154  const MatrixBase<Real> &srcmat = src.Mat();
155  const int32 *frame_offsetvec = frame_offsets.Data();
156  int32 dim = frame_offsets.Dim();
157  MatrixBase<Real> &tgtmat = tgt->Mat();
158  //
159  for(int32 r=0; r < tgtmat.NumRows(); r++) {
160  for(int32 off=0; off < dim; off++) {
161  int32 r_off = r + frame_offsetvec[off];
162  if(r_off < 0) r_off = 0;
163  if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
164  memcpy(tgtmat.RowData(r)+off*srcmat.NumCols(),srcmat.RowData(r_off),sizeof(Real)*srcmat.NumCols());
165  }
166  }
167  }
168 }
const T * Data() const
Get raw pointer.
Definition: cu-array.h:65
#define CU2DBLOCK
Definition: cu-matrixdim.h:67
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:62
template void kaldi::cu::Splice ( const CuMatrixBase< float > &  src,
const CuArray< int32 > &  frame_offsets,
CuMatrixBase< float > *  tgt 
)
template void kaldi::cu::Splice ( const CuMatrixBase< double > &  src,
const CuArray< int32 > &  frame_offsets,
CuMatrixBase< double > *  tgt 
)