All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
kaldi::cu Namespace Reference

Functions

template<typename Real >
void RegularizeL1 (CuMatrixBase< Real > *weight, CuMatrixBase< Real > *gradient, Real l1_penalty, Real learning_rate)
 RegularizeL1 is a gradient step with l1 regularization added to the gradient. More...
 
template<typename Real >
void Randomize (const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< Real > *tgt)
 Copies a permutation of src into tgt. More...
 
template<typename Real >
void Splice (const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
 Splice concatenates frames of src as specified in frame_offsets into tgt. More...
 
template<typename Real >
void Copy (const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< Real > *tgt)
 Copies elements from src into tgt as given by copy_from_indices. More...
 
template<typename Real >
void EnsureNonzero (const CuMatrixBase< Real > &src, Real epsilon, CuMatrixBase< Real > *dest)
 This function requires that src and dest have the same dimension and epsilon > 0. More...
 
template void RegularizeL1 (CuMatrixBase< float > *weight, CuMatrixBase< float > *grad, float l1, float lr)
 
template void RegularizeL1 (CuMatrixBase< double > *weight, CuMatrixBase< double > *grad, double l1, double lr)
 
template void Splice (const CuMatrixBase< float > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< float > *tgt)
 
template void Splice (const CuMatrixBase< double > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< double > *tgt)
 
template void Copy (const CuMatrixBase< float > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< float > *tgt)
 
template void Copy (const CuMatrixBase< double > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< double > *tgt)
 
template void Randomize (const CuMatrixBase< float > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< float > *tgt)
 
template void Randomize (const CuMatrixBase< double > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< double > *tgt)
 
template<typename Real >
void NormalizePerRow (const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
 Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square equals 1.0. More...
 
template void NormalizePerRow (const CuMatrixBase< float > &in, const float target_rms, const bool add_log_stddev, CuMatrixBase< float > *out)
 
template void NormalizePerRow (const CuMatrixBase< double > &in, const double target_rms, const bool add_log_stddev, CuMatrixBase< double > *out)
 
template<typename Real >
void DiffNormalizePerRow (const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)
 
template void DiffNormalizePerRow (const CuMatrixBase< float > &in_value, const CuMatrixBase< float > &out_deriv, const float target_rms, const bool add_log_stddev, CuMatrixBase< float > *in_deriv)
 
template void DiffNormalizePerRow (const CuMatrixBase< double > &in_value, const CuMatrixBase< double > &out_deriv, const double target_rms, const bool add_log_stddev, CuMatrixBase< double > *in_deriv)
 
template<typename Real >
static Real ScalarSigmoid (Real a)
 
template<typename Real >
static Real ScalarTanh (Real a)
 
template<typename Real >
void CpuComputeLstmNonlinearity (const MatrixBase< Real > &input_mat, const MatrixBase< Real > &params_mat, MatrixBase< Real > *output)
 
template<typename Real >
void ComputeLstmNonlinearity (const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, CuMatrixBase< Real > *output)
 this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propagation. More...
 
template void CpuComputeLstmNonlinearity (const MatrixBase< float > &input_mat, const MatrixBase< float > &params_mat, MatrixBase< float > *output)
 
template void CpuComputeLstmNonlinearity (const MatrixBase< double > &input_mat, const MatrixBase< double > &params_mat, MatrixBase< double > *output)
 
template void ComputeLstmNonlinearity (const CuMatrixBase< float > &input, const CuMatrixBase< float > &params, CuMatrixBase< float > *output)
 
template void ComputeLstmNonlinearity (const CuMatrixBase< double > &input, const CuMatrixBase< double > &params, CuMatrixBase< double > *output)
 
template<typename Real >
void CpuBackpropLstmNonlinearity (const MatrixBase< Real > &input, const MatrixBase< Real > &params, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > *input_deriv, MatrixBase< Real > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)
 
template<typename Real >
void BackpropLstmNonlinearity (const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
 This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity. More...
 
template<typename Real >
void EnsureNonzero (const CuVectorBase< Real > &src, Real epsilon, CuVectorBase< Real > *dest)
 Vector version of EnsureNonzero, see matrix version for documentation. More...
 
template void EnsureNonzero (const CuMatrixBase< float > &src, float epsilon, CuMatrixBase< float > *dest)
 
template void EnsureNonzero (const CuMatrixBase< double > &src, double epsilon, CuMatrixBase< double > *dest)
 
template void EnsureNonzero (const CuVectorBase< float > &src, float epsilon, CuVectorBase< float > *dest)
 
template void EnsureNonzero (const CuVectorBase< double > &src, double epsilon, CuVectorBase< double > *dest)
 
template void CpuBackpropLstmNonlinearity (const MatrixBase< float > &input, const MatrixBase< float > &params, const MatrixBase< float > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< float > &self_repair_config, double count_in, MatrixBase< float > *input_deriv, MatrixBase< float > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< float > *self_repair_sum_out)
 
template void CpuBackpropLstmNonlinearity (const MatrixBase< double > &input, const MatrixBase< double > &params, const MatrixBase< double > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< double > &self_repair_config, double count_in, MatrixBase< double > *input_deriv, MatrixBase< double > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< double > *self_repair_sum_out)
 
template void BackpropLstmNonlinearity (const CuMatrixBase< float > &input, const CuMatrixBase< float > &params, const CuMatrixBase< float > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< float > &self_repair_config, double count_in, CuMatrixBase< float > *input_deriv, CuMatrixBase< float > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< float > *self_repair_sum_out)
 
template void BackpropLstmNonlinearity (const CuMatrixBase< double > &input, const CuMatrixBase< double > &params, const CuMatrixBase< double > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< double > &self_repair_config, double count_in, CuMatrixBase< double > *input_deriv, CuMatrixBase< double > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< double > *self_repair_sum_out)
 

Function Documentation

void BackpropLstmNonlinearity ( const CuMatrixBase< Real > &  input,
const CuMatrixBase< Real > &  params,
const CuMatrixBase< Real > &  output_deriv,
const CuMatrixBase< double > &  deriv_sum_in,
const CuVectorBase< Real > &  self_repair_config,
double  count_in,
CuMatrixBase< Real > *  input_deriv,
CuMatrixBase< Real > *  params_deriv,
CuMatrixBase< double > *  value_sum_out,
CuMatrixBase< double > *  deriv_sum_out,
CuMatrixBase< Real > *  self_repair_sum_out 
)

This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity.

It's a little more complicated than you might expect because of the 'self-repair' mechanism that we use to prevent the sigmoid and tanh nonlinearities oversaturating, and because of the average-activation and average-derivative stats that we store for these nonlinearites (these stats are used both to control the self-repair mechanism, and for diagnostic purposes).

Because the forward pass computes various intermediate values that are not output, this function actually has to do the same computations as the forward pass before it actually does the backprop.

Parameters
[in]inputThe same as in ComputeLstmNonlinearity(). A matrix, of dimension N by 5C (i.e. its num-cols must be a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). This function will also accept input of dimension N by 5C + 3, and the three final elements will be interpreted as scaling factors on i_t, f_t and o_t (useful as per-frame dropout masks).
[in]paramsThe same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}.
[in]output_derivA matrix, of dimension N by 2C, containing the derivative of the objective function we're backpropagating, w.r.t. the quantities c_t and m_t (in two blocks of column-dimension C).
[in]deriv_sum_inThis is used in the self-repair code to identify oversaturated nonlinearities. It is a matrix, of dimension 5 by C, corresponding to the totals of the derivatives of the 5 sigmoid and tanh nonlinearities, in they order they appear in the equations in the documentation of ComputeLstmNonlinearity() Rspectively, they appear in the equations for (i_t, f_t, c_t, o_t, m_t). This will be divided by 'count_in' to get the average derivative value so far, for each of the nonlinearities.
[in]self_repair_configA vector of dimension 10, containing the configuration of the self-repair to be used for the 5 nonlinearities. The first 5 elements are the self_repair_lower_threshold values (typically 0.05 for sigmoid and 0.2 for tanh), and the next 5 elements are the corresponding self-repair-scales (typically 10^-5).
[in]count_inThe data-count that corresponds to the stats in 'deriv_sum_in' at entry to the function. This function should tolerate the count being zero (in that case, it is free to do the self-repair or not, as this should only happen on the 1st minibatch of each training job).
[out]input_derivMay be NULL; if not, this function writes, to this location, the backpropagated derivative of the objective function w.r.t. the 'input' matrix. This matrix should have the same dimension as 'input'. In addition to the regular backpropagated derivative, the output will include small values relating to 'self-repair'. If the input is of column-dimension 5C + 3 (i.e. we are using dropout masks), the derivatives w.r.t. the dropout masks will not be set; they will retain their value prior to this function call.
[out]params_derivMay be NULL; if not, this is where this function *writes* [not adds] the backpropagated derivative of the objective function w.r.t. 'params'; it should have the same dimension as 'params' (3 by C). (This matrix will then be processed by the natural gradient code and added to the appropriate copy of the parameter matrix, outside this function).
[out]value_sum_outMust be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C. This function *adds* to this location the total value of each of the sigmoid/tanh nonlinearities that it computes (this is for diagnostic purposes).
[out]deriv_sum_outMust be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C; this function *adds* to this location the total of the derivative of each of the sigmoid/tanh nonlinearities that it computes (this is for diagnostic purposes and to control the self-repair). This function should tolerate the case when 'deriv_sum_out' points to the same data as 'deriv_sum_in'.
[out]self_repair_sum_outMust be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C; this function *writes* to this location the sum of the number of times the self-repair code was activated (integer values 0 <= k <= N). This will be processed outside this function into self-repair stats for diagnostics.

Definition at line 768 of file cu-math.cc.

References CpuBackpropLstmNonlinearity(), CU1DBLOCK, CuVectorBase< Real >::Data(), CuMatrixBase< Real >::Data(), CuVectorBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), CuMatrixBase< Real >::Stride(), and CuVectorBase< Real >::Vec().

Referenced by LstmNonlinearityComponent::Backprop(), kaldi::UnitTestBackpropLstmNonlinearity(), and kaldi::UnitTestLstmNonlinearity().

778  {
779  int32 num_rows = input.NumRows(),
780  cell_dim = input.NumCols() / 5,
781  input_cols = input.NumCols();
782  // Check dimensions.
783  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3);
784  KALDI_ASSERT(params.NumRows() == 3);
785  KALDI_ASSERT(params.NumCols() == cell_dim);
786  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
787  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
788  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
789  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
790  KALDI_ASSERT(self_repair_config.Dim() == 10);
791  if (input_deriv != NULL) {
792  KALDI_ASSERT(SameDim(input, *input_deriv));
793  }
794  if (params_deriv == NULL) {
795  KALDI_ASSERT(value_sum_out == NULL);
796  KALDI_ASSERT(deriv_sum_out == NULL);
797  KALDI_ASSERT(self_repair_sum_out == NULL);
798  } else {
799  KALDI_ASSERT(value_sum_out != NULL);
800  KALDI_ASSERT(deriv_sum_out != NULL);
801  KALDI_ASSERT(self_repair_sum_out != NULL);
802  KALDI_ASSERT(SameDim(params, *params_deriv));
803  KALDI_ASSERT(value_sum_out->NumRows() == 5);
804  KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
805  KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
806  KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
807  KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
808  }
809 
810 
811 #if HAVE_CUDA == 1
812  if (CuDevice::Instantiate().Enabled()) {
813  CuTimer tim;
814  // Each thread block is working on 1 row of the data.
815  // It's best that cell dim is a multiple fo CU1DBLOCK
816 
817  int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
818 
819  // Use 2D block (8x32 threads) as we need to compute column sum.
820  // Use 1D grid to cover the data matrix width `cell_dim`.
821  const int kWarpSize = 32;
822  dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
823 // dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
824 // n_blocks(num_rows, dimBlock.y));
825 // if (dimGrid.x * dimGrid.y > 1024) {
826 // dimGrid.y = std::max(1024 / dimGrid.x, 1);
827 // }
828  dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
829  if (input_deriv == NULL) {
830  if (params_deriv == NULL) {
831  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
832  have_dropout_mask, num_rows,
833  input.Data(), input.Stride(), params.Data(),
834  params.Stride(), output_deriv.Data(),
835  output_deriv.Stride(), deriv_sum_in.Data(),
836  deriv_sum_in.Stride(),
837  self_repair_config.Data(), count_in + 1,
838  NULL,
839  0,
840  NULL,
841  0,
842  NULL,
843  0,
844  NULL,
845  0,
846  NULL,
847  0);
848 
849  } else {
850  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
851  have_dropout_mask, num_rows,
852  input.Data(), input.Stride(), params.Data(),
853  params.Stride(), output_deriv.Data(),
854  output_deriv.Stride(), deriv_sum_in.Data(),
855  deriv_sum_in.Stride(),
856  self_repair_config.Data(), count_in + 1,
857  NULL,
858  0, params_deriv->Data(),
859  params_deriv->Stride(),
860  value_sum_out->Data(),
861  value_sum_out->Stride(),
862  deriv_sum_out->Data(),
863  deriv_sum_out->Stride(),
864  self_repair_sum_out->Data(),
865  self_repair_sum_out->Stride());
866  }
867  } else {
868  if (params_deriv == NULL) {
869  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
870  have_dropout_mask, num_rows,
871  input.Data(), input.Stride(), params.Data(),
872  params.Stride(), output_deriv.Data(),
873  output_deriv.Stride(), deriv_sum_in.Data(),
874  deriv_sum_in.Stride(),
875  self_repair_config.Data(), count_in + 1,
876  input_deriv->Data(), input_deriv->Stride(),
877  NULL,
878  0, NULL, 0, NULL, 0, NULL, 0);
879  } else {
880  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
881  have_dropout_mask, num_rows,
882  input.Data(), input.Stride(), params.Data(),
883  params.Stride(), output_deriv.Data(),
884  output_deriv.Stride(), deriv_sum_in.Data(),
885  deriv_sum_in.Stride(),
886  self_repair_config.Data(), count_in + 1,
887  input_deriv->Data(), input_deriv->Stride(),
888  params_deriv->Data(),
889  params_deriv->Stride(),
890  value_sum_out->Data(),
891  value_sum_out->Stride(),
892  deriv_sum_out->Data(),
893  deriv_sum_out->Stride(),
894  self_repair_sum_out->Data(),
895  self_repair_sum_out->Stride());
896  }
897  }
898 
899  CU_SAFE_CALL(cudaGetLastError());
900 
901  CuDevice::Instantiate().AccuProfile(__func__, tim);
902  } else
903 #endif
904  {
905  CpuBackpropLstmNonlinearity(input.Mat(), params.Mat(), output_deriv.Mat(),
906  deriv_sum_in.Mat(), self_repair_config.Vec(),
907  count_in, &(input_deriv->Mat()),
908  &(params_deriv->Mat()), &(value_sum_out->Mat()),
909  &(deriv_sum_out->Mat()),
910  &(self_repair_sum_out->Mat()));
911  }
912 }
MatrixIndexT NumCols() const
Definition: cu-matrix.h:215
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
const MatrixBase< Real > & Mat() const
Definition: cu-matrix.h:698
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:214
template void CpuBackpropLstmNonlinearity(const MatrixBase< double > &input, const MatrixBase< double > &params, const MatrixBase< double > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< double > &self_repair_config, double count_in, MatrixBase< double > *input_deriv, MatrixBase< double > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< double > *self_repair_sum_out)
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
MatrixIndexT Stride() const
Definition: cu-matrix.h:216
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:689
template void kaldi::cu::BackpropLstmNonlinearity ( const CuMatrixBase< float > &  input,
const CuMatrixBase< float > &  params,
const CuMatrixBase< float > &  output_deriv,
const CuMatrixBase< double > &  deriv_sum_in,
const CuVectorBase< float > &  self_repair_config,
double  count_in,
CuMatrixBase< float > *  input_deriv,
CuMatrixBase< float > *  params_deriv,
CuMatrixBase< double > *  value_sum_out,
CuMatrixBase< double > *  deriv_sum_out,
CuMatrixBase< float > *  self_repair_sum_out 
)
template void kaldi::cu::BackpropLstmNonlinearity ( const CuMatrixBase< double > &  input,
const CuMatrixBase< double > &  params,
const CuMatrixBase< double > &  output_deriv,
const CuMatrixBase< double > &  deriv_sum_in,
const CuVectorBase< double > &  self_repair_config,
double  count_in,
CuMatrixBase< double > *  input_deriv,
CuMatrixBase< double > *  params_deriv,
CuMatrixBase< double > *  value_sum_out,
CuMatrixBase< double > *  deriv_sum_out,
CuMatrixBase< double > *  self_repair_sum_out 
)
void ComputeLstmNonlinearity ( const CuMatrixBase< Real > &  input,
const CuMatrixBase< Real > &  params,
CuMatrixBase< Real > *  output 
)

this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propagation.

It computes the core part of the LSTM nonlinearity. Refer to class LstmNonlinearityComponent in ../nnet3/nnet-simple-component.h for more context.

Parameters
[in]inputA matrix, of dimension N by 5C (i.e. its num-cols must be a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). This function will also accept input of dimension N by 5C + 3, and the three final elements will be used as scaling factors on i_t, f_t and o_t (useful as per-frame dropout masks).
[in]paramsA matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}.
[out]outputA matrix, of dimension N by 2C. The quantities c_t and m_t respectively are put there (in two blocks of column-dimension C), according to the following equations:

i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) c_t = f_t*c_{t-1} + i_t * Tanh(c_part) o_t = Sigmoid(o_part + w_{oc}*c_t) m_t = o_t * Tanh(c_t)

Note on dropout: if the dropout mask is provided, let the mask values be i_t_mask, f_t_mask and o_t_mask (for each matrix row, these are scalars while i_t, f_t and o_t are of dimension C, because this is 'per-frame' dropout as described in http://www.danielpovey.com/files/2017_interspeech_dropout.pdf). Then the modification to the equations above consists of replacing 'i_t' with 'i_t_mask * i_t' in the RHS of the equations above, and the same type of change for f_t and o_t.

Definition at line 489 of file cu-math.cc.

References CpuComputeLstmNonlinearity(), CU1DBLOCK, CuMatrixBase< Real >::Data(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::Stride().

Referenced by LstmNonlinearityComponent::Propagate(), kaldi::UnitTestCuMathComputeLstmNonlinearity(), and kaldi::UnitTestLstmNonlinearity().

491  {
492  int32 num_rows = input.NumRows(),
493  input_cols = input.NumCols(),
494  cell_dim = input_cols / 5;
495  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
496  KALDI_ASSERT(output->NumRows() == num_rows);
497  KALDI_ASSERT(params.NumRows() == 3);
498  KALDI_ASSERT(params.NumCols() == cell_dim);
499  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
500 
501 #if HAVE_CUDA == 1
502  if (CuDevice::Instantiate().Enabled()) {
503  CuTimer tim;
504 
505  int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
506 
507  // Each thread block is working on 1 row of the data.
508  // It's best that cell dim is a multiple fo CU1DBLOCK
509  dim3 dimBlock(CU1DBLOCK);
510  dim3 dimGrid(num_rows);
511 
512  cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(),
513  params.Data(), params.Stride(), output->Stride(),
514  cell_dim, have_dropout_mask, num_rows, output->Data());
515  CU_SAFE_CALL(cudaGetLastError());
516 
517  CuDevice::Instantiate().AccuProfile(__func__, tim);
518  } else
519 #endif
520  {
521  CpuComputeLstmNonlinearity(input.Mat(), params.Mat(), &output->Mat());
522  }
523 }
template void CpuComputeLstmNonlinearity(const MatrixBase< double > &input_mat, const MatrixBase< double > &params_mat, MatrixBase< double > *output)
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::ComputeLstmNonlinearity ( const CuMatrixBase< float > &  input,
const CuMatrixBase< float > &  params,
CuMatrixBase< float > *  output 
)
template void kaldi::cu::ComputeLstmNonlinearity ( const CuMatrixBase< double > &  input,
const CuMatrixBase< double > &  params,
CuMatrixBase< double > *  output 
)
void Copy ( const CuMatrixBase< Real > &  src,
const CuArray< int32 > &  copy_from_indices,
CuMatrixBase< Real > *  tgt 
)

Copies elements from src into tgt as given by copy_from_indices.

The matrices src and tgt must have the same dimensions and the dimension of copy_from_indices must equal the number of columns in the src matrix. As a result, tgt(i, j) == src(i, copy_from_indices[j]). Also see CuMatrix::CopyCols(), which is more general.

Definition at line 173 of file cu-math.cc.

References CU2DBLOCK, CuArrayBase< T >::Data(), CuMatrixBase< Real >::Data(), CuArrayBase< T >::Dim(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::NumRows().

Referenced by CopyComponent::PropagateFnc(), kaldi::TestClusterUtilsVector(), and kaldi::UnitTestCuMathCopy().

174  {
175 
176  KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
177  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
178 
179  #if HAVE_CUDA == 1
180  if (CuDevice::Instantiate().Enabled()) {
181  CuTimer tim;
182 
183  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
184  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
185 
186  cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
187  copy_from_indices.Data(), tgt->Dim(), src.Dim());
188  CU_SAFE_CALL(cudaGetLastError());
189 
190  CuDevice::Instantiate().AccuProfile(__func__, tim);
191  } else
192  #endif
193  {
194  // expand in CPU
195  const MatrixBase<Real> &srcmat = src.Mat();
196  const int32 *copy_from_indicesvec = copy_from_indices.Data();
197  int32 dim = copy_from_indices.Dim();
198  MatrixBase<Real> &tgtmat = tgt->Mat();
199  //
200  for(int32 r = 0; r < tgtmat.NumRows(); r++) {
201  for(int32 c = 0; c < dim; c++) {
202  tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
203  }
204  }
205  }
206 }
const T * Data() const
Get raw pointer.
Definition: cu-array.h:52
#define CU2DBLOCK
Definition: cu-matrixdim.h:67
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:49
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::Copy ( const CuMatrixBase< float > &  src,
const CuArray< int32 > &  copy_from_indices,
CuMatrixBase< float > *  tgt 
)
template void kaldi::cu::Copy ( const CuMatrixBase< double > &  src,
const CuArray< int32 > &  copy_from_indices,
CuMatrixBase< double > *  tgt 
)
void CpuBackpropLstmNonlinearity ( const MatrixBase< Real > &  input,
const MatrixBase< Real > &  params,
const MatrixBase< Real > &  output_deriv,
const MatrixBase< double > &  deriv_sum_in,
const VectorBase< Real > &  self_repair_config,
double  count_in,
MatrixBase< Real > *  input_deriv,
MatrixBase< Real > *  params_deriv,
MatrixBase< double > *  value_sum_out,
MatrixBase< double > *  deriv_sum_out,
MatrixBase< Real > *  self_repair_sum_out 
)

Definition at line 543 of file cu-math.cc.

References count, VectorBase< Real >::Dim(), rnnlm::i, KALDI_ASSERT, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), kaldi::SameDim(), ScalarSigmoid(), and ScalarTanh().

Referenced by BackpropLstmNonlinearity(), and kaldi::UnitTestBackpropLstmNonlinearity().

553  {
554  int32 num_rows = input.NumRows(),
555  input_cols = input
556  .NumCols(),
557  cell_dim = input.NumCols() / 5;
558  // Check dimensions.
559  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
560  KALDI_ASSERT(params.NumRows() == 3);
561  KALDI_ASSERT(params.NumCols() == cell_dim);
562  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
563  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
564  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
565  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
566  KALDI_ASSERT(self_repair_config.Dim() == 10);
567  if (input_deriv != NULL) {
568  KALDI_ASSERT(SameDim(input, *input_deriv));
569  }
570  if (params_deriv == NULL) {
571  KALDI_ASSERT(value_sum_out == NULL);
572  KALDI_ASSERT(deriv_sum_out == NULL);
573  KALDI_ASSERT(self_repair_sum_out == NULL);
574  } else {
575  KALDI_ASSERT(value_sum_out != NULL);
576  KALDI_ASSERT(deriv_sum_out != NULL);
577  KALDI_ASSERT(self_repair_sum_out != NULL);
578  KALDI_ASSERT(SameDim(params, *params_deriv));
579  KALDI_ASSERT(value_sum_out->NumRows() == 5);
580  KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
581  KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
582  KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
583  KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
584  }
585 
586  const MatrixBase<Real> &input_mat = input;
587  const MatrixBase<Real> &params_mat = params;
588  const MatrixBase<Real> &output_deriv_mat = output_deriv;
589  const MatrixBase<double> &deriv_sum_in_mat = deriv_sum_in;
590  const VectorBase<Real> &sr_config = self_repair_config;
591  MatrixBase<Real> *input_deriv_mat = (
592  input_deriv == NULL ? NULL : input_deriv);
593  MatrixBase<Real> *params_deriv_mat = NULL;
594  MatrixBase<Real> *self_repair_sum_out_mat = NULL;
595  MatrixBase<double> *value_sum_out_mat = NULL;
596  MatrixBase<double> *deriv_sum_out_mat = NULL;
597  if (params_deriv != NULL) {
598  params_deriv_mat = params_deriv;
599  value_sum_out_mat = value_sum_out;
600  deriv_sum_out_mat = deriv_sum_out;
601  self_repair_sum_out_mat = self_repair_sum_out;
602  }
603 
604 
605  // We add 1.0 (i.e. a small value) to the count to avoid division by zero.
606  Real count = 1.0 + count_in;
607  for (int32 c = 0; c < cell_dim; c++) {
608  // parameters
609  Real w_ic = params_mat(0, c);
610  Real w_fc = params_mat(1, c);
611  Real w_oc = params_mat(2, c);
612  // derivative sums w.r.t. parameters.
613  Real w_ic_deriv_sum = 0.0;
614  Real w_fc_deriv_sum = 0.0;
615  Real w_oc_deriv_sum = 0.0;
616 
617  // average derivatives, for self-repair.
618  // The 5 nonlinearities that are subject to self-repair are written as:
619  // Sigmoid(i_t_input), Sigmoid(f_t_input),
620  // Tanh(c_part), Sigmoid(o_t_input), Tanh(c_t)
621  Real i_t_self_repair = (
622  deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
623  Real f_t_self_repair = (
624  deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
625  Real c_part_self_repair = (
626  deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
627  Real o_t_self_repair = (
628  deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
629  Real c_t_self_repair = (
630  deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
631  // Note on how we add self-repair for sigmoids/tanh's. If self-repair
632  // is activated for this unit, then...
633  // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0)
634  // ... to the input-deriv;
635  // For tanh's we'd add -self_repair_scale * tanh(x)
636  // If self-repair is not activated, the 'self_repair' scales are set to zero.
637 
638  // The following variables are for the accumulation of stats on the
639  // sigmoid and tanh units.
640  Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0;
641  Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0;
642  Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0;
643  Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0;
644  Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0;
645 
646 
647  for (int32 r = 0; r < num_rows; r++) {
648  Real i_part = input_mat(r, c),
649  f_part = input_mat(r, c + cell_dim),
650  c_part = input_mat(r, c + 2 * cell_dim),
651  o_part = input_mat(r, c + 3 * cell_dim),
652  c_prev = input_mat(r, c + 4 * cell_dim);
653 
654  Real i_scale = (input_cols == cell_dim * 5 ? 1.0 :
655  input_mat(r, cell_dim * 5)),
656  f_scale = (input_cols == cell_dim * 5 ? 1.0 :
657  input_mat(r, cell_dim * 5 + 1)),
658  o_scale = (input_cols == cell_dim * 5 ? 1.0 :
659  input_mat(r, cell_dim * 5 + 2));
660 
661  // For greater clarity, we give some of the quantities in the
662  // forward equations their own names.
663  Real i_t_input = i_part + w_ic * c_prev,
664  i_t = ScalarSigmoid(i_t_input),
665  f_t_input = f_part + w_fc * c_prev,
666  f_t = ScalarSigmoid(f_t_input),
667  tanh_c_part = ScalarTanh(c_part),
668  c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part,
669  o_t_input = o_part + w_oc * c_t,
670  o_t = ScalarSigmoid(o_t_input),
671  tanh_c_t = ScalarTanh(c_t);
672  // we'd also compute, in the forward pass,
673  // m_t = o_t * tanh_c_t;
674  // but this variable is not needed.
675 
676  // Accumulate nonlinearity value and derivative stats.
677  // Note:
678  // tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
679  // sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)).
680  i_t_value_sum += i_t;
681  i_t_deriv_sum += i_t * (1.0F - i_t);
682  f_t_value_sum += f_t;
683  f_t_deriv_sum += f_t * (1.0F - f_t);
684  c_part_value_sum += tanh_c_part;
685  c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part;
686  o_t_value_sum += o_t;
687  o_t_deriv_sum += o_t * (1.0F - o_t);
688  c_t_value_sum += tanh_c_t;
689  c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t;
690 
691 
692  // the derivative of the objective function w.r.t. a particular quantity
693  // will be written by prepending "d" to the name.
694  // We compute these derivatives in the reverse of the order in which
695  // we computed the original quantities.
696  // dc_t_out is the part of the derivative w.r.t. c_t that
697  // comes directly from the output of this function.
698  Real dc_t_out = output_deriv_mat(r, c);
699  Real dm_t = output_deriv_mat(r, c + cell_dim);
700  Real dtanh_c_t = o_t * o_scale * dm_t;
701  Real do_t = o_scale * tanh_c_t * dm_t;
702  Real do_t_input = (o_t * (1.0F - o_t) * do_t
703  - (2.0F * o_t - 1.0F) * o_t_self_repair);
704  Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
705  + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
706  Real dtanh_c_part = i_t * i_scale * dc_t;
707  Real df_t = dc_t * f_scale * c_prev;
708  Real df_t_input = ((df_t * f_t * (1.0F - f_t)
709  - (2.0F * f_t - 1.0F) * f_t_self_repair));
710  Real di_t = dc_t * i_scale * tanh_c_part;
711  Real di_t_input = ((di_t * i_t * (1.0F - i_t)
712  - (2.0F * i_t - 1.0F) * i_t_self_repair));
713 
714  w_ic_deriv_sum += c_prev * di_t_input;
715  w_fc_deriv_sum += c_prev * df_t_input;
716  w_oc_deriv_sum += c_t * do_t_input;
717 
718  Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t;
719  Real do_part = do_t_input;
720  Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
721  - tanh_c_part * c_part_self_repair);
722  Real df_part = df_t_input;
723  Real di_part = di_t_input;
724 
725  if (input_deriv_mat != NULL) {
726  (*input_deriv_mat)(r, c) = di_part;
727  (*input_deriv_mat)(r, c + cell_dim) = df_part;
728  (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part;
729  (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part;
730  (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev;
731  }
732  }
733 
734  if (params_deriv != NULL) {
735  // note: for optimizing things you can assume that params_deriv and
736  // input_deriv_mat are non-NULL (i.e. all the output matrices are
737  // non-NULL). The situations when some of the output matrices are NULL
738  // does not happen often (mainly only in testing code).
739 
740  (*params_deriv_mat)(0, c) = w_ic_deriv_sum;
741  (*params_deriv_mat)(1, c) = w_fc_deriv_sum;
742  (*params_deriv_mat)(2, c) = w_oc_deriv_sum;
743 
744  (*value_sum_out_mat)(0, c) += i_t_value_sum;
745  (*value_sum_out_mat)(1, c) += f_t_value_sum;
746  (*value_sum_out_mat)(2, c) += c_part_value_sum;
747  (*value_sum_out_mat)(3, c) += o_t_value_sum;
748  (*value_sum_out_mat)(4, c) += c_t_value_sum;
749 
750  // need to update self_repair_sum_out before deriv_sum_out, because
751  // deriv_sum_out and deriv_sum_in might point to the same memory.
752  for (int32 i = 0; i < 5; i++)
753  (*self_repair_sum_out_mat)(i, c) =
754  (deriv_sum_in_mat(i, c) / count < sr_config(i) ? num_rows : 0);
755 
756  (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
757  (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
758  (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum;
759  (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum;
760  (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum;
761  }
762  }
763 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
const size_t count
static Real ScalarTanh(Real a)
Definition: cu-math.cc:434
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
static Real ScalarSigmoid(Real a)
Definition: cu-math.cc:424
template void kaldi::cu::CpuBackpropLstmNonlinearity ( const MatrixBase< float > &  input,
const MatrixBase< float > &  params,
const MatrixBase< float > &  output_deriv,
const MatrixBase< double > &  deriv_sum_in,
const VectorBase< float > &  self_repair_config,
double  count_in,
MatrixBase< float > *  input_deriv,
MatrixBase< float > *  params_deriv,
MatrixBase< double > *  value_sum_out,
MatrixBase< double > *  deriv_sum_out,
MatrixBase< float > *  self_repair_sum_out 
)
template void kaldi::cu::CpuBackpropLstmNonlinearity ( const MatrixBase< double > &  input,
const MatrixBase< double > &  params,
const MatrixBase< double > &  output_deriv,
const MatrixBase< double > &  deriv_sum_in,
const VectorBase< double > &  self_repair_config,
double  count_in,
MatrixBase< double > *  input_deriv,
MatrixBase< double > *  params_deriv,
MatrixBase< double > *  value_sum_out,
MatrixBase< double > *  deriv_sum_out,
MatrixBase< double > *  self_repair_sum_out 
)
void CpuComputeLstmNonlinearity ( const MatrixBase< Real > &  input_mat,
const MatrixBase< Real > &  params_mat,
MatrixBase< Real > *  output 
)

Definition at line 445 of file cu-math.cc.

References MatrixBase< Real >::Data(), KALDI_ASSERT, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), MatrixBase< Real >::RowData(), ScalarSigmoid(), ScalarTanh(), and MatrixBase< Real >::Stride().

Referenced by ComputeLstmNonlinearity(), and kaldi::UnitTestCuMathComputeLstmNonlinearity().

447  {
448  int32 num_rows = input_mat.NumRows(),
449  input_cols = input_mat.NumCols(),
450  cell_dim = input_cols / 5;
451  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
452  KALDI_ASSERT(output->NumRows() == num_rows);
453  KALDI_ASSERT(params_mat.NumRows() == 3);
454  KALDI_ASSERT(params_mat.NumCols() == cell_dim);
455  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
456 
457  MatrixBase<Real> &output_mat = *output;
458  const Real *params_data = params_mat.Data();
459  int32 params_stride = params_mat.Stride();
460  for (int32 r = 0; r < num_rows; r++) {
461  const Real *input_row = input_mat.RowData(r);
462  // i_scale and f_scale relate to dropout, they will normally be 1.0.
463  Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]),
464  f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]),
465  o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]);
466 
467  Real *output_row = output_mat.RowData(r);
468  for (int32 c = 0; c < cell_dim; c++) {
469  Real i_part = input_row[c];
470  Real f_part = input_row[c + cell_dim];
471  Real c_part = input_row[c + 2 * cell_dim];
472  Real o_part = input_row[c + 3 * cell_dim];
473  Real c_prev = input_row[c + 4 * cell_dim];
474  Real w_ic = params_data[c];
475  Real w_fc = params_data[c + params_stride];
476  Real w_oc = params_data[c + params_stride * 2];
477  Real i_t = ScalarSigmoid(i_part + w_ic * c_prev);
478  Real f_t = ScalarSigmoid(f_part + w_fc * c_prev);
479  Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part);
480  Real o_t = ScalarSigmoid(o_part + w_oc * c_t);
481  Real m_t = o_t * o_scale * ScalarTanh(c_t);
482  output_row[c] = c_t;
483  output_row[c + cell_dim] = m_t;
484  }
485  }
486 }
static Real ScalarTanh(Real a)
Definition: cu-math.cc:434
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
static Real ScalarSigmoid(Real a)
Definition: cu-math.cc:424
template void kaldi::cu::CpuComputeLstmNonlinearity ( const MatrixBase< float > &  input_mat,
const MatrixBase< float > &  params_mat,
MatrixBase< float > *  output 
)
template void kaldi::cu::CpuComputeLstmNonlinearity ( const MatrixBase< double > &  input_mat,
const MatrixBase< double > &  params_mat,
MatrixBase< double > *  output 
)
void DiffNormalizePerRow ( const CuMatrixBase< Real > &  in_value,
const CuMatrixBase< Real > &  out_deriv,
const Real  target_rms,
const bool  add_log_stddev,
CuMatrixBase< Real > *  in_deriv 
)

Definition at line 349 of file cu-math.cc.

References CuVectorBase< Real >::AddDiagMat2(), CuVectorBase< Real >::AddDiagMatMat(), CuMatrixBase< Real >::AddDiagVecMat(), CuVectorBase< Real >::ApplyFloor(), CU1DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), kaldi::kNoTrans, kaldi::kTrans, kaldi::kUndefined, CuMatrixBase< Real >::MulRowsVec(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::Stride().

Referenced by NormalizeComponent::Backprop(), and kaldi::UnitTestCuDiffNormalizePerRow().

352  {
353  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
354 #if HAVE_CUDA == 1
355  if (CuDevice::Instantiate().Enabled()) {
356  CuTimer tim;
357  size_t dimBlock = CU1DBLOCK;
358  size_t dimGrid = in_deriv->NumRows();
359  cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->Data(),
360  in_deriv->Stride(), in_value.Data(),
361  in_value.Dim(), out_deriv.Data(),
362  out_deriv.Stride(), target_rms, add_log_stddev);
363  CU_SAFE_CALL(cudaGetLastError());
364  CuDevice::Instantiate().AccuProfile(__func__, tim);
365  } else
366 #endif
367  {
368  const CuSubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
369  0, in_value.NumCols());
370  CuVector<Real> dot_products(out_deriv.NumRows());
371  dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
372  kTrans, 0.0);
373  CuVector<Real> in_norm(in_value.NumRows());
374  Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
375  in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
376 
377  if (add_log_stddev) {
378  CuVector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
379  out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
380  // f = log(sqrt(max(epsi, x^T x / D)))
381  // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x : 0.
382  // we don't compute this exactly below for the case when x^2 x is very
383  // small, but we do make sure that the deriv isn't infinity when the input
384  // is zero.
385  log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
386  log_stddev_deriv.ApplyPow(-1.0);
387  out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
388  log_stddev_deriv.MulElements(out_deriv_for_stddev);
389  if (in_deriv)
390  in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
391  }
392  in_norm.Scale(1.0 / d_scaled);
393  in_norm.ApplyFloor(kSquaredNormFloor);
394  in_norm.ApplyPow(-0.5);
395  if (in_deriv) {
396  if (in_deriv->Data() != out_deriv_no_log.Data())
397  in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
398  else
399  in_deriv->MulRowsVec(in_norm);
400  in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
401  in_norm.ApplyPow(3.0);
402  dot_products.MulElements(in_norm);
403 
404  in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value, kNoTrans,
405  1.0);
406  }
407  }
408 }
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
template void kaldi::cu::DiffNormalizePerRow ( const CuMatrixBase< float > &  in_value,
const CuMatrixBase< float > &  out_deriv,
const float  target_rms,
const bool  add_log_stddev,
CuMatrixBase< float > *  in_deriv 
)
template void kaldi::cu::DiffNormalizePerRow ( const CuMatrixBase< double > &  in_value,
const CuMatrixBase< double > &  out_deriv,
const double  target_rms,
const bool  add_log_stddev,
CuMatrixBase< double > *  in_deriv 
)
void EnsureNonzero ( const CuMatrixBase< Real > &  src,
Real  epsilon,
CuMatrixBase< Real > *  dest 
)

This function requires that src and dest have the same dimension and epsilon > 0.

It copies src to dest while ensuring that the values are bounded away from zero by at least epsilon:

y = x if fabs(x) >= epsilon;
epsilon if 0 <= x < epsilon;
-epsilon if -epsilon < x < 0.

Definition at line 209 of file cu-math.cc.

References CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), CuMatrixBase< Real >::RowData(), kaldi::SameDim(), and CuMatrixBase< Real >::Stride().

Referenced by ScaleAndOffsetComponent::BackpropInternal(), EnsureNonzero(), ScaleAndOffsetComponent::PropagateInternal(), and kaldi::UnitTestEnsureNonzero().

211  {
212  KALDI_ASSERT(SameDim(*dest, src) && epsilon > 0.0);
213 #if HAVE_CUDA == 1
214  if (CuDevice::Instantiate().Enabled()) {
215  CuTimer tim;
216  dim3 dimGrid, dimBlock;
217  GetBlockSizesForSimpleMatrixOperation(src.NumRows(), src.NumCols(),
218  &dimGrid, &dimBlock);
219  cuda_ensure_nonzero(dimGrid, dimBlock, src.Data(), src.Dim(),
220  epsilon, dest->Stride(), dest->Data());
221  CU_SAFE_CALL(cudaGetLastError());
222  CuDevice::Instantiate().AccuProfile(__func__, tim);
223  } else
224 #endif
225  {
226  int32 num_rows = src.NumRows(), num_cols = src.NumCols();
227  for (int32 r = 0; r < num_rows; r++) {
228  const Real *src_data = src.RowData(r);
229  Real *dest_data = dest->RowData(r);
230  for (int32 c = 0; c < num_cols; c++) {
231  Real x = src_data[c], y;
232  if (x <= -epsilon || x >= epsilon) y = x;
233  else if (x >= 0.0) y = epsilon;
234  else y = -epsilon;
235  dest_data[c] = y;
236  }
237  }
238  }
239 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
void EnsureNonzero ( const CuVectorBase< Real > &  src,
Real  epsilon,
CuVectorBase< Real > *  dest 
)

Vector version of EnsureNonzero, see matrix version for documentation.

Definition at line 915 of file cu-math.cc.

References CuVectorBase< Real >::Data(), CuVectorBase< Real >::Dim(), EnsureNonzero(), and KALDI_ASSERT.

917  {
918  KALDI_ASSERT(src.Dim() == dest->Dim());
919  int32 dim = src.Dim();
920  // fake it with a 1-row matrix.
921  CuSubMatrix<Real> src_mat(src.Data(), 1, dim, dim),
922  dest_mat(dest->Data(), 1, dim, dim);
923  EnsureNonzero(src_mat, epsilon, &dest_mat);
924 }
template void EnsureNonzero(const CuVectorBase< double > &src, double epsilon, CuVectorBase< double > *dest)
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::EnsureNonzero ( const CuMatrixBase< float > &  src,
float  epsilon,
CuMatrixBase< float > *  dest 
)
template void kaldi::cu::EnsureNonzero ( const CuMatrixBase< double > &  src,
double  epsilon,
CuMatrixBase< double > *  dest 
)
template void kaldi::cu::EnsureNonzero ( const CuVectorBase< float > &  src,
float  epsilon,
CuVectorBase< float > *  dest 
)
template void kaldi::cu::EnsureNonzero ( const CuVectorBase< double > &  src,
double  epsilon,
CuVectorBase< double > *  dest 
)
void NormalizePerRow ( const CuMatrixBase< Real > &  in,
const Real  target_rms,
const bool  add_log_stddev,
CuMatrixBase< Real > *  out 
)

Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square equals 1.0.

The output y_i = scale * x_i, and we want to RMS value of the y_i to equal target_rms, so y^t y = D * target_rms^2 (if y is one row of the input). we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). there is also flooring involved, to avoid division-by-zero problems. It's important for the backprop, that the floor's square root is exactly representable as float. If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) is an extra dimension of the output.

Definition at line 280 of file cu-math.cc.

References CuMatrixBase< Real >::CopyColFromVec(), CuMatrixBase< Real >::CopyFromMat(), CU1DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, kaldi::kNoTrans, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), and CuMatrixBase< Real >::Stride().

Referenced by NormalizeComponent::Propagate(), and kaldi::UnitTestCuMathNormalizePerRow().

281  {
282  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
283  if (add_log_stddev) {
284  KALDI_ASSERT(in.NumRows() == out->NumRows());
285  KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
286  } else {
287  KALDI_ASSERT(SameDim(in, *out));
288  }
289 
290 #if HAVE_CUDA == 1
291  if (CuDevice::Instantiate().Enabled()) {
292  CuTimer tim;
293  size_t dimBlock = CU1DBLOCK;
294  size_t dimGrid = out->NumRows();
295  cuda_normalize_per_row(dimGrid, dimBlock, out->Data(), out->Stride(),
296  in.Data(), in.Dim(), target_rms, add_log_stddev);
297  CU_SAFE_CALL(cudaGetLastError());
298  CuDevice::Instantiate().AccuProfile(__func__, tim);
299  } else
300 #endif
301  {
302  CuSubMatrix<Real> out_no_log(*out, 0, out->NumRows(), 0, in.NumCols());
303  if (in.Data() != out_no_log.Data())
304  out_no_log.CopyFromMat(in);
305  CuVector<Real> in_norm(in.NumRows());
306  Real d_scaled = in.NumCols() * target_rms * target_rms;
307  in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
308  in_norm.ApplyFloor(kSquaredNormFloor);
309  in_norm.ApplyPow(-0.5);
310  out_no_log.MulRowsVec(in_norm);
311  if (add_log_stddev) {
312  in_norm.ApplyLog();
313  in_norm.Scale(-1.0);
314  in_norm.Add(log(target_rms));
315  out->CopyColFromVec(in_norm, in.NumCols());
316  }
317  }
318 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::NormalizePerRow ( const CuMatrixBase< float > &  in,
const float  target_rms,
const bool  add_log_stddev,
CuMatrixBase< float > *  out 
)
template void kaldi::cu::NormalizePerRow ( const CuMatrixBase< double > &  in,
const double  target_rms,
const bool  add_log_stddev,
CuMatrixBase< double > *  out 
)
void Randomize ( const CuMatrixBase< Real > &  src,
const CuArray< int32 > &  copy_from_idx,
CuMatrixBase< Real > *  tgt 
)

Copies a permutation of src into tgt.

The row permutation is specified in copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The dimensions of copy_from_idx must be equivalent to the number of rows in tgt and src and all elements in the vector must be in [0, src.numRows()-1].

Definition at line 80 of file cu-math.cc.

References CuArrayBase< T >::Data(), CuMatrixBase< Real >::Data(), CuArrayBase< T >::Dim(), CuMatrixBase< Real >::Dim(), rnnlm::i, KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), MatrixBase< Real >::Row(), and MatrixDim_::rows.

Referenced by MatrixRandomizer::Randomize(), and kaldi::UnitTestCuMathRandomize().

82  {
83 
84  KALDI_ASSERT(src.NumCols() == tgt->NumCols());
85  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
86  KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());
87 
88  #if HAVE_CUDA == 1
89  if (CuDevice::Instantiate().Enabled()) {
90  CuTimer tim;
91 
92  /*
93  Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
94  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
95  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
96  */
97 
98  /*
99  * Let's use blocksize 4 x 128 (512 threads/block)
100  * and extend the randomizable matrices to: col 4*65535, row 128*65535
101  * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
102  */
103  dim3 dimBlock(4, 128);
104  dim3 dimGrid(n_blocks(tgt->NumCols(), 4), n_blocks(copy_from_idx.Dim(), 128));
105  /*
106  */
107 
108  MatrixDim dimsrc = src.Dim(); dimsrc.rows=copy_from_idx.Dim();
109  MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();
110 
111  cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
112  copy_from_idx.Data(), dimtgt, dimsrc);
113  CU_SAFE_CALL(cudaGetLastError());
114 
115  CuDevice::Instantiate().AccuProfile(__func__, tim);
116  } else
117  #endif
118  {
119  // randomize in CPU
120  const MatrixBase<Real> &srcmat = src.Mat();
121  const int32 *copy_from_idxvec = copy_from_idx.Data();
122  MatrixBase<Real> &tgtmat = tgt->Mat();
123  for(int32 i=0; i<copy_from_idx.Dim(); i++) {
124  tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
125  }
126  }
127 }
int32_cuda rows
Definition: cu-matrixdim.h:53
const T * Data() const
Get raw pointer.
Definition: cu-array.h:52
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:52
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:49
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::Randomize ( const CuMatrixBase< float > &  src,
const CuArray< int32 > &  copy_from_idx,
CuMatrixBase< float > *  tgt 
)
template void kaldi::cu::Randomize ( const CuMatrixBase< double > &  src,
const CuArray< int32 > &  copy_from_idx,
CuMatrixBase< double > *  tgt 
)
void RegularizeL1 ( CuMatrixBase< Real > *  weight,
CuMatrixBase< Real > *  gradient,
Real  l1_penalty,
Real  learning_rate 
)

RegularizeL1 is a gradient step with l1 regularization added to the gradient.

We don't let the value cross over zero from positive to negative or vice versa, in a single step. If an element tries to cross zero and is stopped, we zero the gradient. (Dan: not sure why).

Definition at line 37 of file cu-math.cc.

References CU2DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), MatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), and CuMatrixBase< Real >::Stride().

Referenced by LinearTransform::Update(), and AffineTransform::Update().

37  {
38  KALDI_ASSERT(SameDim(*weight, *grad));
39 #if HAVE_CUDA == 1
40  if (CuDevice::Instantiate().Enabled()) {
41  CuTimer tim;
42 
43  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
44  dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));
45 
46  cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
47  weight->Dim(), grad->Stride());
48  CU_SAFE_CALL(cudaGetLastError());
49 
50  CuDevice::Instantiate().AccuProfile(__func__, tim);
51  } else
52  #endif
53  {
54  MatrixBase<Real> &weight2 = weight->Mat();
55  MatrixBase<Real> &grad2 = grad->Mat();
56  for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
57  for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
58 
59  if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
60 
61  Real l1_signed = l1;
62  if (weight2(r, c) < 0.0)
63  l1_signed = -l1;
64 
65  Real before = weight2(r, c);
66  Real after = weight2(r, c) - lr*grad2(r, c) - l1_signed;
67  if ((after > 0.0) ^ (before > 0.0)) {
68  weight2(r, c) = 0.0;
69  grad2(r, c) = 0.0;
70  } else {
71  weight2(r, c) -= l1_signed;
72  }
73  }
74  }
75  }
76 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
int32 MatrixIndexT
Definition: matrix-common.h:98
#define CU2DBLOCK
Definition: cu-matrixdim.h:67
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::RegularizeL1 ( CuMatrixBase< float > *  weight,
CuMatrixBase< float > *  grad,
float  l1,
float  lr 
)
template void kaldi::cu::RegularizeL1 ( CuMatrixBase< double > *  weight,
CuMatrixBase< double > *  grad,
double  l1,
double  lr 
)
static Real kaldi::cu::ScalarSigmoid ( Real  a)
inlinestatic

Definition at line 424 of file cu-math.cc.

References kaldi::Exp().

Referenced by CpuBackpropLstmNonlinearity(), and CpuComputeLstmNonlinearity().

424  {
425  if (a > Real(0)) {
426  return Real(1) / (Real(1) + Exp(-a));
427  } else {
428  Real x = Exp(a);
429  return x / (x + Real(1));
430  }
431 }
double Exp(double x)
Definition: kaldi-math.h:83
static Real kaldi::cu::ScalarTanh ( Real  a)
inlinestatic

Definition at line 434 of file cu-math.cc.

References kaldi::Exp().

Referenced by CpuBackpropLstmNonlinearity(), and CpuComputeLstmNonlinearity().

434  {
435  if (a > Real(0)) {
436  Real inv_expa = Exp(-a);
437  return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa);
438  } else {
439  Real expa = Exp(a);
440  return Real(1) - Real(2) / (Real(1) + expa * expa);
441  }
442 }
double Exp(double x)
Definition: kaldi-math.h:83
void Splice ( const CuMatrixBase< Real > &  src,
const CuArray< int32 > &  frame_offsets,
CuMatrixBase< Real > *  tgt 
)

Splice concatenates frames of src as specified in frame_offsets into tgt.

The dimensions of tgt must be equivalent to the number of rows in src and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim(). As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the general case where i in [0..src.NumRows()-1], k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the number of rows in src or less than 0 than the right side of the equation is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid an index out of bounds.

Definition at line 132 of file cu-math.cc.

References CU2DBLOCK, CuArrayBase< T >::Data(), CuMatrixBase< Real >::Data(), CuArrayBase< T >::Dim(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), MatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), CuMatrixBase< Real >::NumRows(), and MatrixBase< Real >::RowData().

Referenced by Component::NewComponentOfType(), Splice::PropagateFnc(), and kaldi::UnitTestCuMathSplice().

133  {
134 
135  KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
136  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
137 
138  #if HAVE_CUDA == 1
139  if (CuDevice::Instantiate().Enabled()) {
140  CuTimer tim;
141 
142  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
143  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
144 
145  cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
146  frame_offsets.Data(), tgt->Dim(), src.Dim());
147  CU_SAFE_CALL(cudaGetLastError());
148 
149  CuDevice::Instantiate().AccuProfile(__func__, tim);
150  } else
151  #endif
152  {
153  // expand in CPU
154  const MatrixBase<Real> &srcmat = src.Mat();
155  const int32 *frame_offsetvec = frame_offsets.Data();
156  int32 dim = frame_offsets.Dim();
157  MatrixBase<Real> &tgtmat = tgt->Mat();
158  //
159  for(int32 r=0; r < tgtmat.NumRows(); r++) {
160  for(int32 off=0; off < dim; off++) {
161  int32 r_off = r + frame_offsetvec[off];
162  if(r_off < 0) r_off = 0;
163  if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
164  memcpy(tgtmat.RowData(r)+off*srcmat.NumCols(),srcmat.RowData(r_off),sizeof(Real)*srcmat.NumCols());
165  }
166  }
167  }
168 }
const T * Data() const
Get raw pointer.
Definition: cu-array.h:52
#define CU2DBLOCK
Definition: cu-matrixdim.h:67
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:49
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::Splice ( const CuMatrixBase< float > &  src,
const CuArray< int32 > &  frame_offsets,
CuMatrixBase< float > *  tgt 
)
template void kaldi::cu::Splice ( const CuMatrixBase< double > &  src,
const CuArray< int32 > &  frame_offsets,
CuMatrixBase< double > *  tgt 
)