All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
kaldi::cu Namespace Reference

Functions

template<typename Real >
void RegularizeL1 (CuMatrixBase< Real > *weight, CuMatrixBase< Real > *gradient, Real l1_penalty, Real learning_rate)
 RegularizeL1 is a gradient step with l1 regularization added to the gradient. More...
 
template<typename Real >
void Randomize (const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< Real > *tgt)
 Copies a permutation of src into tgt. More...
 
template<typename Real >
void Splice (const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
 Splice concatenates frames of src as specified in frame_offsets into tgt. More...
 
template<typename Real >
void Copy (const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< Real > *tgt)
 Copies elements from src into tgt as given by copy_from_indices. More...
 
template void RegularizeL1 (CuMatrixBase< float > *weight, CuMatrixBase< float > *grad, float l1, float lr)
 
template void RegularizeL1 (CuMatrixBase< double > *weight, CuMatrixBase< double > *grad, double l1, double lr)
 
template void Splice (const CuMatrixBase< float > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< float > *tgt)
 
template void Splice (const CuMatrixBase< double > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< double > *tgt)
 
template void Copy (const CuMatrixBase< float > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< float > *tgt)
 
template void Copy (const CuMatrixBase< double > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< double > *tgt)
 
template void Randomize (const CuMatrixBase< float > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< float > *tgt)
 
template void Randomize (const CuMatrixBase< double > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< double > *tgt)
 
template<typename Real >
void NormalizePerRow (const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
 Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square equals 1.0. More...
 
template void NormalizePerRow (const CuMatrixBase< float > &in, const float target_rms, const bool add_log_stddev, CuMatrixBase< float > *out)
 
template void NormalizePerRow (const CuMatrixBase< double > &in, const double target_rms, const bool add_log_stddev, CuMatrixBase< double > *out)
 
template<typename Real >
void DiffNormalizePerRow (const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)
 
template void DiffNormalizePerRow (const CuMatrixBase< float > &in_value, const CuMatrixBase< float > &out_deriv, const float target_rms, const bool add_log_stddev, CuMatrixBase< float > *in_deriv)
 
template void DiffNormalizePerRow (const CuMatrixBase< double > &in_value, const CuMatrixBase< double > &out_deriv, const double target_rms, const bool add_log_stddev, CuMatrixBase< double > *in_deriv)
 
template<typename Real >
static Real ScalarSigmoid (Real a)
 
template<typename Real >
static Real ScalarTanh (Real a)
 
template<typename Real >
void CpuComputeLstmNonlinearity (const MatrixBase< Real > &input_mat, const MatrixBase< Real > &params_mat, MatrixBase< Real > *output)
 
template<typename Real >
void ComputeLstmNonlinearity (const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, CuMatrixBase< Real > *output)
 this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propagation. More...
 
template void CpuComputeLstmNonlinearity (const MatrixBase< float > &input_mat, const MatrixBase< float > &params_mat, MatrixBase< float > *output)
 
template void CpuComputeLstmNonlinearity (const MatrixBase< double > &input_mat, const MatrixBase< double > &params_mat, MatrixBase< double > *output)
 
template void ComputeLstmNonlinearity (const CuMatrixBase< float > &input, const CuMatrixBase< float > &params, CuMatrixBase< float > *output)
 
template void ComputeLstmNonlinearity (const CuMatrixBase< double > &input, const CuMatrixBase< double > &params, CuMatrixBase< double > *output)
 
template<typename Real >
void CpuBackpropLstmNonlinearity (const MatrixBase< Real > &input, const MatrixBase< Real > &params, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > *input_deriv, MatrixBase< Real > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)
 
template<typename Real >
void BackpropLstmNonlinearity (const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
 This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity. More...
 
template void CpuBackpropLstmNonlinearity (const MatrixBase< float > &input, const MatrixBase< float > &params, const MatrixBase< float > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< float > &self_repair_config, double count_in, MatrixBase< float > *input_deriv, MatrixBase< float > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< float > *self_repair_sum_out)
 
template void CpuBackpropLstmNonlinearity (const MatrixBase< double > &input, const MatrixBase< double > &params, const MatrixBase< double > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< double > &self_repair_config, double count_in, MatrixBase< double > *input_deriv, MatrixBase< double > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< double > *self_repair_sum_out)
 
template void BackpropLstmNonlinearity (const CuMatrixBase< float > &input, const CuMatrixBase< float > &params, const CuMatrixBase< float > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< float > &self_repair_config, double count_in, CuMatrixBase< float > *input_deriv, CuMatrixBase< float > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< float > *self_repair_sum_out)
 
template void BackpropLstmNonlinearity (const CuMatrixBase< double > &input, const CuMatrixBase< double > &params, const CuMatrixBase< double > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< double > &self_repair_config, double count_in, CuMatrixBase< double > *input_deriv, CuMatrixBase< double > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< double > *self_repair_sum_out)
 
template<typename Real >
void Group2norm (const CuMatrixBase< Real > &src, CuMatrixBase< Real > *dest, int32 group_stride)
 

Function Documentation

void BackpropLstmNonlinearity ( const CuMatrixBase< Real > &  input,
const CuMatrixBase< Real > &  params,
const CuMatrixBase< Real > &  output_deriv,
const CuMatrixBase< double > &  deriv_sum_in,
const CuVectorBase< Real > &  self_repair_config,
double  count_in,
CuMatrixBase< Real > *  input_deriv,
CuMatrixBase< Real > *  params_deriv,
CuMatrixBase< double > *  value_sum_out,
CuMatrixBase< double > *  deriv_sum_out,
CuMatrixBase< Real > *  self_repair_sum_out 
)

This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity.

It's a little more complicated than you might expect because of the 'self-repair' mechanism that we use to prevent the sigmoid and tanh nonlinearities oversaturating, and because of the average-activation and average-derivative stats that we store for these nonlinearites (these stats are used both to control the self-repair mechanism, and for diagnostic purposes).

Because the forward pass computes various intermediate values that are not output, this function actually has to do the same computations as the forward pass before it actually does the backprop.

Parameters
[in]inputThe same as in ComputeLstmNonlinearity(). A matrix, of dimension N by 5C (i.e. its num-cols must be a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). This function will also accept input of dimension N by 5C + 3, and the three final elements will be interpreted as scaling factors on i_t, f_t and o_t (useful as per-frame dropout masks).
[in]paramsThe same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}.
[in]output_derivA matrix, of dimension N by 2C, containing the derivative of the objective function we're backpropagating, w.r.t. the quantities c_t and m_t (in two blocks of column-dimension C).
[in]deriv_sum_inThis is used in the self-repair code to identify oversaturated nonlinearities. It is a matrix, of dimension 5 by C, corresponding to the totals of the derivatives of the 5 sigmoid and tanh nonlinearities, in they order they appear in the equations in the documentation of ComputeLstmNonlinearity() Rspectively, they appear in the equations for (i_t, f_t, c_t, o_t, m_t). This will be divided by 'count_in' to get the average derivative value so far, for each of the nonlinearities.
[in]self_repair_configA vector of dimension 10, containing the configuration of the self-repair to be used for the 5 nonlinearities. The first 5 elements are the self_repair_lower_threshold values (typically 0.05 for sigmoid and 0.2 for tanh), and the next 5 elements are the corresponding self-repair-scales (typically 10^-5).
[in]count_inThe data-count that corresponds to the stats in 'deriv_sum_in' at entry to the function. This function should tolerate the count being zero (in that case, it is free to do the self-repair or not, as this should only happen on the 1st minibatch of each training job).
[out]input_derivMay be NULL; if not, this function writes, to this location, the backpropagated derivative of the objective function w.r.t. the 'input' matrix. This matrix should have the same dimension as 'input'. In addition to the regular backpropagated derivative, the output will include small values relating to 'self-repair'. If the input is of column-dimension 5C + 3 (i.e. we are using dropout masks), the derivatives w.r.t. the dropout masks will not be set; they will retain their value prior to this function call.
[out]params_derivMay be NULL; if not, this is where this function *writes* [not adds] the backpropagated derivative of the objective function w.r.t. 'params'; it should have the same dimension as 'params' (3 by C). (This matrix will then be processed by the natural gradient code and added to the appropriate copy of the parameter matrix, outside this function).
[out]value_sum_outMust be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C. This function *adds* to this location the total value of each of the sigmoid/tanh nonlinearities that it computes (this is for diagnostic purposes).
[out]deriv_sum_outMust be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C; this function *adds* to this location the total of the derivative of each of the sigmoid/tanh nonlinearities that it computes (this is for diagnostic purposes and to control the self-repair). This function should tolerate the case when 'deriv_sum_out' points to the same data as 'deriv_sum_in'.
[out]self_repair_sum_outMust be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C; this function *writes* to this location the sum of the number of times the self-repair code was activated (integer values 0 <= k <= N). This will be processed outside this function into self-repair stats for diagnostics.

Definition at line 735 of file cu-math.cc.

References CpuBackpropLstmNonlinearity(), CU1DBLOCK, CuVectorBase< Real >::Data(), CuMatrixBase< Real >::Data(), CuVectorBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), CuMatrixBase< Real >::Stride(), and CuVectorBase< Real >::Vec().

Referenced by LstmNonlinearityComponent::Backprop(), kaldi::UnitTestBackpropLstmNonlinearity(), and kaldi::UnitTestLstmNonlinearity().

745  {
746  int32 num_rows = input.NumRows(),
747  cell_dim = input.NumCols() / 5,
748  input_cols = input.NumCols();
749  // Check dimensions.
750  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3);
751  KALDI_ASSERT(params.NumRows() == 3);
752  KALDI_ASSERT(params.NumCols() == cell_dim);
753  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
754  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
755  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
756  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
757  KALDI_ASSERT(self_repair_config.Dim() == 10);
758  KALDI_ASSERT(count_in >= 0);
759  if (input_deriv != NULL) {
760  KALDI_ASSERT(SameDim(input, *input_deriv));
761  }
762  if (params_deriv == NULL) {
763  KALDI_ASSERT(value_sum_out == NULL);
764  KALDI_ASSERT(deriv_sum_out == NULL);
765  KALDI_ASSERT(self_repair_sum_out == NULL);
766  } else {
767  KALDI_ASSERT(value_sum_out != NULL);
768  KALDI_ASSERT(deriv_sum_out != NULL);
769  KALDI_ASSERT(self_repair_sum_out != NULL);
770  KALDI_ASSERT(SameDim(params, *params_deriv));
771  KALDI_ASSERT(value_sum_out->NumRows() == 5);
772  KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
773  KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
774  KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
775  KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
776  }
777 
778 
779 #if HAVE_CUDA == 1
780  if (CuDevice::Instantiate().Enabled()) {
781  CuTimer tim;
782  // Each thread block is working on 1 row of the data.
783  // It's best that cell dim is a multiple fo CU1DBLOCK
784 
785  int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
786 
787  // Use 2D block (8x32 threads) as we need to compute column sum.
788  // Use 1D grid to cover the data matrix width `cell_dim`.
789  const int kWarpSize = 32;
790  dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
791 // dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
792 // n_blocks(num_rows, dimBlock.y));
793 // if (dimGrid.x * dimGrid.y > 1024) {
794 // dimGrid.y = std::max(1024 / dimGrid.x, 1);
795 // }
796  dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
797  if (input_deriv == NULL) {
798  if (params_deriv == NULL) {
799  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
800  have_dropout_mask, num_rows,
801  input.Data(), input.Stride(), params.Data(),
802  params.Stride(), output_deriv.Data(),
803  output_deriv.Stride(), deriv_sum_in.Data(),
804  deriv_sum_in.Stride(),
805  self_repair_config.Data(), count_in + 1,
806  NULL,
807  0,
808  NULL,
809  0,
810  NULL,
811  0,
812  NULL,
813  0,
814  NULL,
815  0);
816 
817  } else {
818  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
819  have_dropout_mask, num_rows,
820  input.Data(), input.Stride(), params.Data(),
821  params.Stride(), output_deriv.Data(),
822  output_deriv.Stride(), deriv_sum_in.Data(),
823  deriv_sum_in.Stride(),
824  self_repair_config.Data(), count_in + 1,
825  NULL,
826  0, params_deriv->Data(),
827  params_deriv->Stride(),
828  value_sum_out->Data(),
829  value_sum_out->Stride(),
830  deriv_sum_out->Data(),
831  deriv_sum_out->Stride(),
832  self_repair_sum_out->Data(),
833  self_repair_sum_out->Stride());
834  }
835  } else {
836  if (params_deriv == NULL) {
837  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
838  have_dropout_mask, num_rows,
839  input.Data(), input.Stride(), params.Data(),
840  params.Stride(), output_deriv.Data(),
841  output_deriv.Stride(), deriv_sum_in.Data(),
842  deriv_sum_in.Stride(),
843  self_repair_config.Data(), count_in + 1,
844  input_deriv->Data(), input_deriv->Stride(),
845  NULL,
846  0, NULL, 0, NULL, 0, NULL, 0);
847  } else {
848  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
849  have_dropout_mask, num_rows,
850  input.Data(), input.Stride(), params.Data(),
851  params.Stride(), output_deriv.Data(),
852  output_deriv.Stride(), deriv_sum_in.Data(),
853  deriv_sum_in.Stride(),
854  self_repair_config.Data(), count_in + 1,
855  input_deriv->Data(), input_deriv->Stride(),
856  params_deriv->Data(),
857  params_deriv->Stride(),
858  value_sum_out->Data(),
859  value_sum_out->Stride(),
860  deriv_sum_out->Data(),
861  deriv_sum_out->Stride(),
862  self_repair_sum_out->Data(),
863  self_repair_sum_out->Stride());
864  }
865  }
866 
867  CU_SAFE_CALL(cudaGetLastError());
868 
869  CuDevice::Instantiate().AccuProfile(__func__, tim);
870  } else
871 #endif
872  {
873  CpuBackpropLstmNonlinearity(input.Mat(), params.Mat(), output_deriv.Mat(),
874  deriv_sum_in.Mat(), self_repair_config.Vec(),
875  count_in, &(input_deriv->Mat()),
876  &(params_deriv->Mat()), &(value_sum_out->Mat()),
877  &(deriv_sum_out->Mat()),
878  &(self_repair_sum_out->Mat()));
879  }
880 }
MatrixIndexT NumCols() const
Definition: cu-matrix.h:196
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
const MatrixBase< Real > & Mat() const
Definition: cu-matrix.h:630
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:195
template void CpuBackpropLstmNonlinearity(const MatrixBase< double > &input, const MatrixBase< double > &params, const MatrixBase< double > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< double > &self_repair_config, double count_in, MatrixBase< double > *input_deriv, MatrixBase< double > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< double > *self_repair_sum_out)
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
MatrixIndexT Stride() const
Definition: cu-matrix.h:197
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:621
template void kaldi::cu::BackpropLstmNonlinearity ( const CuMatrixBase< float > &  input,
const CuMatrixBase< float > &  params,
const CuMatrixBase< float > &  output_deriv,
const CuMatrixBase< double > &  deriv_sum_in,
const CuVectorBase< float > &  self_repair_config,
double  count_in,
CuMatrixBase< float > *  input_deriv,
CuMatrixBase< float > *  params_deriv,
CuMatrixBase< double > *  value_sum_out,
CuMatrixBase< double > *  deriv_sum_out,
CuMatrixBase< float > *  self_repair_sum_out 
)
template void kaldi::cu::BackpropLstmNonlinearity ( const CuMatrixBase< double > &  input,
const CuMatrixBase< double > &  params,
const CuMatrixBase< double > &  output_deriv,
const CuMatrixBase< double > &  deriv_sum_in,
const CuVectorBase< double > &  self_repair_config,
double  count_in,
CuMatrixBase< double > *  input_deriv,
CuMatrixBase< double > *  params_deriv,
CuMatrixBase< double > *  value_sum_out,
CuMatrixBase< double > *  deriv_sum_out,
CuMatrixBase< double > *  self_repair_sum_out 
)
void ComputeLstmNonlinearity ( const CuMatrixBase< Real > &  input,
const CuMatrixBase< Real > &  params,
CuMatrixBase< Real > *  output 
)

this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propagation.

It computes the core part of the LSTM nonlinearity. Refer to class LstmNonlinearityComponent in ../nnet3/nnet-simple-component.h for more context.

Parameters
[in]inputA matrix, of dimension N by 5C (i.e. its num-cols must be a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). This function will also accept input of dimension N by 5C + 3, and the three final elements will be used as scaling factors on i_t, f_t and o_t (useful as per-frame dropout masks).
[in]paramsA matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}.
[out]outputA matrix, of dimension N by 2C. The quantities c_t and m_t respectively are put there (in two blocks of column-dimension C), according to the following equations:

i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) c_t = f_t*c_{t-1} + i_t * Tanh(c_part) o_t = Sigmoid(o_part + w_{oc}*c_t) m_t = o_t * Tanh(c_t)

Note on dropout: if the dropout mask is provided, let the mask values be i_t_mask, f_t_mask and o_t_mask (for each matrix row, these are scalars while i_t, f_t and o_t are of dimension C, because this is 'per-frame' dropout as described in http://www.danielpovey.com/files/2017_interspeech_dropout.pdf). Then the modification to the equations above consists of replacing 'i_t' with 'i_t_mask * i_t' in the RHS of the equations above, and the same type of change for f_t and o_t.

Definition at line 455 of file cu-math.cc.

References CpuComputeLstmNonlinearity(), CU1DBLOCK, CuMatrixBase< Real >::Data(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::Stride().

Referenced by LstmNonlinearityComponent::Propagate(), kaldi::UnitTestCuMathComputeLstmNonlinearity(), and kaldi::UnitTestLstmNonlinearity().

457  {
458  int32 num_rows = input.NumRows(),
459  input_cols = input.NumCols(),
460  cell_dim = input_cols / 5;
461  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
462  KALDI_ASSERT(output->NumRows() == num_rows);
463  KALDI_ASSERT(params.NumRows() == 3);
464  KALDI_ASSERT(params.NumCols() == cell_dim);
465  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
466 
467 #if HAVE_CUDA == 1
468  if (CuDevice::Instantiate().Enabled()) {
469  CuTimer tim;
470 
471  int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
472 
473  // Each thread block is working on 1 row of the data.
474  // It's best that cell dim is a multiple fo CU1DBLOCK
475  dim3 dimBlock(CU1DBLOCK);
476  dim3 dimGrid(num_rows);
477 
478  cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(),
479  params.Data(), params.Stride(), output->Stride(),
480  cell_dim, have_dropout_mask, num_rows, output->Data());
481  CU_SAFE_CALL(cudaGetLastError());
482 
483  CuDevice::Instantiate().AccuProfile(__func__, tim);
484  } else
485 #endif
486  {
487  CpuComputeLstmNonlinearity(input.Mat(), params.Mat(), &output->Mat());
488  }
489 }
template void CpuComputeLstmNonlinearity(const MatrixBase< double > &input_mat, const MatrixBase< double > &params_mat, MatrixBase< double > *output)
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::ComputeLstmNonlinearity ( const CuMatrixBase< float > &  input,
const CuMatrixBase< float > &  params,
CuMatrixBase< float > *  output 
)
template void kaldi::cu::ComputeLstmNonlinearity ( const CuMatrixBase< double > &  input,
const CuMatrixBase< double > &  params,
CuMatrixBase< double > *  output 
)
void Copy ( const CuMatrixBase< Real > &  src,
const CuArray< int32 > &  copy_from_indices,
CuMatrixBase< Real > *  tgt 
)

Copies elements from src into tgt as given by copy_from_indices.

The matrices src and tgt must have the same dimensions and the dimension of copy_from_indices must equal the number of columns in the src matrix. As a result, tgt(i, j) == src(i, copy_from_indices[j]). Also see CuMatrix::CopyCols(), which is more general.

Definition at line 173 of file cu-math.cc.

References CU2DBLOCK, CuArray< T >::Data(), CuMatrixBase< Real >::Data(), CuArray< T >::Dim(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::NumRows().

Referenced by CopyComponent::PropagateFnc(), kaldi::TestClusterUtilsVector(), and kaldi::UnitTestCuMathCopy().

174  {
175 
176  KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
177  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
178 
179  #if HAVE_CUDA == 1
180  if (CuDevice::Instantiate().Enabled()) {
181  CuTimer tim;
182 
183  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
184  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
185 
186  cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
187  copy_from_indices.Data(), tgt->Dim(), src.Dim());
188  CU_SAFE_CALL(cudaGetLastError());
189 
190  CuDevice::Instantiate().AccuProfile(__func__, tim);
191  } else
192  #endif
193  {
194  // expand in CPU
195  const MatrixBase<Real> &srcmat = src.Mat();
196  const int32 *copy_from_indicesvec = copy_from_indices.Data();
197  int32 dim = copy_from_indices.Dim();
198  MatrixBase<Real> &tgtmat = tgt->Mat();
199  //
200  for(int32 r = 0; r < tgtmat.NumRows(); r++) {
201  for(int32 c = 0; c < dim; c++) {
202  tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
203  }
204  }
205  }
206 }
const T * Data() const
Get raw pointer.
Definition: cu-array.h:65
#define CU2DBLOCK
Definition: cu-matrixdim.h:67
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:62
template void kaldi::cu::Copy ( const CuMatrixBase< float > &  src,
const CuArray< int32 > &  copy_from_indices,
CuMatrixBase< float > *  tgt 
)
template void kaldi::cu::Copy ( const CuMatrixBase< double > &  src,
const CuArray< int32 > &  copy_from_indices,
CuMatrixBase< double > *  tgt 
)
void CpuBackpropLstmNonlinearity ( const MatrixBase< Real > &  input,
const MatrixBase< Real > &  params,
const MatrixBase< Real > &  output_deriv,
const MatrixBase< double > &  deriv_sum_in,
const VectorBase< Real > &  self_repair_config,
double  count_in,
MatrixBase< Real > *  input_deriv,
MatrixBase< Real > *  params_deriv,
MatrixBase< double > *  value_sum_out,
MatrixBase< double > *  deriv_sum_out,
MatrixBase< Real > *  self_repair_sum_out 
)

Definition at line 509 of file cu-math.cc.

References count, VectorBase< Real >::Dim(), rnnlm::i, KALDI_ASSERT, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), kaldi::SameDim(), ScalarSigmoid(), and ScalarTanh().

Referenced by BackpropLstmNonlinearity(), and kaldi::UnitTestBackpropLstmNonlinearity().

519  {
520  int32 num_rows = input.NumRows(),
521  input_cols = input
522  .NumCols(),
523  cell_dim = input.NumCols() / 5;
524  // Check dimensions.
525  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
526  KALDI_ASSERT(params.NumRows() == 3);
527  KALDI_ASSERT(params.NumCols() == cell_dim);
528  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
529  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
530  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
531  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
532  KALDI_ASSERT(self_repair_config.Dim() == 10);
533  KALDI_ASSERT(count_in >= 0);
534  if (input_deriv != NULL) {
535  KALDI_ASSERT(SameDim(input, *input_deriv));
536  }
537  if (params_deriv == NULL) {
538  KALDI_ASSERT(value_sum_out == NULL);
539  KALDI_ASSERT(deriv_sum_out == NULL);
540  KALDI_ASSERT(self_repair_sum_out == NULL);
541  } else {
542  KALDI_ASSERT(value_sum_out != NULL);
543  KALDI_ASSERT(deriv_sum_out != NULL);
544  KALDI_ASSERT(self_repair_sum_out != NULL);
545  KALDI_ASSERT(SameDim(params, *params_deriv));
546  KALDI_ASSERT(value_sum_out->NumRows() == 5);
547  KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
548  KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
549  KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
550  KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
551  }
552 
553  const MatrixBase<Real> &input_mat = input;
554  const MatrixBase<Real> &params_mat = params;
555  const MatrixBase<Real> &output_deriv_mat = output_deriv;
556  const MatrixBase<double> &deriv_sum_in_mat = deriv_sum_in;
557  const VectorBase<Real> &sr_config = self_repair_config;
558  MatrixBase<Real> *input_deriv_mat = (
559  input_deriv == NULL ? NULL : input_deriv);
560  MatrixBase<Real> *params_deriv_mat = NULL;
561  MatrixBase<Real> *self_repair_sum_out_mat = NULL;
562  MatrixBase<double> *value_sum_out_mat = NULL;
563  MatrixBase<double> *deriv_sum_out_mat = NULL;
564  if (params_deriv != NULL) {
565  params_deriv_mat = params_deriv;
566  value_sum_out_mat = value_sum_out;
567  deriv_sum_out_mat = deriv_sum_out;
568  self_repair_sum_out_mat = self_repair_sum_out;
569  }
570 
571 
572  // We add 1.0 (i.e. a small value) to the count to avoid division by zero.
573  Real count = 1.0 + count_in;
574  for (int32 c = 0; c < cell_dim; c++) {
575  // parameters
576  Real w_ic = params_mat(0, c);
577  Real w_fc = params_mat(1, c);
578  Real w_oc = params_mat(2, c);
579  // derivative sums w.r.t. parameters.
580  Real w_ic_deriv_sum = 0.0;
581  Real w_fc_deriv_sum = 0.0;
582  Real w_oc_deriv_sum = 0.0;
583 
584  // average derivatives, for self-repair.
585  // The 5 nonlinearities that are subject to self-repair are written as:
586  // Sigmoid(i_t_input), Sigmoid(f_t_input),
587  // Tanh(c_part), Sigmoid(o_t_input), Tanh(c_t)
588  Real i_t_self_repair = (
589  deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
590  Real f_t_self_repair = (
591  deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
592  Real c_part_self_repair = (
593  deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
594  Real o_t_self_repair = (
595  deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
596  Real c_t_self_repair = (
597  deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
598  // Note on how we add self-repair for sigmoids/tanh's. If self-repair
599  // is activated for this unit, then...
600  // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0)
601  // ... to the input-deriv;
602  // For tanh's we'd add -self_repair_scale * tanh(x)
603  // If self-repair is not activated, the 'self_repair' scales are set to zero.
604 
605  // The following variables are for the accumulation of stats on the
606  // sigmoid and tanh units.
607  Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0;
608  Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0;
609  Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0;
610  Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0;
611  Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0;
612 
613 
614  for (int32 r = 0; r < num_rows; r++) {
615  Real i_part = input_mat(r, c),
616  f_part = input_mat(r, c + cell_dim),
617  c_part = input_mat(r, c + 2 * cell_dim),
618  o_part = input_mat(r, c + 3 * cell_dim),
619  c_prev = input_mat(r, c + 4 * cell_dim);
620 
621  Real i_scale = (input_cols == cell_dim * 5 ? 1.0 :
622  input_mat(r, cell_dim * 5)),
623  f_scale = (input_cols == cell_dim * 5 ? 1.0 :
624  input_mat(r, cell_dim * 5 + 1)),
625  o_scale = (input_cols == cell_dim * 5 ? 1.0 :
626  input_mat(r, cell_dim * 5 + 2));
627 
628  // For greater clarity, we give some of the quantities in the
629  // forward equations their own names.
630  Real i_t_input = i_part + w_ic * c_prev,
631  i_t = ScalarSigmoid(i_t_input),
632  f_t_input = f_part + w_fc * c_prev,
633  f_t = ScalarSigmoid(f_t_input),
634  tanh_c_part = ScalarTanh(c_part),
635  c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part,
636  o_t_input = o_part + w_oc * c_t,
637  o_t = ScalarSigmoid(o_t_input),
638  tanh_c_t = ScalarTanh(c_t);
639  // we'd also compute, in the forward pass,
640  // m_t = o_t * tanh_c_t;
641  // but this variable is not needed.
642 
643  // Accumulate nonlinearity value and derivative stats.
644  // Note:
645  // tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
646  // sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)).
647  i_t_value_sum += i_t;
648  i_t_deriv_sum += i_t * (1.0F - i_t);
649  f_t_value_sum += f_t;
650  f_t_deriv_sum += f_t * (1.0F - f_t);
651  c_part_value_sum += tanh_c_part;
652  c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part;
653  o_t_value_sum += o_t;
654  o_t_deriv_sum += o_t * (1.0F - o_t);
655  c_t_value_sum += tanh_c_t;
656  c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t;
657 
658 
659  // the derivative of the objective function w.r.t. a particular quantity
660  // will be written by prepending "d" to the name.
661  // We compute these derivatives in the reverse of the order in which
662  // we computed the original quantities.
663  // dc_t_out is the part of the derivative w.r.t. c_t that
664  // comes directly from the output of this function.
665  Real dc_t_out = output_deriv_mat(r, c);
666  Real dm_t = output_deriv_mat(r, c + cell_dim);
667  Real dtanh_c_t = o_t * o_scale * dm_t;
668  Real do_t = o_scale * tanh_c_t * dm_t;
669  Real do_t_input = (o_t * (1.0F - o_t) * do_t
670  - (2.0F * o_t - 1.0F) * o_t_self_repair);
671  Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
672  + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
673  Real dtanh_c_part = i_t * i_scale * dc_t;
674  Real df_t = dc_t * f_scale * c_prev;
675  Real df_t_input = ((df_t * f_t * (1.0F - f_t)
676  - (2.0F * f_t - 1.0F) * f_t_self_repair));
677  Real di_t = dc_t * i_scale * tanh_c_part;
678  Real di_t_input = ((di_t * i_t * (1.0F - i_t)
679  - (2.0F * i_t - 1.0F) * i_t_self_repair));
680 
681  w_ic_deriv_sum += c_prev * di_t_input;
682  w_fc_deriv_sum += c_prev * df_t_input;
683  w_oc_deriv_sum += c_t * do_t_input;
684 
685  Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t;
686  Real do_part = do_t_input;
687  Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
688  - tanh_c_part * c_part_self_repair);
689  Real df_part = df_t_input;
690  Real di_part = di_t_input;
691 
692  if (input_deriv_mat != NULL) {
693  (*input_deriv_mat)(r, c) = di_part;
694  (*input_deriv_mat)(r, c + cell_dim) = df_part;
695  (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part;
696  (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part;
697  (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev;
698  }
699  }
700 
701  if (params_deriv != NULL) {
702  // note: for optimizing things you can assume that params_deriv and
703  // input_deriv_mat are non-NULL (i.e. all the output matrices are
704  // non-NULL). The situations when some of the output matrices are NULL
705  // does not happen often (mainly only in testing code).
706 
707  (*params_deriv_mat)(0, c) = w_ic_deriv_sum;
708  (*params_deriv_mat)(1, c) = w_fc_deriv_sum;
709  (*params_deriv_mat)(2, c) = w_oc_deriv_sum;
710 
711  (*value_sum_out_mat)(0, c) += i_t_value_sum;
712  (*value_sum_out_mat)(1, c) += f_t_value_sum;
713  (*value_sum_out_mat)(2, c) += c_part_value_sum;
714  (*value_sum_out_mat)(3, c) += o_t_value_sum;
715  (*value_sum_out_mat)(4, c) += c_t_value_sum;
716 
717  // need to update self_repair_sum_out before deriv_sum_out, because
718  // deriv_sum_out and deriv_sum_in might point to the same memory.
719  for (int32 i = 0; i < 5; i++)
720  (*self_repair_sum_out_mat)(i, c) =
721  (deriv_sum_in_mat(i, c) / count < sr_config(i) ? num_rows : 0);
722 
723  (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
724  (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
725  (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum;
726  (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum;
727  (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum;
728  }
729  }
730 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
const size_t count
static Real ScalarTanh(Real a)
Definition: cu-math.cc:400
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
static Real ScalarSigmoid(Real a)
Definition: cu-math.cc:390
template void kaldi::cu::CpuBackpropLstmNonlinearity ( const MatrixBase< float > &  input,
const MatrixBase< float > &  params,
const MatrixBase< float > &  output_deriv,
const MatrixBase< double > &  deriv_sum_in,
const VectorBase< float > &  self_repair_config,
double  count_in,
MatrixBase< float > *  input_deriv,
MatrixBase< float > *  params_deriv,
MatrixBase< double > *  value_sum_out,
MatrixBase< double > *  deriv_sum_out,
MatrixBase< float > *  self_repair_sum_out 
)
template void kaldi::cu::CpuBackpropLstmNonlinearity ( const MatrixBase< double > &  input,
const MatrixBase< double > &  params,
const MatrixBase< double > &  output_deriv,
const MatrixBase< double > &  deriv_sum_in,
const VectorBase< double > &  self_repair_config,
double  count_in,
MatrixBase< double > *  input_deriv,
MatrixBase< double > *  params_deriv,
MatrixBase< double > *  value_sum_out,
MatrixBase< double > *  deriv_sum_out,
MatrixBase< double > *  self_repair_sum_out 
)
void CpuComputeLstmNonlinearity ( const MatrixBase< Real > &  input_mat,
const MatrixBase< Real > &  params_mat,
MatrixBase< Real > *  output 
)

Definition at line 411 of file cu-math.cc.

References MatrixBase< Real >::Data(), KALDI_ASSERT, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), MatrixBase< Real >::RowData(), ScalarSigmoid(), ScalarTanh(), and MatrixBase< Real >::Stride().

Referenced by ComputeLstmNonlinearity(), and kaldi::UnitTestCuMathComputeLstmNonlinearity().

413  {
414  int32 num_rows = input_mat.NumRows(),
415  input_cols = input_mat.NumCols(),
416  cell_dim = input_cols / 5;
417  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
418  KALDI_ASSERT(output->NumRows() == num_rows);
419  KALDI_ASSERT(params_mat.NumRows() == 3);
420  KALDI_ASSERT(params_mat.NumCols() == cell_dim);
421  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
422 
423  MatrixBase<Real> &output_mat = *output;
424  const Real *params_data = params_mat.Data();
425  int32 params_stride = params_mat.Stride();
426  for (int32 r = 0; r < num_rows; r++) {
427  const Real *input_row = input_mat.RowData(r);
428  // i_scale and f_scale relate to dropout, they will normally be 1.0.
429  Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]),
430  f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]),
431  o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]);
432 
433  Real *output_row = output_mat.RowData(r);
434  for (int32 c = 0; c < cell_dim; c++) {
435  Real i_part = input_row[c];
436  Real f_part = input_row[c + cell_dim];
437  Real c_part = input_row[c + 2 * cell_dim];
438  Real o_part = input_row[c + 3 * cell_dim];
439  Real c_prev = input_row[c + 4 * cell_dim];
440  Real w_ic = params_data[c];
441  Real w_fc = params_data[c + params_stride];
442  Real w_oc = params_data[c + params_stride * 2];
443  Real i_t = ScalarSigmoid(i_part + w_ic * c_prev);
444  Real f_t = ScalarSigmoid(f_part + w_fc * c_prev);
445  Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part);
446  Real o_t = ScalarSigmoid(o_part + w_oc * c_t);
447  Real m_t = o_t * o_scale * ScalarTanh(c_t);
448  output_row[c] = c_t;
449  output_row[c + cell_dim] = m_t;
450  }
451  }
452 }
static Real ScalarTanh(Real a)
Definition: cu-math.cc:400
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
static Real ScalarSigmoid(Real a)
Definition: cu-math.cc:390
template void kaldi::cu::CpuComputeLstmNonlinearity ( const MatrixBase< float > &  input_mat,
const MatrixBase< float > &  params_mat,
MatrixBase< float > *  output 
)
template void kaldi::cu::CpuComputeLstmNonlinearity ( const MatrixBase< double > &  input_mat,
const MatrixBase< double > &  params_mat,
MatrixBase< double > *  output 
)
void DiffNormalizePerRow ( const CuMatrixBase< Real > &  in_value,
const CuMatrixBase< Real > &  out_deriv,
const Real  target_rms,
const bool  add_log_stddev,
CuMatrixBase< Real > *  in_deriv 
)

Definition at line 315 of file cu-math.cc.

References CuVectorBase< Real >::AddDiagMat2(), CuVectorBase< Real >::AddDiagMatMat(), CuMatrixBase< Real >::AddDiagVecMat(), CuVectorBase< Real >::ApplyFloor(), CU1DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), kaldi::kNoTrans, kaldi::kTrans, kaldi::kUndefined, CuMatrixBase< Real >::MulRowsVec(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::Stride().

Referenced by NormalizeComponent::Backprop(), and kaldi::UnitTestCuDiffNormalizePerRow().

318  {
319  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
320 #if HAVE_CUDA == 1
321  if (CuDevice::Instantiate().Enabled()) {
322  CuTimer tim;
323  size_t dimBlock = CU1DBLOCK;
324  size_t dimGrid = in_deriv->NumRows();
325  cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->Data(),
326  in_deriv->Stride(), in_value.Data(),
327  in_value.Dim(), out_deriv.Data(),
328  out_deriv.Stride(), target_rms, add_log_stddev);
329  CU_SAFE_CALL(cudaGetLastError());
330  CuDevice::Instantiate().AccuProfile(__func__, tim);
331  } else
332 #endif
333  {
334  const CuSubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
335  0, in_value.NumCols());
336  CuVector<Real> dot_products(out_deriv.NumRows());
337  dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
338  kTrans, 0.0);
339  CuVector<Real> in_norm(in_value.NumRows());
340  Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
341  in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
342 
343  if (add_log_stddev) {
344  CuVector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
345  out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
346  // f = log(sqrt(max(epsi, x^T x / D)))
347  // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x : 0.
348  // we don't compute this exactly below for the case when x^2 x is very
349  // small, but we do make sure that the deriv isn't infinity when the input
350  // is zero.
351  log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
352  log_stddev_deriv.ApplyPow(-1.0);
353  out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
354  log_stddev_deriv.MulElements(out_deriv_for_stddev);
355  if (in_deriv)
356  in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
357  }
358  in_norm.Scale(1.0 / d_scaled);
359  in_norm.ApplyFloor(kSquaredNormFloor);
360  in_norm.ApplyPow(-0.5);
361  if (in_deriv) {
362  if (in_deriv->Data() != out_deriv_no_log.Data())
363  in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
364  else
365  in_deriv->MulRowsVec(in_norm);
366  in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
367  in_norm.ApplyPow(3.0);
368  dot_products.MulElements(in_norm);
369 
370  in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value, kNoTrans,
371  1.0);
372  }
373  }
374 }
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
template void kaldi::cu::DiffNormalizePerRow ( const CuMatrixBase< float > &  in_value,
const CuMatrixBase< float > &  out_deriv,
const float  target_rms,
const bool  add_log_stddev,
CuMatrixBase< float > *  in_deriv 
)
template void kaldi::cu::DiffNormalizePerRow ( const CuMatrixBase< double > &  in_value,
const CuMatrixBase< double > &  out_deriv,
const double  target_rms,
const bool  add_log_stddev,
CuMatrixBase< double > *  in_deriv 
)
void kaldi::cu::Group2norm ( const CuMatrixBase< Real > &  src,
CuMatrixBase< Real > *  dest,
int32  group_stride 
)
void NormalizePerRow ( const CuMatrixBase< Real > &  in,
const Real  target_rms,
const bool  add_log_stddev,
CuMatrixBase< Real > *  out 
)

Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square equals 1.0.

The output y_i = scale * x_i, and we want to RMS value of the y_i to equal target_rms, so y^t y = D * target_rms^2 (if y is one row of the input). we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). there is also flooring involved, to avoid division-by-zero problems. It's important for the backprop, that the floor's square root is exactly representable as float. If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) is an extra dimension of the output.

Definition at line 246 of file cu-math.cc.

References CuMatrixBase< Real >::CopyColFromVec(), CuMatrixBase< Real >::CopyFromMat(), CU1DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, kaldi::kNoTrans, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), and CuMatrixBase< Real >::Stride().

Referenced by NormalizeComponent::Propagate(), and kaldi::UnitTestCuMathNormalizePerRow().

247  {
248  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
249  if (add_log_stddev) {
250  KALDI_ASSERT(in.NumRows() == out->NumRows());
251  KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
252  } else {
253  KALDI_ASSERT(SameDim(in, *out));
254  }
255 
256 #if HAVE_CUDA == 1
257  if (CuDevice::Instantiate().Enabled()) {
258  CuTimer tim;
259  size_t dimBlock = CU1DBLOCK;
260  size_t dimGrid = out->NumRows();
261  cuda_normalize_per_row(dimGrid, dimBlock, out->Data(), out->Stride(),
262  in.Data(), in.Dim(), target_rms, add_log_stddev);
263  CU_SAFE_CALL(cudaGetLastError());
264  CuDevice::Instantiate().AccuProfile(__func__, tim);
265  } else
266 #endif
267  {
268  CuSubMatrix<Real> out_no_log(*out, 0, out->NumRows(), 0, in.NumCols());
269  if (in.Data() != out_no_log.Data())
270  out_no_log.CopyFromMat(in);
271  CuVector<Real> in_norm(in.NumRows());
272  Real d_scaled = in.NumCols() * target_rms * target_rms;
273  in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
274  in_norm.ApplyFloor(kSquaredNormFloor);
275  in_norm.ApplyPow(-0.5);
276  out_no_log.MulRowsVec(in_norm);
277  if (add_log_stddev) {
278  in_norm.ApplyLog();
279  in_norm.Scale(-1.0);
280  in_norm.Add(log(target_rms));
281  out->CopyColFromVec(in_norm, in.NumCols());
282  }
283  }
284 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
#define CU1DBLOCK
Definition: cu-matrixdim.h:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::NormalizePerRow ( const CuMatrixBase< float > &  in,
const float  target_rms,
const bool  add_log_stddev,
CuMatrixBase< float > *  out 
)
template void kaldi::cu::NormalizePerRow ( const CuMatrixBase< double > &  in,
const double  target_rms,
const bool  add_log_stddev,
CuMatrixBase< double > *  out 
)
void Randomize ( const CuMatrixBase< Real > &  src,
const CuArray< int32 > &  copy_from_idx,
CuMatrixBase< Real > *  tgt 
)

Copies a permutation of src into tgt.

The row permutation is specified in copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The dimensions of copy_from_idx must be equivalent to the number of rows in tgt and src and all elements in the vector must be in [0, src.numRows()-1].

Definition at line 80 of file cu-math.cc.

References CuArray< T >::Data(), CuMatrixBase< Real >::Data(), CuArray< T >::Dim(), CuMatrixBase< Real >::Dim(), rnnlm::i, KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), MatrixBase< Real >::Row(), and MatrixDim_::rows.

Referenced by MatrixRandomizer::Randomize(), and kaldi::UnitTestCuMathRandomize().

82  {
83 
84  KALDI_ASSERT(src.NumCols() == tgt->NumCols());
85  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
86  KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());
87 
88  #if HAVE_CUDA == 1
89  if (CuDevice::Instantiate().Enabled()) {
90  CuTimer tim;
91 
92  /*
93  Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
94  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
95  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
96  */
97 
98  /*
99  * Let's use blocksize 4 x 128 (512 threads/block)
100  * and extend the randomizable matrices to: col 4*65535, row 128*65535
101  * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
102  */
103  dim3 dimBlock(4, 128);
104  dim3 dimGrid(n_blocks(tgt->NumCols(), 4), n_blocks(copy_from_idx.Dim(), 128));
105  /*
106  */
107 
108  MatrixDim dimsrc = src.Dim(); dimsrc.rows=copy_from_idx.Dim();
109  MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();
110 
111  cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
112  copy_from_idx.Data(), dimtgt, dimsrc);
113  CU_SAFE_CALL(cudaGetLastError());
114 
115  CuDevice::Instantiate().AccuProfile(__func__, tim);
116  } else
117  #endif
118  {
119  // randomize in CPU
120  const MatrixBase<Real> &srcmat = src.Mat();
121  const int32 *copy_from_idxvec = copy_from_idx.Data();
122  MatrixBase<Real> &tgtmat = tgt->Mat();
123  for(int32 i=0; i<copy_from_idx.Dim(); i++) {
124  tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
125  }
126  }
127 }
const T * Data() const
Get raw pointer.
Definition: cu-array.h:65
int32_cuda rows
Definition: cu-matrixdim.h:53
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:52
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:62
template void kaldi::cu::Randomize ( const CuMatrixBase< float > &  src,
const CuArray< int32 > &  copy_from_idx,
CuMatrixBase< float > *  tgt 
)
template void kaldi::cu::Randomize ( const CuMatrixBase< double > &  src,
const CuArray< int32 > &  copy_from_idx,
CuMatrixBase< double > *  tgt 
)
void RegularizeL1 ( CuMatrixBase< Real > *  weight,
CuMatrixBase< Real > *  gradient,
Real  l1_penalty,
Real  learning_rate 
)

RegularizeL1 is a gradient step with l1 regularization added to the gradient.

We don't let the value cross over zero from positive to negative or vice versa, in a single step. If an element tries to cross zero and is stopped, we zero the gradient. (Dan: not sure why).

Definition at line 37 of file cu-math.cc.

References CU2DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), MatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), and CuMatrixBase< Real >::Stride().

Referenced by LinearTransform::Update(), and AffineTransform::Update().

37  {
38  KALDI_ASSERT(SameDim(*weight, *grad));
39 #if HAVE_CUDA == 1
40  if (CuDevice::Instantiate().Enabled()) {
41  CuTimer tim;
42 
43  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
44  dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));
45 
46  cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
47  weight->Dim(), grad->Stride());
48  CU_SAFE_CALL(cudaGetLastError());
49 
50  CuDevice::Instantiate().AccuProfile(__func__, tim);
51  } else
52  #endif
53  {
54  MatrixBase<Real> &weight2 = weight->Mat();
55  MatrixBase<Real> &grad2 = grad->Mat();
56  for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
57  for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
58 
59  if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
60 
61  Real l1_signed = l1;
62  if (weight2(r, c) < 0.0)
63  l1_signed = -l1;
64 
65  Real before = weight2(r, c);
66  Real after = weight2(r, c) - lr*grad2(r, c) - l1_signed;
67  if ((after > 0.0) ^ (before > 0.0)) {
68  weight2(r, c) = 0.0;
69  grad2(r, c) = 0.0;
70  } else {
71  weight2(r, c) -= l1_signed;
72  }
73  }
74  }
75  }
76 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
int32 MatrixIndexT
Definition: matrix-common.h:96
#define CU2DBLOCK
Definition: cu-matrixdim.h:67
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
template void kaldi::cu::RegularizeL1 ( CuMatrixBase< float > *  weight,
CuMatrixBase< float > *  grad,
float  l1,
float  lr 
)
template void kaldi::cu::RegularizeL1 ( CuMatrixBase< double > *  weight,
CuMatrixBase< double > *  grad,
double  l1,
double  lr 
)
static Real kaldi::cu::ScalarSigmoid ( Real  a)
inlinestatic

Definition at line 390 of file cu-math.cc.

References kaldi::Exp().

Referenced by CpuBackpropLstmNonlinearity(), and CpuComputeLstmNonlinearity().

390  {
391  if (a > Real(0)) {
392  return Real(1) / (Real(1) + Exp(-a));
393  } else {
394  Real x = Exp(a);
395  return x / (x + Real(1));
396  }
397 }
double Exp(double x)
Definition: kaldi-math.h:83
static Real kaldi::cu::ScalarTanh ( Real  a)
inlinestatic

Definition at line 400 of file cu-math.cc.

References kaldi::Exp().

Referenced by CpuBackpropLstmNonlinearity(), and CpuComputeLstmNonlinearity().

400  {
401  if (a > Real(0)) {
402  Real inv_expa = Exp(-a);
403  return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa);
404  } else {
405  Real expa = Exp(a);
406  return Real(1) - Real(2) / (Real(1) + expa * expa);
407  }
408 }
double Exp(double x)
Definition: kaldi-math.h:83
void Splice ( const CuMatrixBase< Real > &  src,
const CuArray< int32 > &  frame_offsets,
CuMatrixBase< Real > *  tgt 
)

Splice concatenates frames of src as specified in frame_offsets into tgt.

The dimensions of tgt must be equivalent to the number of rows in src and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim(). As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the general case where i in [0..src.NumRows()-1], k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the number of rows in src or less than 0 than the right side of the equation is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid an index out of bounds.

Definition at line 132 of file cu-math.cc.

References CU2DBLOCK, CuArray< T >::Data(), CuMatrixBase< Real >::Data(), CuArray< T >::Dim(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), MatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), CuMatrixBase< Real >::NumRows(), and MatrixBase< Real >::RowData().

Referenced by Component::NewComponentOfType(), Splice::PropagateFnc(), and kaldi::UnitTestCuMathSplice().

133  {
134 
135  KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
136  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
137 
138  #if HAVE_CUDA == 1
139  if (CuDevice::Instantiate().Enabled()) {
140  CuTimer tim;
141 
142  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
143  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
144 
145  cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
146  frame_offsets.Data(), tgt->Dim(), src.Dim());
147  CU_SAFE_CALL(cudaGetLastError());
148 
149  CuDevice::Instantiate().AccuProfile(__func__, tim);
150  } else
151  #endif
152  {
153  // expand in CPU
154  const MatrixBase<Real> &srcmat = src.Mat();
155  const int32 *frame_offsetvec = frame_offsets.Data();
156  int32 dim = frame_offsets.Dim();
157  MatrixBase<Real> &tgtmat = tgt->Mat();
158  //
159  for(int32 r=0; r < tgtmat.NumRows(); r++) {
160  for(int32 off=0; off < dim; off++) {
161  int32 r_off = r + frame_offsetvec[off];
162  if(r_off < 0) r_off = 0;
163  if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
164  memcpy(tgtmat.RowData(r)+off*srcmat.NumCols(),srcmat.RowData(r_off),sizeof(Real)*srcmat.NumCols());
165  }
166  }
167  }
168 }
const T * Data() const
Get raw pointer.
Definition: cu-array.h:65
#define CU2DBLOCK
Definition: cu-matrixdim.h:67
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:62
template void kaldi::cu::Splice ( const CuMatrixBase< float > &  src,
const CuArray< int32 > &  frame_offsets,
CuMatrixBase< float > *  tgt 
)
template void kaldi::cu::Splice ( const CuMatrixBase< double > &  src,
const CuArray< int32 > &  frame_offsets,
CuMatrixBase< double > *  tgt 
)