Functions
template<typename Real >
void	RegularizeL1 (CuMatrixBase< Real > weight, CuMatrixBase< Real > gradient, Real l1_penalty, Real learning_rate)
	RegularizeL1 is a gradient step with l1 regularization added to the gradient. More...

template<typename Real >
void	Randomize (const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< Real > *tgt)
	Copies a permutation of src into tgt. More...

template<typename Real >
void	Splice (const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
	Splice concatenates frames of src as specified in frame_offsets into tgt. More...

template<typename Real >
void	Copy (const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< Real > *tgt)
	Copies elements from src into tgt as given by copy_from_indices. More...

template<typename Real >
void	EnsureNonzero (const CuMatrixBase< Real > &src, Real epsilon, CuMatrixBase< Real > *dest)
	This function requires that src and dest have the same dimension and epsilon > 0. More...

template void	RegularizeL1 (CuMatrixBase< float > weight, CuMatrixBase< float > grad, float l1, float lr)

template void	RegularizeL1 (CuMatrixBase< double > weight, CuMatrixBase< double > grad, double l1, double lr)

template void	Splice (const CuMatrixBase< float > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< float > *tgt)

template void	Splice (const CuMatrixBase< double > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< double > *tgt)

template void	Copy (const CuMatrixBase< float > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< float > *tgt)

template void	Copy (const CuMatrixBase< double > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< double > *tgt)

template void	Randomize (const CuMatrixBase< float > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< float > *tgt)

template void	Randomize (const CuMatrixBase< double > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< double > *tgt)

template<typename Real >
void	NormalizePerRow (const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
	Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square equals 1.0. More...

template void	NormalizePerRow (const CuMatrixBase< float > &in, const float target_rms, const bool add_log_stddev, CuMatrixBase< float > *out)

template void	NormalizePerRow (const CuMatrixBase< double > &in, const double target_rms, const bool add_log_stddev, CuMatrixBase< double > *out)

template<typename Real >
void	DiffNormalizePerRow (const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)

template void	DiffNormalizePerRow (const CuMatrixBase< float > &in_value, const CuMatrixBase< float > &out_deriv, const float target_rms, const bool add_log_stddev, CuMatrixBase< float > *in_deriv)

template void	DiffNormalizePerRow (const CuMatrixBase< double > &in_value, const CuMatrixBase< double > &out_deriv, const double target_rms, const bool add_log_stddev, CuMatrixBase< double > *in_deriv)

template<typename Real >
static Real	ScalarSigmoid (Real a)

template<typename Real >
static Real	ScalarTanh (Real a)

template<typename Real >
void	CpuComputeLstmNonlinearity (const MatrixBase< Real > &input_mat, const MatrixBase< Real > &params_mat, MatrixBase< Real > *output)

template<typename Real >
void	ComputeLstmNonlinearity (const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, CuMatrixBase< Real > *output)
	this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propagation. More...

template void	CpuComputeLstmNonlinearity (const MatrixBase< float > &input_mat, const MatrixBase< float > &params_mat, MatrixBase< float > *output)

template void	CpuComputeLstmNonlinearity (const MatrixBase< double > &input_mat, const MatrixBase< double > &params_mat, MatrixBase< double > *output)

template void	ComputeLstmNonlinearity (const CuMatrixBase< float > &input, const CuMatrixBase< float > &params, CuMatrixBase< float > *output)

template void	ComputeLstmNonlinearity (const CuMatrixBase< double > &input, const CuMatrixBase< double > &params, CuMatrixBase< double > *output)

template<typename Real >
void	CpuBackpropLstmNonlinearity (const MatrixBase< Real > &input, const MatrixBase< Real > &params, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > input_deriv, MatrixBase< Real > params_deriv, MatrixBase< double > value_sum_out, MatrixBase< double > deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)

template<typename Real >
void	BackpropLstmNonlinearity (const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > input_deriv, CuMatrixBase< Real > params_deriv, CuMatrixBase< double > value_sum_out, CuMatrixBase< double > deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
	This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity. More...

template<typename Real >
void	EnsureNonzero (const CuVectorBase< Real > &src, Real epsilon, CuVectorBase< Real > *dest)
	Vector version of EnsureNonzero, see matrix version for documentation. More...

template void	EnsureNonzero (const CuMatrixBase< float > &src, float epsilon, CuMatrixBase< float > *dest)

template void	EnsureNonzero (const CuMatrixBase< double > &src, double epsilon, CuMatrixBase< double > *dest)

template void	EnsureNonzero (const CuVectorBase< float > &src, float epsilon, CuVectorBase< float > *dest)

template void	EnsureNonzero (const CuVectorBase< double > &src, double epsilon, CuVectorBase< double > *dest)

template void	CpuBackpropLstmNonlinearity (const MatrixBase< float > &input, const MatrixBase< float > &params, const MatrixBase< float > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< float > &self_repair_config, double count_in, MatrixBase< float > input_deriv, MatrixBase< float > params_deriv, MatrixBase< double > value_sum_out, MatrixBase< double > deriv_sum_out, MatrixBase< float > *self_repair_sum_out)

template void	CpuBackpropLstmNonlinearity (const MatrixBase< double > &input, const MatrixBase< double > &params, const MatrixBase< double > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< double > &self_repair_config, double count_in, MatrixBase< double > input_deriv, MatrixBase< double > params_deriv, MatrixBase< double > value_sum_out, MatrixBase< double > deriv_sum_out, MatrixBase< double > *self_repair_sum_out)

template void	BackpropLstmNonlinearity (const CuMatrixBase< float > &input, const CuMatrixBase< float > &params, const CuMatrixBase< float > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< float > &self_repair_config, double count_in, CuMatrixBase< float > input_deriv, CuMatrixBase< float > params_deriv, CuMatrixBase< double > value_sum_out, CuMatrixBase< double > deriv_sum_out, CuMatrixBase< float > *self_repair_sum_out)

template void	BackpropLstmNonlinearity (const CuMatrixBase< double > &input, const CuMatrixBase< double > &params, const CuMatrixBase< double > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< double > &self_repair_config, double count_in, CuMatrixBase< double > input_deriv, CuMatrixBase< double > params_deriv, CuMatrixBase< double > value_sum_out, CuMatrixBase< double > deriv_sum_out, CuMatrixBase< double > *self_repair_sum_out)

Function Documentation

◆ BackpropLstmNonlinearity() [1/3]

void BackpropLstmNonlinearity	(	const CuMatrixBase< Real > &	input,
		const CuMatrixBase< Real > &	params,
		const CuMatrixBase< Real > &	output_deriv,
		const CuMatrixBase< double > &	deriv_sum_in,
		const CuVectorBase< Real > &	self_repair_config,
		double	count_in,
		CuMatrixBase< Real > *	input_deriv,
		CuMatrixBase< Real > *	params_deriv,
		CuMatrixBase< double > *	value_sum_out,
		CuMatrixBase< double > *	deriv_sum_out,
		CuMatrixBase< Real > *	self_repair_sum_out
	)

This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity.

It's a little more complicated than you might expect because of the 'self-repair' mechanism that we use to prevent the sigmoid and tanh nonlinearities oversaturating, and because of the average-activation and average-derivative stats that we store for these nonlinearites (these stats are used both to control the self-repair mechanism, and for diagnostic purposes).

Because the forward pass computes various intermediate values that are not output, this function actually has to do the same computations as the forward pass before it actually does the backprop.

Parameters

[in]	input	The same as in ComputeLstmNonlinearity(). A matrix, of dimension N by 5C (i.e. its num-cols must be a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). This function will also accept input of dimension N by 5C + 3, and the three final elements will be interpreted as scaling factors on i_t, f_t and o_t (useful as per-frame dropout masks).
[in]	params	The same as in ComputeLstmNonlinearity(). A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}.
[in]	output_deriv	A matrix, of dimension N by 2C, containing the derivative of the objective function we're backpropagating, w.r.t. the quantities c_t and m_t (in two blocks of column-dimension C).
[in]	deriv_sum_in	This is used in the self-repair code to identify oversaturated nonlinearities. It is a matrix, of dimension 5 by C, corresponding to the totals of the derivatives of the 5 sigmoid and tanh nonlinearities, in they order they appear in the equations in the documentation of ComputeLstmNonlinearity() Rspectively, they appear in the equations for (i_t, f_t, c_t, o_t, m_t). This will be divided by 'count_in' to get the average derivative value so far, for each of the nonlinearities.
[in]	self_repair_config	A vector of dimension 10, containing the configuration of the self-repair to be used for the 5 nonlinearities. The first 5 elements are the self_repair_lower_threshold values (typically 0.05 for sigmoid and 0.2 for tanh), and the next 5 elements are the corresponding self-repair-scales (typically 10^-5).
[in]	count_in	The data-count that corresponds to the stats in 'deriv_sum_in' at entry to the function. This function should tolerate the count being zero (in that case, it is free to do the self-repair or not, as this should only happen on the 1st minibatch of each training job).
[out]	input_deriv	May be NULL; if not, this function writes, to this location, the backpropagated derivative of the objective function w.r.t. the 'input' matrix. This matrix should have the same dimension as 'input'. In addition to the regular backpropagated derivative, the output will include small values relating to 'self-repair'. If the input is of column-dimension 5C + 3 (i.e. we are using dropout masks), the derivatives w.r.t. the dropout masks will not be set; they will retain their value prior to this function call.
[out]	params_deriv	May be NULL; if not, this is where this function writes [not adds] the backpropagated derivative of the objective function w.r.t. 'params'; it should have the same dimension as 'params' (3 by C). (This matrix will then be processed by the natural gradient code and added to the appropriate copy of the parameter matrix, outside this function).
[out]	value_sum_out	Must be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C. This function adds to this location the total value of each of the sigmoid/tanh nonlinearities that it computes (this is for diagnostic purposes).
[out]	deriv_sum_out	Must be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C; this function adds to this location the total of the derivative of each of the sigmoid/tanh nonlinearities that it computes (this is for diagnostic purposes and to control the self-repair). This function should tolerate the case when 'deriv_sum_out' points to the same data as 'deriv_sum_in'.
[out]	self_repair_sum_out	Must be NULL if params_deriv is NULL; if not, a matrix of dimension 5 by C; this function writes to this location the sum of the number of times the self-repair code was activated (integer values 0 <= k <= N). This will be processed outside this function into self-repair stats for diagnostics.

Definition at line 768 of file cu-math.cc.

References CpuBackpropLstmNonlinearity(), CU1DBLOCK, CuVectorBase< Real >::Data(), CuMatrixBase< Real >::Data(), CuVectorBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), CuMatrixBase< Real >::Stride(), and CuVectorBase< Real >::Vec().

Referenced by LstmNonlinearityComponent::Backprop(), EnsureNonzero(), kaldi::UnitTestBackpropLstmNonlinearity(), and kaldi::UnitTestLstmNonlinearity().

                                                                        {
   int32 num_rows = input.NumRows(),
         cell_dim = input.NumCols() / 5,
       input_cols = input.NumCols();
   // Check dimensions.
   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output_deriv.NumRows() == num_rows);
   KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
   KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
   KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
   KALDI_ASSERT(self_repair_config.Dim() == 10);
   if (input_deriv != NULL) {
     KALDI_ASSERT(SameDim(input, *input_deriv));
   }
   if (params_deriv == NULL) {
     KALDI_ASSERT(value_sum_out == NULL);
     KALDI_ASSERT(deriv_sum_out == NULL);
     KALDI_ASSERT(self_repair_sum_out == NULL);
   } else {
     KALDI_ASSERT(value_sum_out != NULL);
     KALDI_ASSERT(deriv_sum_out != NULL);
     KALDI_ASSERT(self_repair_sum_out != NULL);
     KALDI_ASSERT(SameDim(params, *params_deriv));
     KALDI_ASSERT(value_sum_out->NumRows() == 5);
     KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
     KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
     KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
     KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
   }
 
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     // Each thread block is working on 1 row of the data.
     // It's best that cell dim is a multiple fo CU1DBLOCK
 
     int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
 
     // Use 2D block (8x32 threads) as we need to compute column sum.
     // Use 1D grid to cover the data matrix width `cell_dim`.
     const int kWarpSize = 32;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
 //    dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
 //                 n_blocks(num_rows, dimBlock.y));
 //    if (dimGrid.x * dimGrid.y > 1024) {
 //      dimGrid.y = std::max(1024 / dimGrid.x, 1);
 //    }
     dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
     if (input_deriv == NULL) {
       if (params_deriv == NULL) {
         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
                                     have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
                                     deriv_sum_in.Stride(),
                                     self_repair_config.Data(), count_in + 1,
                                     NULL,
                                     0,
                                     NULL,
                                     0,
                                     NULL,
                                     0,
                                     NULL,
                                     0,
                                     NULL,
                                     0);
 
       } else {
         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
                                     have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
                                     deriv_sum_in.Stride(),
                                     self_repair_config.Data(), count_in + 1,
                                     NULL,
                                     0, params_deriv->Data(),
                                     params_deriv->Stride(),
                                     value_sum_out->Data(),
                                     value_sum_out->Stride(),
                                     deriv_sum_out->Data(),
                                     deriv_sum_out->Stride(),
                                     self_repair_sum_out->Data(),
                                     self_repair_sum_out->Stride());
       }
     } else {
       if (params_deriv == NULL) {
         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
                                     have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
                                     deriv_sum_in.Stride(),
                                     self_repair_config.Data(), count_in + 1,
                                     input_deriv->Data(), input_deriv->Stride(),
                                     NULL,
                                     0, NULL, 0, NULL, 0, NULL, 0);
       } else {
         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
                                     have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
                                     deriv_sum_in.Stride(),
                                     self_repair_config.Data(), count_in + 1,
                                     input_deriv->Data(), input_deriv->Stride(),
                                     params_deriv->Data(),
                                     params_deriv->Stride(),
                                     value_sum_out->Data(),
                                     value_sum_out->Stride(),
                                     deriv_sum_out->Data(),
                                     deriv_sum_out->Stride(),
                                     self_repair_sum_out->Data(),
                                     self_repair_sum_out->Stride());
       }
     }
 
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     CpuBackpropLstmNonlinearity(input.Mat(), params.Mat(), output_deriv.Mat(),
                                 deriv_sum_in.Mat(), self_repair_config.Vec(),
                                 count_in, &(input_deriv->Mat()),
                                 &(params_deriv->Mat()), &(value_sum_out->Mat()),
                                 &(deriv_sum_out->Mat()),
                                 &(self_repair_sum_out->Mat()));
   }
 }

◆ BackpropLstmNonlinearity() [2/3]

template void kaldi::cu::BackpropLstmNonlinearity	(	const CuMatrixBase< float > &	input,
		const CuMatrixBase< float > &	params,
		const CuMatrixBase< float > &	output_deriv,
		const CuMatrixBase< double > &	deriv_sum_in,
		const CuVectorBase< float > &	self_repair_config,
		double	count_in,
		CuMatrixBase< float > *	input_deriv,
		CuMatrixBase< float > *	params_deriv,
		CuMatrixBase< double > *	value_sum_out,
		CuMatrixBase< double > *	deriv_sum_out,
		CuMatrixBase< float > *	self_repair_sum_out
	)

◆ BackpropLstmNonlinearity() [3/3]

template void kaldi::cu::BackpropLstmNonlinearity	(	const CuMatrixBase< double > &	input,
		const CuMatrixBase< double > &	params,
		const CuMatrixBase< double > &	output_deriv,
		const CuMatrixBase< double > &	deriv_sum_in,
		const CuVectorBase< double > &	self_repair_config,
		double	count_in,
		CuMatrixBase< double > *	input_deriv,
		CuMatrixBase< double > *	params_deriv,
		CuMatrixBase< double > *	value_sum_out,
		CuMatrixBase< double > *	deriv_sum_out,
		CuMatrixBase< double > *	self_repair_sum_out
	)

◆ ComputeLstmNonlinearity() [1/3]

void ComputeLstmNonlinearity	(	const CuMatrixBase< Real > &	input,
		const CuMatrixBase< Real > &	params,
		CuMatrixBase< Real > *	output
	)

this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propagation.

It computes the core part of the LSTM nonlinearity. Refer to class LstmNonlinearityComponent in ../nnet3/nnet-simple-component.h for more context.

Parameters

[in]	input	A matrix, of dimension N by 5C (i.e. its num-cols must be a multiple of 5). The column-space is interpreted as 5 consecutive blocks, each of dimension C, which we name: (i_part, f_part, c_part, o_part, c_{t-1}). This function will also accept input of dimension N by 5C + 3, and the three final elements will be used as scaling factors on i_t, f_t and o_t (useful as per-frame dropout masks).
[in]	params	A matrix, of dimension 3 by C, with rows containing the three diagonal parameter matrices used in LSTMs, namely w_{ic}, w_{fc} and w_{oc}.
[out]	output	A matrix, of dimension N by 2C. The quantities c_t and m_t respectively are put there (in two blocks of column-dimension C), according to the following equations:

i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) c_t = f_t*c_{t-1} + i_t * Tanh(c_part) o_t = Sigmoid(o_part + w_{oc}*c_t) m_t = o_t * Tanh(c_t)

Note on dropout: if the dropout mask is provided, let the mask values be i_t_mask, f_t_mask and o_t_mask (for each matrix row, these are scalars while i_t, f_t and o_t are of dimension C, because this is 'per-frame' dropout as described in http://www.danielpovey.com/files/2017_interspeech_dropout.pdf). Then the modification to the equations above consists of replacing 'i_t' with 'i_t_mask * i_t' in the RHS of the equations above, and the same type of change for f_t and o_t.

Definition at line 489 of file cu-math.cc.

References CpuComputeLstmNonlinearity(), CU1DBLOCK, CuMatrixBase< Real >::Data(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::Stride().

Referenced by LstmNonlinearityComponent::Propagate(), kaldi::UnitTestCuMathComputeLstmNonlinearity(), and kaldi::UnitTestLstmNonlinearity().

                                                          {
   int32 num_rows = input.NumRows(),
       input_cols = input.NumCols(),
         cell_dim = input_cols / 5;
   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   KALDI_ASSERT(output->NumRows() == num_rows);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
 
     // Each thread block is working on 1 row of the data.
     // It's best that cell dim is a multiple fo CU1DBLOCK
     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(num_rows);
 
     cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(),
                            params.Data(), params.Stride(), output->Stride(),
                            cell_dim, have_dropout_mask, num_rows, output->Data());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     CpuComputeLstmNonlinearity(input.Mat(), params.Mat(), &output->Mat());
   }
 }

◆ ComputeLstmNonlinearity() [2/3]

template void kaldi::cu::ComputeLstmNonlinearity	(	const CuMatrixBase< float > &	input,
		const CuMatrixBase< float > &	params,
		CuMatrixBase< float > *	output
	)

◆ ComputeLstmNonlinearity() [3/3]

template void kaldi::cu::ComputeLstmNonlinearity	(	const CuMatrixBase< double > &	input,
		const CuMatrixBase< double > &	params,
		CuMatrixBase< double > *	output
	)

◆ Copy() [1/3]

void Copy	(	const CuMatrixBase< Real > &	src,
		const CuArray< int32 > &	copy_from_indices,
		CuMatrixBase< Real > *	tgt
	)

Copies elements from src into tgt as given by copy_from_indices.

The matrices src and tgt must have the same dimensions and the dimension of copy_from_indices must equal the number of columns in the src matrix. As a result, tgt(i, j) == src(i, copy_from_indices[j]). Also see CuMatrix::CopyCols(), which is more general.

Definition at line 173 of file cu-math.cc.

References CU2DBLOCK, CuArrayBase< T >::Data(), CuMatrixBase< Real >::Data(), CuArrayBase< T >::Dim(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::NumRows().

                                    {
 
   KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
 
     cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
               copy_from_indices.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     // expand in CPU
     const MatrixBase<Real> &srcmat = src.Mat();
     const int32 *copy_from_indicesvec = copy_from_indices.Data();
     int32 dim = copy_from_indices.Dim();
     MatrixBase<Real> &tgtmat = tgt->Mat();
     //
     for(int32 r = 0; r < tgtmat.NumRows(); r++) {
       for(int32 c = 0; c < dim; c++) {
         tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
       }
     }
   }
 }

◆ Copy() [2/3]

template void kaldi::cu::Copy	(	const CuMatrixBase< float > &	src,
		const CuArray< int32 > &	copy_from_indices,
		CuMatrixBase< float > *	tgt
	)

◆ Copy() [3/3]

template void kaldi::cu::Copy	(	const CuMatrixBase< double > &	src,
		const CuArray< int32 > &	copy_from_indices,
		CuMatrixBase< double > *	tgt
	)

◆ CpuBackpropLstmNonlinearity() [1/3]

void CpuBackpropLstmNonlinearity	(	const MatrixBase< Real > &	input,
		const MatrixBase< Real > &	params,
		const MatrixBase< Real > &	output_deriv,
		const MatrixBase< double > &	deriv_sum_in,
		const VectorBase< Real > &	self_repair_config,
		double	count_in,
		MatrixBase< Real > *	input_deriv,
		MatrixBase< Real > *	params_deriv,
		MatrixBase< double > *	value_sum_out,
		MatrixBase< double > *	deriv_sum_out,
		MatrixBase< Real > *	self_repair_sum_out
	)

Definition at line 543 of file cu-math.cc.

References count, VectorBase< Real >::Dim(), rnnlm::i, KALDI_ASSERT, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), kaldi::SameDim(), ScalarSigmoid(), and ScalarTanh().

Referenced by BackpropLstmNonlinearity(), EnsureNonzero(), and kaldi::UnitTestBackpropLstmNonlinearity().

                                                                         {
   int32 num_rows = input.NumRows(),
       input_cols = input
                    .NumCols(),
         cell_dim = input.NumCols() / 5;
   // Check dimensions.
   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output_deriv.NumRows() == num_rows);
   KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
   KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
   KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
   KALDI_ASSERT(self_repair_config.Dim() == 10);
   if (input_deriv != NULL) {
     KALDI_ASSERT(SameDim(input, *input_deriv));
   }
   if (params_deriv == NULL) {
     KALDI_ASSERT(value_sum_out == NULL);
     KALDI_ASSERT(deriv_sum_out == NULL);
     KALDI_ASSERT(self_repair_sum_out == NULL);
   } else {
     KALDI_ASSERT(value_sum_out != NULL);
     KALDI_ASSERT(deriv_sum_out != NULL);
     KALDI_ASSERT(self_repair_sum_out != NULL);
     KALDI_ASSERT(SameDim(params, *params_deriv));
     KALDI_ASSERT(value_sum_out->NumRows() == 5);
     KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
     KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
     KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
     KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
   }
 
   const MatrixBase<Real> &input_mat = input;
   const MatrixBase<Real> &params_mat = params;
   const MatrixBase<Real> &output_deriv_mat = output_deriv;
   const MatrixBase<double> &deriv_sum_in_mat = deriv_sum_in;
   const VectorBase<Real> &sr_config = self_repair_config;
   MatrixBase<Real> *input_deriv_mat = (
       input_deriv == NULL ? NULL : input_deriv);
   MatrixBase<Real> *params_deriv_mat = NULL;
   MatrixBase<Real> *self_repair_sum_out_mat = NULL;
   MatrixBase<double> *value_sum_out_mat = NULL;
   MatrixBase<double> *deriv_sum_out_mat = NULL;
   if (params_deriv != NULL) {
     params_deriv_mat = params_deriv;
     value_sum_out_mat = value_sum_out;
     deriv_sum_out_mat = deriv_sum_out;
     self_repair_sum_out_mat = self_repair_sum_out;
   }
 
 
   // We add 1.0 (i.e. a small value) to the count to avoid division by zero.
   Real count = 1.0 + count_in;
   for (int32 c = 0; c < cell_dim; c++) {
     // parameters
     Real w_ic = params_mat(0, c);
     Real w_fc = params_mat(1, c);
     Real w_oc = params_mat(2, c);
     // derivative sums w.r.t. parameters.
     Real w_ic_deriv_sum = 0.0;
     Real w_fc_deriv_sum = 0.0;
     Real w_oc_deriv_sum = 0.0;
 
     // average derivatives, for self-repair.
     // The 5 nonlinearities that are subject to self-repair are written as:
     //  Sigmoid(i_t_input), Sigmoid(f_t_input),
     //  Tanh(c_part), Sigmoid(o_t_input),  Tanh(c_t)
     Real i_t_self_repair = (
         deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
     Real f_t_self_repair = (
         deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
     Real c_part_self_repair = (
         deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
     Real o_t_self_repair = (
         deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
     Real c_t_self_repair = (
         deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
     // Note on how we add self-repair for sigmoids/tanh's.  If self-repair
     // is activated for this unit, then...
     // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0)
     // ... to the input-deriv;
     // For tanh's we'd add -self_repair_scale * tanh(x)
     // If self-repair is not activated, the 'self_repair' scales are set to zero.
 
     // The following variables are for the accumulation of stats on the
     // sigmoid and tanh units.
     Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0;
     Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0;
     Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0;
     Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0;
     Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0;
 
 
     for (int32 r = 0; r < num_rows; r++) {
       Real i_part = input_mat(r, c),
           f_part = input_mat(r, c + cell_dim),
           c_part = input_mat(r, c + 2 * cell_dim),
           o_part = input_mat(r, c + 3 * cell_dim),
           c_prev = input_mat(r, c + 4 * cell_dim);
 
       Real i_scale = (input_cols == cell_dim * 5 ? 1.0 :
                       input_mat(r, cell_dim * 5)),
            f_scale = (input_cols == cell_dim * 5 ? 1.0 :
                       input_mat(r, cell_dim * 5 + 1)),
            o_scale = (input_cols == cell_dim * 5 ? 1.0 :
                       input_mat(r, cell_dim * 5 + 2));
 
       // For greater clarity, we give some of the quantities in the
       // forward equations their own names.
       Real i_t_input = i_part + w_ic * c_prev,
           i_t = ScalarSigmoid(i_t_input),
           f_t_input = f_part + w_fc * c_prev,
           f_t = ScalarSigmoid(f_t_input),
           tanh_c_part = ScalarTanh(c_part),
           c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part,
           o_t_input = o_part + w_oc * c_t,
           o_t = ScalarSigmoid(o_t_input),
           tanh_c_t = ScalarTanh(c_t);
       // we'd also compute, in the forward pass,
       //   m_t = o_t * tanh_c_t;
       // but this variable is not needed.
 
       // Accumulate nonlinearity value and derivative stats.
       // Note:
       //    tanh'(x)  = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
       //  sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)).
       i_t_value_sum += i_t;
       i_t_deriv_sum += i_t * (1.0F - i_t);
       f_t_value_sum += f_t;
       f_t_deriv_sum += f_t * (1.0F - f_t);
       c_part_value_sum += tanh_c_part;
       c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part;
       o_t_value_sum += o_t;
       o_t_deriv_sum += o_t * (1.0F - o_t);
       c_t_value_sum += tanh_c_t;
       c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t;
 
 
       // the derivative of the objective function w.r.t. a particular quantity
       // will be written by prepending "d" to the name.
       // We compute these derivatives in the reverse of the order in which
       // we computed the original quantities.
       // dc_t_out is the part of the derivative w.r.t. c_t that
       // comes directly from the output of this function.
       Real dc_t_out = output_deriv_mat(r, c);
       Real dm_t = output_deriv_mat(r, c + cell_dim);
       Real dtanh_c_t = o_t * o_scale * dm_t;
       Real do_t = o_scale * tanh_c_t * dm_t;
       Real do_t_input = (o_t * (1.0F - o_t) * do_t
           - (2.0F * o_t - 1.0F) * o_t_self_repair);
       Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
           + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
       Real dtanh_c_part = i_t * i_scale * dc_t;
       Real df_t = dc_t * f_scale * c_prev;
       Real df_t_input = ((df_t * f_t * (1.0F - f_t)
                           - (2.0F * f_t - 1.0F) * f_t_self_repair));
       Real di_t = dc_t * i_scale * tanh_c_part;
       Real di_t_input = ((di_t * i_t * (1.0F - i_t)
                           - (2.0F * i_t - 1.0F) * i_t_self_repair));
 
       w_ic_deriv_sum += c_prev * di_t_input;
       w_fc_deriv_sum += c_prev * df_t_input;
       w_oc_deriv_sum += c_t * do_t_input;
 
       Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t;
       Real do_part = do_t_input;
       Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
           - tanh_c_part * c_part_self_repair);
       Real df_part = df_t_input;
       Real di_part = di_t_input;
 
       if (input_deriv_mat != NULL) {
         (*input_deriv_mat)(r, c) = di_part;
         (*input_deriv_mat)(r, c + cell_dim) = df_part;
         (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part;
         (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part;
         (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev;
       }
     }
 
     if (params_deriv != NULL) {
       // note: for optimizing things you can assume that params_deriv and
       // input_deriv_mat are non-NULL (i.e. all the output matrices are
       // non-NULL).  The situations when some of the output matrices are NULL
       // does not happen often (mainly only in testing code).
 
       (*params_deriv_mat)(0, c) = w_ic_deriv_sum;
       (*params_deriv_mat)(1, c) = w_fc_deriv_sum;
       (*params_deriv_mat)(2, c) = w_oc_deriv_sum;
 
       (*value_sum_out_mat)(0, c) += i_t_value_sum;
       (*value_sum_out_mat)(1, c) += f_t_value_sum;
       (*value_sum_out_mat)(2, c) += c_part_value_sum;
       (*value_sum_out_mat)(3, c) += o_t_value_sum;
       (*value_sum_out_mat)(4, c) += c_t_value_sum;
 
       // need to update self_repair_sum_out before deriv_sum_out, because
       // deriv_sum_out and deriv_sum_in might point to the same memory.
       for (int32 i = 0; i < 5; i++)
         (*self_repair_sum_out_mat)(i, c) =
             (deriv_sum_in_mat(i, c) / count < sr_config(i) ? num_rows : 0);
 
       (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
       (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
       (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum;
       (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum;
       (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum;
     }
   }
 }

◆ CpuBackpropLstmNonlinearity() [2/3]

template void kaldi::cu::CpuBackpropLstmNonlinearity	(	const MatrixBase< float > &	input,
		const MatrixBase< float > &	params,
		const MatrixBase< float > &	output_deriv,
		const MatrixBase< double > &	deriv_sum_in,
		const VectorBase< float > &	self_repair_config,
		double	count_in,
		MatrixBase< float > *	input_deriv,
		MatrixBase< float > *	params_deriv,
		MatrixBase< double > *	value_sum_out,
		MatrixBase< double > *	deriv_sum_out,
		MatrixBase< float > *	self_repair_sum_out
	)

◆ CpuBackpropLstmNonlinearity() [3/3]

template void kaldi::cu::CpuBackpropLstmNonlinearity	(	const MatrixBase< double > &	input,
		const MatrixBase< double > &	params,
		const MatrixBase< double > &	output_deriv,
		const MatrixBase< double > &	deriv_sum_in,
		const VectorBase< double > &	self_repair_config,
		double	count_in,
		MatrixBase< double > *	input_deriv,
		MatrixBase< double > *	params_deriv,
		MatrixBase< double > *	value_sum_out,
		MatrixBase< double > *	deriv_sum_out,
		MatrixBase< double > *	self_repair_sum_out
	)

◆ CpuComputeLstmNonlinearity() [1/3]

void CpuComputeLstmNonlinearity	(	const MatrixBase< Real > &	input_mat,
		const MatrixBase< Real > &	params_mat,
		MatrixBase< Real > *	output
	)

Definition at line 445 of file cu-math.cc.

References MatrixBase< Real >::Data(), KALDI_ASSERT, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), MatrixBase< Real >::RowData(), ScalarSigmoid(), ScalarTanh(), and MatrixBase< Real >::Stride().

Referenced by ComputeLstmNonlinearity(), and kaldi::UnitTestCuMathComputeLstmNonlinearity().

                                                           {
   int32 num_rows = input_mat.NumRows(),
       input_cols = input_mat.NumCols(),
         cell_dim = input_cols / 5;
   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   KALDI_ASSERT(output->NumRows() == num_rows);
   KALDI_ASSERT(params_mat.NumRows() == 3);
   KALDI_ASSERT(params_mat.NumCols() == cell_dim);
   KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
 
   MatrixBase<Real> &output_mat = *output;
   const Real *params_data = params_mat.Data();
   int32 params_stride = params_mat.Stride();
   for (int32 r = 0; r < num_rows; r++) {
     const Real *input_row = input_mat.RowData(r);
     // i_scale and f_scale relate to dropout, they will normally be 1.0.
     Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]),
          f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]),
          o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]);
 
     Real *output_row = output_mat.RowData(r);
     for (int32 c = 0; c < cell_dim; c++) {
       Real i_part = input_row[c];
       Real f_part = input_row[c + cell_dim];
       Real c_part = input_row[c + 2 * cell_dim];
       Real o_part = input_row[c + 3 * cell_dim];
       Real c_prev = input_row[c + 4 * cell_dim];
       Real w_ic = params_data[c];
       Real w_fc = params_data[c + params_stride];
       Real w_oc = params_data[c + params_stride * 2];
       Real i_t = ScalarSigmoid(i_part + w_ic * c_prev);
       Real f_t = ScalarSigmoid(f_part + w_fc * c_prev);
       Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part);
       Real o_t = ScalarSigmoid(o_part + w_oc * c_t);
       Real m_t = o_t * o_scale * ScalarTanh(c_t);
       output_row[c] = c_t;
       output_row[c + cell_dim] = m_t;
     }
   }
 }

◆ CpuComputeLstmNonlinearity() [2/3]

template void kaldi::cu::CpuComputeLstmNonlinearity	(	const MatrixBase< float > &	input_mat,
		const MatrixBase< float > &	params_mat,
		MatrixBase< float > *	output
	)

◆ CpuComputeLstmNonlinearity() [3/3]

template void kaldi::cu::CpuComputeLstmNonlinearity	(	const MatrixBase< double > &	input_mat,
		const MatrixBase< double > &	params_mat,
		MatrixBase< double > *	output
	)

◆ DiffNormalizePerRow() [1/3]

void DiffNormalizePerRow	(	const CuMatrixBase< Real > &	in_value,
		const CuMatrixBase< Real > &	out_deriv,
		const Real	target_rms,
		const bool	add_log_stddev,
		CuMatrixBase< Real > *	in_deriv
	)

Definition at line 349 of file cu-math.cc.

References CuVectorBase< Real >::AddDiagMat2(), CuVectorBase< Real >::AddDiagMatMat(), CuMatrixBase< Real >::AddDiagVecMat(), CuVectorBase< Real >::ApplyFloor(), CU1DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), kaldi::kNoTrans, kaldi::kTrans, kaldi::kUndefined, CuMatrixBase< Real >::MulRowsVec(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::Stride().

Referenced by NormalizeComponent::Backprop(), and kaldi::UnitTestCuDiffNormalizePerRow().

                                                        {
   const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     size_t dimBlock = CU1DBLOCK;
     size_t dimGrid = in_deriv->NumRows();
     cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->Data(),
                                 in_deriv->Stride(), in_value.Data(),
                                 in_value.Dim(), out_deriv.Data(),
                                 out_deriv.Stride(), target_rms, add_log_stddev);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     const CuSubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
                                              0, in_value.NumCols());
     CuVector<Real> dot_products(out_deriv.NumRows());
     dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
                                kTrans, 0.0);
     CuVector<Real> in_norm(in_value.NumRows());
     Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
     in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
 
     if (add_log_stddev) {
       CuVector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
       out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
       // f = log(sqrt(max(epsi, x^T x / D)))
       // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
       // we don't compute this exactly below for the case when x^2 x is very
       // small, but we do make sure that the deriv isn't infinity when the input
       // is zero.
       log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
       log_stddev_deriv.ApplyPow(-1.0);
       out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
       log_stddev_deriv.MulElements(out_deriv_for_stddev);
       if (in_deriv)
         in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
     }
     in_norm.Scale(1.0 / d_scaled);
     in_norm.ApplyFloor(kSquaredNormFloor);
     in_norm.ApplyPow(-0.5);
     if (in_deriv) {
       if (in_deriv->Data() != out_deriv_no_log.Data())
         in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
       else
         in_deriv->MulRowsVec(in_norm);
       in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
       in_norm.ApplyPow(3.0);
       dot_products.MulElements(in_norm);
 
       in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value, kNoTrans,
                               1.0);
     }
   }
 }

◆ DiffNormalizePerRow() [2/3]

template void kaldi::cu::DiffNormalizePerRow	(	const CuMatrixBase< float > &	in_value,
		const CuMatrixBase< float > &	out_deriv,
		const float	target_rms,
		const bool	add_log_stddev,
		CuMatrixBase< float > *	in_deriv
	)

◆ DiffNormalizePerRow() [3/3]

template void kaldi::cu::DiffNormalizePerRow	(	const CuMatrixBase< double > &	in_value,
		const CuMatrixBase< double > &	out_deriv,
		const double	target_rms,
		const bool	add_log_stddev,
		CuMatrixBase< double > *	in_deriv
	)

◆ EnsureNonzero() [1/6]

void EnsureNonzero	(	const CuMatrixBase< Real > &	src,
		Real	epsilon,
		CuMatrixBase< Real > *	dest
	)

This function requires that src and dest have the same dimension and epsilon > 0.

It copies src to dest while ensuring that the values are bounded away from zero by at least epsilon:

y =  x if fabs(x) >= epsilon;
     epsilon if 0 <= x < epsilon;
     -epsilon if -epsilon < x < 0.

Definition at line 209 of file cu-math.cc.

References Copy(), CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), Randomize(), RegularizeL1(), CuMatrixBase< Real >::RowData(), kaldi::SameDim(), Splice(), and CuMatrixBase< Real >::Stride().

Referenced by ScaleAndOffsetComponent::BackpropInternal(), EnsureNonzero(), ScaleAndOffsetComponent::PropagateInternal(), and kaldi::UnitTestEnsureNonzero().

                                              {
   KALDI_ASSERT(SameDim(*dest, src) && epsilon > 0.0);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(src.NumRows(), src.NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_ensure_nonzero(dimGrid, dimBlock, src.Data(), src.Dim(),
                         epsilon, dest->Stride(), dest->Data());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     int32 num_rows = src.NumRows(), num_cols = src.NumCols();
     for (int32 r = 0; r < num_rows; r++) {
       const Real *src_data = src.RowData(r);
       Real *dest_data = dest->RowData(r);
       for (int32 c = 0; c < num_cols; c++) {
         Real x = src_data[c], y;
         if (x <= -epsilon || x >= epsilon) y = x;
         else if (x >= 0.0) y = epsilon;
         else y = -epsilon;
         dest_data[c] = y;
       }
     }
   }
 }

◆ EnsureNonzero() [2/6]

void EnsureNonzero	(	const CuVectorBase< Real > &	src,
		Real	epsilon,
		CuVectorBase< Real > *	dest
	)

Vector version of EnsureNonzero, see matrix version for documentation.

Definition at line 915 of file cu-math.cc.

References BackpropLstmNonlinearity(), CpuBackpropLstmNonlinearity(), CuVectorBase< Real >::Data(), CuVectorBase< Real >::Dim(), EnsureNonzero(), and KALDI_ASSERT.

                                              {
   KALDI_ASSERT(src.Dim() == dest->Dim());
   int32 dim = src.Dim();
   // fake it with a 1-row matrix.
   CuSubMatrix<Real> src_mat(src.Data(), 1,  dim, dim),
       dest_mat(dest->Data(), 1, dim, dim);
   EnsureNonzero(src_mat, epsilon, &dest_mat);
 }

◆ EnsureNonzero() [3/6]

template void kaldi::cu::EnsureNonzero	(	const CuMatrixBase< float > &	src,
		float	epsilon,
		CuMatrixBase< float > *	dest
	)

◆ EnsureNonzero() [4/6]

template void kaldi::cu::EnsureNonzero	(	const CuMatrixBase< double > &	src,
		double	epsilon,
		CuMatrixBase< double > *	dest
	)

◆ EnsureNonzero() [5/6]

template void kaldi::cu::EnsureNonzero	(	const CuVectorBase< float > &	src,
		float	epsilon,
		CuVectorBase< float > *	dest
	)

◆ EnsureNonzero() [6/6]

template void kaldi::cu::EnsureNonzero	(	const CuVectorBase< double > &	src,
		double	epsilon,
		CuVectorBase< double > *	dest
	)

◆ NormalizePerRow() [1/3]

void NormalizePerRow	(	const CuMatrixBase< Real > &	in,
		const Real	target_rms,
		const bool	add_log_stddev,
		CuMatrixBase< Real > *	out
	)

Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square equals 1.0.

The output y_i = scale * x_i, and we want to RMS value of the y_i to equal target_rms, so y^t y = D * target_rms^2 (if y is one row of the input). we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). there is also flooring involved, to avoid division-by-zero problems. It's important for the backprop, that the floor's square root is exactly representable as float. If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) is an extra dimension of the output.

Definition at line 280 of file cu-math.cc.

References CuMatrixBase< Real >::CopyColFromVec(), CuMatrixBase< Real >::CopyFromMat(), CU1DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, kaldi::kNoTrans, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), and CuMatrixBase< Real >::Stride().

Referenced by NormalizeComponent::Propagate(), kaldi::UnitTestCuMathNormalizePerRow(), and kaldi::UnitTestCuMathNormalizePerRow_v2().

                                                                          {
   const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
   if (add_log_stddev) {
     KALDI_ASSERT(in.NumRows() == out->NumRows());
     KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
   } else {
     KALDI_ASSERT(SameDim(in, *out));
   }
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     size_t dimBlock = CU1DBLOCK;
     size_t dimGrid = out->NumRows();
     cuda_normalize_per_row(dimGrid, dimBlock, out->Data(), out->Stride(),
                            in.Data(), in.Dim(), target_rms, add_log_stddev);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     CuSubMatrix<Real> out_no_log(*out, 0, out->NumRows(), 0, in.NumCols());
     if (in.Data() != out_no_log.Data())
       out_no_log.CopyFromMat(in);
     CuVector<Real> in_norm(in.NumRows());
     Real d_scaled = in.NumCols() * target_rms * target_rms;
     in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
     in_norm.ApplyFloor(kSquaredNormFloor);
     in_norm.ApplyPow(-0.5);
     out_no_log.MulRowsVec(in_norm);
     if (add_log_stddev) {
       in_norm.ApplyLog();
       in_norm.Scale(-1.0);
       in_norm.Add(log(target_rms));
       out->CopyColFromVec(in_norm, in.NumCols());
     }
   }
 }

◆ NormalizePerRow() [2/3]

template void kaldi::cu::NormalizePerRow	(	const CuMatrixBase< float > &	in,
		const float	target_rms,
		const bool	add_log_stddev,
		CuMatrixBase< float > *	out
	)

◆ NormalizePerRow() [3/3]

template void kaldi::cu::NormalizePerRow	(	const CuMatrixBase< double > &	in,
		const double	target_rms,
		const bool	add_log_stddev,
		CuMatrixBase< double > *	out
	)

◆ Randomize() [1/3]

void Randomize	(	const CuMatrixBase< Real > &	src,
		const CuArray< int32 > &	copy_from_idx,
		CuMatrixBase< Real > *	tgt
	)

Copies a permutation of src into tgt.

The row permutation is specified in copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The dimensions of copy_from_idx must be equivalent to the number of rows in tgt and src and all elements in the vector must be in [0, src.numRows()-1].

Definition at line 80 of file cu-math.cc.

References CuArrayBase< T >::Data(), CuMatrixBase< Real >::Data(), CuArrayBase< T >::Dim(), CuMatrixBase< Real >::Dim(), rnnlm::i, KALDI_ASSERT, CuMatrixBase< Real >::Mat(), CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), MatrixBase< Real >::Row(), and MatrixDim_::rows.

Referenced by EnsureNonzero(), MatrixRandomizer::NumFrames(), VectorRandomizer::NumFrames(), StdVectorRandomizer< T >::NumFrames(), MatrixRandomizer::Randomize(), and kaldi::UnitTestCuMathRandomize().

                                         {
 
   KALDI_ASSERT(src.NumCols() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
   KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     /*
     Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
     */
 
     /*
      * Let's use blocksize 4 x 128 (512 threads/block)
      * and extend the randomizable matrices to: col 4*65535, row 128*65535
      * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
      */
     dim3 dimBlock(4, 128);
     dim3 dimGrid(n_blocks(tgt->NumCols(), 4), n_blocks(copy_from_idx.Dim(), 128));
     /*
      */
 
     MatrixDim dimsrc = src.Dim(); dimsrc.rows=copy_from_idx.Dim();
     MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();
 
     cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
                    copy_from_idx.Data(), dimtgt, dimsrc);
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     // randomize in CPU
     const MatrixBase<Real> &srcmat = src.Mat();
     const int32 *copy_from_idxvec = copy_from_idx.Data();
     MatrixBase<Real> &tgtmat = tgt->Mat();
     for(int32 i=0; i<copy_from_idx.Dim(); i++) {
       tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
     }
   }
 }

◆ Randomize() [2/3]

template void kaldi::cu::Randomize	(	const CuMatrixBase< float > &	src,
		const CuArray< int32 > &	copy_from_idx,
		CuMatrixBase< float > *	tgt
	)

◆ Randomize() [3/3]

template void kaldi::cu::Randomize	(	const CuMatrixBase< double > &	src,
		const CuArray< int32 > &	copy_from_idx,
		CuMatrixBase< double > *	tgt
	)

◆ RegularizeL1() [1/3]

void RegularizeL1	(	CuMatrixBase< Real > *	weight,
		CuMatrixBase< Real > *	gradient,
		Real	l1_penalty,
		Real	learning_rate
	)

RegularizeL1 is a gradient step with l1 regularization added to the gradient.

We don't let the value cross over zero from positive to negative or vice versa, in a single step. If an element tries to cross zero and is stopped, we zero the gradient. (Dan: not sure why).

Definition at line 37 of file cu-math.cc.

References CU2DBLOCK, CuMatrixBase< Real >::Data(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), MatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), CuMatrixBase< Real >::NumRows(), kaldi::SameDim(), and CuMatrixBase< Real >::Stride().

Referenced by EnsureNonzero(), LinearTransform::Update(), and AffineTransform::Update().

                                                                                           {
   KALDI_ASSERT(SameDim(*weight, *grad));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));
 
     cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
                        weight->Dim(), grad->Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     MatrixBase<Real> &weight2 = weight->Mat();
     MatrixBase<Real> &grad2 = grad->Mat();
     for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
       for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
 
         if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
 
         Real l1_signed = l1;
         if (weight2(r, c) < 0.0)
           l1_signed = -l1;
 
         Real before = weight2(r, c);
         Real after = weight2(r, c) - lr*grad2(r, c) - l1_signed;
         if ((after > 0.0) ^ (before > 0.0)) {
           weight2(r, c) = 0.0;
           grad2(r, c) = 0.0;
         } else {
           weight2(r, c) -= l1_signed;
         }
       }
     }
   }
 }

◆ RegularizeL1() [2/3]

template void kaldi::cu::RegularizeL1	(	CuMatrixBase< float > *	weight,
		CuMatrixBase< float > *	grad,
		float	l1,
		float	lr
	)

◆ RegularizeL1() [3/3]

template void kaldi::cu::RegularizeL1	(	CuMatrixBase< double > *	weight,
		CuMatrixBase< double > *	grad,
		double	l1,
		double	lr
	)

◆ ScalarSigmoid()

static Real kaldi::cu::ScalarSigmoid ( Real a )

inlinestatic

Definition at line 424 of file cu-math.cc.

References kaldi::Exp().

Referenced by CpuBackpropLstmNonlinearity(), and CpuComputeLstmNonlinearity().

                                          {
   if (a > Real(0)) {
     return Real(1) / (Real(1) + Exp(-a));
   } else {
     Real x = Exp(a);
     return x / (x + Real(1));
   }
 }

◆ ScalarTanh()

static Real kaldi::cu::ScalarTanh ( Real a )

inlinestatic

Definition at line 434 of file cu-math.cc.

References kaldi::Exp().

Referenced by CpuBackpropLstmNonlinearity(), and CpuComputeLstmNonlinearity().

                                       {
   if (a > Real(0)) {
     Real inv_expa = Exp(-a);
     return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa);
   } else {
     Real expa = Exp(a);
     return Real(1) - Real(2) / (Real(1) + expa * expa);
   }
 }

◆ Splice() [1/3]

void Splice	(	const CuMatrixBase< Real > &	src,
		const CuArray< int32 > &	frame_offsets,
		CuMatrixBase< Real > *	tgt
	)

Splice concatenates frames of src as specified in frame_offsets into tgt.

The dimensions of tgt must be equivalent to the number of rows in src and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim(). As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the general case where i in [0..src.NumRows()-1], k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the number of rows in src or less than 0 than the right side of the equation is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid an index out of bounds.

Definition at line 132 of file cu-math.cc.

References CU2DBLOCK, CuArrayBase< T >::Data(), CuMatrixBase< Real >::Data(), CuArrayBase< T >::Dim(), CuMatrixBase< Real >::Dim(), KALDI_ASSERT, CuMatrixBase< Real >::Mat(), MatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), CuMatrixBase< Real >::NumRows(), and MatrixBase< Real >::RowData().

Referenced by EnsureNonzero(), Component::NewComponentOfType(), Splice::PropagateFnc(), and kaldi::UnitTestCuMathSplice().

                                      {
 
   KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
 
     cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
                 frame_offsets.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     // expand in CPU
     const MatrixBase<Real> &srcmat = src.Mat();
     const int32 *frame_offsetvec = frame_offsets.Data();
     int32 dim = frame_offsets.Dim();
     MatrixBase<Real> &tgtmat = tgt->Mat();
     //
     for(int32 r=0; r < tgtmat.NumRows(); r++) {
       for(int32 off=0; off < dim; off++) {
         int32 r_off = r + frame_offsetvec[off];
         if(r_off < 0) r_off = 0;
         if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
         memcpy(tgtmat.RowData(r)+off*srcmat.NumCols(),srcmat.RowData(r_off),sizeof(Real)*srcmat.NumCols());
       }
     }
   }
 }

◆ Splice() [2/3]

template void kaldi::cu::Splice	(	const CuMatrixBase< float > &	src,
		const CuArray< int32 > &	frame_offsets,
		CuMatrixBase< float > *	tgt
	)

◆ Splice() [3/3]

template void kaldi::cu::Splice	(	const CuMatrixBase< double > &	src,
		const CuArray< int32 > &	frame_offsets,
		CuMatrixBase< double > *	tgt
	)

Functions

Function Documentation

◆ BackpropLstmNonlinearity() [1/3]

◆ BackpropLstmNonlinearity() [2/3]

◆ BackpropLstmNonlinearity() [3/3]

◆ ComputeLstmNonlinearity() [1/3]

◆ ComputeLstmNonlinearity() [2/3]

◆ ComputeLstmNonlinearity() [3/3]

◆ Copy() [1/3]

◆ Copy() [2/3]

◆ Copy() [3/3]

◆ CpuBackpropLstmNonlinearity() [1/3]

◆ CpuBackpropLstmNonlinearity() [2/3]

◆ CpuBackpropLstmNonlinearity() [3/3]

◆ CpuComputeLstmNonlinearity() [1/3]

◆ CpuComputeLstmNonlinearity() [2/3]

◆ CpuComputeLstmNonlinearity() [3/3]

◆ DiffNormalizePerRow() [1/3]

◆ DiffNormalizePerRow() [2/3]

◆ DiffNormalizePerRow() [3/3]

◆ EnsureNonzero() [1/6]

◆ EnsureNonzero() [2/6]

◆ EnsureNonzero() [3/6]

◆ EnsureNonzero() [4/6]

◆ EnsureNonzero() [5/6]

◆ EnsureNonzero() [6/6]

◆ NormalizePerRow() [1/3]

◆ NormalizePerRow() [2/3]

◆ NormalizePerRow() [3/3]

◆ Randomize() [1/3]

◆ Randomize() [2/3]

◆ Randomize() [3/3]

◆ RegularizeL1() [1/3]

◆ RegularizeL1() [2/3]

◆ RegularizeL1() [3/3]

◆ ScalarSigmoid()

◆ ScalarTanh()

◆ Splice() [1/3]

◆ Splice() [2/3]

◆ Splice() [3/3]