36 template<
typename Real>
    40   if (CuDevice::Instantiate().Enabled()) {
    46     cuda_regularize_l1(dimGrid, dimBlock, weight->
Data(), grad->
Data(), l1, lr,
    48     CU_SAFE_CALL(cudaGetLastError());
    50     CuDevice::Instantiate().AccuProfile(__func__, tim);
    59         if(weight2(r,c)==0.0) 
continue; 
    62         if (weight2(r, c) < 0.0)
    65         Real before = weight2(r, c);
    66         Real after = weight2(r, c) - lr*grad2(r, c) - l1_signed;
    67         if ((after > 0.0) ^ (before > 0.0)) {
    71           weight2(r, c) -= l1_signed;
    79 template<
typename Real>
    89   if (CuDevice::Instantiate().Enabled()) {
   103     dim3 dimBlock(4, 128);
   104     dim3 dimGrid(n_blocks(tgt->
NumCols(), 4), n_blocks(copy_from_idx.
Dim(), 128));
   111     cuda_randomize(dimGrid, dimBlock, tgt->
Data(), src.
Data(),
   112                    copy_from_idx.
Data(), dimtgt, dimsrc);
   113     CU_SAFE_CALL(cudaGetLastError());
   115     CuDevice::Instantiate().AccuProfile(__func__, tim);
   121     const int32 *copy_from_idxvec = copy_from_idx.
Data();
   124       tgtmat.
Row(
i).CopyFromVec(srcmat.
Row(copy_from_idxvec[
i]));
   131 template<
typename Real>
   139   if (CuDevice::Instantiate().Enabled()) {
   145     cuda_splice(dimGrid, dimBlock, tgt->
Data(), src.
Data(),
   147     CU_SAFE_CALL(cudaGetLastError());
   149     CuDevice::Instantiate().AccuProfile(__func__, tim);
   155     const int32 *frame_offsetvec = frame_offsets.
Data();
   160       for(
int32 off=0; off < dim; off++) {
   161         int32 r_off = r + frame_offsetvec[off];
   162         if(r_off < 0) r_off = 0;
   172 template<
typename Real>
   180   if (CuDevice::Instantiate().Enabled()) {
   186     cuda_copy(dimGrid, dimBlock, tgt->
Data(), src.
Data(),
   187               copy_from_indices.
Data(), tgt->
Dim(), src.
Dim());
   188     CU_SAFE_CALL(cudaGetLastError());
   190     CuDevice::Instantiate().AccuProfile(__func__, tim);
   196     const int32 *copy_from_indicesvec = copy_from_indices.
Data();
   197     int32 dim = copy_from_indices.
Dim();
   201       for(
int32 c = 0; c < dim; c++) {
   202         tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
   208 template <
typename Real>
   214   if (CuDevice::Instantiate().Enabled()) {
   216     dim3 dimGrid, dimBlock;
   217     GetBlockSizesForSimpleMatrixOperation(src.
NumRows(), src.
NumCols(),
   218                                           &dimGrid, &dimBlock);
   219     cuda_ensure_nonzero(dimGrid, dimBlock, src.
Data(), src.
Dim(),
   221     CU_SAFE_CALL(cudaGetLastError());
   222     CuDevice::Instantiate().AccuProfile(__func__, tim);
   227     for (
int32 r = 0; r < num_rows; r++) {
   228       const Real *src_data = src.
RowData(r);
   229       Real *dest_data = dest->
RowData(r);
   230       for (
int32 c = 0; c < num_cols; c++) {
   231         Real x = src_data[c], y;
   232         if (x <= -epsilon || x >= epsilon) y = x;
   233         else if (x >= 0.0) y = epsilon;
   279 template<
typename Real>
   282   const Real kSquaredNormFloor = 1.3552527156068805425e-20; 
   283   if (add_log_stddev) {
   291   if (CuDevice::Instantiate().Enabled()) {
   294     size_t dimGrid = out->
NumRows();
   295     cuda_normalize_per_row(dimGrid, dimBlock, out->
Data(), out->
Stride(),
   296                            in.
Data(), in.
Dim(), target_rms, add_log_stddev);
   297     CU_SAFE_CALL(cudaGetLastError());
   298     CuDevice::Instantiate().AccuProfile(__func__, tim);
   303     if (in.
Data() != out_no_log.Data())
   306     Real d_scaled = in.
NumCols() * target_rms * target_rms;
   307     in_norm.AddDiagMat2(1.0 / d_scaled, in, 
kNoTrans, 0.0);
   308     in_norm.ApplyFloor(kSquaredNormFloor);
   309     in_norm.ApplyPow(-0.5);
   310     out_no_log.MulRowsVec(in_norm);
   311     if (add_log_stddev) {
   314       in_norm.Add(log(target_rms));
   348 template<
typename Real>
   351                          const Real target_rms, 
const bool add_log_stddev,
   353   const Real kSquaredNormFloor = 1.3552527156068805425e-20; 
   355   if (CuDevice::Instantiate().Enabled()) {
   358     size_t dimGrid = in_deriv->
NumRows();
   359     cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->
Data(),
   361                                 in_value.
Dim(), out_deriv.
Data(),
   362                                 out_deriv.
Stride(), target_rms, add_log_stddev);
   363     CU_SAFE_CALL(cudaGetLastError());
   364     CuDevice::Instantiate().AccuProfile(__func__, tim);
   374     Real d_scaled = (in_value.
NumCols() * target_rms * target_rms);
   377     if (add_log_stddev) {
   386       log_stddev_deriv.ApplyPow(-1.0);
   387       out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.
NumCols() - 1));
   388       log_stddev_deriv.MulElements(out_deriv_for_stddev);
   392     in_norm.Scale(1.0 / d_scaled);
   393     in_norm.ApplyFloor(kSquaredNormFloor);
   394     in_norm.ApplyPow(-0.5);
   396       if (in_deriv->
Data() != out_deriv_no_log.Data())
   400       in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
   401       in_norm.ApplyPow(3.0);
   402       dot_products.MulElements(in_norm);
   413                          const float target_rms, 
const bool add_log_stddev,
   418                          const double target_rms, 
const bool add_log_stddev,
   423 template<
typename Real>
   426     return Real(1) / (Real(1) + 
Exp(-a));
   429     return x / (x + Real(1));
   433 template<
typename Real>
   436     Real inv_expa = 
Exp(-a);
   437     return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa);
   440     return Real(1) - Real(2) / (Real(1) + expa * expa);
   444 template<
typename Real>
   449       input_cols = input_mat.
NumCols(),
   450         cell_dim = input_cols / 5;
   451   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   458   const Real *params_data = params_mat.
Data();
   460   for (
int32 r = 0; r < num_rows; r++) {
   461     const Real *input_row = input_mat.
RowData(r);
   463     Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]),
   464          f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]),
   465          o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]);
   467     Real *output_row = output_mat.RowData(r);
   468     for (
int32 c = 0; c < cell_dim; c++) {
   469       Real i_part = input_row[c];
   470       Real f_part = input_row[c + cell_dim];
   471       Real c_part = input_row[c + 2 * cell_dim];
   472       Real o_part = input_row[c + 3 * cell_dim];
   473       Real c_prev = input_row[c + 4 * cell_dim];
   474       Real w_ic = params_data[c];
   475       Real w_fc = params_data[c + params_stride];
   476       Real w_oc = params_data[c + params_stride * 2];
   479       Real c_t = f_t * f_scale * c_prev + i_t * i_scale * 
ScalarTanh(c_part);
   483       output_row[c + cell_dim] = m_t;
   488 template<
typename Real>
   494         cell_dim = input_cols / 5;
   495   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   502   if (CuDevice::Instantiate().Enabled()) {
   505     int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
   510     dim3 dimGrid(num_rows);
   512     cuda_lstm_nonlinearity(dimGrid, dimBlock, input.
Data(), input.
Stride(),
   514                            cell_dim, have_dropout_mask, num_rows, output->
Data());
   515     CU_SAFE_CALL(cudaGetLastError());
   517     CuDevice::Instantiate().AccuProfile(__func__, tim);
   542 template<
typename Real>
   557         cell_dim = input.
NumCols() / 5;
   559   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   567   if (input_deriv != NULL) {
   570   if (params_deriv == NULL) {
   592       input_deriv == NULL ? NULL : input_deriv);
   597   if (params_deriv != NULL) {
   598     params_deriv_mat = params_deriv;
   599     value_sum_out_mat = value_sum_out;
   600     deriv_sum_out_mat = deriv_sum_out;
   601     self_repair_sum_out_mat = self_repair_sum_out;
   606   Real 
count = 1.0 + count_in;
   607   for (
int32 c = 0; c < cell_dim; c++) {
   609     Real w_ic = params_mat(0, c);
   610     Real w_fc = params_mat(1, c);
   611     Real w_oc = params_mat(2, c);
   613     Real w_ic_deriv_sum = 0.0;
   614     Real w_fc_deriv_sum = 0.0;
   615     Real w_oc_deriv_sum = 0.0;
   621     Real i_t_self_repair = (
   622         deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
   623     Real f_t_self_repair = (
   624         deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
   625     Real c_part_self_repair = (
   626         deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
   627     Real o_t_self_repair = (
   628         deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
   629     Real c_t_self_repair = (
   630         deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
   640     Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0;
   641     Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0;
   642     Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0;
   643     Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0;
   644     Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0;
   647     for (
int32 r = 0; r < num_rows; r++) {
   648       Real i_part = input_mat(r, c),
   649           f_part = input_mat(r, c + cell_dim),
   650           c_part = input_mat(r, c + 2 * cell_dim),
   651           o_part = input_mat(r, c + 3 * cell_dim),
   652           c_prev = input_mat(r, c + 4 * cell_dim);
   654       Real i_scale = (input_cols == cell_dim * 5 ? 1.0 :
   655                       input_mat(r, cell_dim * 5)),
   656            f_scale = (input_cols == cell_dim * 5 ? 1.0 :
   657                       input_mat(r, cell_dim * 5 + 1)),
   658            o_scale = (input_cols == cell_dim * 5 ? 1.0 :
   659                       input_mat(r, cell_dim * 5 + 2));
   663       Real i_t_input = i_part + w_ic * c_prev,
   665           f_t_input = f_part + w_fc * c_prev,
   668           c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part,
   669           o_t_input = o_part + w_oc * c_t,
   680       i_t_value_sum += i_t;
   681       i_t_deriv_sum += i_t * (1.0F - i_t);
   682       f_t_value_sum += f_t;
   683       f_t_deriv_sum += f_t * (1.0F - f_t);
   684       c_part_value_sum += tanh_c_part;
   685       c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part;
   686       o_t_value_sum += o_t;
   687       o_t_deriv_sum += o_t * (1.0F - o_t);
   688       c_t_value_sum += tanh_c_t;
   689       c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t;
   698       Real dc_t_out = output_deriv_mat(r, c);
   699       Real dm_t = output_deriv_mat(r, c + cell_dim);
   700       Real dtanh_c_t = o_t * o_scale * dm_t;
   701       Real do_t = o_scale * tanh_c_t * dm_t;
   702       Real do_t_input = (o_t * (1.0F - o_t) * do_t
   703           - (2.0F * o_t - 1.0F) * o_t_self_repair);
   704       Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
   705           + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
   706       Real dtanh_c_part = i_t * i_scale * dc_t;
   707       Real df_t = dc_t * f_scale * c_prev;
   708       Real df_t_input = ((df_t * f_t * (1.0F - f_t)
   709                           - (2.0F * f_t - 1.0F) * f_t_self_repair));
   710       Real di_t = dc_t * i_scale * tanh_c_part;
   711       Real di_t_input = ((di_t * i_t * (1.0F - i_t)
   712                           - (2.0F * i_t - 1.0F) * i_t_self_repair));
   714       w_ic_deriv_sum += c_prev * di_t_input;
   715       w_fc_deriv_sum += c_prev * df_t_input;
   716       w_oc_deriv_sum += c_t * do_t_input;
   718       Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t;
   719       Real do_part = do_t_input;
   720       Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
   721           - tanh_c_part * c_part_self_repair);
   722       Real df_part = df_t_input;
   723       Real di_part = di_t_input;
   725       if (input_deriv_mat != NULL) {
   726         (*input_deriv_mat)(r, c) = di_part;
   727         (*input_deriv_mat)(r, c + cell_dim) = df_part;
   728         (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part;
   729         (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part;
   730         (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev;
   734     if (params_deriv != NULL) {
   740       (*params_deriv_mat)(0, c) = w_ic_deriv_sum;
   741       (*params_deriv_mat)(1, c) = w_fc_deriv_sum;
   742       (*params_deriv_mat)(2, c) = w_oc_deriv_sum;
   744       (*value_sum_out_mat)(0, c) += i_t_value_sum;
   745       (*value_sum_out_mat)(1, c) += f_t_value_sum;
   746       (*value_sum_out_mat)(2, c) += c_part_value_sum;
   747       (*value_sum_out_mat)(3, c) += o_t_value_sum;
   748       (*value_sum_out_mat)(4, c) += c_t_value_sum;
   753         (*self_repair_sum_out_mat)(
i, c) =
   754             (deriv_sum_in_mat(
i, c) / count < sr_config(
i) ? num_rows : 0);
   756       (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
   757       (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
   758       (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum;
   759       (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum;
   760       (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum;
   767 template<
typename Real>
   780         cell_dim = input.
NumCols() / 5,
   783   KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3);
   791   if (input_deriv != NULL) {
   794   if (params_deriv == NULL) {
   812   if (CuDevice::Instantiate().Enabled()) {
   817     int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
   821     const int kWarpSize = 32;
   822     dim3 dimBlock(kWarpSize, 
CU1DBLOCK / kWarpSize);
   828     dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
   829     if (input_deriv == NULL) {
   830       if (params_deriv == NULL) {
   831         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
   832                                     have_dropout_mask, num_rows,
   837                                     self_repair_config.
Data(), count_in + 1,
   850         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
   851                                     have_dropout_mask, num_rows,
   856                                     self_repair_config.
Data(), count_in + 1,
   858                                     0, params_deriv->
Data(),
   860                                     value_sum_out->
Data(),
   862                                     deriv_sum_out->
Data(),
   864                                     self_repair_sum_out->
Data(),
   865                                     self_repair_sum_out->
Stride());
   868       if (params_deriv == NULL) {
   869         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
   870                                     have_dropout_mask, num_rows,
   875                                     self_repair_config.
Data(), count_in + 1,
   878                                     0, NULL, 0, NULL, 0, NULL, 0);
   880         cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
   881                                     have_dropout_mask, num_rows,
   886                                     self_repair_config.
Data(), count_in + 1,
   888                                     params_deriv->
Data(),
   890                                     value_sum_out->
Data(),
   892                                     deriv_sum_out->
Data(),
   894                                     self_repair_sum_out->
Data(),
   895                                     self_repair_sum_out->
Stride());
   899     CU_SAFE_CALL(cudaGetLastError());
   901     CuDevice::Instantiate().AccuProfile(__func__, tim);
   906                                 deriv_sum_in.
Mat(), self_repair_config.
Vec(),
   907                                 count_in, &(input_deriv->
Mat()),
   908                                 &(params_deriv->
Mat()), &(value_sum_out->
Mat()),
   909                                 &(deriv_sum_out->
Mat()),
   910                                 &(self_repair_sum_out->
Mat()));
   914 template <
typename Real>
   922       dest_mat(dest->
Data(), 1, dim, dim);
 const MatrixBase< Real > & Mat() const
 
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
 
MatrixIndexT Stride() const
 
void CpuComputeLstmNonlinearity(const MatrixBase< Real > &input_mat, const MatrixBase< Real > ¶ms_mat, MatrixBase< Real > *output)
 
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix). 
 
Base class which provides matrix operations not involving resizing or allocation. ...
 
const Real * Data() const
Gives pointer to raw data (const). 
 
Structure containing size of the matrix plus stride. 
 
void Randomize(const CuMatrixBase< Real > &src, const CuArray< int32 > ©_from_idx, CuMatrixBase< Real > *tgt)
Copies a permutation of src into tgt. 
 
Real * RowData(MatrixIndexT i)
Returns pointer to data for one row (non-const) 
 
void AddDiagMat2(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, Real beta)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
 
const T * Data() const
Get raw pointer. 
 
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=NULL)
 
void CopyColFromVec(const CuVectorBase< Real > &v, const MatrixIndexT col)
Copy vector into specific column of matrix. 
 
void BackpropLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > ¶ms, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity. 
 
void AddDiagMatMat(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const CuMatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
 
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
 
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols. 
 
void DiffNormalizePerRow(const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)
 
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const]. 
 
This class is used for a piece of a CuMatrix. 
 
MatrixIndexT Dim() const
Returns the dimension of the vector. 
 
const Real * Data() const
Return data pointer (const). 
 
static Real ScalarTanh(Real a)
 
void EnsureNonzero(const CuMatrixBase< Real > &src, Real epsilon, CuMatrixBase< Real > *dest)
This function requires that src and dest have the same dimension and epsilon > 0. ...
 
void Splice(const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
Splice concatenates frames of src as specified in frame_offsets into tgt. 
 
Matrix for CUDA computing. 
 
MatrixIndexT NumCols() const
 
const VectorBase< Real > & Vec() const
 
#define KALDI_ASSERT(cond)
 
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix). 
 
Real * Data()
Returns a pointer to the start of the vector's data. 
 
MatrixIndexT NumRows() const
Dimensions. 
 
Provides a vector abstraction class. 
 
MatrixIndexT Dim() const
Return the vector dimension. 
 
void NormalizePerRow(const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square ...
 
static Real ScalarSigmoid(Real a)
 
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i'th row by scale[i] 
 
void RegularizeL1(CuMatrixBase< Real > *weight, CuMatrixBase< Real > *grad, Real l1, Real lr)
RegularizeL1 is a gradient step with l1 regularization added to the gradient. 
 
MatrixIndexT Dim() const
Dimensions. 
 
Vector for CUDA computing. 
 
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T]. 
 
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const). 
 
void Copy(const CuMatrixBase< Real > &src, const CuArray< int32 > ©_from_indices, CuMatrixBase< Real > *tgt)
Copies elements from src into tgt as given by copy_from_indices. 
 
void CpuBackpropLstmNonlinearity(const MatrixBase< Real > &input, const MatrixBase< Real > ¶ms, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > *input_deriv, MatrixBase< Real > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)
 
void ComputeLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > ¶ms, CuMatrixBase< Real > *output)
this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propaga...