36 template<
typename Real>
40 if (CuDevice::Instantiate().Enabled()) {
46 cuda_regularize_l1(dimGrid, dimBlock, weight->
Data(), grad->
Data(), l1, lr,
48 CU_SAFE_CALL(cudaGetLastError());
50 CuDevice::Instantiate().AccuProfile(__func__, tim);
59 if(weight2(r,c)==0.0)
continue;
62 if (weight2(r, c) < 0.0)
65 Real before = weight2(r, c);
66 Real after = weight2(r, c) - lr*grad2(r, c) - l1_signed;
67 if ((after > 0.0) ^ (before > 0.0)) {
71 weight2(r, c) -= l1_signed;
79 template<
typename Real>
89 if (CuDevice::Instantiate().Enabled()) {
103 dim3 dimBlock(4, 128);
104 dim3 dimGrid(n_blocks(tgt->
NumCols(), 4), n_blocks(copy_from_idx.
Dim(), 128));
111 cuda_randomize(dimGrid, dimBlock, tgt->
Data(), src.
Data(),
112 copy_from_idx.
Data(), dimtgt, dimsrc);
113 CU_SAFE_CALL(cudaGetLastError());
115 CuDevice::Instantiate().AccuProfile(__func__, tim);
121 const int32 *copy_from_idxvec = copy_from_idx.
Data();
124 tgtmat.
Row(
i).CopyFromVec(srcmat.
Row(copy_from_idxvec[
i]));
131 template<
typename Real>
139 if (CuDevice::Instantiate().Enabled()) {
145 cuda_splice(dimGrid, dimBlock, tgt->
Data(), src.
Data(),
147 CU_SAFE_CALL(cudaGetLastError());
149 CuDevice::Instantiate().AccuProfile(__func__, tim);
155 const int32 *frame_offsetvec = frame_offsets.
Data();
160 for(
int32 off=0; off < dim; off++) {
161 int32 r_off = r + frame_offsetvec[off];
162 if(r_off < 0) r_off = 0;
172 template<
typename Real>
180 if (CuDevice::Instantiate().Enabled()) {
186 cuda_copy(dimGrid, dimBlock, tgt->
Data(), src.
Data(),
187 copy_from_indices.
Data(), tgt->
Dim(), src.
Dim());
188 CU_SAFE_CALL(cudaGetLastError());
190 CuDevice::Instantiate().AccuProfile(__func__, tim);
196 const int32 *copy_from_indicesvec = copy_from_indices.
Data();
197 int32 dim = copy_from_indices.
Dim();
201 for(
int32 c = 0; c < dim; c++) {
202 tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
208 template <
typename Real>
214 if (CuDevice::Instantiate().Enabled()) {
216 dim3 dimGrid, dimBlock;
217 GetBlockSizesForSimpleMatrixOperation(src.
NumRows(), src.
NumCols(),
218 &dimGrid, &dimBlock);
219 cuda_ensure_nonzero(dimGrid, dimBlock, src.
Data(), src.
Dim(),
221 CU_SAFE_CALL(cudaGetLastError());
222 CuDevice::Instantiate().AccuProfile(__func__, tim);
227 for (
int32 r = 0; r < num_rows; r++) {
228 const Real *src_data = src.
RowData(r);
229 Real *dest_data = dest->
RowData(r);
230 for (
int32 c = 0; c < num_cols; c++) {
231 Real x = src_data[c], y;
232 if (x <= -epsilon || x >= epsilon) y = x;
233 else if (x >= 0.0) y = epsilon;
279 template<
typename Real>
282 const Real kSquaredNormFloor = 1.3552527156068805425e-20;
283 if (add_log_stddev) {
291 if (CuDevice::Instantiate().Enabled()) {
294 size_t dimGrid = out->
NumRows();
295 cuda_normalize_per_row(dimGrid, dimBlock, out->
Data(), out->
Stride(),
296 in.
Data(), in.
Dim(), target_rms, add_log_stddev);
297 CU_SAFE_CALL(cudaGetLastError());
298 CuDevice::Instantiate().AccuProfile(__func__, tim);
303 if (in.
Data() != out_no_log.Data())
306 Real d_scaled = in.
NumCols() * target_rms * target_rms;
307 in_norm.AddDiagMat2(1.0 / d_scaled, in,
kNoTrans, 0.0);
308 in_norm.ApplyFloor(kSquaredNormFloor);
309 in_norm.ApplyPow(-0.5);
310 out_no_log.MulRowsVec(in_norm);
311 if (add_log_stddev) {
314 in_norm.Add(log(target_rms));
348 template<
typename Real>
351 const Real target_rms,
const bool add_log_stddev,
353 const Real kSquaredNormFloor = 1.3552527156068805425e-20;
355 if (CuDevice::Instantiate().Enabled()) {
358 size_t dimGrid = in_deriv->
NumRows();
359 cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->
Data(),
361 in_value.
Dim(), out_deriv.
Data(),
362 out_deriv.
Stride(), target_rms, add_log_stddev);
363 CU_SAFE_CALL(cudaGetLastError());
364 CuDevice::Instantiate().AccuProfile(__func__, tim);
374 Real d_scaled = (in_value.
NumCols() * target_rms * target_rms);
377 if (add_log_stddev) {
386 log_stddev_deriv.ApplyPow(-1.0);
387 out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.
NumCols() - 1));
388 log_stddev_deriv.MulElements(out_deriv_for_stddev);
392 in_norm.Scale(1.0 / d_scaled);
393 in_norm.ApplyFloor(kSquaredNormFloor);
394 in_norm.ApplyPow(-0.5);
396 if (in_deriv->
Data() != out_deriv_no_log.Data())
400 in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
401 in_norm.ApplyPow(3.0);
402 dot_products.MulElements(in_norm);
413 const float target_rms,
const bool add_log_stddev,
418 const double target_rms,
const bool add_log_stddev,
423 template<
typename Real>
426 return Real(1) / (Real(1) +
Exp(-a));
429 return x / (x + Real(1));
433 template<
typename Real>
436 Real inv_expa =
Exp(-a);
437 return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa);
440 return Real(1) - Real(2) / (Real(1) + expa * expa);
444 template<
typename Real>
449 input_cols = input_mat.
NumCols(),
450 cell_dim = input_cols / 5;
451 KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
458 const Real *params_data = params_mat.
Data();
460 for (
int32 r = 0; r < num_rows; r++) {
461 const Real *input_row = input_mat.
RowData(r);
463 Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]),
464 f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]),
465 o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]);
467 Real *output_row = output_mat.RowData(r);
468 for (
int32 c = 0; c < cell_dim; c++) {
469 Real i_part = input_row[c];
470 Real f_part = input_row[c + cell_dim];
471 Real c_part = input_row[c + 2 * cell_dim];
472 Real o_part = input_row[c + 3 * cell_dim];
473 Real c_prev = input_row[c + 4 * cell_dim];
474 Real w_ic = params_data[c];
475 Real w_fc = params_data[c + params_stride];
476 Real w_oc = params_data[c + params_stride * 2];
479 Real c_t = f_t * f_scale * c_prev + i_t * i_scale *
ScalarTanh(c_part);
483 output_row[c + cell_dim] = m_t;
488 template<
typename Real>
494 cell_dim = input_cols / 5;
495 KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
502 if (CuDevice::Instantiate().Enabled()) {
505 int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
510 dim3 dimGrid(num_rows);
512 cuda_lstm_nonlinearity(dimGrid, dimBlock, input.
Data(), input.
Stride(),
514 cell_dim, have_dropout_mask, num_rows, output->
Data());
515 CU_SAFE_CALL(cudaGetLastError());
517 CuDevice::Instantiate().AccuProfile(__func__, tim);
542 template<
typename Real>
557 cell_dim = input.
NumCols() / 5;
559 KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
567 if (input_deriv != NULL) {
570 if (params_deriv == NULL) {
592 input_deriv == NULL ? NULL : input_deriv);
597 if (params_deriv != NULL) {
598 params_deriv_mat = params_deriv;
599 value_sum_out_mat = value_sum_out;
600 deriv_sum_out_mat = deriv_sum_out;
601 self_repair_sum_out_mat = self_repair_sum_out;
606 Real
count = 1.0 + count_in;
607 for (
int32 c = 0; c < cell_dim; c++) {
609 Real w_ic = params_mat(0, c);
610 Real w_fc = params_mat(1, c);
611 Real w_oc = params_mat(2, c);
613 Real w_ic_deriv_sum = 0.0;
614 Real w_fc_deriv_sum = 0.0;
615 Real w_oc_deriv_sum = 0.0;
621 Real i_t_self_repair = (
622 deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
623 Real f_t_self_repair = (
624 deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
625 Real c_part_self_repair = (
626 deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
627 Real o_t_self_repair = (
628 deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
629 Real c_t_self_repair = (
630 deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
640 Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0;
641 Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0;
642 Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0;
643 Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0;
644 Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0;
647 for (
int32 r = 0; r < num_rows; r++) {
648 Real i_part = input_mat(r, c),
649 f_part = input_mat(r, c + cell_dim),
650 c_part = input_mat(r, c + 2 * cell_dim),
651 o_part = input_mat(r, c + 3 * cell_dim),
652 c_prev = input_mat(r, c + 4 * cell_dim);
654 Real i_scale = (input_cols == cell_dim * 5 ? 1.0 :
655 input_mat(r, cell_dim * 5)),
656 f_scale = (input_cols == cell_dim * 5 ? 1.0 :
657 input_mat(r, cell_dim * 5 + 1)),
658 o_scale = (input_cols == cell_dim * 5 ? 1.0 :
659 input_mat(r, cell_dim * 5 + 2));
663 Real i_t_input = i_part + w_ic * c_prev,
665 f_t_input = f_part + w_fc * c_prev,
668 c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part,
669 o_t_input = o_part + w_oc * c_t,
680 i_t_value_sum += i_t;
681 i_t_deriv_sum += i_t * (1.0F - i_t);
682 f_t_value_sum += f_t;
683 f_t_deriv_sum += f_t * (1.0F - f_t);
684 c_part_value_sum += tanh_c_part;
685 c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part;
686 o_t_value_sum += o_t;
687 o_t_deriv_sum += o_t * (1.0F - o_t);
688 c_t_value_sum += tanh_c_t;
689 c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t;
698 Real dc_t_out = output_deriv_mat(r, c);
699 Real dm_t = output_deriv_mat(r, c + cell_dim);
700 Real dtanh_c_t = o_t * o_scale * dm_t;
701 Real do_t = o_scale * tanh_c_t * dm_t;
702 Real do_t_input = (o_t * (1.0F - o_t) * do_t
703 - (2.0F * o_t - 1.0F) * o_t_self_repair);
704 Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
705 + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
706 Real dtanh_c_part = i_t * i_scale * dc_t;
707 Real df_t = dc_t * f_scale * c_prev;
708 Real df_t_input = ((df_t * f_t * (1.0F - f_t)
709 - (2.0F * f_t - 1.0F) * f_t_self_repair));
710 Real di_t = dc_t * i_scale * tanh_c_part;
711 Real di_t_input = ((di_t * i_t * (1.0F - i_t)
712 - (2.0F * i_t - 1.0F) * i_t_self_repair));
714 w_ic_deriv_sum += c_prev * di_t_input;
715 w_fc_deriv_sum += c_prev * df_t_input;
716 w_oc_deriv_sum += c_t * do_t_input;
718 Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t;
719 Real do_part = do_t_input;
720 Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
721 - tanh_c_part * c_part_self_repair);
722 Real df_part = df_t_input;
723 Real di_part = di_t_input;
725 if (input_deriv_mat != NULL) {
726 (*input_deriv_mat)(r, c) = di_part;
727 (*input_deriv_mat)(r, c + cell_dim) = df_part;
728 (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part;
729 (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part;
730 (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev;
734 if (params_deriv != NULL) {
740 (*params_deriv_mat)(0, c) = w_ic_deriv_sum;
741 (*params_deriv_mat)(1, c) = w_fc_deriv_sum;
742 (*params_deriv_mat)(2, c) = w_oc_deriv_sum;
744 (*value_sum_out_mat)(0, c) += i_t_value_sum;
745 (*value_sum_out_mat)(1, c) += f_t_value_sum;
746 (*value_sum_out_mat)(2, c) += c_part_value_sum;
747 (*value_sum_out_mat)(3, c) += o_t_value_sum;
748 (*value_sum_out_mat)(4, c) += c_t_value_sum;
753 (*self_repair_sum_out_mat)(
i, c) =
754 (deriv_sum_in_mat(
i, c) / count < sr_config(
i) ? num_rows : 0);
756 (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
757 (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
758 (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum;
759 (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum;
760 (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum;
767 template<
typename Real>
780 cell_dim = input.
NumCols() / 5,
783 KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3);
791 if (input_deriv != NULL) {
794 if (params_deriv == NULL) {
812 if (CuDevice::Instantiate().Enabled()) {
817 int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
821 const int kWarpSize = 32;
822 dim3 dimBlock(kWarpSize,
CU1DBLOCK / kWarpSize);
828 dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
829 if (input_deriv == NULL) {
830 if (params_deriv == NULL) {
831 cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
832 have_dropout_mask, num_rows,
837 self_repair_config.
Data(), count_in + 1,
850 cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
851 have_dropout_mask, num_rows,
856 self_repair_config.
Data(), count_in + 1,
858 0, params_deriv->
Data(),
860 value_sum_out->
Data(),
862 deriv_sum_out->
Data(),
864 self_repair_sum_out->
Data(),
865 self_repair_sum_out->
Stride());
868 if (params_deriv == NULL) {
869 cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
870 have_dropout_mask, num_rows,
875 self_repair_config.
Data(), count_in + 1,
878 0, NULL, 0, NULL, 0, NULL, 0);
880 cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
881 have_dropout_mask, num_rows,
886 self_repair_config.
Data(), count_in + 1,
888 params_deriv->
Data(),
890 value_sum_out->
Data(),
892 deriv_sum_out->
Data(),
894 self_repair_sum_out->
Data(),
895 self_repair_sum_out->
Stride());
899 CU_SAFE_CALL(cudaGetLastError());
901 CuDevice::Instantiate().AccuProfile(__func__, tim);
906 deriv_sum_in.
Mat(), self_repair_config.
Vec(),
907 count_in, &(input_deriv->
Mat()),
908 &(params_deriv->
Mat()), &(value_sum_out->
Mat()),
909 &(deriv_sum_out->
Mat()),
910 &(self_repair_sum_out->
Mat()));
914 template <
typename Real>
922 dest_mat(dest->
Data(), 1, dim, dim);
const MatrixBase< Real > & Mat() const
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
MatrixIndexT Stride() const
void CpuComputeLstmNonlinearity(const MatrixBase< Real > &input_mat, const MatrixBase< Real > ¶ms_mat, MatrixBase< Real > *output)
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Base class which provides matrix operations not involving resizing or allocation. ...
const Real * Data() const
Gives pointer to raw data (const).
Structure containing size of the matrix plus stride.
void Randomize(const CuMatrixBase< Real > &src, const CuArray< int32 > ©_from_idx, CuMatrixBase< Real > *tgt)
Copies a permutation of src into tgt.
Real * RowData(MatrixIndexT i)
Returns pointer to data for one row (non-const)
void AddDiagMat2(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, Real beta)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
const T * Data() const
Get raw pointer.
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=NULL)
void CopyColFromVec(const CuVectorBase< Real > &v, const MatrixIndexT col)
Copy vector into specific column of matrix.
void BackpropLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > ¶ms, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity.
void AddDiagMatMat(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const CuMatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols.
void DiffNormalizePerRow(const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
This class is used for a piece of a CuMatrix.
MatrixIndexT Dim() const
Returns the dimension of the vector.
const Real * Data() const
Return data pointer (const).
static Real ScalarTanh(Real a)
void EnsureNonzero(const CuMatrixBase< Real > &src, Real epsilon, CuMatrixBase< Real > *dest)
This function requires that src and dest have the same dimension and epsilon > 0. ...
void Splice(const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
Splice concatenates frames of src as specified in frame_offsets into tgt.
Matrix for CUDA computing.
MatrixIndexT NumCols() const
const VectorBase< Real > & Vec() const
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Real * Data()
Returns a pointer to the start of the vector's data.
MatrixIndexT NumRows() const
Dimensions.
Provides a vector abstraction class.
MatrixIndexT Dim() const
Return the vector dimension.
void NormalizePerRow(const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square ...
static Real ScalarSigmoid(Real a)
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i'th row by scale[i]
void RegularizeL1(CuMatrixBase< Real > *weight, CuMatrixBase< Real > *grad, Real l1, Real lr)
RegularizeL1 is a gradient step with l1 regularization added to the gradient.
MatrixIndexT Dim() const
Dimensions.
Vector for CUDA computing.
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).
void Copy(const CuMatrixBase< Real > &src, const CuArray< int32 > ©_from_indices, CuMatrixBase< Real > *tgt)
Copies elements from src into tgt as given by copy_from_indices.
void CpuBackpropLstmNonlinearity(const MatrixBase< Real > &input, const MatrixBase< Real > ¶ms, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > *input_deriv, MatrixBase< Real > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)
void ComputeLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > ¶ms, CuMatrixBase< Real > *output)
this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propaga...