35 using namespace kaldi;
45 template<
typename Real>
55 std::vector<int32> copy_from_idx_vec;
58 copy_from_idx_vec.push_back(
Rand() % n_rows);
64 for (
int32 j = 0;
j < n_columns;
j++) {
65 Real src_val = src(copy_from_idx_vec.at(
i),
j);
66 Real tgt_val = tgt(
i,
j);
72 template<
typename Real>
84 Real src = x_cpu(r, c), dest = y_cpu(r, c);
85 if (src <= -epsilon || src >= epsilon) {
87 }
else if (src >= 0) {
96 template<
typename Real>
106 std::vector<int32> copy_from_idx_vec;
108 for (
int32 i = 0;
i < n_columns;
i++) {
109 copy_from_idx_vec.push_back(
Rand() % n_columns);
114 for (
int32 i = 0;
i < n_rows;
i++) {
115 for (
int32 j = 0;
j < n_columns;
j++) {
116 Real src_val = src(
i, copy_from_idx_vec.at(
j));
117 Real tgt_val = tgt(
i,
j);
123 template<
typename Real>
132 std::vector<int32> frame_offsets_vec;
138 for (
int32 i = 0;
i < n_frame_offsets;
i++) {
139 frame_offsets_vec.push_back(
Rand() % 2 * n_columns - n_columns);
147 for (
int32 i = 0;
i < n_rows;
i++) {
148 for (
int32 k = 0; k < n_frame_offsets; k++) {
149 for (
int32 j = 0;
j < n_columns;
j++) {
151 if (
i + frame_offsets_vec.at(k) >= n_rows) {
152 src_val = src_copy(n_rows-1,
j);
153 }
else if (
i + frame_offsets_vec.at(k) <= 0) {
154 src_val = src_copy(0,
j);
156 src_val = src_copy(
i + frame_offsets_vec.at(k),
j);
158 Real tgt_val = tgt_copy(
i, k * n_columns +
j);
165 template<
typename Real>
167 for (
int i = 0;
i < 3;
i++) {
171 Matrix<Real> Hinput(num_rows, 5 * cell_dim + dropout_dim);
188 for (
int i = 16;
i <= 1024;
i *= 2) {
201 for (; tim.
Elapsed() < time_in_secs; iter++)
205 KALDI_LOG <<
"For ComputeLstmNonlinearity" 206 << (
sizeof(Real)==8 ?
"<double>" :
"<float>") <<
", for dim = " 207 <<
i <<
", speed was " << gflops <<
" gigaflops";
214 for (
int32 loop = 0; loop < 10; loop++) {
219 dropout_dim = (
RandInt(0, 1) == 0 ? 0 : 3);
238 output_deriv(num_rows, cell_dim * 2);
242 output_deriv.
ColRange(test_output * cell_dim, cell_dim).SetRandn();
255 value_sum(5, cell_dim);
258 self_repair_sum(5, cell_dim),
259 input_deriv(num_rows, 5 * cell_dim + dropout_dim),
260 params_deriv(3, cell_dim);
262 double count_in = 0.0;
266 self_repair_config, count_in,
267 &input_deriv, ¶ms_deriv,
268 &value_sum, &deriv_sum, &self_repair_sum);
275 measured_objf_change(test_dim);
277 for (
int32 i = 0;
i < test_dim;
i++) {
279 delta_params(3, cell_dim);
280 if (test_input >= 0) {
281 delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn();
282 delta_input.Scale(delta);
284 if (test_params >= 0) {
285 delta_params.
Row(test_params).SetRandn();
286 delta_params.
Scale(delta);
293 perturbed_input.
AddMat(1.0, delta_input);
296 perturbed_params.
AddMat(1.0, delta_params);
302 objf_change = new_objf - baseline_objf;
303 measured_objf_change(
i) = objf_change;
305 KALDI_LOG <<
"LSTM nonlinearity test: num_rows=" << num_rows
306 <<
", cell_dim=" << cell_dim
307 <<
", dropout_dim=" << dropout_dim
308 <<
", test_input=" << test_input
309 <<
", test_params=" << test_params
310 <<
", test_output=" << test_output
311 <<
", predicted_objf_change=" << predicted_objf_change
312 <<
", measured_objf_change=" << measured_objf_change;
315 KALDI_ERR <<
"LSTM nonlinearity test failed.";
320 template<
typename Real>
322 for (
int i = 0;
i < 3;
i++) {
325 dropout_dim = (
RandInt(0, 1) == 0 ? 0 : 3);
328 Matrix<Real> hinput(num_rows, 5 * cell_dim + dropout_dim);
334 Matrix<Real> hinput_deriv(num_rows, 5 * cell_dim + dropout_dim);
345 count_in =
Rand() % num_rows;
366 hderiv_sum_in, hself_repair_config,
373 dself_repair_config, count_in,
381 hderiv_sum_in, hself_repair_config,
383 &hparams_deriv, &hvalue_sum_out,
384 &hderiv_sum_out, &hself_repair_sum_out);
386 dself_repair_config, count_in,
388 &dvalue_sum_out, &dderiv_sum_out,
389 &dself_repair_sum_out);
392 hderiv_sum_in, hself_repair_config,
393 count_in, &hinput_deriv,
399 dself_repair_config, count_in, &dinput_deriv,
406 hderiv_sum_in, hself_repair_config,
407 count_in, &hinput_deriv, &hparams_deriv,
408 &hvalue_sum_out, &hderiv_sum_out,
409 &hself_repair_sum_out);
411 dself_repair_config, count_in, &dinput_deriv,
412 &dparams_deriv, &dvalue_sum_out,
413 &dderiv_sum_out, &dself_repair_sum_out);
419 Matrix<Real> hdself_repair_sum_out(dself_repair_sum_out);
431 AssertEqual(hself_repair_sum_out, hdself_repair_sum_out);
434 for (
int i = 16;
i <= 2048;
i *= 2) {
458 count_in =
Rand() % num_rows;
462 for (; tim.
Elapsed() < time_in_secs; iter++)
464 self_repair_config, count_in, &input_deriv,
465 ¶ms_deriv, &value_sum_out,
466 &deriv_sum_out, &self_repair_sum_out);
470 KALDI_LOG <<
"For BackpropLstmNonlinearity" 471 << (
sizeof(Real) == 8 ?
"<double>" :
"<float>") <<
", for dim = " 472 <<
i <<
", speed was " << gflops <<
" gigaflops";
478 template<
typename Real>
482 int row = 10 +
Rand() % 40;
483 int col = 10 +
Rand() % 50;
494 Real target_rms = 0.3456;
495 bool add_log_stddev =
true;
496 const Real kSquaredNormFloor = 1.35525271560688e-20;
505 Real target_rms=0.3456;
507 if (in.
Data() != out_no_log.Data())
510 Real d_scaled = in.
NumCols() * target_rms * target_rms;
511 in_norm.AddDiagMat2(1.0 / d_scaled, in,
kNoTrans, 0.0);
512 in_norm.ApplyFloor(kSquaredNormFloor);
513 in_norm.ApplyPow(-0.5);
514 out_no_log.MulRowsVec(in_norm);
515 if (add_log_stddev) {
518 in_norm.Add(log(target_rms));
527 for (
int dim = 16; dim <= 1024; dim *= 2) {
534 for (; tim.
Elapsed() < time_in_secs; iter++) {
540 KALDI_LOG <<
"For CuMath::NormalizePerRow" 541 << (
sizeof(Real)==8?
"<double>":
"<float>") <<
", for dim = " 542 << dim <<
", speed was " << gflops <<
" gigaflops.";
549 template<
typename Real>
565 Real target_rms = 0.3456;
566 bool add_log_stddev =
false;
567 const Real kSquaredNormFloor = 1.35525271560688e-20;
576 Real target_rms=0.3456;
578 Real d_scaled = in.
NumCols() * target_rms * target_rms;
579 in_norm.AddDiagMat2(1.0 / d_scaled, in,
kNoTrans, 0.0);
580 in_norm.ApplyFloor(kSquaredNormFloor);
581 in_norm.ApplyPow(-0.5);
592 template<
typename Real>
595 int row = 10 +
Rand() % 40;
596 int col = 10 +
Rand() % 50;
613 Real target_rms = 0.3456;
614 bool add_log_stddev =
true;
615 const Real kSquaredNormFloor = 1.3552527156068805425e-20;
632 Real d_scaled = (in_value.
NumCols() * target_rms * target_rms);
634 if (add_log_stddev) {
643 log_stddev_deriv.ApplyPow(-1.0);
644 out_deriv_for_stddev.CopyColFromMat(out_deriv,
646 log_stddev_deriv.MulElements(out_deriv_for_stddev);
651 in_norm.Scale(1.0 / d_scaled);
652 in_norm.ApplyFloor(kSquaredNormFloor);
653 in_norm.ApplyPow(-0.5);
655 if (in_deriv->
Data() != out_deriv_no_log.Data())
660 in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
661 in_norm.ApplyPow(3.0);
662 dot_products.MulElements(in_norm);
664 in_deriv->
AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value,
673 for (
int dim = 16; dim <= 1024; dim *= 2) {
680 for (; tim.
Elapsed() < time_in_secs; iter++) {
685 KALDI_LOG <<
"For CuMath::DiffNormalizePerRow" 686 << (
sizeof(Real)==8?
"<double>":
"<float>")
687 <<
", for dim = " << dim <<
", speed was " << gflops
696 if (CuDevice::Instantiate().DoublePrecisionSupported())
699 UnitTestCuMathComputeLstmNonlinearity<Real>();
700 UnitTestCuMathRandomize<Real>();
701 UnitTestCuMathSplice<Real>();
702 UnitTestCuMathCopy<Real>();
704 UnitTestEnsureNonzero<Real>();
705 UnitTestBackpropLstmNonlinearity<Real>();
706 UnitTestCuMathNormalizePerRow<Real>();
707 UnitTestCuMathNormalizePerRow_v2<Real>();
708 UnitTestCuDiffNormalizePerRow<Real>();
718 for (; loop < 2; loop++) {
719 CuDevice::Instantiate().SetDebugStrideMode(
true);
721 CuDevice::Instantiate().SelectGpuId(
"no");
723 CuDevice::Instantiate().SelectGpuId(
"yes");
726 kaldi::CudaMathUnitTest<float>();
729 if (CuDevice::Instantiate().DoublePrecisionSupported()) {
730 kaldi::CudaMathUnitTest<double>();
732 KALDI_WARN <<
"Double precision not supported";
735 kaldi::CudaMathUnitTest<float>();
739 KALDI_LOG <<
"Tests without GPU use succeeded.";
741 KALDI_LOG <<
"Tests with GPU use (if available) succeeded.";
744 CuDevice::Instantiate().PrintProfile();
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void CopyColFromVec(const VectorBase< Real > &v, const MatrixIndexT col)
Copy vector into specific column of matrix.
void CopyFromVec(const std::vector< T > &src)
This function resizes if needed.
void UnitTestLstmNonlinearity()
void CpuComputeLstmNonlinearity(const MatrixBase< Real > &input_mat, const MatrixBase< Real > ¶ms_mat, MatrixBase< Real > *output)
const CuSubVector< Real > Row(MatrixIndexT i) const
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Base class which provides matrix operations not involving resizing or allocation. ...
const Real * Data() const
Gives pointer to raw data (const).
void Randomize(const CuMatrixBase< Real > &src, const CuArray< int32 > ©_from_idx, CuMatrixBase< Real > *tgt)
Copies a permutation of src into tgt.
static void UnitTestBackpropLstmNonlinearity()
static void UnitTestCuMathCopy()
void AddDiagMat2(Real alpha, const MatrixBase< Real > &M, MatrixTransposeType trans=kNoTrans, Real beta=1.0)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
void AddMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
*this += alpha * A
A class for storing matrices.
This class represents a matrix that's stored on the GPU if we have one, and in memory if not...
void CopyFromMat(const MatrixBase< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given matrix. (no resize is done).
static void UnitTestCuMathNormalizePerRow()
static void UnitTestCuDiffNormalizePerRow()
static void UnitTestCuMathComputeLstmNonlinearity()
void BackpropLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > ¶ms, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
This function does the 'backward' pass corresponding to the function ComputeLstmNonlinearity.
void SetVerboseLevel(int32 i)
This should be rarely used, except by programs using Kaldi as library; command-line programs set the ...
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=nullptr)
Applies floor to all elements.
static void UnitTestCuMathRandomize()
void DiffNormalizePerRow(const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)
static void UnitTestEnsureNonzero()
void Scale(Real alpha)
Multiply each element with a scalar value.
void SetRandn()
Sets to random values of a normal distribution.
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
int Rand(struct RandomState *state)
void SetRandn()
Set vector to random normally-distributed noise.
void MulRowsVec(const VectorBase< Real > &scale)
Equivalent to (*this) = diag(scale) * (*this).
CuSubMatrix< Real > ColRange(const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
void EnsureNonzero(const CuMatrixBase< Real > &src, Real epsilon, CuMatrixBase< Real > *dest)
This function requires that src and dest have the same dimension and epsilon > 0. ...
void Splice(const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
Splice concatenates frames of src as specified in frame_offsets into tgt.
Matrix for CUDA computing.
MatrixIndexT NumCols() const
A class representing a vector.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
void AddDiagMatMat(Real alpha, const MatrixBase< Real > &M, MatrixTransposeType transM, const MatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
void AddDiagVecMat(const Real alpha, const VectorBase< Real > &v, const MatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
MatrixIndexT NumRows() const
Dimensions.
void NormalizePerRow(const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square ...
void ApplyFloor(Real floor_val)
static void UnitTestCuMathNormalizePerRow_v2()
double Elapsed() const
Returns time in seconds.
Sub-matrix representation.
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
static void UnitTestCuMathSplice()
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
void Copy(const CuMatrixBase< Real > &src, const CuArray< int32 > ©_from_indices, CuMatrixBase< Real > *tgt)
Copies elements from src into tgt as given by copy_from_indices.
void CpuBackpropLstmNonlinearity(const MatrixBase< Real > &input, const MatrixBase< Real > ¶ms, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > *input_deriv, MatrixBase< Real > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)
void ComputeLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > ¶ms, CuMatrixBase< Real > *output)
this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propaga...