cu-math-test.cc
Go to the documentation of this file.
1 // cudamatrix/cu-math-test.cc
2 
3 // Copyright 2013 Johns Hopkins University (Author: David Snyder)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 #include <iostream>
22 #include <vector>
23 #include <cstdlib>
24 
25 #include "base/kaldi-common.h"
26 #include "util/common-utils.h"
28 #include "cudamatrix/cu-math.h"
29 #include "cudamatrix/cu-array.h"
30 
31 #if defined(_MSC_VER)
32 #include <time.h>
33 #endif
34 
35 using namespace kaldi;
36 
37 
38 namespace kaldi {
39 
40 
41 /*
42  * Unit tests
43  */
44 
45 template<typename Real>
46 static void UnitTestCuMathRandomize() {
47  int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
48  CuMatrix<Real> src(M, N);
49  CuMatrix<Real> tgt(M, N);
50  CuArray<int32> copy_from_idx;
51 
52  src.SetRandn();
53  int32 n_rows = src.NumRows();
54  int32 n_columns = src.NumCols();
55  std::vector<int32> copy_from_idx_vec;
56 
57  for (int32 i = 0; i < n_rows; i++) {
58  copy_from_idx_vec.push_back(Rand() % n_rows);
59  }
60  copy_from_idx.CopyFromVec(copy_from_idx_vec);
61  cu::Randomize(src, copy_from_idx, &tgt);
62 
63  for (int32 i = 0; i < n_rows; i++) {
64  for (int32 j = 0; j < n_columns; j++) {
65  Real src_val = src(copy_from_idx_vec.at(i), j);
66  Real tgt_val = tgt(i, j);
67  AssertEqual(src_val, tgt_val);
68  }
69  }
70 }
71 
72 template<typename Real>
73 static void UnitTestEnsureNonzero() {
74  int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
75  Real epsilon = 0.1;
76  CuMatrix<Real> x(M, N);
77  x.SetRandn();
78  CuMatrix<Real> y(M, N, kUndefined);
79  cu::EnsureNonzero(x, epsilon, &y);
80  Matrix<Real> x_cpu(x);
81  Matrix<Real> y_cpu(y);
82  for (int32 i = 0; i < 30; i++) {
83  int32 r = RandInt(0, M-1), c = RandInt(0, N-1);
84  Real src = x_cpu(r, c), dest = y_cpu(r, c);
85  if (src <= -epsilon || src >= epsilon) {
86  KALDI_ASSERT(src == dest);
87  } else if (src >= 0) {
88  KALDI_ASSERT(dest == epsilon);
89  } else {
90  KALDI_ASSERT(dest == -epsilon);
91  }
92  }
93 }
94 
95 
96 template<typename Real>
97 static void UnitTestCuMathCopy() {
98  int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
99  CuMatrix<Real> src(M, N);
100  CuMatrix<Real> tgt(M, N);
101  CuArray<int32> copy_from_idx;
102 
103  src.SetRandn();
104  int32 n_rows = src.NumRows();
105  int32 n_columns = src.NumCols();
106  std::vector<int32> copy_from_idx_vec;
107 
108  for (int32 i = 0; i < n_columns; i++) {
109  copy_from_idx_vec.push_back(Rand() % n_columns);
110  }
111  copy_from_idx.CopyFromVec(copy_from_idx_vec);
112  cu::Copy(src, copy_from_idx, &tgt);
113 
114  for (int32 i = 0; i < n_rows; i++) {
115  for (int32 j = 0; j < n_columns; j++) {
116  Real src_val = src(i, copy_from_idx_vec.at(j));
117  Real tgt_val = tgt(i, j);
118  AssertEqual(src_val, tgt_val);
119  }
120  }
121 }
122 
123 template<typename Real>
124 static void UnitTestCuMathSplice() {
125  int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
126  CuMatrix<Real> src(M, N);
127  CuArray<int32> frame_offsets;
128 
129  src.SetRandn();
130  int32 n_rows = src.NumRows();
131  int32 n_columns = src.NumCols();
132  std::vector<int32> frame_offsets_vec;
133 
134  // The number of columns of tgt is rows(src)
135  // times n_frame_offsets, so we keep n_frame_offsets
136  // reasonably small (2 <= n <= 6).
137  int32 n_frame_offsets = Rand() % 7 + 2;
138  for (int32 i = 0; i < n_frame_offsets; i++) {
139  frame_offsets_vec.push_back(Rand() % 2 * n_columns - n_columns);
140  }
141 
142  CuMatrix<Real> tgt(M, N * n_frame_offsets);
143  frame_offsets.CopyFromVec(frame_offsets_vec);
144  cu::Splice(src, frame_offsets, &tgt);
145 
146  Matrix<Real> src_copy(src), tgt_copy(tgt);
147  for (int32 i = 0; i < n_rows; i++) {
148  for (int32 k = 0; k < n_frame_offsets; k++) {
149  for (int32 j = 0; j < n_columns; j++) {
150  Real src_val;
151  if (i + frame_offsets_vec.at(k) >= n_rows) {
152  src_val = src_copy(n_rows-1, j);
153  } else if (i + frame_offsets_vec.at(k) <= 0) {
154  src_val = src_copy(0, j);
155  } else {
156  src_val = src_copy(i + frame_offsets_vec.at(k), j);
157  }
158  Real tgt_val = tgt_copy(i, k * n_columns + j);
159  AssertEqual(src_val, tgt_val);
160  }
161  }
162  }
163 }
164 
165 template<typename Real>
167  for (int i = 0; i < 3; i++) {
168  int32 num_rows = 1 + Rand() % 100;
169  int32 cell_dim = 1 + Rand() % 2000;
170  int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
171  Matrix<Real> Hinput(num_rows, 5 * cell_dim + dropout_dim);
172  Matrix<Real> Hparams(3, cell_dim);
173  Matrix<Real> Houtput(num_rows, 2 * cell_dim);
174  Hinput.SetRandn();
175  Hparams.SetRandn();
176 
177  CuMatrix<Real> Dinput(Hinput);
178  CuMatrix<Real> Dparams(Hparams);
179  CuMatrix<Real> Doutput(Houtput);
180 
181  cu::CpuComputeLstmNonlinearity(Hinput, Hparams, &Houtput);
182  cu::ComputeLstmNonlinearity(Dinput, Dparams, &Doutput);
183 
184  Matrix<Real> HDoutput(Doutput);
185  AssertEqual(Houtput, HDoutput);
186  }
187 
188  for (int i = 16; i <= 1024; i *= 2) {
189  BaseFloat time_in_secs = 0.025;
190  int32 num_rows = i;
191  int32 cell_dim = i;
192  int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
193  CuMatrix<Real> input(num_rows, 5 * cell_dim + dropout_dim);
194  CuMatrix<Real> params(3, cell_dim);
195  CuMatrix<Real> output(num_rows, 2 * cell_dim);
196  input.SetRandn();
197  params.SetRandn();
198 
199  Timer tim;
200  int32 iter = 0;
201  for (; tim.Elapsed() < time_in_secs; iter++)
202  cu::ComputeLstmNonlinearity(input, params, &output);
203 
204  BaseFloat gflops = ((BaseFloat) i * i * iter) / (tim.Elapsed() * 1.0e+09);
205  KALDI_LOG << "For ComputeLstmNonlinearity"
206  << (sizeof(Real)==8 ? "<double>" : "<float>") << ", for dim = "
207  << i << ", speed was " << gflops << " gigaflops";
208  if (tim.Elapsed() > 0.05)
209  break;
210  }
211 }
212 
214  for (int32 loop = 0; loop < 10; loop++) {
215 
216  // problem dimensions.
217  int32 num_rows = RandInt(5, 20),
218  cell_dim = RandInt(2, 200),
219  dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
220 
221  // Pick the (input or params block), and output block, for which we'll
222  // spot-check the derivative values. This will give us test failures
223  // that are fine-grained enough to assist debugging.
224  int32 test_input = RandInt(0, 4),
225  test_params = RandInt(0, 2),
226  test_output = RandInt(0, 1);
227 
228  // set one of test_input or test_params to -1, meaning we're not testing that
229  // thing. only test one at a time.
230  if (RandInt(0, 1) == 0)
231  test_input = -1;
232  else
233  test_params = -1;
234 
235 
236  CuMatrix<BaseFloat> input(num_rows, cell_dim * 5 + dropout_dim),
237  params(3, cell_dim),
238  output_deriv(num_rows, cell_dim * 2);
239  input.SetRandn();
240  params.SetRandn();
241  // set just one block of the output deriv to a random value.
242  output_deriv.ColRange(test_output * cell_dim, cell_dim).SetRandn();
243 
244 
245 
246  CuMatrix<BaseFloat> output(num_rows, cell_dim * 2);
247 
248  cu::ComputeLstmNonlinearity(input, params, &output);
249 
250  BaseFloat baseline_objf = TraceMatMat(output, output_deriv, kTrans);
251 
252  // not really testing self repair here... will debug it when we actually run
253  // it, by looking at the diagnostics.
254  CuMatrix<double> deriv_sum(5, cell_dim),
255  value_sum(5, cell_dim);
256  CuVector<BaseFloat> self_repair_config(10.0); // leave at zero... we don't really test this here.
258  self_repair_sum(5, cell_dim),
259  input_deriv(num_rows, 5 * cell_dim + dropout_dim),
260  params_deriv(3, cell_dim);
261 
262  double count_in = 0.0;
263 
264  // get derivative w.r.t. input and params, which we are testing.
265  cu::BackpropLstmNonlinearity(input, params, output_deriv, deriv_sum,
266  self_repair_config, count_in,
267  &input_deriv, &params_deriv,
268  &value_sum, &deriv_sum, &self_repair_sum);
269 
270 
271  int32 test_dim = 5; // number of separate offsets we add while testing the
272  // derivatives... reduces randomness in test.
273  BaseFloat delta = 1.0e-03;
274  Vector<BaseFloat> predicted_objf_change(test_dim),
275  measured_objf_change(test_dim);
276 
277  for (int32 i = 0; i < test_dim; i++) {
278  CuMatrix<BaseFloat> delta_input(num_rows, 5 * cell_dim + dropout_dim),
279  delta_params(3, cell_dim);
280  if (test_input >= 0) {
281  delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn();
282  delta_input.Scale(delta);
283  }
284  if (test_params >= 0) {
285  delta_params.Row(test_params).SetRandn();
286  delta_params.Scale(delta);
287  }
288 
289  predicted_objf_change(i) = TraceMatMat(delta_input, input_deriv, kTrans) +
290  TraceMatMat(delta_params, params_deriv, kTrans);
291 
292  CuMatrix<BaseFloat> perturbed_input(input);
293  perturbed_input.AddMat(1.0, delta_input);
294 
295  CuMatrix<BaseFloat> perturbed_params(params);
296  perturbed_params.AddMat(1.0, delta_params);
297 
298  CuMatrix<BaseFloat> perturbed_output(num_rows, 2 * cell_dim);
299  cu::ComputeLstmNonlinearity(perturbed_input, perturbed_params,
300  &perturbed_output);
301  BaseFloat new_objf = TraceMatMat(perturbed_output, output_deriv, kTrans),
302  objf_change = new_objf - baseline_objf;
303  measured_objf_change(i) = objf_change;
304  }
305  KALDI_LOG << "LSTM nonlinearity test: num_rows=" << num_rows
306  << ", cell_dim=" << cell_dim
307  << ", dropout_dim=" << dropout_dim
308  << ", test_input=" << test_input
309  << ", test_params=" << test_params
310  << ", test_output=" << test_output
311  << ", predicted_objf_change=" << predicted_objf_change
312  << ", measured_objf_change=" << measured_objf_change;
313 
314  if (!ApproxEqual(predicted_objf_change, measured_objf_change, BaseFloat(0.1F))) {
315  KALDI_ERR << "LSTM nonlinearity test failed.";
316  }
317  }
318 }
319 
320 template<typename Real>
322  for (int i = 0; i < 3; i++) {
323  int32 num_rows = 1 + Rand() % 200;
324  int32 cell_dim = 1 + Rand() % 2000,
325  dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
326 // KALDI_LOG << num_rows << ", " << cell_dim;
327 
328  Matrix<Real> hinput(num_rows, 5 * cell_dim + dropout_dim);
329  Matrix<Real> hparams(3, cell_dim);
330  Matrix<Real> houtput_deriv(num_rows, 2 * cell_dim);
331  Matrix<double> hderiv_sum_in(5, cell_dim);
332  Vector<Real> hself_repair_config(10);
333  double count_in;
334  Matrix<Real> hinput_deriv(num_rows, 5 * cell_dim + dropout_dim);
335  Matrix<Real> hparams_deriv(3, cell_dim);
336  Matrix<double> hvalue_sum_out(5, cell_dim);
337  Matrix<double> hderiv_sum_out(5, cell_dim);
338  Matrix<Real> hself_repair_sum_out(5, cell_dim);
339 
340  hinput.SetRandn();
341  hparams.SetRandn();
342  houtput_deriv.SetRandn();
343  hderiv_sum_in.SetRandn();
344  hself_repair_config.SetRandn();
345  count_in = Rand() % num_rows;
346 
347  hinput_deriv.SetRandn();
348  hparams_deriv.SetRandn();
349  hvalue_sum_out.SetRandn();
350  hderiv_sum_out.SetRandn();
351  hself_repair_sum_out.SetRandn();
352 
353  CuMatrix<Real> dinput(hinput);
354  CuMatrix<Real> dparams(hparams);
355  CuMatrix<Real> doutput_deriv(houtput_deriv);
356  CuMatrix<double> dderiv_sum_in(hderiv_sum_in);
357  CuVector<Real> dself_repair_config(hself_repair_config);
358 
359  CuMatrix<Real> dinput_deriv(hinput_deriv);
360  CuMatrix<Real> dparams_deriv(hparams_deriv);
361  CuMatrix<double> dvalue_sum_out(hvalue_sum_out);
362  CuMatrix<double> dderiv_sum_out(hderiv_sum_out);
363  CuMatrix<Real> dself_repair_sum_out(hself_repair_sum_out);
364 
365  cu::CpuBackpropLstmNonlinearity(hinput, hparams, houtput_deriv,
366  hderiv_sum_in, hself_repair_config,
367  count_in, (MatrixBase<Real>*) NULL,
368  (MatrixBase<Real>*) NULL,
369  (MatrixBase<double>*) NULL,
370  (MatrixBase<double>*) NULL,
371  (MatrixBase<Real>*) NULL);
372  cu::BackpropLstmNonlinearity(dinput, dparams, doutput_deriv, dderiv_sum_in,
373  dself_repair_config, count_in,
374  (CuMatrixBase<Real>*) NULL,
375  (CuMatrixBase<Real>*) NULL,
376  (CuMatrixBase<double>*) NULL,
377  (CuMatrixBase<double>*) NULL,
378  (CuMatrixBase<Real>*) NULL);
379 
380  cu::CpuBackpropLstmNonlinearity(hinput, hparams, houtput_deriv,
381  hderiv_sum_in, hself_repair_config,
382  count_in, (MatrixBase<Real>*) NULL,
383  &hparams_deriv, &hvalue_sum_out,
384  &hderiv_sum_out, &hself_repair_sum_out);
385  cu::BackpropLstmNonlinearity(dinput, dparams, doutput_deriv, dderiv_sum_in,
386  dself_repair_config, count_in,
387  (CuMatrixBase<Real>*) NULL, &dparams_deriv,
388  &dvalue_sum_out, &dderiv_sum_out,
389  &dself_repair_sum_out);
390 
391  cu::CpuBackpropLstmNonlinearity(hinput, hparams, houtput_deriv,
392  hderiv_sum_in, hself_repair_config,
393  count_in, &hinput_deriv,
394  (MatrixBase<Real>*) NULL,
395  (MatrixBase<double>*) NULL,
396  (MatrixBase<double>*) NULL,
397  (MatrixBase<Real>*) NULL);
398  cu::BackpropLstmNonlinearity(dinput, dparams, doutput_deriv, dderiv_sum_in,
399  dself_repair_config, count_in, &dinput_deriv,
400  (CuMatrixBase<Real>*) NULL,
401  (CuMatrixBase<double>*) NULL,
402  (CuMatrixBase<double>*) NULL,
403  (CuMatrixBase<Real>*) NULL);
404 
405  cu::CpuBackpropLstmNonlinearity(hinput, hparams, houtput_deriv,
406  hderiv_sum_in, hself_repair_config,
407  count_in, &hinput_deriv, &hparams_deriv,
408  &hvalue_sum_out, &hderiv_sum_out,
409  &hself_repair_sum_out);
410  cu::BackpropLstmNonlinearity(dinput, dparams, doutput_deriv, dderiv_sum_in,
411  dself_repair_config, count_in, &dinput_deriv,
412  &dparams_deriv, &dvalue_sum_out,
413  &dderiv_sum_out, &dself_repair_sum_out);
414 
415  Matrix<Real> hdinput_deriv(dinput_deriv);
416  Matrix<Real> hdparams_deriv(dparams_deriv);
417  Matrix<double> hdvalue_sum_out(dvalue_sum_out);
418  Matrix<double> hdderiv_sum_out(dderiv_sum_out);
419  Matrix<Real> hdself_repair_sum_out(dself_repair_sum_out);
420 
421 // KALDI_LOG<< "input_deriv" << hinput_deriv << "d" << hdinput_deriv;
422 // KALDI_LOG<< "hparams_deriv" << hparams_deriv << "d" << hdparams_deriv;
423 // KALDI_LOG<< "hvalue_sum_out" << hvalue_sum_out << "d" << hdvalue_sum_out;
424 // KALDI_LOG<< "hderiv_sum_out" << hderiv_sum_out << "d" << hdderiv_sum_out;
425 // KALDI_LOG<< "hself_repair_sum_out" << hself_repair_sum_out << "d" << hdself_repair_sum_out;
426 
427  AssertEqual(hinput_deriv, hdinput_deriv);
428  AssertEqual(hparams_deriv, hdparams_deriv);
429  AssertEqual(hvalue_sum_out, hdvalue_sum_out);
430  AssertEqual(hderiv_sum_out, hdderiv_sum_out);
431  AssertEqual(hself_repair_sum_out, hdself_repair_sum_out);
432  }
433 
434  for (int i = 16; i <= 2048; i *= 2) {
435  BaseFloat time_in_secs = 0.025;
436  int32 num_rows = i;
437  int32 cell_dim = i;
438  int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
439 
440  CuMatrix<Real> input(num_rows, 5 * cell_dim + dropout_dim);
441  CuMatrix<Real> params(3, cell_dim);
442  CuMatrix<Real> output_deriv(num_rows, 2 * cell_dim);
443  CuMatrix<double> deriv_sum_in(5, cell_dim);
444  CuVector<Real> self_repair_config(10);
445  double count_in;
446 
447  CuMatrix<Real> input_deriv(num_rows, 5 * cell_dim + dropout_dim);
448  CuMatrix<Real> params_deriv(3, cell_dim);
449  CuMatrix<double> value_sum_out(5, cell_dim);
450  CuMatrix<double> deriv_sum_out(5, cell_dim);
451  CuMatrix<Real> self_repair_sum_out(5, cell_dim);
452 
453  input.SetRandn();
454  params.SetRandn();
455  output_deriv.SetRandn();
456  deriv_sum_in.SetRandn();
457  self_repair_config.SetRandn();
458  count_in = Rand() % num_rows;
459 
460  Timer tim;
461  int32 iter = 0;
462  for (; tim.Elapsed() < time_in_secs; iter++)
463  cu::BackpropLstmNonlinearity(input, params, output_deriv, deriv_sum_in,
464  self_repair_config, count_in, &input_deriv,
465  &params_deriv, &value_sum_out,
466  &deriv_sum_out, &self_repair_sum_out);
467 
468 
469  BaseFloat gflops = ((BaseFloat) i * i * iter) / (tim.Elapsed() * 1.0e+09);
470  KALDI_LOG << "For BackpropLstmNonlinearity"
471  << (sizeof(Real) == 8 ? "<double>" : "<float>") << ", for dim = "
472  << i << ", speed was " << gflops << " gigaflops";
473  if (tim.Elapsed() > 0.05)
474  break;
475  }
476 }
477 
478 template<typename Real>
480 
481  for (int32 i = 0; i < 2; i++) {
482  int row = 10 + Rand() % 40;
483  int col = 10 + Rand() % 50;
484 
485  Matrix<Real> Hi(row,col);
486  Matrix<Real> Ho(row,col+1);
487  Hi.SetRandn();
488  Hi.Scale(5.0);
489 
490  CuMatrix<Real> Di(row, col);
491  CuMatrix<Real> Do(row, col+1);
492  Di.CopyFromMat(Hi);
493 
494  Real target_rms = 0.3456;
495  bool add_log_stddev = true;
496  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
497 
498  //gpu
499  cu::NormalizePerRow(Di, target_rms, add_log_stddev, &Do);
500 
501  //cpu
502  {
503  MatrixBase<Real>& in(Hi);
504  MatrixBase<Real>& out(Ho);
505  Real target_rms=0.3456;
506  SubMatrix<Real> out_no_log(out, 0, out.NumRows(), 0, in.NumCols());
507  if (in.Data() != out_no_log.Data())
508  out_no_log.CopyFromMat(in);
509  Vector<Real> in_norm(in.NumRows());
510  Real d_scaled = in.NumCols() * target_rms * target_rms;
511  in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
512  in_norm.ApplyFloor(kSquaredNormFloor);
513  in_norm.ApplyPow(-0.5);
514  out_no_log.MulRowsVec(in_norm);
515  if (add_log_stddev) {
516  in_norm.ApplyLog();
517  in_norm.Scale(-1.0);
518  in_norm.Add(log(target_rms));
519  out.CopyColFromVec(in_norm, in.NumCols());
520  }
521  }
522 
523  Matrix<Real> Ho2(Do);
524  AssertEqual(Ho,Ho2,0.00001);
525  }
526 
527  for (int dim = 16; dim <= 1024; dim *= 2) {
528  BaseFloat time_in_secs = 0.025;
529  CuMatrix<Real> M(dim, dim), N(dim, dim + 1);
530  M.SetRandn();
531  N.SetRandn();
532  Timer tim;
533  int32 iter = 0;
534  for (; tim.Elapsed() < time_in_secs; iter++) {
535  cu::NormalizePerRow(M, Real(1), true, &N);
536  }
537 
538  BaseFloat gflops = ((BaseFloat) dim * dim * iter)
539  / (tim.Elapsed() * 1.0e+09);
540  KALDI_LOG << "For CuMath::NormalizePerRow"
541  << (sizeof(Real)==8?"<double>":"<float>") << ", for dim = "
542  << dim << ", speed was " << gflops << " gigaflops.";
543  if (tim.Elapsed() > 0.05)
544  break;
545  }
546 }
547 
548 
549 template<typename Real>
551 
552  int row = 128;
553  int col = 1024;
554 
555  Matrix<Real> Hi(row,col);
556  Matrix<Real> Ho(row,col);
557  Hi.SetRandn();
558  Hi.Scale(5.0);
559  Hi.ApplyFloor(0.0); // like ReLU,
560 
561  CuMatrix<Real> Di(row, col);
562  CuMatrix<Real> Do(row, col);
563  Di.CopyFromMat(Hi);
564 
565  Real target_rms = 0.3456;
566  bool add_log_stddev = false;
567  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
568 
569  //gpu
570  cu::NormalizePerRow(Di, target_rms, add_log_stddev, &Do);
571 
572  //cpu
573  {
574  MatrixBase<Real>& in(Hi);
575  MatrixBase<Real>& out(Ho);
576  Real target_rms=0.3456;
577  Vector<Real> in_norm(in.NumRows());
578  Real d_scaled = in.NumCols() * target_rms * target_rms;
579  in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
580  in_norm.ApplyFloor(kSquaredNormFloor);
581  in_norm.ApplyPow(-0.5);
582  out.CopyFromMat(in);
583  out.MulRowsVec(in_norm);
584  }
585 
586  Matrix<Real> Ho2(Do);
587  // here the BUG was detected (by processing big-enough matrix),
588  AssertEqual(Ho,Ho2,0.00001);
589 }
590 
591 
592 template<typename Real>
594  for (int32 i = 0; i < 2; i++) {
595  int row = 10 + Rand() % 40;
596  int col = 10 + Rand() % 50;
597 
598  Matrix<Real> Hi(row, col);
599  Matrix<Real> Ho(row, col + 1);
600  Matrix<Real> Hid(row, col);
601  Matrix<Real> Hod(row, col + 1);
602  Hi.SetRandn();
603  Hod.SetRandn();
604  Hi.Scale(5.0);
605 
606  CuMatrix<Real> Di(row, col);
607  CuMatrix<Real> Do(row, col + 1);
608  CuMatrix<Real> Did(row, col);
609  CuMatrix<Real> Dod(row, col + 1);
610  Di.CopyFromMat(Hi);
611  Dod.CopyFromMat(Hod);
612 
613  Real target_rms = 0.3456;
614  bool add_log_stddev = true;
615  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
616 
617  //gpu
618  cu::DiffNormalizePerRow(Di, Dod, target_rms, add_log_stddev, &Did);
619 
620  //cpu
621  {
622  MatrixBase<Real>* in_deriv = &Hid;
623  MatrixBase<Real>& out_deriv(Hod);
624  MatrixBase<Real>& in_value(Hi);
625 
626  const SubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
627  0, in_value.NumCols());
628  Vector<Real> dot_products(out_deriv.NumRows());
629  dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
630  kTrans, 0.0);
631  Vector<Real> in_norm(in_value.NumRows());
632  Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
633  in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
634  if (add_log_stddev) {
635  Vector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
636  out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
637  // f = log(sqrt(max(epsi, x^T x / D)))
638  // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x : 0.
639  // we don't compute this exactly below for the case when x^2 x is very
640  // small, but we do make sure that the deriv isn't infinity when the input
641  // is zero.
642  log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
643  log_stddev_deriv.ApplyPow(-1.0);
644  out_deriv_for_stddev.CopyColFromMat(out_deriv,
645  (out_deriv.NumCols() - 1));
646  log_stddev_deriv.MulElements(out_deriv_for_stddev);
647  if (in_deriv)
648  in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans,
649  1.0);
650  }
651  in_norm.Scale(1.0 / d_scaled);
652  in_norm.ApplyFloor(kSquaredNormFloor);
653  in_norm.ApplyPow(-0.5);
654  if (in_deriv) {
655  if (in_deriv->Data() != out_deriv_no_log.Data())
656  in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans,
657  1.0);
658  else
659  in_deriv->MulRowsVec(in_norm);
660  in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
661  in_norm.ApplyPow(3.0);
662  dot_products.MulElements(in_norm);
663 
664  in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value,
665  kNoTrans, 1.0);
666  }
667 
668  Matrix<Real> Hid2(Did);
669  AssertEqual(Hid, Hid2, 0.00001);
670  }
671  }
672 
673  for (int dim = 16; dim <= 1024; dim *= 2) {
674  BaseFloat time_in_secs = 0.025;
675  CuMatrix<Real> id(dim, dim), iv(dim, dim), od(dim, dim + 1);
676  iv.SetRandn();
677  od.SetRandn();
678  Timer tim;
679  int32 iter = 0;
680  for (; tim.Elapsed() < time_in_secs; iter++) {
681  cu::DiffNormalizePerRow(iv, od, Real(0.456), true, &id);
682  }
683  BaseFloat fdim = dim;
684  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
685  KALDI_LOG << "For CuMath::DiffNormalizePerRow"
686  << (sizeof(Real)==8?"<double>":"<float>")
687  << ", for dim = " << dim << ", speed was " << gflops
688  << " gigaflops.";
689  }
690 }
691 
692 
693 
694 template<typename Real> void CudaMathUnitTest() {
695 #if HAVE_CUDA == 1
696  if (CuDevice::Instantiate().DoublePrecisionSupported())
697 #endif
698 
699  UnitTestCuMathComputeLstmNonlinearity<Real>();
700  UnitTestCuMathRandomize<Real>();
701  UnitTestCuMathSplice<Real>();
702  UnitTestCuMathCopy<Real>();
704  UnitTestEnsureNonzero<Real>();
705  UnitTestBackpropLstmNonlinearity<Real>();
706  UnitTestCuMathNormalizePerRow<Real>();
707  UnitTestCuMathNormalizePerRow_v2<Real>();
708  UnitTestCuDiffNormalizePerRow<Real>();
709 }
710 
711 } // namespace kaldi
712 
713 
714 int main() {
715  SetVerboseLevel(1);
716  int32 loop = 0;
717 #if HAVE_CUDA == 1
718  for (; loop < 2; loop++) {
719  CuDevice::Instantiate().SetDebugStrideMode(true);
720  if (loop == 0)
721  CuDevice::Instantiate().SelectGpuId("no"); // 0 means no GPU
722  else
723  CuDevice::Instantiate().SelectGpuId("yes"); // 1 .. automatic selection
724 #endif
725  srand(time(NULL));
726  kaldi::CudaMathUnitTest<float>();
727 
728 #if HAVE_CUDA == 1
729  if (CuDevice::Instantiate().DoublePrecisionSupported()) {
730  kaldi::CudaMathUnitTest<double>();
731  } else {
732  KALDI_WARN << "Double precision not supported";
733  }
734 #else
735  kaldi::CudaMathUnitTest<float>();
736 #endif
737 
738  if (loop == 0)
739  KALDI_LOG << "Tests without GPU use succeeded.";
740  else
741  KALDI_LOG << "Tests with GPU use (if available) succeeded.";
742 #if HAVE_CUDA == 1
743  } // No for loop if 'HAVE_CUDA != 1',
744  CuDevice::Instantiate().PrintProfile();
745 #endif
746  return 0;
747 }
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void CopyColFromVec(const VectorBase< Real > &v, const MatrixIndexT col)
Copy vector into specific column of matrix.
void CopyFromVec(const std::vector< T > &src)
This function resizes if needed.
Definition: cu-array-inl.h:120
void UnitTestLstmNonlinearity()
void CpuComputeLstmNonlinearity(const MatrixBase< Real > &input_mat, const MatrixBase< Real > &params_mat, MatrixBase< Real > *output)
Definition: cu-math.cc:445
const CuSubVector< Real > Row(MatrixIndexT i) const
Definition: cu-matrix.h:670
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
const Real * Data() const
Gives pointer to raw data (const).
Definition: kaldi-matrix.h:79
void Randomize(const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< Real > *tgt)
Copies a permutation of src into tgt.
Definition: cu-math.cc:80
static void UnitTestBackpropLstmNonlinearity()
static void UnitTestCuMathCopy()
Definition: cu-math-test.cc:97
void AddDiagMat2(Real alpha, const MatrixBase< Real > &M, MatrixTransposeType trans=kNoTrans, Real beta=1.0)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
kaldi::int32 int32
void AddMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
*this += alpha * A
Definition: cu-matrix.cc:954
A class for storing matrices.
Definition: kaldi-matrix.h:823
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void CopyFromMat(const MatrixBase< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given matrix. (no resize is done).
static void UnitTestCuMathNormalizePerRow()
static void UnitTestCuDiffNormalizePerRow()
static void UnitTestCuMathComputeLstmNonlinearity()
void BackpropLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
This function does the &#39;backward&#39; pass corresponding to the function ComputeLstmNonlinearity.
Definition: cu-math.cc:768
void SetVerboseLevel(int32 i)
This should be rarely used, except by programs using Kaldi as library; command-line programs set the ...
Definition: kaldi-error.h:64
void Scale(Real value)
Definition: cu-matrix.cc:644
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=nullptr)
Applies floor to all elements.
Definition: kaldi-vector.h:149
static void UnitTestCuMathRandomize()
Definition: cu-math-test.cc:46
float BaseFloat
Definition: kaldi-types.h:29
void DiffNormalizePerRow(const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)
Definition: cu-math.cc:349
static void UnitTestEnsureNonzero()
Definition: cu-math-test.cc:73
void CudaMathUnitTest()
void Scale(Real alpha)
Multiply each element with a scalar value.
void SetRandn()
Sets to random values of a normal distribution.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
int Rand(struct RandomState *state)
Definition: kaldi-math.cc:45
void SetRandn()
Set vector to random normally-distributed noise.
void MulRowsVec(const VectorBase< Real > &scale)
Equivalent to (*this) = diag(scale) * (*this).
CuSubMatrix< Real > ColRange(const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:665
void EnsureNonzero(const CuMatrixBase< Real > &src, Real epsilon, CuMatrixBase< Real > *dest)
This function requires that src and dest have the same dimension and epsilon > 0. ...
Definition: cu-math.cc:209
void Splice(const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
Splice concatenates frames of src as specified in frame_offsets into tgt.
Definition: cu-math.cc:132
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
Definition: kaldi-math.h:276
void AddDiagMatMat(Real alpha, const MatrixBase< Real > &M, MatrixTransposeType transM, const MatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
void AddDiagVecMat(const Real alpha, const VectorBase< Real > &v, const MatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
void NormalizePerRow(const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square ...
Definition: cu-math.cc:280
void ApplyFloor(Real floor_val)
Definition: kaldi-matrix.h:354
static void UnitTestCuMathNormalizePerRow_v2()
#define KALDI_LOG
Definition: kaldi-error.h:153
double Elapsed() const
Returns time in seconds.
Definition: timer.h:74
Sub-matrix representation.
Definition: kaldi-matrix.h:988
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265
static void UnitTestCuMathSplice()
int main()
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
Definition: kaldi-math.cc:95
void Copy(const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< Real > *tgt)
Copies elements from src into tgt as given by copy_from_indices.
Definition: cu-math.cc:173
void CpuBackpropLstmNonlinearity(const MatrixBase< Real > &input, const MatrixBase< Real > &params, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > *input_deriv, MatrixBase< Real > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)
Definition: cu-math.cc:543
void ComputeLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, CuMatrixBase< Real > *output)
this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propaga...
Definition: cu-math.cc:489