nnet-normalize-component.cc
Go to the documentation of this file.
1 // nnet3/nnet-normalize-component.cc
2 
3 // Copyright 2015-2017 Johns Hopkins University (author: Daniel Povey)
4 // 2015 Guoguo Chen
5 // 2015 Daniel Galvez
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #include <iterator>
23 #include <sstream>
24 #include <algorithm>
25 #include <iomanip>
27 #include "nnet3/nnet-parse.h"
28 #include "cudamatrix/cu-math.h"
29 
30 namespace kaldi {
31 namespace nnet3 {
32 
35 
37  input_dim_(other.input_dim_), block_dim_(other.block_dim_),
38  target_rms_(other.target_rms_),
39  add_log_stddev_(other.add_log_stddev_) { }
40 
42  input_dim_ = 0;
43  add_log_stddev_ = false;
44  target_rms_ = 1.0;
45  bool ok = cfl->GetValue("dim", &input_dim_) ||
46  cfl->GetValue("input-dim", &input_dim_);
48  cfl->GetValue("block-dim", &block_dim_);
49  cfl->GetValue("target-rms", &target_rms_);
50  cfl->GetValue("add-log-stddev", &add_log_stddev_);
51  if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || target_rms_ <= 0.0 ||
52  block_dim_ <= 0 || input_dim_ % block_dim_ != 0)
53  KALDI_ERR << "Invalid initializer for layer of type "
54  << Type() << ": \"" << cfl->WholeLine() << "\"";
55 }
56 
57 void NormalizeComponent::Read(std::istream &is, bool binary) {
58  std::string token;
59  ReadToken(is, binary, &token);
60  if (token == "<NormalizeComponent>") {
61  ReadToken(is, binary, &token);
62  }
63  KALDI_ASSERT(token == "<Dim>" || token == "<InputDim>");
64  ReadBasicType(is, binary, &input_dim_); // Read dimension.
65  ReadToken(is, binary, &token);
66  if (token == "<BlockDim>") {
67  ReadBasicType(is, binary, &block_dim_);
68  ReadToken(is, binary, &token);
69  } else {
71  }
72  // read target_rms_ if it is available.
73  if (token == "<TargetRms>") {
74  ReadBasicType(is, binary, &target_rms_);
75  ReadToken(is, binary, &token);
76  }
77  // Read add_log_stddev_ token, if it is available.
78  if (token == "<AddLogStddev>") {
79  ReadBasicType(is, binary, &add_log_stddev_);
80  ReadToken(is, binary, &token);
81  } else {
82  add_log_stddev_ = false;
83  }
84  if (token == "<ValueAvg>") {
85  // back-compatibility code.
86  CuVector<double> temp;
87  temp.Read(is, binary);
88  ExpectToken(is, binary, "<DerivAvg>");
89  temp.Read(is, binary);
90  ExpectToken(is, binary, "<Count>");
91  double count;
92  ReadBasicType(is, binary, &count);
93  ReadToken(is, binary, &token);
94  }
95  KALDI_ASSERT(token == "</NormalizeComponent>");
96 }
97 
98 void NormalizeComponent::Write(std::ostream &os, bool binary) const {
99  WriteToken(os, binary, "<NormalizeComponent>");
100  WriteToken(os, binary, "<InputDim>");
101  WriteBasicType(os, binary, input_dim_);
102  if (block_dim_ != input_dim_) {
103  WriteToken(os, binary, "<BlockDim>");
104  WriteBasicType(os, binary, block_dim_);
105  }
106  WriteToken(os, binary, "<TargetRms>");
107  WriteBasicType(os, binary, target_rms_);
108  WriteToken(os, binary, "<AddLogStddev>");
109  WriteBasicType(os, binary, add_log_stddev_);
110  WriteToken(os, binary, "</NormalizeComponent>");
111 }
112 
113 std::string NormalizeComponent::Info() const {
114  std::ostringstream stream;
115  stream << Type() << ", input-dim=" << InputDim()
116  << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_
117  << ", add-log-stddev=" << std::boolalpha << add_log_stddev_;
118  if (block_dim_ != input_dim_)
119  stream << ", block-dim=" << block_dim_;
120  return stream.str();
121 }
122 
123 // The output y_i = scale * x_i,
124 // and we want to RMS value of the y_i to equal target_rms,
125 // so y^t y = D * target_rms^2 (if y is one row of the input).
126 // we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
127 // there is also flooring involved, to avoid division-by-zero
128 // problems. It's important for the backprop, that the floor's
129 // square root is exactly representable as float.
130 // If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
131 // is an extra dimension of the output.
133  const CuMatrixBase<BaseFloat> &in,
134  CuMatrixBase<BaseFloat> *out) const {
135  KALDI_ASSERT(in.NumCols() == InputDim() && out->NumCols() == OutputDim() &&
136  in.NumRows() == out->NumRows());
137  if (block_dim_ != input_dim_) {
138  int32 num_blocks = input_dim_ / block_dim_,
139  new_num_rows = in.NumRows() * num_blocks,
140  output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
141  KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
142  CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_num_rows,
144  out_reshaped(out->Data(), new_num_rows,
145  output_block_dim, output_block_dim);
147  &out_reshaped);
148  } else {
150  }
151  return NULL;
152 }
153 
154 /*
155  A note on the derivative of NormalizeComponent...
156  let both row_in and row_out be vectors of dimension D.
157  Let p = row_in^T row_in / (D * target_rms^2), and let
158  f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
159  row_out = f row_in.
160  Suppose we have a quantity deriv_out which is the derivative
161  of the objective function w.r.t. row_out. We want to compute
162  deriv_in which is the derivative of the objective function w.r.t.
163  row_in. Let the objective function be F. One term is obvious: we have
164  deriv_in = f deriv_out + ....
165  next we have to take into account the derivative that gets back-propagated
166  through f. Obviously, dF/df = deriv_out^T row_in.
167  And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
168  and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
169  So this term in dF/d(row_in) equals:
170  dF/df df/dp dp/d(row_in) = 2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
171  So
172  deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / (D * target_rms^2) ) (deriv_out^T row_in) row_in
173 
174  if add_log_stddev_ true, the deriv_in has another term as
175  dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
176 */
177 void NormalizeComponent::Backprop(const std::string &debug_info,
178  const ComponentPrecomputedIndexes *indexes,
179  const CuMatrixBase<BaseFloat> &in_value,
180  const CuMatrixBase<BaseFloat> &, // out_value
181  const CuMatrixBase<BaseFloat> &out_deriv,
182  void *memo,
183  Component *to_update,
184  CuMatrixBase<BaseFloat> *in_deriv) const {
185  NVTX_RANGE("NormalizeComponent::Backprop");
186  if (!in_deriv)
187  return;
188  if (block_dim_ != input_dim_) {
189  int32 num_blocks = input_dim_ / block_dim_,
190  new_num_rows = in_value.NumRows() * num_blocks,
191  output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
192  KALDI_ASSERT(in_value.Stride() == in_value.NumCols() &&
193  out_deriv.Stride() == out_deriv.NumCols() &&
194  in_deriv->Stride() == in_deriv->NumCols());
195  CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(), new_num_rows,
197  out_deriv_reshaped(out_deriv.Data(), new_num_rows,
198  output_block_dim, output_block_dim),
199  in_deriv_reshaped(in_deriv->Data(), new_num_rows,
201  cu::DiffNormalizePerRow(in_value_reshaped, out_deriv_reshaped, target_rms_,
202  add_log_stddev_, &in_deriv_reshaped);
203  } else {
205  in_deriv);
206  }
207 }
208 
210  if (!test_mode_) {
211  offset_.Resize(0);
212  scale_.Resize(0);
213  return;
214  }
215 
216  if (count_ == 0.0) {
217  KALDI_WARN << "Test-mode is set but there is no data count. "
218  "Creating random counts. This is NOT A PROBLEM if the message "
219  "appears in unit-tests or in compute_prob_*.0.log. If you see this "
220  "elsewhere, something is very wrong.";
221  count_ = 1.0;
222  stats_sum_.SetRandn();
223  stats_sumsq_.SetRandn();
224  stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0);
225  }
226 
227  offset_.Resize(block_dim_);
228  scale_.Resize(block_dim_);
229  offset_.CopyFromVec(stats_sum_);
230  offset_.Scale(-1.0 / count_);
231  // now offset_ is -mean.
232  scale_.CopyFromVec(stats_sumsq_);
233  scale_.Scale(1.0 / count_);
234  scale_.AddVecVec(-1.0, offset_, offset_, 1.0);
235  // now scale_ is variance.
236  // Mathematically the ApplyFloor statement should be a no-op; this is in case
237  // of numerical roundoff.
238  scale_.ApplyFloor(0.0);
239  scale_.Add(epsilon_);
240  BaseFloat power = -0.5;
241  scale_.ApplyPow(power);
242  // now scale_ = min(variance, epsilon)^power
243  // next, multiply by the target RMS (normally 1.0).
244  scale_.Scale(target_rms_);
245  offset_.MulElements(scale_);
246  // now offset_ is -(scale*mean).
247 }
248 
249 void BatchNormComponent::SetTestMode(bool test_mode) {
250  test_mode_ = test_mode;
251  ComputeDerived();
252 }
253 
255  KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 &&
256  epsilon_ > 0.0 && target_rms_ > 0.0);
257 }
258 
260  dim_(other.dim_), block_dim_(other.block_dim_),
261  epsilon_(other.epsilon_), target_rms_(other.target_rms_),
262  test_mode_(other.test_mode_), count_(other.count_),
263  stats_sum_(other.stats_sum_), stats_sumsq_(other.stats_sumsq_) {
264  ComputeDerived();
265  Check();
266 }
267 
268 
269 std::string BatchNormComponent::Info() const {
270  std::ostringstream stream;
271  stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
272  << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
273  << ", count=" << count_
274  << ", test-mode=" << (test_mode_ ? "true" : "false");
275  if (count_ > 0) {
277  mean.Scale(1.0 / count_);
278  var.Scale(1.0 / count_);
279  // subtract mean^2 from var.
280  var.AddVecVec(-1.0, mean, mean, 1.0);
281  var.ApplyFloor(0.0);
282  var.ApplyPow(0.5); // make it the stddev.
283  stream << ", data-mean=" << SummarizeVector(mean)
284  << ", data-stddev=" << SummarizeVector(var);
285  }
286  return stream.str();
287 }
288 
290  dim_ = -1;
291  block_dim_ = -1;
292  epsilon_ = 1.0e-03;
293  target_rms_ = 1.0;
294  test_mode_ = false;
295  bool ok = cfl->GetValue("dim", &dim_);
296  cfl->GetValue("block-dim", &block_dim_);
297  cfl->GetValue("epsilon", &epsilon_);
298  cfl->GetValue("target-rms", &target_rms_);
299  cfl->GetValue("test-mode", &test_mode_);
300  if (!ok || dim_ <= 0) {
301  KALDI_ERR << "BatchNormComponent must have 'dim' specified, and > 0";
302  }
303  if (block_dim_ == -1)
304  block_dim_ = dim_;
305  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 &&
306  epsilon_ > 0 && target_rms_ > 0))
307  KALDI_ERR << "Invalid configuration in BatchNormComponent.";
308  if (cfl->HasUnusedValues())
309  KALDI_ERR << "Could not process these elements in initializer: "
310  << cfl->UnusedValues();
311  count_ = 0;
314  if (test_mode_) {
315  ComputeDerived();
316  }
317 }
318 
319 
320 
321 /*
322  BATCHNORM_MATH
323 
324  This comment describes the equations involved in batch normalization, and
325  derives the forward and back-propagation.
326 
327  This is all dimension-by-dimension, so we just imagine the inputs
328  are scalars x(i), for i=0 .. n-1.
329 
330  FORWARD PASS:
331 
332  Let 'power' be a constant, equal to -0.5 for regular batch-norm.
333 
334  To simplify the math we (conceptually, not physically) do the normalization in
335  two stages: first mean, then variance, so we have x(i) -> y(i) -> z(i).
336 
337  The name 'rscale' means 'raw scale', meaning the scale before including
338  target-rms. Later we'll define 'scale = target-rms * rscale', to make some
339  of the actual computations slightly more efficient.
340 
341  Define: mean = 1/I * sum_i x(i)
342  y(i) = x(i) - mean
343 
344  var = 1/I \sum_i y(i)^2
345  rscale = sqrt(var + epsilon)^power <---- For regular batchnorm, power == -0.5.
346  z(i) = target-rms * rscale * y(i)
347 
348 
349  Most of the rest of this comment derives how to compute the derivatives. If
350  you just want the formulas, please skip to the string 'BACKWARD PASS' below.
351 
352  We'll use a notation where an apostrophe on something means (the derivative of
353  the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on.
354  We are given y'(i). Propagating the derivatives backward:
355 
356  rscale' = (sum_i y(i) z'(i)) * target-rms
357  = (sum_i z(i) z'(i)) / rscale
358 
359  [ note: d(rscale)/d(var) = power * (var + epsilon)^{power - 1}
360  = power * rscale^{(power-1)/power} ]
361 
362  var' = rscale' * power * rscale^{(power-1)/power}
363  = power * (\sum_i z'(i) z(i)) * rscale^{(power-1)/power - 1}
364  = power * (\sum_i z'(i) z(i)) * rscale^{-1/power}
365 
366  [note: the following formula is of the form "direct term" + "indirect term"]
367  y'(i) = z'(i) * target-rms * rscale + 2/I y(i) var'
368 
369  Now, the above is inconvenient because it contains y(i) which is an intermediate
370  quantity. We reformulate in terms of z(i), using y(i) = z(i) / (target-rms * rscale), so:
371 
372  defining
373  var_deriv_mod = 2/I * var' / (target-rms * rscale)
374  = 2/I * power/target-rms * (\sum_i z'(i) z(i)) * rscale^{-(1+power)/power}
375  we have:
376  y'(i) = z'(i) * target-rms * rscale + z(i) var_deriv_mod
377 
378  Now,
379  mean' = \sum_i y'(i)
380  = (target-rms * rscale * \sum_i z'(i)) + (var_deriv_mod \sum_i z(i))
381  [... and the 2nd term above is zero when summed over i, because \sum_i z(i) is zero, ...]
382  = target-rms * rscale * \sum_i z(i)
383  and:
384  x'(i) = z'(i) * target-rms * rscale + z(i) var_deriv_mod - 1/I mean'
385  = z'(i) * target-rms * rscale + z(i) var_deriv_mod - 1/I * target-rms * rscale * \sum_i z'(i)
386  = target-rms * rscale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod
387 
388  It will simplify the code if we define:
389 
390  scale = target-rms * rscale. This way, we can write as follows:
391 
392  BACKWARD PASS (recap):
393 
394  var_deriv_mod = 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power}
395  .. which for power = -0.5, simplifies to:
396  var_deriv_mod = -1.0 / (target-rms^2) * (1/I \sum_i z'(i) z(i)) * scale
397 
398  x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod
399 
400  */
402  const CuMatrixBase<BaseFloat> &in,
403  CuMatrixBase<BaseFloat> *out) const {
404  KALDI_ASSERT(SameDim(in, *out) &&
405  (in.NumCols() == dim_ || in.NumCols() == block_dim_));
406  if (in.NumCols() != block_dim_) {
407  // if block_dim_ != dim_, we recurse; this helps keep the main code
408  // simple.
409  KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
410  int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(),
411  orig_cols = in.NumCols(), new_rows = orig_rows * ratio,
412  new_cols = orig_cols / ratio;
413  CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_rows, new_cols, new_cols),
414  out_reshaped(out->Data(), new_rows, new_cols, new_cols);
415  return Propagate(indexes, in_reshaped, &out_reshaped);
416  }
417 
418  // From this point, we can assume that the num-cols of 'in' and 'out'
419  // equals block_dim_.
420 
421  if (!test_mode_) {
422  // search in the comment above for FORWARD PASS to see what is being
423  // implemented here.
424  // if this takes too much time due to multiple different CUDA calls,
425  // we'll consider making a single kernel for some of it.
426  Memo *memo = new Memo;
427  int32 num_frames = in.NumRows(), dim = block_dim_;
428  memo->num_frames = num_frames;
429  memo->mean_uvar_scale.Resize(5, dim);
431  uvar(memo->mean_uvar_scale, 1),
432  scale(memo->mean_uvar_scale, 2);
433  mean.AddRowSumMat(1.0 / num_frames, in, 0.0);
434  uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0);
435  scale.CopyFromVec(uvar);
436 
437  // by applying this scale at this point, we save a multiply later on.
438  BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_);
439  scale.AddVecVec(-var_scale, mean, mean, var_scale);
440  // at this point, 'scale' contains just the variance (times target-rms^{-2}).
441  scale.ApplyFloor(0.0);
442  scale.Add(var_scale * epsilon_);
443  // Now 'scale' contains the variance floored to zero and then with epsilon
444  // added [both times 1/target-rms^2].
445  scale.ApplyPow(-0.5);
446  // now 'scale' is the actual scale we'll use.
447 
448  // the next command will do no work if out == in, for in-place propagation.
449  out->CopyFromMat(in);
450  out->AddVecToRows(-1.0, mean, 1.0);
451  out->MulColsVec(scale);
452  return static_cast<void*>(memo);
453  } else {
454  if (offset_.Dim() != block_dim_) {
455  if (count_ == 0)
456  KALDI_ERR << "Test mode set in BatchNormComponent, but no stats.";
457  else // why was ComputeDerived() not called?
458  KALDI_ERR << "Code error in BatchNormComponent";
459  }
460  out->CopyFromMat(in);
461  out->MulColsVec(scale_);
462  out->AddVecToRows(1.0, offset_, 1.0);
463  return NULL;
464  }
465 }
466 
468  const std::string &debug_info,
469  const ComponentPrecomputedIndexes *indexes,
470  const CuMatrixBase<BaseFloat> &in_value, // unused
471  const CuMatrixBase<BaseFloat> &out_value,
472  const CuMatrixBase<BaseFloat> &out_deriv,
473  void *memo_in,
474  Component *to_update, // unused
475  CuMatrixBase<BaseFloat> *in_deriv) const {
476  NVTX_RANGE("BatchNormComponent::Backprop");
477 
478  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
479  SameDim(out_value, *in_deriv) &&
480  (out_value.NumCols() == dim_ ||
481  out_value.NumCols() == block_dim_));
482  if (out_value.NumCols() != block_dim_) {
483  // if block_dim_ != dim_, we recurse; this helps keep the main code
484  // simple.
485  KALDI_ASSERT(out_value.Stride() == out_value.NumCols() &&
486  out_deriv.Stride() == out_deriv.NumCols() &&
487  in_deriv->Stride() == in_deriv->NumCols());
488  int32 ratio = dim_ / block_dim_,
489  orig_rows = out_value.NumRows(),
490  orig_cols = out_value.NumCols(),
491  new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
492  CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
493  new_cols, new_cols),
494  out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols),
495  in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols);
496  // we'll never use in_value, so pass it in unchanged.
497  Backprop(debug_info, indexes, in_value,
498  out_value_reshaped, out_deriv_reshaped,
499  memo_in, to_update, &in_deriv_reshaped);
500  return;
501  }
502 
503  Memo *memo = static_cast<Memo*>(memo_in);
504 
505  if (!test_mode_) {
506  // search above for BACKWARD PASS for a comment describing the math.
507  KALDI_ASSERT(memo != NULL && "memo not passed into backprop");
508  int32 num_frames = memo->num_frames;
509  KALDI_ASSERT(out_value.NumRows() == num_frames);
511  scale(memo->mean_uvar_scale, 2),
512  var_deriv_mod(memo->mean_uvar_scale, 3),
513  temp(memo->mean_uvar_scale, 4);
514 
515  // var_deriv_mod is going to contain:
516  // 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power}
517  // which for power = -0.5 simplifies to:
518  // -1.0 / (target_rms * target_rms).
519  // but for now we don't have the power of 'scale', we'll add that later.
520  BaseFloat coeff = -1.0 / (target_rms_ * target_rms_ * num_frames);
521 
522  var_deriv_mod.AddDiagMatMat(coeff, out_value, kTrans,
523  out_deriv, kNoTrans, 0.0);
524  var_deriv_mod.MulElements(scale);
525 
526  temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0);
527  // the following statement does no work if in_deriv and out_deriv are the
528  // same matrix.
529  in_deriv->CopyFromMat(out_deriv);
530  in_deriv->AddVecToRows(1.0, temp);
531  // At this point, *in_deriv contains
532  // (z'(i) - 1/I * \sum_i z'(i))
533  in_deriv->MulColsVec(scale);
534  // At this point, *in_deriv contains
535  // scale * (z'(i) - 1/I * \sum_i z'(i))
536 
537  in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans,
538  var_deriv_mod, 1.0);
539 
540  // At this point, *in_deriv contains what we described in the comment
541  // starting BATCHNORM_MATH as:
542  // x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod
543  } else {
544  KALDI_ASSERT(offset_.Dim() == block_dim_);
545  // the next call does no work if they point to the same memory.
546  in_deriv->CopyFromMat(out_deriv);
547  in_deriv->MulColsVec(scale_);
548  }
549 }
550 
552  const CuMatrixBase<BaseFloat> &in_value,
553  const CuMatrixBase<BaseFloat> &out_value,
554  void *memo_in) {
555  // in test mode this component does not store stats, it doesn't provide the
556  // kStoresStats flag.
558  KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_);
559  if (out_value.NumCols() != block_dim_) {
560  // if block_dim_ != dim_, we recurse; this helps keep the main code
561  // simple.
562  KALDI_ASSERT(out_value.Stride() == out_value.NumCols());
563  int32 ratio = dim_ / block_dim_,
564  orig_rows = out_value.NumRows(),
565  orig_cols = out_value.NumCols(),
566  new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
567  CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
568  new_cols, new_cols);
569  // we'll never use in_value, so just pass it in unchanged.
570  StoreStats(in_value, out_value_reshaped, memo_in);
571  return;
572  }
573 
574  Memo *memo = static_cast<Memo*>(memo_in);
575  KALDI_ASSERT(out_value.NumRows() == memo->num_frames);
576 
578  uvar(memo->mean_uvar_scale, 1);
579  KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0);
580  BaseFloat num_frames = memo->num_frames;
581  if (stats_sum_.Dim() != block_dim_) {
584  KALDI_ASSERT(count_ == 0);
585  }
586  count_ += num_frames;
587  stats_sum_.AddVec(num_frames, mean, 1.0);
588  stats_sumsq_.AddVec(num_frames, uvar, 1.0);
589 }
590 
591 void BatchNormComponent::Read(std::istream &is, bool binary) {
592  ExpectOneOrTwoTokens(is, binary, "<BatchNormComponent>", "<Dim>");
593  ReadBasicType(is, binary, &dim_);
594  ExpectToken(is, binary, "<BlockDim>");
595  ReadBasicType(is, binary, &block_dim_);
596  ExpectToken(is, binary, "<Epsilon>");
597  ReadBasicType(is, binary, &epsilon_);
598  ExpectToken(is, binary, "<TargetRms>");
599  ReadBasicType(is, binary, &target_rms_);
600  ExpectToken(is, binary, "<TestMode>");
601  ReadBasicType(is, binary, &test_mode_);
602  ExpectToken(is, binary, "<Count>");
603  ReadBasicType(is, binary, &count_);
604  ExpectToken(is, binary, "<StatsMean>");
605  stats_sum_.Read(is, binary);
606  ExpectToken(is, binary, "<StatsVar>");
607  stats_sumsq_.Read(is, binary);
611  ExpectToken(is, binary, "</BatchNormComponent>");
612  ComputeDerived();
613  Check();
614 }
615 
616 void BatchNormComponent::Write(std::ostream &os, bool binary) const {
617  Check();
618  WriteToken(os, binary, "<BatchNormComponent>");
619  WriteToken(os, binary, "<Dim>");
620  WriteBasicType(os, binary, dim_);
621  WriteToken(os, binary, "<BlockDim>");
622  WriteBasicType(os, binary, block_dim_);
623  WriteToken(os, binary, "<Epsilon>");
624  WriteBasicType(os, binary, epsilon_);
625  WriteToken(os, binary, "<TargetRms>");
626  WriteBasicType(os, binary, target_rms_);
627  WriteToken(os, binary, "<TestMode>");
628  WriteBasicType(os, binary, test_mode_);
629  WriteToken(os, binary, "<Count>");
630  WriteBasicType(os, binary, count_);
632  if (count_ != 0) {
633  mean.Scale(1.0 / count_);
634  var.Scale(1.0 / count_);
635  var.AddVecVec(-1.0, mean, mean, 1.0);
636  }
637  WriteToken(os, binary, "<StatsMean>");
638  mean.Write(os, binary);
639  WriteToken(os, binary, "<StatsVar>");
640  var.Write(os, binary);
641  WriteToken(os, binary, "</BatchNormComponent>");
642 }
643 
645  if (scale == 0) {
646  count_ = 0.0;
649  } else {
650  count_ *= scale;
651  stats_sum_.Scale(scale);
652  stats_sumsq_.Scale(scale);
653  }
654 }
655 
656 
657 void BatchNormComponent::Add(BaseFloat alpha, const Component &other_in) {
658  const BatchNormComponent *other =
659  dynamic_cast<const BatchNormComponent*>(&other_in);
660  count_ += alpha * other->count_;
661  stats_sum_.AddVec(alpha, other->stats_sum_);
662  stats_sumsq_.AddVec(alpha, other->stats_sumsq_);
663  // this operation might change offset_ and scale_, so we recompute them
664  // in this instance (but not in Scale()).
665  ComputeDerived();
666 }
667 
669  // We only zero the stats if we're not in test mode. In test mode, this would
670  // be dangerous as the stats are the source for the transform, and zeroing
671  // them and then calling ComputeDerived() again would remove the transform
672  // parameters (offset_ and scale_).
673  if (!test_mode_) {
674  count_ = 0.0;
677  }
678 }
679 
680 
681 } // namespace nnet3
682 } // namespace kaldi
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
MatrixIndexT Stride() const
Definition: cu-matrix.h:217
const std::string WholeLine()
Definition: text-utils.h:230
void SetZero()
Math operations.
Definition: cu-vector.cc:1098
Abstract base-class for neural-net components.
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
void Add(Real value)
Definition: cu-vector.cc:1157
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
void AddMatDiagVec(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, CuVectorBase< Real > &v, Real beta=1.0)
Definition: cu-matrix.cc:1415
std::string SummarizeVector(const VectorBase< float > &vec)
Returns a string that summarizes a vector fairly succintly, for printing stats in info lines...
Definition: nnet-parse.cc:111
kaldi::int32 int32
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void ReadToken(std::istream &is, bool binary, std::string *str)
ReadToken gets the next token and puts it in str (exception on failure).
Definition: io-funcs.cc:154
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=NULL)
Definition: cu-vector.h:139
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void StoreStats(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, void *memo)
This function may store stats on average activation values, and for some component types...
This file contains declarations of components that in one way or another normalize their input: Norma...
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
void ExpectOneOrTwoTokens(std::istream &is, bool binary, const std::string &token1, const std::string &token2)
This function is like ExpectToken but for two tokens, and it will either accept token1 and then token...
Definition: text-utils.cc:536
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
void AddVecVec(Real alpha, const VectorBase< Real > &v, const VectorBase< Real > &r, Real beta)
Add element-by-element product of vectors:
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=nullptr)
Applies floor to all elements.
Definition: kaldi-vector.h:149
std::string UnusedValues() const
returns e.g.
Definition: text-utils.cc:518
const size_t count
void Write(std::ostream &is, bool binary) const
Definition: cu-vector.cc:973
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
float BaseFloat
Definition: kaldi-types.h:29
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
Definition: cu-vector.cc:1078
void DiffNormalizePerRow(const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)
Definition: cu-math.cc:349
void AddVecToRows(Real alpha, const CuVectorBase< Real > &row, Real beta=1.0)
(for each row r of *this), r = alpha * row + beta * r
Definition: cu-matrix.cc:1261
static void ExpectToken(const std::string &token, const std::string &what_we_are_parsing, const std::string **next_token)
virtual void ZeroStats()
Components that provide an implementation of StoreStats should also provide an implementation of Zero...
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void Resize(MatrixIndexT dim, MatrixResizeType t=kSetZero)
Allocate the memory.
Definition: cu-vector.cc:993
#define KALDI_ERR
Definition: kaldi-error.h:147
void ApplyPow(Real power)
Definition: cu-vector.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
void Scale(Real alpha)
Multiplies all elements by this constant.
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
void AddVec(Real alpha, const CuVectorBase< Real > &vec, Real beta=1.0)
Definition: cu-vector.cc:1237
void MulColsVec(const CuVectorBase< Real > &scale)
scale i&#39;th column by scale[i]
Definition: cu-matrix.cc:765
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
const CuVector< BaseFloat > & Scale() const
A class representing a vector.
Definition: kaldi-vector.h:406
This class is responsible for parsing input like hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing=&#39;a b c&#39; baz="a b c d=&#39;a b&#39; e" and giving you access to the fields, in this case.
Definition: text-utils.h:205
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void ApplyPow(Real power)
Take all elements of vector to a power.
Definition: kaldi-vector.h:179
void Read(std::istream &is, bool binary)
I/O.
Definition: cu-vector.cc:963
#define NVTX_RANGE(name)
Definition: cu-common.h:143
void AddVecVec(Real alpha, const CuVectorBase< Real > &v, const CuVectorBase< Real > &r, Real beta)
Definition: cu-vector.cc:560
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
void Scale(Real value)
Definition: cu-vector.cc:1216
virtual int32 OutputDim() const
Returns output-dimension of this component.
bool HasUnusedValues() const
Definition: text-utils.cc:510
bool GetValue(const std::string &key, std::string *value)
Definition: text-utils.cc:427
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34
virtual int32 InputDim() const
Returns input-dimension of this component.
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
void NormalizePerRow(const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square ...
Definition: cu-math.cc:280
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69