nnet-simple-component.cc
Go to the documentation of this file.
1 // nnet3/nnet-simple-component.cc
2 
3 // Copyright 2015-2017 Johns Hopkins University (author: Daniel Povey)
4 // 2015 Xiaohui Zhang
5 // 2015 Guoguo Chen
6 // 2015 Daniel Galvez
7 // 2016 Yiming Wang
8 
9 // See ../../COPYING for clarification regarding multiple authors
10 //
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 //
15 // http://www.apache.org/licenses/LICENSE-2.0
16 //
17 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
19 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
20 // MERCHANTABLITY OR NON-INFRINGEMENT.
21 // See the Apache 2 License for the specific language governing permissions and
22 // limitations under the License.
23 
24 #include <iterator>
25 #include <sstream>
26 #include <algorithm>
27 #include <iomanip>
29 #include "nnet3/nnet-parse.h"
30 #include "cudamatrix/cu-math.h"
31 
32 namespace kaldi {
33 namespace nnet3 {
34 
35 void PnormComponent::Init(int32 input_dim, int32 output_dim) {
36  input_dim_ = input_dim;
37  output_dim_ = output_dim;
38  KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0 &&
39  input_dim_ % output_dim_ == 0);
40 }
41 
43  int32 input_dim = 0;
44  int32 output_dim = 0;
45  bool ok = cfl->GetValue("output-dim", &output_dim) &&
46  cfl->GetValue("input-dim", &input_dim);
47  if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
48  KALDI_ERR << "Invalid initializer for layer of type "
49  << Type() << ": \"" << cfl->WholeLine() << "\"";
50  Init(input_dim, output_dim);
51 }
52 
53 
55  const CuMatrixBase<BaseFloat> &in,
56  CuMatrixBase<BaseFloat> *out) const {
57  BaseFloat p = 2.0;
58  out->GroupPnorm(in, p);
59  return NULL;
60 }
61 
62 void PnormComponent::Backprop(const std::string &debug_info,
63  const ComponentPrecomputedIndexes *indexes,
64  const CuMatrixBase<BaseFloat> &in_value,
65  const CuMatrixBase<BaseFloat> &out_value,
66  const CuMatrixBase<BaseFloat> &out_deriv,
67  void *memo,
68  Component *to_update,
69  CuMatrixBase<BaseFloat> *in_deriv) const {
70  NVTX_RANGE("PnormComponent::Backprop");
71  if (!in_deriv)
72  return;
73  BaseFloat p = 2.0;
74  in_deriv->DiffGroupPnorm(in_value, out_value, out_deriv, p);
75 }
76 
77 void PnormComponent::Read(std::istream &is, bool binary) {
78  ExpectOneOrTwoTokens(is, binary, "<PnormComponent>", "<InputDim>");
79  ReadBasicType(is, binary, &input_dim_);
80  ExpectToken(is, binary, "<OutputDim>");
81  ReadBasicType(is, binary, &output_dim_);
82  ExpectToken(is, binary, "</PnormComponent>");
83 }
84 
85 void PnormComponent::Write(std::ostream &os, bool binary) const {
86  WriteToken(os, binary, "<PnormComponent>");
87  WriteToken(os, binary, "<InputDim>");
88  WriteBasicType(os, binary, input_dim_);
89  WriteToken(os, binary, "<OutputDim>");
90  WriteBasicType(os, binary, output_dim_);
91  WriteToken(os, binary, "</PnormComponent>");
92 }
93 
95  RandomComponent(other),
96  dim_(other.dim_),
97  dropout_proportion_(other.dropout_proportion_),
98  dropout_per_frame_(other.dropout_per_frame_) { }
99 
101  DropoutComponent *ans = new DropoutComponent(*this);
102  return ans;
103 }
104 
105 void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion,
106  bool dropout_per_frame) {
107  dropout_proportion_ = dropout_proportion;
108  dropout_per_frame_ = dropout_per_frame;
109  dim_ = dim;
110 }
111 
113  int32 dim = 0;
114  BaseFloat dropout_proportion = 0.0;
115  bool dropout_per_frame = false;
116  test_mode_ = false;
117  bool ok = cfl->GetValue("dim", &dim) &&
118  cfl->GetValue("dropout-proportion", &dropout_proportion);
119  cfl->GetValue("dropout-per-frame", &dropout_per_frame);
120  // It only makes sense to set test-mode in the config for testing purposes.
121  cfl->GetValue("test-mode", &test_mode_);
122  // for this stage, dropout is hard coded in
123  // normal mode if not declared in config
124  if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
125  dropout_proportion < 0.0 || dropout_proportion > 1.0)
126  KALDI_ERR << "Invalid initializer for layer of type "
127  << Type() << ": \"" << cfl->WholeLine() << "\"";
128  Init(dim, dropout_proportion, dropout_per_frame);
129 }
130 
131 std::string DropoutComponent::Info() const {
132  std::ostringstream stream;
133  stream << Type() << ", dim=" << dim_
134  << ", dropout-proportion=" << dropout_proportion_
135  << ", dropout-per-frame=" << (dropout_per_frame_ ? "true" : "false");
136  return stream.str();
137 }
138 
140  const CuMatrixBase<BaseFloat> &in,
141  CuMatrixBase<BaseFloat> *out) const {
142  KALDI_ASSERT(out->NumRows() == in.NumRows() && out->NumCols() == in.NumCols()
143  && in.NumCols() == dim_);
144 
145  BaseFloat dropout = dropout_proportion_;
146  KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
147  if (test_mode_) {
148  out->CopyFromMat(in);
149  out->Scale(1.0 - dropout);
150  return NULL;
151  }
152  if (!dropout_per_frame_) {
153  // This const_cast is only safe assuming you don't attempt
154  // to use multi-threaded code with the GPU.
156 
157  out->Add(-dropout); // now, a proportion "dropout" will be <0.0
158  // apply the function (x>0?1:0). Now, a proportion
159  // "dropout" will be zero and (1 - dropout) will be 1.0.
160  out->ApplyHeaviside();
161 
162  out->MulElements(in);
163  } else {
164  // randomize the dropout matrix by row,
165  // i.e. [[1,1,1,1],[0,0,0,0],[0,0,0,0],[1,1,1,1],[0,0,0,0]]
166  CuMatrix<BaseFloat> tmp(1, out->NumRows(), kUndefined);
167  // This const_cast is only safe assuming you don't attempt
168  // to use multi-threaded code with the GPU.
169  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&tmp);
170  tmp.Add(-dropout);
171  tmp.ApplyHeaviside();
172  out->CopyColsFromVec(tmp.Row(0));
173  out->MulElements(in);
174  }
175  return NULL;
176 }
177 
178 
179 void DropoutComponent::Backprop(const std::string &debug_info,
180  const ComponentPrecomputedIndexes *indexes,
181  const CuMatrixBase<BaseFloat> &in_value,
182  const CuMatrixBase<BaseFloat> &out_value,
183  const CuMatrixBase<BaseFloat> &out_deriv,
184  void *memo,
185  Component *to_update,
186  CuMatrixBase<BaseFloat> *in_deriv) const {
187  NVTX_RANGE("DropoutComponent::Backprop");
188  KALDI_ASSERT(in_value.NumRows() == out_value.NumRows() &&
189  in_value.NumCols() == out_value.NumCols());
190 
191  KALDI_ASSERT(in_value.NumRows() == out_deriv.NumRows() &&
192  in_value.NumCols() == out_deriv.NumCols());
193  in_deriv->SetMatMatDivMat(out_deriv, out_value, in_value);
194 }
195 
196 
197 
198 void DropoutComponent::Read(std::istream &is, bool binary) {
199  std::string token;
200  ReadToken(is, binary, &token);
201  if (token == "<DropoutComponent>") {
202  ReadToken(is, binary, &token);
203  }
204  KALDI_ASSERT(token == "<Dim>");
205  ReadBasicType(is, binary, &dim_); // read dimension.
206  ReadToken(is, binary, &token);
207  KALDI_ASSERT(token == "<DropoutProportion>");
208  ReadBasicType(is, binary, &dropout_proportion_); // read dropout rate
209  ReadToken(is, binary, &token);
210  if (token == "<DropoutPerFrame>") {
211  ReadBasicType(is, binary, &dropout_per_frame_); // read dropout mode
212  ReadToken(is, binary, &token);
213  } else {
214  dropout_per_frame_ = false;
215  }
216  if (token == "<TestMode>") {
217  ReadBasicType(is, binary, &test_mode_); // read test mode
218  ExpectToken(is, binary, "</DropoutComponent>");
219  } else {
220  test_mode_ = false;
221  KALDI_ASSERT(token == "</DropoutComponent>");
222  }
223 }
224 
225 void DropoutComponent::Write(std::ostream &os, bool binary) const {
226  WriteToken(os, binary, "<DropoutComponent>");
227  WriteToken(os, binary, "<Dim>");
228  WriteBasicType(os, binary, dim_);
229  WriteToken(os, binary, "<DropoutProportion>");
230  WriteBasicType(os, binary, dropout_proportion_);
231  WriteToken(os, binary, "<DropoutPerFrame>");
232  WriteBasicType(os, binary, dropout_per_frame_);
233  WriteToken(os, binary, "<TestMode>");
234  WriteBasicType(os, binary, test_mode_);
235  WriteToken(os, binary, "</DropoutComponent>");
236 }
237 
238 void ElementwiseProductComponent::Init(int32 input_dim, int32 output_dim) {
239  input_dim_ = input_dim;
240  output_dim_ = output_dim;
241  KALDI_ASSERT(input_dim_ > 0 && output_dim_ >= 0);
242  KALDI_ASSERT(input_dim_ > output_dim_);
243  KALDI_ASSERT(input_dim_ % output_dim_ == 0);
244 }
245 
247  int32 input_dim = 0;
248  int32 output_dim = 0;
249  bool ok = cfl->GetValue("output-dim", &output_dim) &&
250  cfl->GetValue("input-dim", &input_dim);
251  if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
252  KALDI_ERR << "Invalid initializer for layer of type "
253  << Type() << ": \"" << cfl->WholeLine() << "\"";
254  Init(input_dim, output_dim);
255 }
256 
258  const ComponentPrecomputedIndexes *indexes,
259  const CuMatrixBase<BaseFloat> &in,
260  CuMatrixBase<BaseFloat> *out) const {
261  KALDI_ASSERT(in.NumCols() == input_dim_);
262  int32 num_inputs = input_dim_ / output_dim_;
263  for (int32 i = 0; i < num_inputs; i++) {
264  CuSubMatrix<BaseFloat> current_in(in, 0, in.NumRows(),
265  i * output_dim_, output_dim_);
266  if (i == 0) {
267  out->CopyFromMat(current_in);
268  } else {
269  out->MulElements(current_in);
270  }
271  }
272  return NULL;
273 }
274 
275 void ElementwiseProductComponent::Backprop(const std::string &debug_info,
276  const ComponentPrecomputedIndexes *indexes,
277  const CuMatrixBase<BaseFloat> &in_value,
278  const CuMatrixBase<BaseFloat> &out_value,
279  const CuMatrixBase<BaseFloat> &out_deriv,
280  void *memo,
281  Component *to_update,
282  CuMatrixBase<BaseFloat> *in_deriv) const {
283  NVTX_RANGE("ElementwiseProductComponent::Backprop");
284  if (!in_deriv) return;
285  int32 num_inputs = input_dim_ / output_dim_;
286  for (int32 i = 0; i < num_inputs; i++) {
287  CuSubMatrix<BaseFloat> current_in_deriv(*in_deriv, 0, in_deriv->NumRows(),
288  i * output_dim_,
289  output_dim_);
290  current_in_deriv.CopyFromMat(out_deriv);
291  for (int32 j = 0; j < num_inputs; j++) {
292  if (i == j)
293  continue;
294  CuSubMatrix<BaseFloat> in_value_partition(in_value, 0,
295  in_value.NumRows(),
296  j * output_dim_,
297  output_dim_);
298  current_in_deriv.MulElements(in_value_partition);
299  }
300  }
301 }
302 
303 void ElementwiseProductComponent::Read(std::istream &is, bool binary) {
304  ExpectOneOrTwoTokens(is, binary, "<ElementwiseProductComponent>",
305  "<InputDim>");
306  ReadBasicType(is, binary, &input_dim_);
307  ExpectToken(is, binary, "<OutputDim>");
308  ReadBasicType(is, binary, &output_dim_);
309  ExpectToken(is, binary, "</ElementwiseProductComponent>");
310 }
311 
312 void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const {
313  WriteToken(os, binary, "<ElementwiseProductComponent>");
314  WriteToken(os, binary, "<InputDim>");
315  WriteBasicType(os, binary, input_dim_);
316  WriteToken(os, binary, "<OutputDim>");
317  WriteBasicType(os, binary, output_dim_);
318  WriteToken(os, binary, "</ElementwiseProductComponent>");
319 }
320 
322  const CuMatrixBase<BaseFloat> &in,
323  CuMatrixBase<BaseFloat> *out) const {
324  out->Sigmoid(in);
325  return NULL;
326 }
327 
328 void SigmoidComponent::Backprop(const std::string &debug_info,
329  const ComponentPrecomputedIndexes *indexes,
330  const CuMatrixBase<BaseFloat> &,
331  const CuMatrixBase<BaseFloat> &out_value,
332  const CuMatrixBase<BaseFloat> &out_deriv,
333  void *memo,
334  Component *to_update_in,
335  CuMatrixBase<BaseFloat> *in_deriv) const {
336  NVTX_RANGE("SigmoidComponent::Backprop");
337  if (in_deriv != NULL) {
338  in_deriv->DiffSigmoid(out_value, out_deriv);
339  SigmoidComponent *to_update = dynamic_cast<SigmoidComponent*>(to_update_in);
340  if (to_update != NULL) {
341  RepairGradients(out_value, in_deriv, to_update);
342  to_update->StoreBackpropStats(out_deriv);
343  }
344  }
345 }
346 
348  const CuMatrixBase<BaseFloat> &out_value,
349  CuMatrixBase<BaseFloat> *in_deriv,
350  SigmoidComponent *to_update) const {
351  KALDI_ASSERT(to_update != NULL);
352  // maximum possible derivative of SigmoidComponent is 0.25.
353  // the default lower-threshold on the derivative, below which we
354  // add a term to the derivative to encourage the inputs to the sigmoid
355  // to be closer to zero, is 0.05, which means the derivative is on average
356  // 5 times smaller than its maximum possible value.
357  BaseFloat default_lower_threshold = 0.05;
358 
359  // we use this 'repair_probability' (hardcoded for now) to limit
360  // this code to running on about half of the minibatches.
361  BaseFloat repair_probability = 0.5;
362 
363  to_update->num_dims_processed_ += dim_;
364 
365  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
366  RandUniform() > repair_probability)
367  return;
368 
369  // check that the self-repair scale is in a reasonable range.
370  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
371  BaseFloat unset = kUnsetThreshold; // -1000.0
372  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
373  default_lower_threshold :
374  self_repair_lower_threshold_) *
375  count_;
376  if (self_repair_upper_threshold_ != unset) {
377  KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
378  << "components, it does nothing.";
379  }
380 
381  // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside
382  // function isn't defined for vectors).
383  CuMatrix<BaseFloat> thresholds(1, dim_);
384  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
385  thresholds_vec.AddVec(-1.0, deriv_sum_);
386  thresholds_vec.Add(lower_threshold);
387  thresholds.ApplyHeaviside();
388  to_update->num_dims_self_repaired_ += thresholds_vec.Sum();
389 
390  // At this point, 'thresholds_vec' contains a 1 for each dimension of
391  // the output that is 'problematic', i.e. for which the avg-deriv
392  // is less than the self-repair lower threshold, and a 0 for
393  // each dimension that is not problematic.
394 
395  // what we want to do is to add
396  // -self_repair_scale_ / repair_probability times (2 * output-valiue - 1.0)
397  // to the input derivative for each problematic dimension.
398 
399  // Here, 2 * output - 1.0 is a version of the sigmoid that goes from -1.0 to
400  // 1.0, like a tanh. the negative sign is so that for inputs <0, we push them
401  // up towards 0, and for inputs >0, we push them down towards 0.
402  // Our use of this sigmoid-type function here is just a convenience since
403  // we have it available. We could use just about any function that is positive
404  // for inputs < 0 and negative for inputs > 0.
405 
406  // We can rearrange the above as: for only the problematic columns,
407  // input-deriv -= 2 * self-repair-scale / repair-probabilty * output
408  // input-deriv += self-repair-scale / repair-probabilty
409  // which we can write as:
410  // input-deriv -= 2 * self-repair-scale / repair-probabilty * output * thresholds-vec
411  // input-deriv += self-repair-scale / repair-probabilty * thresholds-vec
412 
413  in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability,
414  out_value, kNoTrans, thresholds_vec);
415  in_deriv->AddVecToRows(self_repair_scale_ / repair_probability,
416  thresholds_vec);
417 }
418 
419 
420 
422  const CuMatrixBase<BaseFloat> &out_value,
423  void *memo) {
424  // Only store stats about every other minibatch (but on the first minibatch,
425  // always store it, which is necessary for the ConsolidateMemory() operation
426  // to work correctly.
427  if (RandInt(0, 1) == 0 && count_ != 0)
428  return;
429  // derivative of the nonlinearity is out_value * (1.0 - out_value);
430  CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(), out_value.NumCols(),
431  kUndefined);
432  temp_deriv.Set(1.0);
433  temp_deriv.AddMat(-1.0, out_value);
434  temp_deriv.MulElements(out_value);
435  StoreStatsInternal(out_value, &temp_deriv);
436 }
437 
438 
439 
441  const CuMatrixBase<BaseFloat> &in,
442  CuMatrixBase<BaseFloat> *out) const {
443  out->CopyFromMat(in);
444  return NULL;
445 }
446 
447 void NoOpComponent::Backprop(const std::string &debug_info,
448  const ComponentPrecomputedIndexes *indexes,
449  const CuMatrixBase<BaseFloat> &,
450  const CuMatrixBase<BaseFloat> &,
451  const CuMatrixBase<BaseFloat> &out_deriv,
452  void *memo,
453  Component *to_update, // may be NULL; may be identical
454  // to "this" or different.
455  CuMatrixBase<BaseFloat> *in_deriv) const {
456  NVTX_RANGE("NoOpComponent::Backprop");
457  in_deriv->CopyFromMat(out_deriv);
458  if (backprop_scale_ != 1.0)
459  in_deriv->Scale(backprop_scale_);
460 }
461 
463  backprop_scale_ = 1.0;
464  cfl->GetValue("backprop-scale", &backprop_scale_);
465  if (!cfl->GetValue("dim", &dim_) ||
466  dim_ <= 0 || cfl->HasUnusedValues()) {
467  KALDI_ERR << "Invalid initializer for layer of type "
468  << Type() << ": \"" << cfl->WholeLine() << "\"";
469  }
470 }
471 
472 std::string NoOpComponent::Info() const {
473  std::ostringstream stream;
474  stream << Type() << ", dim=" << dim_;
475  if (backprop_scale_ != 1.0)
476  stream << ", backprop-scale=" << backprop_scale_;
477  return stream.str();
478 }
479 
480 void NoOpComponent::Write(std::ostream &os, bool binary) const {
481  WriteToken(os, binary, "<NoOpComponent>");
482  WriteToken(os, binary, "<Dim>");
483  WriteBasicType(os, binary, dim_);
484  WriteToken(os, binary, "<BackpropScale>");
485  WriteBasicType(os, binary, backprop_scale_);
486  WriteToken(os, binary, "</NoOpComponent>");
487 }
488 
489 void NoOpComponent::Read(std::istream &is, bool binary) {
490  ExpectOneOrTwoTokens(is, binary, "<NoOpComponent>", "<Dim>");
491  ReadBasicType(is, binary, &dim_);
492 
493  if (PeekToken(is, binary) == 'V') {
494  // This is the old format, from when NoOpComponent inherited from
495  // NonlinearComponent.
496  backprop_scale_ = 1.0;
497  ExpectToken(is, binary, "<ValueAvg>");
498  CuVector<BaseFloat> temp_vec;
499  temp_vec.Read(is, binary);
500  ExpectToken(is, binary, "<DerivAvg>");
501  temp_vec.Read(is, binary);
502  ExpectToken(is, binary, "<Count>");
503  BaseFloat temp_float;
504  ReadBasicType(is, binary, &temp_float);
505  if (PeekToken(is, binary) == 'O') {
506  ExpectToken(is, binary, "<OderivRms>");
507  temp_vec.Read(is, binary);
508  ExpectToken(is, binary, "<OderivCount>");
509  ReadBasicType(is, binary, &temp_float);
510  }
511  std::string token;
512  ReadToken(is, binary, &token);
513  if (token[0] != '<') {
514  // this should happen only rarely, in case we couldn't push back the
515  // '<' to the stream in PeekToken().
516  token = '<' + token;
517  }
518  if (token == "<NumDimsSelfRepaired>") {
519  ReadBasicType(is, binary, &temp_float);
520  ReadToken(is, binary, &token);
521  }
522  if (token == "<NumDimsProcessed>") {
523  ReadBasicType(is, binary, &temp_float);
524  ReadToken(is, binary, &token);
525  }
526  KALDI_ASSERT(token == "</NoOpComponent>");
527  return;
528  } else {
529  ExpectToken(is, binary, "<BackpropScale>");
530  ReadBasicType(is, binary, &backprop_scale_);
531  ExpectToken(is, binary, "</NoOpComponent>");
532  }
533 }
534 
535 
536 void ClipGradientComponent::Read(std::istream &is, bool binary) {
537  // might not see the "<NaturalGradientAffineComponent>" part because
538  // of how ReadNew() works.
539  ExpectOneOrTwoTokens(is, binary, "<ClipGradientComponent>",
540  "<Dim>");
541  ReadBasicType(is, binary, &dim_);
542  ExpectToken(is, binary, "<ClippingThreshold>");
543  ReadBasicType(is, binary, &clipping_threshold_);
544  ExpectToken(is, binary, "<NormBasedClipping>");
545  ReadBasicType(is, binary, &norm_based_clipping_);
546  std::string token;
547  ReadToken(is, binary, &token);
548  if (token == "<SelfRepairClippedProportionThreshold>") {
549  ReadBasicType(is, binary, &self_repair_clipped_proportion_threshold_);
550  ExpectToken(is, binary, "<SelfRepairTarget>");
551  ReadBasicType(is, binary, &self_repair_target_);
552  ExpectToken(is, binary, "<SelfRepairScale>");
553  ReadBasicType(is, binary, &self_repair_scale_);
554  ExpectToken(is, binary, "<NumElementsClipped>");
555  } else {
556  self_repair_clipped_proportion_threshold_ = 1.0;
557  self_repair_target_ = 0.0;
558  self_repair_scale_ = 0.0;
559  KALDI_ASSERT(token == "<NumElementsClipped>");
560  }
561  ReadBasicType(is, binary, &num_clipped_);
562  ExpectToken(is, binary, "<NumElementsProcessed>");
563  ReadBasicType(is, binary, &count_);
564  ReadToken(is, binary, &token);
565  if (token == "<NumSelfRepaired>") {
566  ReadBasicType(is, binary, &num_self_repaired_);
567  ExpectToken(is, binary, "<NumBackpropped>");
568  ReadBasicType(is, binary, &num_backpropped_);
569  ExpectToken(is, binary, "</ClipGradientComponent>");
570  } else {
571  num_self_repaired_ = 0;
572  num_backpropped_ = 0;
573  KALDI_ASSERT(token == "</ClipGradientComponent>");
574  }
575 }
576 
577 void ClipGradientComponent::Write(std::ostream &os, bool binary) const {
578  WriteToken(os, binary, "<ClipGradientComponent>");
579  WriteToken(os, binary, "<Dim>");
580  WriteBasicType(os, binary, dim_);
581  WriteToken(os, binary, "<ClippingThreshold>");
582  WriteBasicType(os, binary, clipping_threshold_);
583  WriteToken(os, binary, "<NormBasedClipping>");
584  WriteBasicType(os, binary, norm_based_clipping_);
585  WriteToken(os, binary, "<SelfRepairClippedProportionThreshold>");
586  WriteBasicType(os, binary, self_repair_clipped_proportion_threshold_);
587  WriteToken(os, binary, "<SelfRepairTarget>");
588  WriteBasicType(os, binary, self_repair_target_);
589  WriteToken(os, binary, "<SelfRepairScale>");
590  WriteBasicType(os, binary, self_repair_scale_);
591  WriteToken(os, binary, "<NumElementsClipped>");
592  WriteBasicType(os, binary, num_clipped_);
593  WriteToken(os, binary, "<NumElementsProcessed>");
594  WriteBasicType(os, binary, count_);
595  WriteToken(os, binary, "<NumSelfRepaired>");
596  WriteBasicType(os, binary, num_self_repaired_);
597  WriteToken(os, binary, "<NumBackpropped>");
598  WriteBasicType(os, binary, num_backpropped_);
599  WriteToken(os, binary, "</ClipGradientComponent>");
600 }
601 
602 std::string ClipGradientComponent::Info() const {
603  std::ostringstream stream;
604  stream << Type() << ", dim=" << dim_
605  << ", norm-based-clipping="
606  << (norm_based_clipping_ ? "true" : "false")
607  << ", clipping-threshold=" << clipping_threshold_
608  << ", clipped-proportion="
609  << (count_ > 0 ? static_cast<BaseFloat>(num_clipped_)/count_ : 0);
610  if (self_repair_scale_ != 0.0)
611  stream << ", self-repair-clipped-proportion-threshold="
612  << self_repair_clipped_proportion_threshold_
613  << ", self-repair-target=" << self_repair_target_
614  << ", self-repair-scale=" << self_repair_scale_;
615  return stream.str();
616 }
617 
619  BaseFloat clipping_threshold,
620  bool norm_based_clipping,
621  BaseFloat self_repair_clipped_proportion_threshold,
622  BaseFloat self_repair_target,
623  BaseFloat self_repair_scale,
624  int32 num_clipped,
625  int32 count,
626  int32 num_self_repaired,
627  int32 num_backpropped) {
628  KALDI_ASSERT(clipping_threshold >= 0 && dim > 0 &&
629  self_repair_clipped_proportion_threshold >= 0.0 &&
630  self_repair_target >= 0.0 && self_repair_scale >= 0.0);
631  dim_ = dim;
632  norm_based_clipping_ = norm_based_clipping;
633  clipping_threshold_ = clipping_threshold;
634  self_repair_clipped_proportion_threshold_ =
635  self_repair_clipped_proportion_threshold;
636  self_repair_target_ = self_repair_target;
637  self_repair_scale_ = self_repair_scale;
638  num_clipped_ = num_clipped;
639  count_ = count;
640  num_self_repaired_ = num_self_repaired;
641  num_backpropped_ = num_backpropped;
642 }
643 
645  int32 dim = 0;
646  bool ok = cfl->GetValue("dim", &dim);
647  bool norm_based_clipping = false;
648  BaseFloat clipping_threshold = 15.0;
649  BaseFloat self_repair_clipped_proportion_threshold = 0.01;
650  BaseFloat self_repair_target = 0.0;
651  BaseFloat self_repair_scale = 1.0;
652  cfl->GetValue("clipping-threshold", &clipping_threshold);
653  cfl->GetValue("norm-based-clipping", &norm_based_clipping);
654  cfl->GetValue("self-repair-clipped-proportion-threshold",
655  &self_repair_clipped_proportion_threshold);
656  cfl->GetValue("self-repair-target",
657  &self_repair_target);
658  cfl->GetValue("self-repair-scale", &self_repair_scale);
659  if (!ok || cfl->HasUnusedValues() ||
660  clipping_threshold < 0 || dim <= 0 ||
661  self_repair_clipped_proportion_threshold < 0.0 ||
662  self_repair_target < 0.0 || self_repair_scale < 0.0)
663  KALDI_ERR << "Invalid initializer for layer of type "
664  << Type() << ": \"" << cfl->WholeLine() << "\"";
665  Init(dim, clipping_threshold, norm_based_clipping,
666  self_repair_clipped_proportion_threshold,
667  self_repair_target,
668  self_repair_scale, 0, 0, 0, 0);
669 }
670 
672  const ComponentPrecomputedIndexes *indexes,
673  const CuMatrixBase<BaseFloat> &in,
674  CuMatrixBase<BaseFloat> *out) const {
675  out->CopyFromMat(in);
676  return NULL;
677 }
678 
679 
680 void ClipGradientComponent::Backprop(const std::string &debug_info,
681  const ComponentPrecomputedIndexes *indexes,
682  const CuMatrixBase<BaseFloat> &in_value,
683  const CuMatrixBase<BaseFloat> &,
684  const CuMatrixBase<BaseFloat> &out_deriv,
685  void *memo,
686  Component *to_update_in, // may be NULL; may be identical
687  // to "this" or different.
688  CuMatrixBase<BaseFloat> *in_deriv) const {
689  NVTX_RANGE("ClipGradientComponent::Backprop");
690  // the following statement will do nothing if in_deriv and out_deriv have same
691  // memory.
692  in_deriv->CopyFromMat(out_deriv);
693 
694  ClipGradientComponent *to_update =
695  dynamic_cast<ClipGradientComponent*>(to_update_in);
696 
697  if (clipping_threshold_ > 0) {
698  if (norm_based_clipping_) {
699  // each row in the derivative matrix, which corresponds to one sample in
700  // the mini-batch, is scaled to have a max-norm of clipping_threshold_
701  CuVector<BaseFloat> clipping_scales(in_deriv->NumRows());
702  clipping_scales.AddDiagMat2(pow(clipping_threshold_, -2), *in_deriv,
703  kNoTrans, 0.0);
704  // now clipping_scales contains the squared (norm of each row divided by
705  // clipping_threshold)
706  int32 num_not_scaled;
707  clipping_scales.ApplyFloor(1.0, &num_not_scaled);
708  // now clipping_scales contains min(1,
709  // squared-(norm/clipping_threshold))
710  if (num_not_scaled != clipping_scales.Dim()) {
711  clipping_scales.ApplyPow(-0.5);
712  // now clipping_scales contains max(1,
713  // clipping_threshold/vector_norm)
714  in_deriv->MulRowsVec(clipping_scales);
715  if (to_update != NULL)
716  to_update->num_clipped_ += (clipping_scales.Dim() - num_not_scaled);
717  }
718  if (to_update != NULL)
719  to_update->count_ += clipping_scales.Dim();
720  } else {
721  // each element of the derivative matrix, is clipped to be below the
722  // clipping_threshold_
723  in_deriv->ApplyCeiling(clipping_threshold_);
724  in_deriv->ApplyFloor(-1 * clipping_threshold_);
725  }
726 
727  if (to_update != NULL) {
728  to_update->num_backpropped_ += 1;
729  RepairGradients(debug_info, in_value, in_deriv, to_update);
730  }
731  } else if (clipping_threshold_ == 0.0) {
732  in_deriv->SetZero();
733  }
734 }
735 
736 // This function will add a self-repair term to in-deriv, attempting to shrink
737 // the magnitude of the input towards self_repair_target_.
738 // This term is proportional to [-(input vector - self_repair_target_)].
739 // The avarage magnitude of this term is equal to
740 // [self_repair_scale_ * clipped_proportion * average norm of input derivative].
741 // We use norm of input derivative when computing the magnitude so that it is
742 // comparable to the magnitude of input derivative, especially when the gradient
743 // explosion is actually happening.
745  const std::string &debug_info,
746  const CuMatrixBase<BaseFloat> &in_value,
747  CuMatrixBase<BaseFloat> *in_deriv, ClipGradientComponent *to_update) const {
748  KALDI_ASSERT(to_update != NULL);
749 
750  // we use this 'repair_probability' (hardcoded for now) to limit
751  // this code to running on about half of the minibatches.
752  BaseFloat repair_probability = 0.5;
753  if (self_repair_clipped_proportion_threshold_ >= 1.0 ||
754  self_repair_scale_ == 0.0 || count_ == 0 ||
755  RandUniform() > repair_probability)
756  return;
757 
758  KALDI_ASSERT(self_repair_target_ >= 0.0 && self_repair_scale_ > 0.0);
759 
760  BaseFloat clipped_proportion =
761  (count_ > 0 ? static_cast<BaseFloat>(num_clipped_) / count_ : 0);
762  // in-deriv would be modified only when clipped_proportion exceeds the
763  // threshold
764  if (clipped_proportion <= self_repair_clipped_proportion_threshold_)
765  return;
766 
767  to_update->num_self_repaired_ += 1;
768  if (to_update->debug_info_ == "") // get the component-node name
769  to_update->debug_info_ = debug_info;
770  if (to_update->num_self_repaired_ == 1)
771  KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info
772  << ")'s self-repair was activated as the first time at the "
773  << to_update->num_backpropped_
774  << "-th call of Backprop() in this training job.";
775 
776  // sign_mat = sign(in_value), i.e.,
777  // An element in sign_mat is 1 if its corresponding element in in_value > 0,
778  // or -1 otherwise
779  CuMatrix<BaseFloat> sign_mat(in_value);
780  sign_mat.ApplyHeaviside();
781  sign_mat.Scale(2.0);
782  sign_mat.Add(-1.0);
783 
784  // repair_mat =
785  // floor(abs(in_value) - self_repair_target_, 0) .* sign(in_value)
786  CuMatrix<BaseFloat> repair_mat(in_value);
787  repair_mat.ApplyPowAbs(1.0);
788  repair_mat.Add(-self_repair_target_);
789  repair_mat.ApplyFloor(0.0);
790  repair_mat.MulElements(sign_mat);
791 
792  // magnitude =
793  // self_repair_scale_ * clipped_proportion * average norm of in-deriv
794  CuVector<BaseFloat> in_deriv_norm_vec(in_deriv->NumRows());
795  in_deriv_norm_vec.AddDiagMat2(1.0, *in_deriv, kNoTrans, 0.0);
796  in_deriv_norm_vec.ApplyPow(0.5);
797  double in_deriv_norm_sum = in_deriv_norm_vec.Sum();
798  BaseFloat magnitude = self_repair_scale_ * clipped_proportion *
799  (in_deriv_norm_sum / in_deriv_norm_vec.Dim());
800 
801  CuVector<BaseFloat> repair_mat_norm_vec(repair_mat.NumRows());
802  repair_mat_norm_vec.AddDiagMat2(1.0, repair_mat, kNoTrans, 0.0);
803  repair_mat_norm_vec.ApplyPow(0.5);
804  double repair_mat_norm_sum = repair_mat_norm_vec.Sum();
805  double scale = 0.0;
806  if (repair_mat_norm_sum != 0.0)
807  scale = magnitude / (repair_mat_norm_sum / repair_mat_norm_vec.Dim());
808  // repair_mat is scaled so that on average the rows have the norm
809  // (magnitude / repair_probability). This will give higher magnitude of
810  // self-repair to input vectors that have larger absolute value, which tend to
811  // be those that are diverging.
812  in_deriv->AddMat(-scale / repair_probability, repair_mat);
813  CuVector<BaseFloat> in_deriv_repaired_norm_vec(in_deriv->NumRows());
814  in_deriv_repaired_norm_vec.AddDiagMat2(1.0, *in_deriv, kNoTrans, 0.0);
815  in_deriv_repaired_norm_vec.ApplyPow(0.5);
816  // scale in_deriv to have the same norm as that before adding the self-repair
817  // term, in order to avoid increase of the norm caused by self-repair,
818  // which may incur more clip of gradient and thus more self-repair
819  double in_deriv_repaired_norm_sum = in_deriv_repaired_norm_vec.Sum();
820  if (in_deriv_repaired_norm_sum != 0.0)
821  in_deriv->Scale(in_deriv_norm_sum / in_deriv_repaired_norm_sum);
822 }
823 
825  count_ = 0.0;
826  num_clipped_ = 0.0;
827  num_self_repaired_ = 0;
828  num_backpropped_ = 0;
829 }
830 
832  count_ *= scale;
833  num_clipped_ *= scale;
834 }
835 
836 void ClipGradientComponent::Add(BaseFloat alpha, const Component &other_in) {
837  const ClipGradientComponent *other =
838  dynamic_cast<const ClipGradientComponent*>(&other_in);
839  KALDI_ASSERT(other != NULL);
840  count_ += alpha * other->count_;
841  num_clipped_ += alpha * other->num_clipped_;
842 }
843 
845  const CuMatrixBase<BaseFloat> &in,
846  CuMatrixBase<BaseFloat> *out) const {
847  // Apply tanh function to each element of the output...
848  // the tanh function may be written as -1 + ( 2 / (1 + e^{-2 x})),
849  // which is a scaled and shifted sigmoid.
850  out->Tanh(in);
851  return NULL;
852 }
853 
854 
856  const CuMatrixBase<BaseFloat> &out_value,
857  CuMatrixBase<BaseFloat> *in_deriv,
858  TanhComponent *to_update) const {
859  KALDI_ASSERT(to_update != NULL);
860  // maximum possible derivative of SigmoidComponent is 1.0
861  // the default lower-threshold on the derivative, below which we
862  // add a term to the derivative to encourage the inputs to the sigmoid
863  // to be closer to zero, is 0.2, which means the derivative is on average
864  // 5 times smaller than its maximum possible value.
865  BaseFloat default_lower_threshold = 0.2;
866 
867  // we use this 'repair_probability' (hardcoded for now) to limit
868  // this code to running on about half of the minibatches.
869  BaseFloat repair_probability = 0.5;
870 
871  to_update->num_dims_processed_ += dim_;
872 
873  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
874  RandUniform() > repair_probability)
875  return;
876 
877  // check that the self-repair scale is in a reasonable range.
878  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
879  BaseFloat unset = kUnsetThreshold; // -1000.0
880  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
881  default_lower_threshold :
882  self_repair_lower_threshold_) *
883  count_;
884  if (self_repair_upper_threshold_ != unset) {
885  KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
886  << "components, it does nothing.";
887  }
888 
889  // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside
890  // function isn't defined for vectors).
891  CuMatrix<BaseFloat> thresholds(1, dim_);
892  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
893  thresholds_vec.AddVec(-1.0, deriv_sum_);
894  thresholds_vec.Add(lower_threshold);
895  thresholds.ApplyHeaviside();
896  to_update->num_dims_self_repaired_ += thresholds_vec.Sum();
897 
898  // At this point, 'thresholds_vec' contains a 1 for each dimension of
899  // the output that is 'problematic', i.e. for which the avg-deriv
900  // is less than the self-repair lower threshold, and a 0 for
901  // each dimension that is not problematic.
902 
903  // what we want to do is to add -self_repair_scale_ / repair_probability times
904  // output-valiue) to the input derivative for each problematic dimension.
905  // note that for the tanh, the output-value goes from -1.0 when the input is
906  // -inf to +1.0 when the input is +inf. The negative sign is so that for
907  // inputs <0, we push them up towards 0, and for inputs >0, we push them down
908  // towards 0. Our use of the tanh here is just a convenience since we have it
909  // available. We could use just about any function that is positive for
910  // inputs < 0 and negative for inputs > 0.
911 
912  // We can rearrange the above as: for only the problematic columns,
913  // input-deriv -= self-repair-scale / repair-probabilty * output
914  // which we can write as:
915  // input-deriv -= self-repair-scale / repair-probabilty * output * thresholds-vec
916 
917  in_deriv->AddMatDiagVec(-self_repair_scale_ / repair_probability,
918  out_value, kNoTrans, thresholds_vec);
919 }
920 
921 void TanhComponent::Backprop(const std::string &debug_info,
922  const ComponentPrecomputedIndexes *indexes,
923  const CuMatrixBase<BaseFloat> &,
924  const CuMatrixBase<BaseFloat> &out_value,
925  const CuMatrixBase<BaseFloat> &out_deriv,
926  void *memo,
927  Component *to_update_in, // may be NULL; may be identical
928  // to "this" or different.
929  CuMatrixBase<BaseFloat> *in_deriv) const {
930  NVTX_RANGE("TanhComponent::Backprop");
931  if (in_deriv != NULL) {
932  in_deriv->DiffTanh(out_value, out_deriv);
933  TanhComponent *to_update = dynamic_cast<TanhComponent*>(to_update_in);
934  if (to_update != NULL) {
935  RepairGradients(out_value, in_deriv, to_update);
936  to_update->StoreBackpropStats(out_deriv);
937  }
938  }
939 }
940 
941 /*
942  Note on the derivative of the tanh function:
943  tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
944 
945  The element by element equation of what we're doing would be:
946  in_deriv = out_deriv * (1.0 - out_value^2).
947  We can accomplish this via calls to the matrix library. */
949  const CuMatrixBase<BaseFloat> &out_value,
950  void *memo) {
951  // Only store stats about every other minibatch (but on the first minibatch,
952  // always store it, which is necessary for the ConsolidateMemory() operation
953  // to work correctly.
954  if (RandInt(0, 1) == 0 && count_ != 0)
955  return;
956  // derivative of the onlinearity is out_value * (1.0 - out_value);
957  CuMatrix<BaseFloat> temp_deriv(out_value);
958  temp_deriv.ApplyPow(2.0);
959  temp_deriv.Scale(-1.0);
960  temp_deriv.Add(1.0);
961  StoreStatsInternal(out_value, &temp_deriv);
962 }
963 
965  const ComponentPrecomputedIndexes *indexes,
966  const CuMatrixBase<BaseFloat> &in,
967  CuMatrixBase<BaseFloat> *out) const {
968  // Apply rectified linear function (x >= 0 ? 1.0 : 0.0)
969  out->CopyFromMat(in);
970  out->ApplyFloor(0.0);
971  return NULL;
972 }
973 
975  const std::string &debug_info,
976  const ComponentPrecomputedIndexes *indexes,
977  const CuMatrixBase<BaseFloat> &, //in_value
978  const CuMatrixBase<BaseFloat> &out_value,
979  const CuMatrixBase<BaseFloat> &out_deriv,
980  void *memo,
981  Component *to_update_in,
982  CuMatrixBase<BaseFloat> *in_deriv) const {
983  NVTX_RANGE("RectifiedLinearComponent::Backprop");
984  if (in_deriv != NULL) {
985  in_deriv->Heaviside(out_value);
986  in_deriv->MulElements(out_deriv);
987  RectifiedLinearComponent *to_update =
988  dynamic_cast<RectifiedLinearComponent*>(to_update_in);
989  if (to_update != NULL) {
990  RepairGradients(in_deriv, to_update);
991  to_update->StoreBackpropStats(out_deriv);
992  }
993  }
994 }
995 
996 
998  CuMatrixBase<BaseFloat> *in_deriv,
999  RectifiedLinearComponent *to_update) const {
1000  KALDI_ASSERT(to_update != NULL);
1001  int32 dim = dim_, block_dim = block_dim_;
1002  BaseFloat default_lower_threshold = 0.05,
1003  default_upper_threshold = 0.95;
1004  // we use this 'repair_probability' (hardcoded for now) to limit
1005  // this code to running on about half of the minibatches.
1006  BaseFloat repair_probability = 0.5;
1007  KALDI_ASSERT(in_deriv->NumCols() == dim || in_deriv->NumCols() == block_dim);
1008  if (self_repair_scale_ == 0.0 || count_ == 0.0 ||
1009  deriv_sum_.Dim() != dim)
1010  return;
1011 
1012  if (in_deriv->NumCols() != block_dim) {
1013  KALDI_ASSERT(in_deriv->NumCols() == in_deriv->Stride());
1014  int32 dim_multiple = dim / block_dim;
1015  CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(),
1016  in_deriv->NumRows() * dim_multiple,
1017  block_dim, block_dim);
1018  RepairGradients(&in_deriv_reshaped, to_update);
1019  return;
1020  }
1021 
1022  // By now we know that in_deriv->NumCols() == block_dim.
1023 
1024  if (RandUniform() > repair_probability)
1025  return;
1026 
1027  to_update->num_dims_processed_ += block_dim;
1028 
1029  // check that the self-repair scale is in a reasonable range.
1030  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
1031  BaseFloat unset = kUnsetThreshold; // -1000.0
1032  BaseFloat count = count_,
1033  lower_threshold = (self_repair_lower_threshold_ == unset ?
1034  default_lower_threshold :
1035  self_repair_lower_threshold_) * count,
1036  upper_threshold = (self_repair_upper_threshold_ == unset ?
1037  default_upper_threshold :
1038  self_repair_upper_threshold_) * count;
1039 
1040  CuMatrix<BaseFloat> storage(2, block_dim + 2, kUndefined);
1041  CuSubVector<BaseFloat> thresholds_vec(storage.RowData(0) + block_dim, 2);
1042  CuSubMatrix<BaseFloat> stats_mat(storage, 0, 2, 0, block_dim);
1043  thresholds_vec(0) = -lower_threshold;
1044  thresholds_vec(1) = -upper_threshold;
1045  CuSubVector<BaseFloat> row0(stats_mat, 0);
1046  CuSubVector<BaseFloat> row1(stats_mat, 1);
1047 
1048  if (block_dim == dim) {
1049  row0.CopyFromVec(deriv_sum_);
1050  } else {
1051  CuSubMatrix<double> deriv_sum_mat(deriv_sum_.Data(),
1052  dim / block_dim,
1053  block_dim, block_dim);
1054  CuVector<double> deriv_sum_dbl(block_dim);
1055  // get the average of the deriv-sums over the blocks.
1056  deriv_sum_dbl.AddRowSumMat(block_dim * 1.0 / dim, deriv_sum_mat);
1057  row0.CopyFromVec(deriv_sum_dbl);
1058  }
1059  row1.CopyFromVec(row0);
1060  stats_mat.AddVecToCols(1.0, thresholds_vec, 1.0);
1061  // now row0 equals stats - lower_threshold, and
1062  // row1 equals stats - upper_threshold.
1063  stats_mat.ApplyHeaviside();
1064  // now row0 equals (stats > lower_threshold ? 1 : 0), and
1065  // row1 equals (stats > upper_threshold ? 1 : 0).
1066  // what we want is:
1067  // self_repair_scale * ((stats <= lower_threshold ? 1 : 0) +
1068  // (stats > upper_threshold ? -1 : 0)).
1069  //
1070  // we can get these in stats_mat.Row(0) by computing:
1071  // -self_repair_scale * (stats_mat.Row(1) + stats_mat.Row(0) - 1).
1072  row0.AddVec(1.0, row1, 1.0);
1073  row0.Add(-1.0);
1074  CuVector<BaseFloat> temp(row0);
1075  temp.ApplyPow(2.0);
1076  to_update->num_dims_self_repaired_ += temp.Sum();
1077  // [actually we need to divide by repair_probability also, to
1078  // correct for the fact that we only do this on some frames.]
1079  row0.Scale(-self_repair_scale_ / repair_probability);
1080  in_deriv->AddVecToRows(1.0, row0, 1.0);
1081 }
1082 
1083 
1085  const CuMatrixBase<BaseFloat> &in_value,
1086  const CuMatrixBase<BaseFloat> &out_value,
1087  void *memo) {
1088  // Only store stats about every other minibatch (but on the first minibatch,
1089  // always store it, which is necessary for the ConsolidateMemory() operation
1090  // to work correctly.
1091  if (RandInt(0, 1) == 0 && count_ != 0)
1092  return;
1093  CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(),
1094  out_value.NumCols(),
1095  kUndefined);
1096  temp_deriv.Heaviside(out_value);
1097  StoreStatsInternal(out_value, &temp_deriv);
1098 }
1099 
1101  if (scale == 0.0) {
1102  // If scale == 0.0 we call SetZero() which will get rid of NaN's and inf's.
1103  linear_params_.SetZero();
1104  bias_params_.SetZero();
1105  } else {
1106  linear_params_.Scale(scale);
1107  bias_params_.Scale(scale);
1108  }
1109 }
1110 
1111 void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
1112  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
1113  bias_params_.Resize(output_dim);
1114  linear_params_.Resize(output_dim, input_dim);
1115 }
1116 
1117 void AffineComponent::Add(BaseFloat alpha, const Component &other_in) {
1118  const AffineComponent *other =
1119  dynamic_cast<const AffineComponent*>(&other_in);
1120  KALDI_ASSERT(other != NULL);
1121  linear_params_.AddMat(alpha, other->linear_params_);
1122  bias_params_.AddVec(alpha, other->bias_params_);
1123 }
1124 
1126  UpdatableComponent(component),
1127  linear_params_(component.linear_params_),
1128  bias_params_(component.bias_params_),
1129  orthonormal_constraint_(component.orthonormal_constraint_) { }
1130 
1132  const CuVectorBase<BaseFloat> &bias_params,
1133  BaseFloat learning_rate):
1134  linear_params_(linear_params),
1135  bias_params_(bias_params),
1137  SetUnderlyingLearningRate(learning_rate);
1138  KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
1139  bias_params.Dim() != 0);
1140 }
1141 
1143  const CuMatrixBase<BaseFloat> &linear) {
1144  bias_params_ = bias;
1145  linear_params_ = linear;
1146  KALDI_ASSERT(bias_params_.Dim() == linear_params_.NumRows());
1147 }
1148 
1150  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
1151  temp_linear_params.SetRandn();
1152  linear_params_.AddMat(stddev, temp_linear_params);
1153 
1154  CuVector<BaseFloat> temp_bias_params(bias_params_);
1155  temp_bias_params.SetRandn();
1156  bias_params_.AddVec(stddev, temp_bias_params);
1157 }
1158 
1159 std::string AffineComponent::Info() const {
1160  std::ostringstream stream;
1161  stream << UpdatableComponent::Info();
1162  if (orthonormal_constraint_ != 0.0)
1163  stream << ", orthonormal-constraint=" << orthonormal_constraint_;
1164  PrintParameterStats(stream, "linear-params", linear_params_,
1165  false, // include_mean
1166  true, // include_row_norms
1167  true, // include_column_norms
1168  GetVerboseLevel() >= 2); // include_singular_values
1169  PrintParameterStats(stream, "bias", bias_params_, true);
1170  return stream.str();
1171 }
1172 
1174  AffineComponent *ans = new AffineComponent(*this);
1175  return ans;
1176 }
1177 
1179  const AffineComponent *other =
1180  dynamic_cast<const AffineComponent*>(&other_in);
1182  + VecVec(bias_params_, other->bias_params_);
1183 }
1184 
1185 void AffineComponent::Init(int32 input_dim, int32 output_dim,
1186  BaseFloat param_stddev, BaseFloat bias_stddev) {
1187  linear_params_.Resize(output_dim, input_dim);
1188  bias_params_.Resize(output_dim);
1189  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
1190  linear_params_.SetRandn(); // sets to random normally distributed noise.
1191  linear_params_.Scale(param_stddev);
1192  bias_params_.SetRandn();
1193  bias_params_.Scale(bias_stddev);
1194 }
1195 
1196 void AffineComponent::Init(std::string matrix_filename) {
1197  CuMatrix<BaseFloat> mat;
1198  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
1199  KALDI_ASSERT(mat.NumCols() >= 2);
1200  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
1201  linear_params_.Resize(output_dim, input_dim);
1202  bias_params_.Resize(output_dim);
1203  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
1204  bias_params_.CopyColFromMat(mat, input_dim);
1205 }
1206 
1208  bool ok = true;
1209  std::string matrix_filename;
1210  int32 input_dim = -1, output_dim = -1;
1212  if (cfl->GetValue("matrix", &matrix_filename)) {
1213  Init(matrix_filename);
1214  if (cfl->GetValue("input-dim", &input_dim))
1215  KALDI_ASSERT(input_dim == InputDim() &&
1216  "input-dim mismatch vs. matrix.");
1217  if (cfl->GetValue("output-dim", &output_dim))
1218  KALDI_ASSERT(output_dim == OutputDim() &&
1219  "output-dim mismatch vs. matrix.");
1220  } else {
1221  ok = ok && cfl->GetValue("input-dim", &input_dim);
1222  ok = ok && cfl->GetValue("output-dim", &output_dim);
1223  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
1224  bias_stddev = 1.0;
1225  cfl->GetValue("param-stddev", &param_stddev);
1226  cfl->GetValue("bias-stddev", &bias_stddev);
1227  Init(input_dim, output_dim,
1228  param_stddev, bias_stddev);
1229  }
1230  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
1231 
1232  if (cfl->HasUnusedValues())
1233  KALDI_ERR << "Could not process these elements in initializer: "
1234  << cfl->UnusedValues();
1235  if (!ok)
1236  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
1237 }
1238 
1239 
1240 
1241 
1243  const CuMatrixBase<BaseFloat> &in,
1244  CuMatrixBase<BaseFloat> *out) const {
1245 
1246  // No need for asserts as they'll happen within the matrix operations.
1247  out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row
1248  // of *out.
1249  out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 1.0);
1250  return NULL;
1251 }
1252 
1254  const CuMatrixBase<BaseFloat> &out_deriv) {
1255  bias_params_.AddRowSumMat(learning_rate_, out_deriv, 1.0);
1256  linear_params_.AddMatMat(learning_rate_, out_deriv, kTrans,
1257  in_value, kNoTrans, 1.0);
1258 }
1259 
1260 void AffineComponent::Backprop(const std::string &debug_info,
1261  const ComponentPrecomputedIndexes *indexes,
1262  const CuMatrixBase<BaseFloat> &in_value,
1263  const CuMatrixBase<BaseFloat> &, // out_value
1264  const CuMatrixBase<BaseFloat> &out_deriv,
1265  void *memo,
1266  Component *to_update_in,
1267  CuMatrixBase<BaseFloat> *in_deriv) const {
1268  NVTX_RANGE("AffineComponent::Backprop");
1269  AffineComponent *to_update = dynamic_cast<AffineComponent*>(to_update_in);
1270 
1271  // Propagate the derivative back to the input.
1272  // add with coefficient 1.0 since property kBackpropAdds is true.
1273  // If we wanted to add with coefficient 0.0 we'd need to zero the
1274  // in_deriv, in case of infinities.
1275  if (in_deriv)
1276  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, linear_params_, kNoTrans,
1277  1.0);
1278 
1279  if (to_update != NULL) {
1280  // Next update the model (must do this 2nd so the derivatives we propagate
1281  // are accurate, in case this == to_update_in.)
1282  if (to_update->is_gradient_)
1283  to_update->UpdateSimple(in_value, out_deriv);
1284  else // the call below is to a virtual function that may be re-implemented
1285  to_update->Update(debug_info, in_value, out_deriv); // by child classes.
1286  }
1287 }
1288 
1289 void AffineComponent::Read(std::istream &is, bool binary) {
1290  ReadUpdatableCommon(is, binary); // read opening tag and learning rate.
1291  ExpectToken(is, binary, "<LinearParams>");
1292  linear_params_.Read(is, binary);
1293  ExpectToken(is, binary, "<BiasParams>");
1294  bias_params_.Read(is, binary);
1295  if (PeekToken(is, binary) == 'I') {
1296  // for back compatibility; we don't write this here any
1297  // more as it's written and read in Write/ReadUpdatableCommon
1298  ExpectToken(is, binary, "<IsGradient>");
1299  ReadBasicType(is, binary, &is_gradient_);
1300  }
1301  if (PeekToken(is, binary) == 'O') {
1302  ExpectToken(is, binary, "<OrthonormalConstraint>");
1303  ReadBasicType(is, binary, &orthonormal_constraint_);
1304  } else {
1306  }
1307  ExpectToken(is, binary, "</AffineComponent>");
1308 }
1309 
1310 void AffineComponent::Write(std::ostream &os, bool binary) const {
1311  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
1312  WriteToken(os, binary, "<LinearParams>");
1313  linear_params_.Write(os, binary);
1314  WriteToken(os, binary, "<BiasParams>");
1315  bias_params_.Write(os, binary);
1316  if (orthonormal_constraint_ != 0.0) {
1317  WriteToken(os, binary, "<OrthonormalConstraint>");
1319  }
1320  WriteToken(os, binary, "</AffineComponent>");
1321 }
1322 
1324  return (InputDim() + 1) * OutputDim();
1325 }
1327  KALDI_ASSERT(params->Dim() == this->NumParameters());
1328  params->Range(0, InputDim() * OutputDim()).CopyRowsFromMat(linear_params_);
1329  params->Range(InputDim() * OutputDim(),
1330  OutputDim()).CopyFromVec(bias_params_);
1331 }
1333  KALDI_ASSERT(params.Dim() == this->NumParameters());
1334  linear_params_.CopyRowsFromVec(params.Range(0, InputDim() * OutputDim()));
1335  bias_params_.CopyFromVec(params.Range(InputDim() * OutputDim(),
1336  OutputDim()));
1337 }
1338 
1340  UpdatableComponent(component),
1341  linear_params_(component.linear_params_),
1342  bias_params_(component.bias_params_),
1343  num_repeats_(component.num_repeats_) {}
1344 
1345 
1347  if (scale == 0.0) {
1348  linear_params_.SetZero();
1349  bias_params_.SetZero();
1350  } else {
1351  linear_params_.Scale(scale);
1352  bias_params_.Scale(scale);
1353  }
1354 }
1355 
1356 void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
1357  const RepeatedAffineComponent *other =
1358  dynamic_cast<const RepeatedAffineComponent *>(&other_in);
1359  KALDI_ASSERT(other != NULL);
1360  linear_params_.AddMat(alpha, other->linear_params_);
1361  bias_params_.AddVec(alpha, other->bias_params_);
1362 }
1363 
1365  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
1366  temp_linear_params.SetRandn();
1367  linear_params_.AddMat(stddev, temp_linear_params);
1368  CuVector<BaseFloat> temp_bias_params(bias_params_);
1369  temp_bias_params.SetRandn();
1370  bias_params_.AddVec(stddev, temp_bias_params);
1371 }
1372 
1373 std::string RepeatedAffineComponent::Info() const {
1374  std::ostringstream stream;
1375  stream << UpdatableComponent::Info()
1376  << ", num-repeats=" << num_repeats_;
1377  PrintParameterStats(stream, "linear-params", linear_params_);
1378  PrintParameterStats(stream, "bias", bias_params_, true);
1379  return stream.str();
1380 }
1381 
1384  return ans;
1385 }
1386 
1388  const RepeatedAffineComponent *other =
1389  dynamic_cast<const RepeatedAffineComponent*>(&other_in);
1391  + VecVec(bias_params_, other->bias_params_);
1392 }
1393 
1394 void RepeatedAffineComponent::Init(int32 input_dim, int32 output_dim, int32 num_repeats,
1395  BaseFloat param_stddev, BaseFloat bias_mean,
1396  BaseFloat bias_stddev) {
1397  KALDI_ASSERT(input_dim % num_repeats == 0 && output_dim % num_repeats == 0);
1398  linear_params_.Resize(output_dim / num_repeats, input_dim / num_repeats);
1399  bias_params_.Resize(output_dim / num_repeats);
1400  num_repeats_ = num_repeats;
1401  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
1402  linear_params_.SetRandn(); // sets to random normally distributed noise.
1403  linear_params_.Scale(param_stddev);
1404  bias_params_.SetRandn();
1405  bias_params_.Scale(bias_stddev);
1406  bias_params_.Add(bias_mean);
1408 }
1409 
1410 
1412  bool ok = true;
1413  int32 num_repeats = num_repeats_;
1414  int32 input_dim = -1, output_dim = -1;
1416  ok = cfl->GetValue("num-repeats", &num_repeats) && ok;
1417  ok = cfl->GetValue("input-dim", &input_dim) && ok;
1418  ok = cfl->GetValue("output-dim", &output_dim) && ok;
1419  KALDI_ASSERT(input_dim % num_repeats == 0 &&
1420  "num-repeats must divide input-dim");
1421  KALDI_ASSERT(output_dim % num_repeats == 0 &&
1422  "num-repeats must divide output-dim");
1423  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_repeats),
1424  bias_mean = 0.0, bias_stddev = 0.0;
1425  cfl->GetValue("param-stddev", &param_stddev);
1426  cfl->GetValue("bias-mean", &bias_mean);
1427  cfl->GetValue("bias-stddev", &bias_stddev);
1428  Init(input_dim, output_dim,
1429  num_repeats, param_stddev, bias_mean, bias_stddev);
1430  if (cfl->HasUnusedValues())
1431  KALDI_ERR << "Could not process these elements in initializer: "
1432  << cfl->UnusedValues();
1433  if (!ok)
1434  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
1435 }
1436 
1438  const CuMatrixBase<BaseFloat> &in,
1439  CuMatrixBase<BaseFloat> *out) const {
1440  // we gave the kInputContiguous and kOutputContiguous flags-- check that they
1441  // are honored.
1442  KALDI_ASSERT(in.NumCols() == in.Stride() &&
1443  out->NumCols() == out->Stride() &&
1444  out->NumRows() == in.NumRows());
1445 
1446  int32 num_repeats = num_repeats_,
1447  num_rows = in.NumRows(),
1448  block_dim_out = linear_params_.NumRows(),
1449  block_dim_in = linear_params_.NumCols();
1450 
1451  CuSubMatrix<BaseFloat> in_reshaped(in.Data(), num_rows * num_repeats,
1452  block_dim_in, block_dim_in),
1453  out_reshaped(out->Data(), num_rows * num_repeats,
1454  block_dim_out, block_dim_out);
1455 
1456  out_reshaped.CopyRowsFromVec(bias_params_);
1457 
1458  out_reshaped.AddMatMat(1.0, in_reshaped, kNoTrans,
1459  linear_params_, kTrans, 1.0);
1460  return NULL;
1461 }
1462 
1463 void RepeatedAffineComponent::Backprop(const std::string &debug_info,
1464  const ComponentPrecomputedIndexes *indexes,
1465  const CuMatrixBase<BaseFloat> &in_value,
1466  const CuMatrixBase<BaseFloat> &, // out_value
1467  const CuMatrixBase<BaseFloat> &out_deriv,
1468  void *memo,
1469  Component *to_update_in,
1470  CuMatrixBase<BaseFloat> *in_deriv) const {
1471  NVTX_RANGE("RepeatedAffineComponent::Backprop");
1472  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
1473  (in_value.NumCols() == 0 || in_value.NumCols() == in_value.Stride()) &&
1474  (!in_deriv || in_deriv->NumCols() == in_deriv->Stride()));
1475 
1476  RepeatedAffineComponent *to_update = dynamic_cast<RepeatedAffineComponent*>(
1477  to_update_in);
1478 
1479  // Propagate the derivative back to the input.
1480  // add with coefficient 1.0 since property kBackpropAdds is true.
1481  // If we wanted to add with coefficient 0.0 we'd need to zero the
1482  // in_deriv, in case of infinities.
1483  if (in_deriv) {
1484  int32 num_repeats = num_repeats_,
1485  num_rows = out_deriv.NumRows(),
1486  block_dim_out = linear_params_.NumRows(),
1487  block_dim_in = linear_params_.NumCols();
1488 
1489  CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(),
1490  num_rows * num_repeats,
1491  block_dim_in, block_dim_in),
1492  out_deriv_reshaped(out_deriv.Data(),
1493  num_rows * num_repeats,
1494  block_dim_out, block_dim_out);
1495  in_deriv_reshaped.AddMatMat(1.0, out_deriv_reshaped, kNoTrans,
1496  linear_params_, kNoTrans, 1.0);
1497  }
1498 
1499  // Next update the model (must do this 2nd so the derivatives we propagate are
1500  // accurate, in case this == to_update_in.)
1501  if (to_update != NULL)
1502  to_update->Update(in_value, out_deriv);
1503 }
1504 
1506  const CuMatrixBase<BaseFloat> &out_deriv) {
1507  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
1508  in_value.NumCols() == in_value.Stride() &&
1509  in_value.NumRows() == out_deriv.NumRows());
1510 
1511 
1512  int32 num_repeats = num_repeats_,
1513  num_rows = in_value.NumRows(),
1514  block_dim_out = linear_params_.NumRows(),
1515  block_dim_in = linear_params_.NumCols();
1516 
1517  CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
1518  num_rows * num_repeats,
1519  block_dim_in, block_dim_in),
1520  out_deriv_reshaped(out_deriv.Data(),
1521  num_rows * num_repeats,
1522  block_dim_out, block_dim_out);
1523 
1524 
1525  linear_params_.AddMatMat(learning_rate_, out_deriv_reshaped, kTrans,
1526  in_value_reshaped, kNoTrans, 1.0);
1527  bias_params_.AddRowSumMat(learning_rate_,
1528  out_deriv_reshaped);
1529 }
1530 
1531 void RepeatedAffineComponent::Read(std::istream &is, bool binary) {
1532  // This Read function also works for NaturalGradientRepeatedAffineComponent.
1533  ReadUpdatableCommon(is, binary); // read opening tag and learning rate.
1534  ExpectToken(is, binary, "<NumRepeats>");
1535  ReadBasicType(is, binary, &num_repeats_);
1536  ExpectToken(is, binary, "<LinearParams>");
1537  linear_params_.Read(is, binary);
1538  ExpectToken(is, binary, "<BiasParams>");
1539  bias_params_.Read(is, binary);
1540  if (PeekToken(is, binary) == 'I') {
1541  // for back compatibility; we don't write this here any
1542  // more as it's written and read in Write/ReadUpdatableCommon
1543  ExpectToken(is, binary, "<IsGradient>");
1544  ReadBasicType(is, binary, &is_gradient_);
1545  }
1546  ExpectToken(is, binary, std::string("</") + Type() + std::string(">"));
1548 }
1549 
1550 void RepeatedAffineComponent::Write(std::ostream &os, bool binary) const {
1551  // This Write function also works for NaturalGradientRepeatedAffineComponent.
1552  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
1553  WriteToken(os, binary, "<NumRepeats>");
1554  WriteBasicType(os, binary, num_repeats_);
1555  WriteToken(os, binary, "<LinearParams>");
1556  linear_params_.Write(os, binary);
1557  WriteToken(os, binary, "<BiasParams>");
1558  bias_params_.Write(os, binary);
1559  // write closing token.
1560  WriteToken(os, binary, std::string("</") + Type() + std::string(">"));
1561 }
1562 
1564  // Note: unlike AffineComponent, InputDim() & OutputDim() are not used here and below,
1565  // for they are multipled by num_repeats_.
1566  return linear_params_.NumCols() * linear_params_.NumRows() + bias_params_.Dim();
1567 }
1568 
1570  KALDI_ASSERT(params->Dim() == this->NumParameters());
1571  params->Range(0, linear_params_.NumCols() * linear_params_.NumRows()).CopyRowsFromMat(linear_params_);
1572  params->Range(linear_params_.NumCols() * linear_params_.NumRows(),
1573  bias_params_.Dim()).CopyFromVec(bias_params_);
1574 }
1575 
1577  KALDI_ASSERT(params.Dim() == this->NumParameters());
1578  linear_params_.CopyRowsFromVec(params.Range(0, linear_params_.NumCols() * linear_params_.NumRows()));
1579  bias_params_.CopyFromVec(params.Range(linear_params_.NumCols() * linear_params_.NumRows(),
1580  bias_params_.Dim()));
1581 }
1582 
1584  int32 rank_in = 40;
1585  int32 input_dim = linear_params_.NumCols();
1586  if (rank_in > input_dim / 2)
1587  rank_in = input_dim / 2;
1588  if (rank_in < 1)
1589  rank_in = 1;
1590  preconditioner_in_.SetRank(rank_in);
1591  preconditioner_in_.SetUpdatePeriod(4);
1592 }
1593 
1596  RepeatedAffineComponent(other),
1597  preconditioner_in_(other.preconditioner_in_) { }
1598 
1599 // virtual
1601  return new NaturalGradientRepeatedAffineComponent(*this);
1602 }
1603 
1605  const CuMatrixBase<BaseFloat> &in_value,
1606  const CuMatrixBase<BaseFloat> &out_deriv) {
1607  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
1608  in_value.NumCols() == in_value.Stride() &&
1609  in_value.NumRows() == out_deriv.NumRows());
1610 
1611  int32 num_repeats = num_repeats_,
1612  num_rows = in_value.NumRows(),
1613  block_dim_out = linear_params_.NumRows(),
1614  block_dim_in = linear_params_.NumCols();
1615 
1616  CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
1617  num_rows * num_repeats,
1618  block_dim_in, block_dim_in),
1619  out_deriv_reshaped(out_deriv.Data(),
1620  num_rows * num_repeats,
1621  block_dim_out, block_dim_out);
1622 
1623  CuVector<BaseFloat> bias_deriv(block_dim_out);
1624  bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped);
1625 
1626  CuMatrix<BaseFloat> deriv(block_dim_out,
1627  block_dim_in + 1);
1628  deriv.ColRange(0, block_dim_in).AddMatMat(
1629  1.0, out_deriv_reshaped, kTrans,
1630  in_value_reshaped, kNoTrans, 1.0);
1631  deriv.CopyColFromVec(bias_deriv, block_dim_in);
1632 
1633  BaseFloat scale = 1.0;
1634  if (!is_gradient_) {
1635  try {
1636  // Only apply the preconditioning/natural-gradient if we're not computing
1637  // the exact gradient.
1639  } catch (...) {
1640  int32 num_bad_rows = 0;
1641  for (int32 i = 0; i < out_deriv.NumRows(); i++) {
1642  BaseFloat f = out_deriv.Row(i).Sum();
1643  if (!(f - f == 0)) num_bad_rows++;
1644  }
1645  KALDI_ERR << "Preonditioning failed, in_value sum is "
1646  << in_value.Sum() << ", out_deriv sum is " << out_deriv.Sum()
1647  << ", out_deriv has " << num_bad_rows << " bad rows.";
1648  }
1649  }
1650  linear_params_.AddMat(learning_rate_ * scale,
1651  deriv.ColRange(0, block_dim_in));
1652  bias_deriv.CopyColFromMat(deriv, block_dim_in);
1653  bias_params_.AddVec(learning_rate_ * scale, bias_deriv);
1654 }
1655 
1658  preconditioner_in_.Swap(&temp);
1659 }
1660 
1661 
1663  UpdatableComponent(other),
1665  bias_params_(other.bias_params_),
1666  num_blocks_(other.num_blocks_) {}
1667 
1669  UpdatableComponent(rac),
1670  linear_params_(rac.num_repeats_ * rac.linear_params_.NumRows(),
1671  rac.linear_params_.NumCols(), kUndefined),
1672  bias_params_(rac.num_repeats_ * rac.linear_params_.NumRows(), kUndefined),
1673  num_blocks_(rac.num_repeats_) {
1674  // copy rac's linear_params_ and bias_params_ to this.
1675  int32 num_rows_in_block = rac.linear_params_.NumRows();
1676  for(int32 block_counter = 0; block_counter < num_blocks_; block_counter++) {
1677  int32 row_offset = block_counter * num_rows_in_block;
1678  CuSubMatrix<BaseFloat> block = this->linear_params_.RowRange(row_offset,
1679  num_rows_in_block);
1680  block.CopyFromMat(rac.linear_params_);
1681  CuSubVector<BaseFloat> block_bias = this->bias_params_.Range(row_offset,
1682  num_rows_in_block);
1683  block_bias.CopyFromVec(rac.bias_params_);
1684  }
1685 }
1686 
1688  BlockAffineComponent *ans = new BlockAffineComponent(*this);
1689  return ans;
1690 }
1691 
1692 std::string BlockAffineComponent::Info() const {
1693  std::ostringstream stream;
1694  stream << UpdatableComponent::Info()
1695  << ", num-blocks=" << num_blocks_;
1696  PrintParameterStats(stream, "linear-params", linear_params_);
1697  PrintParameterStats(stream, "bias", bias_params_, true);
1698  return stream.str();
1699 }
1700 
1702  int32 output_dim, int32 num_blocks,
1703  BaseFloat param_stddev, BaseFloat bias_mean,
1704  BaseFloat bias_stddev) {
1705  KALDI_ASSERT(input_dim > 0 && output_dim > 0 && num_blocks >= 1);
1706  KALDI_ASSERT(output_dim % num_blocks == 0 && input_dim % num_blocks == 0);
1707  const int32 num_columns_per_block = input_dim / num_blocks;
1708  linear_params_.Resize(output_dim, num_columns_per_block);
1709  bias_params_.Resize(output_dim);
1710  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
1711  linear_params_.SetRandn();
1712  linear_params_.Scale(param_stddev);
1713  bias_params_.SetRandn();
1714  bias_params_.Scale(bias_stddev);
1715  bias_params_.Add(bias_mean);
1716  num_blocks_ = num_blocks;
1717 }
1718 
1720  int32 input_dim = -1, output_dim = -1, num_blocks = -1;
1721  if(!cfl->GetValue("input-dim", &input_dim) ||
1722  !cfl->GetValue("output-dim", &output_dim) ||
1723  !cfl->GetValue("num-blocks", &num_blocks))
1724  KALDI_ERR << "Invalid initializer for layer of type "
1725  << Type() << ": \"" << cfl->WholeLine() << "\"";
1727  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_blocks),
1728  bias_mean = 0.0, bias_stddev = 1.0;
1729  cfl->GetValue("param-stddev", &param_stddev);
1730  cfl->GetValue("bias-stddev", &bias_stddev);
1731  cfl->GetValue("bias-mean", &bias_mean);
1732 
1733  if (cfl->HasUnusedValues())
1734  KALDI_ERR << "Invalid initializer for layer of type "
1735  << Type() << ": \"" << cfl->WholeLine() << "\"";
1736 
1737  Init(input_dim, output_dim, num_blocks,
1738  param_stddev, bias_mean, bias_stddev);
1739 }
1740 
1742  const CuMatrixBase<BaseFloat> &in,
1743  CuMatrixBase<BaseFloat> *out) const {
1745  // block_dimension is both the number of columns, and the number of rows,
1746  // of a block.
1747  int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_;
1748  int32 num_cols_in_block = linear_params_.NumCols();
1749  std::vector<CuSubMatrix<BaseFloat> *> in_batch, out_batch,
1750  linear_params_batch;
1751  for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
1752  CuSubMatrix<BaseFloat> *in_block =
1753  new CuSubMatrix<BaseFloat>(in.ColRange(block_counter * num_cols_in_block,
1754  num_cols_in_block));
1755  in_batch.push_back(in_block);
1756 
1757  CuSubMatrix<BaseFloat> *out_block =
1758  new CuSubMatrix<BaseFloat>(out->ColRange(block_counter * num_rows_in_block,
1759  num_rows_in_block));
1760  out_batch.push_back(out_block);
1761 
1762  CuSubMatrix<BaseFloat> *linear_params_block =
1763  new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
1764  num_rows_in_block));
1765  linear_params_batch.push_back(linear_params_block);
1766  }
1767  AddMatMatBatched<BaseFloat>(1.0, out_batch, in_batch, kNoTrans,
1768  linear_params_batch, kTrans, 1.0);
1769 
1770  DeletePointers(&in_batch);
1771  DeletePointers(&out_batch);
1772  DeletePointers(&linear_params_batch);
1773  return NULL;
1774 }
1775 
1776 void BlockAffineComponent::Backprop(const std::string &debug_info,
1777  const ComponentPrecomputedIndexes *indexes,
1778  const CuMatrixBase<BaseFloat> &in_value,
1779  const CuMatrixBase<BaseFloat> &, // out_value
1780  const CuMatrixBase<BaseFloat> &out_deriv,
1781  void *memo,
1782  Component *to_update_in,
1783  CuMatrixBase<BaseFloat> *in_deriv) const {
1784  NVTX_RANGE("BlockAffineComponent::Backprop");
1785  BlockAffineComponent *to_update = dynamic_cast<BlockAffineComponent*>(to_update_in);
1786 
1787  const int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_;
1788  const int32 num_cols_in_block = linear_params_.NumCols();
1789 
1790  // Propagate the derivative back to the input.
1791  // add with coefficient 1.0 since property kBackpropAdds is true.
1792  // If we wanted to add with coefficient 0.0 we'd need to zero the
1793  // in_deriv, in case of infinities.
1794  if (in_deriv) {
1795  std::vector<CuSubMatrix<BaseFloat> *> in_deriv_batch, out_deriv_batch, linear_params_batch;
1796 
1797  for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
1798  CuSubMatrix<BaseFloat> *in_deriv_block =
1799  new CuSubMatrix<BaseFloat>(in_deriv->ColRange(block_counter * num_cols_in_block,
1800  num_cols_in_block));
1801  in_deriv_batch.push_back(in_deriv_block);
1802 
1803  CuSubMatrix<BaseFloat> *out_deriv_block =
1804  new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
1805  num_rows_in_block));
1806  out_deriv_batch.push_back(out_deriv_block);
1807 
1808  CuSubMatrix<BaseFloat> *linear_params_block =
1809  new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
1810  num_rows_in_block));
1811  linear_params_batch.push_back(linear_params_block);
1812  }
1813 
1814  AddMatMatBatched<BaseFloat>(1.0, in_deriv_batch, out_deriv_batch, kNoTrans,
1815  linear_params_batch, kNoTrans, 1.0);
1816 
1817  DeletePointers(&in_deriv_batch);
1818  DeletePointers(&out_deriv_batch);
1819  DeletePointers(&linear_params_batch);
1820  }
1821 
1822  if (to_update != NULL) {
1823 
1824  { // linear params update
1825 
1826  std::vector<CuSubMatrix<BaseFloat> *> in_value_batch,
1827  out_deriv_batch, linear_params_batch;
1828 
1829  for (int block_counter = 0; block_counter < num_blocks_; block_counter++) {
1830  CuSubMatrix<BaseFloat> *in_value_block =
1831  new CuSubMatrix<BaseFloat>(in_value.ColRange(block_counter * num_cols_in_block,
1832  num_cols_in_block));
1833  in_value_batch.push_back(in_value_block);
1834 
1835  CuSubMatrix<BaseFloat> *out_deriv_block =
1836  new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
1837  num_rows_in_block));
1838  out_deriv_batch.push_back(out_deriv_block);
1839 
1840  CuSubMatrix<BaseFloat> *linear_params_block =
1841  new CuSubMatrix<BaseFloat>(to_update->linear_params_.RowRange(block_counter * num_rows_in_block,
1842  num_rows_in_block));
1843  linear_params_batch.push_back(linear_params_block);
1844  }
1845 
1846  AddMatMatBatched<BaseFloat>(to_update->learning_rate_,
1847  linear_params_batch,
1848  out_deriv_batch, kTrans,
1849  in_value_batch, kNoTrans, 1.0);
1850 
1851  DeletePointers(&in_value_batch);
1852  DeletePointers(&out_deriv_batch);
1853  DeletePointers(&linear_params_batch);
1854  } // end linear params update
1855 
1856  { // bias update
1857  to_update->bias_params_.AddRowSumMat(to_update->learning_rate_,
1858  out_deriv, 1.0);
1859  } // end bias update
1860  }
1861 }
1862 
1864  if (scale == 0.0) {
1865  linear_params_.SetZero();
1866  bias_params_.SetZero();
1867  } else {
1868  linear_params_.Scale(scale);
1869  bias_params_.Scale(scale);
1870  }
1871 }
1872 
1873 void BlockAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
1874  const BlockAffineComponent *other =
1875  dynamic_cast<const BlockAffineComponent *>(&other_in);
1876  KALDI_ASSERT(other != NULL);
1877  linear_params_.AddMat(alpha, other->linear_params_);
1878  bias_params_.AddVec(alpha, other->bias_params_);
1879 }
1880 
1882  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
1883  temp_linear_params.SetRandn();
1884  linear_params_.AddMat(stddev, temp_linear_params);
1885 
1886  CuVector<BaseFloat> temp_bias_params(bias_params_);
1887  temp_bias_params.SetRandn();
1888  bias_params_.AddVec(stddev, temp_bias_params);
1889 }
1890 
1892  const BlockAffineComponent *other =
1893  dynamic_cast<const BlockAffineComponent*>(&other_in);
1894  return TraceMatMat(linear_params_, other->linear_params_, kTrans) +
1895  VecVec(bias_params_, other->bias_params_);
1896 }
1897 
1898 void BlockAffineComponent::Read(std::istream &is, bool binary) {
1899  ReadUpdatableCommon(is, binary); // read opening tag and learning rate.
1900  ExpectToken(is, binary, "<NumBlocks>");
1901  ReadBasicType(is, binary, &num_blocks_);
1902  ExpectToken(is, binary, "<LinearParams>");
1903  linear_params_.Read(is, binary);
1904  ExpectToken(is, binary, "<BiasParams>");
1905  bias_params_.Read(is, binary);
1906  if (PeekToken(is, binary) == 'I') {
1907  // for back compatibility; we don't write this here any
1908  // more as it's written and read in Write/ReadUpdatableCommon
1909  ExpectToken(is, binary, "<IsGradient>");
1910  ReadBasicType(is, binary, &is_gradient_);
1911  }
1912  ExpectToken(is, binary, "</BlockAffineComponent>");
1913 }
1914 
1915 void BlockAffineComponent::Write(std::ostream &os, bool binary) const {
1916  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
1917  WriteToken(os, binary, "<NumBlocks>");
1918  WriteBasicType(os, binary, num_blocks_);
1919  WriteToken(os, binary, "<LinearParams>");
1920  linear_params_.Write(os, binary);
1921  WriteToken(os, binary, "<BiasParams>");
1922  bias_params_.Write(os, binary);
1923  WriteToken(os, binary, "</BlockAffineComponent>");
1924 }
1925 
1927  return linear_params_.NumCols() * linear_params_.NumRows() + bias_params_.Dim();
1928 }
1929 
1931  KALDI_ASSERT(params->Dim() == this->NumParameters());
1932  int32 num_linear_params = linear_params_.NumCols() * linear_params_.NumRows();
1933  int32 num_bias_params = bias_params_.Dim();
1934  params->Range(0, num_linear_params).CopyRowsFromMat(linear_params_);
1935  params->Range(num_linear_params, num_bias_params).CopyFromVec(bias_params_);
1936 }
1937 
1939  KALDI_ASSERT(params.Dim() == this->NumParameters());
1940  int32 num_linear_params = linear_params_.NumCols() * linear_params_.NumRows();
1941  int32 num_bias_params = bias_params_.Dim();
1942  linear_params_.CopyRowsFromVec(params.Range(0, num_linear_params));
1943  bias_params_.CopyFromVec(params.Range(num_linear_params, num_bias_params));
1944 }
1945 
1947  if (scale == 0.0) {
1948  scales_.SetZero();
1949  } else {
1950  scales_.Scale(scale);
1951  }
1952 }
1953 
1955  const Component &other_in) {
1956  const PerElementScaleComponent *other =
1957  dynamic_cast<const PerElementScaleComponent*>(&other_in);
1958  KALDI_ASSERT(other != NULL);
1959  scales_.AddVec(alpha, other->scales_);
1960 }
1961 
1963  const PerElementScaleComponent &component):
1964  UpdatableComponent(component),
1965  scales_(component.scales_) { }
1966 
1968  CuVector<BaseFloat> temp_scales(scales_.Dim(), kUndefined);
1969  temp_scales.SetRandn();
1970  scales_.AddVec(stddev, temp_scales);
1971 }
1972 
1973 std::string PerElementScaleComponent::Info() const {
1974  std::ostringstream stream;
1975  stream << UpdatableComponent::Info()
1976  << ", scales-min=" << scales_.Min()
1977  << ", scales-max=" << scales_.Max();
1978  PrintParameterStats(stream, "scales", scales_, true);
1979  return stream.str();
1980 }
1981 
1983  return new PerElementScaleComponent(*this);
1984 }
1985 
1987  const UpdatableComponent &other_in) const {
1988  const PerElementScaleComponent *other =
1989  dynamic_cast<const PerElementScaleComponent*>(&other_in);
1990  return VecVec(scales_, other->scales_);
1991 }
1992 
1994  BaseFloat param_mean,
1995  BaseFloat param_stddev) {
1996  KALDI_ASSERT(dim > 0 && param_stddev >= 0.0);
1997  scales_.Resize(dim);
1998  scales_.SetRandn();
1999  scales_.Scale(param_stddev);
2000  scales_.Add(param_mean);
2001 }
2002 
2003 void PerElementScaleComponent::Init(std::string vector_filename) {
2004  CuVector<BaseFloat> vec;
2005  ReadKaldiObject(vector_filename, &vec); // will abort on failure.
2006  scales_.Resize(vec.Dim());
2007  scales_.CopyFromVec(vec);
2008 }
2009 
2011  std::string vector_filename;
2012  int32 dim = -1;
2014  if (cfl->GetValue("vector", &vector_filename)) {
2015  Init(vector_filename);
2016  if (cfl->GetValue("dim", &dim))
2017  KALDI_ASSERT(dim == InputDim() &&
2018  "input-dim mismatch vs. vector.");
2019  } else {
2020  if(!cfl->GetValue("dim", &dim))
2021  KALDI_ERR << "'dim' not provided in the config line.";
2022  BaseFloat param_mean = 1.0, param_stddev = 0.0;
2023  cfl->GetValue("param-mean", &param_mean);
2024  cfl->GetValue("param-stddev", &param_stddev);
2025  Init(dim, param_mean, param_stddev);
2026  }
2027  if (cfl->HasUnusedValues())
2028  KALDI_ERR << "Could not process these elements in initializer: "
2029  << cfl->UnusedValues();
2030 }
2031 
2033  const ComponentPrecomputedIndexes *indexes,
2034  const CuMatrixBase<BaseFloat> &in,
2035  CuMatrixBase<BaseFloat> *out) const {
2036  out->CopyFromMat(in);
2037  out->MulColsVec(scales_);
2038  return NULL;
2039 }
2040 
2042  const CuMatrixBase<BaseFloat> &in_value,
2043  const CuMatrixBase<BaseFloat> &out_deriv) {
2044  scales_.AddDiagMatMat(learning_rate_, out_deriv, kTrans,
2045  in_value, kNoTrans, 1.0);
2046 }
2047 
2049  const std::string &debug_info,
2050  const ComponentPrecomputedIndexes *indexes,
2051  const CuMatrixBase<BaseFloat> &in_value,
2052  const CuMatrixBase<BaseFloat> &, // out_value
2053  const CuMatrixBase<BaseFloat> &out_deriv,
2054  void *memo,
2055  Component *to_update_in,
2056  CuMatrixBase<BaseFloat> *in_deriv) const {
2057  NVTX_RANGE("PerElementScaleComponent::Backprop");
2058  PerElementScaleComponent *to_update =
2059  dynamic_cast<PerElementScaleComponent*>(to_update_in);
2060 
2061  if (to_update != NULL) {
2062  // Next update the model (must do this 2nd so the derivatives we propagate
2063  // are accurate, in case this == to_update_in.)
2064  if (to_update->is_gradient_)
2065  to_update->UpdateSimple(in_value, out_deriv);
2066  else // the call below is to a virtual function that may be re-implemented
2067  to_update->Update(debug_info, in_value, out_deriv); // by child classes.
2068  }
2069 
2070  if (in_deriv) {
2071  // Propagate the derivative back to the input.
2072  if (in_deriv->Data() != out_deriv.Data())
2073  in_deriv->CopyFromMat(out_deriv);
2074  in_deriv->MulColsVec(scales_);
2075  }
2076 }
2077 
2078 void PerElementScaleComponent::Read(std::istream &is, bool binary) {
2079  ReadUpdatableCommon(is, binary); // Read opening tag and learning rate.
2080  ExpectToken(is, binary, "<Params>");
2081  scales_.Read(is, binary);
2082  if (PeekToken(is, binary) == 'I') {
2083  // for back compatibility; we don't write this here any
2084  // more as it's written and read in Write/ReadUpdatableCommon
2085  ExpectToken(is, binary, "<IsGradient>");
2086  ReadBasicType(is, binary, &is_gradient_);
2087  }
2088  ExpectToken(is, binary, "</PerElementScaleComponent>");
2089 }
2090 
2091 void PerElementScaleComponent::Write(std::ostream &os, bool binary) const {
2092  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate.
2093  WriteToken(os, binary, "<Params>");
2094  scales_.Write(os, binary);
2095  WriteToken(os, binary, "</PerElementScaleComponent>");
2096 }
2097 
2099  return InputDim();
2100 }
2101 
2103  params->CopyFromVec(scales_);
2104 }
2105 
2107  const VectorBase<BaseFloat> &params) {
2108  scales_.CopyFromVec(params);
2109 }
2110 
2112  if (scale == 0.0) {
2113  offsets_.SetZero();
2114  } else {
2115  offsets_.Scale(scale);
2116  }
2117 }
2118 
2119 
2121  const Component &other_in) {
2122  const PerElementOffsetComponent *other =
2123  dynamic_cast<const PerElementOffsetComponent*>(&other_in);
2124  KALDI_ASSERT(other != NULL);
2125  offsets_.AddVec(alpha, other->offsets_);
2126 }
2127 
2129  const PerElementOffsetComponent &component):
2130  UpdatableComponent(component),
2131  offsets_(component.offsets_),
2132  dim_(component.dim_),
2133  use_natural_gradient_(component.use_natural_gradient_),
2134  preconditioner_(component.preconditioner_) { }
2135 
2137  CuVector<BaseFloat> temp_offsets(offsets_.Dim(), kUndefined);
2138  temp_offsets.SetRandn();
2139  offsets_.AddVec(stddev, temp_offsets);
2140 }
2141 
2142 std::string PerElementOffsetComponent::Info() const {
2143  std::ostringstream stream;
2144  stream << UpdatableComponent::Info()
2145  << ", offsets-min=" << offsets_.Min()
2146  << ", offsets-max=" << offsets_.Max()
2147  << ", block-dim=" << offsets_.Dim()
2148  << ", use-natural-gradient="
2149  << (use_natural_gradient_ ? "true" : "false");
2150  PrintParameterStats(stream, "offsets", offsets_, true);
2151  return stream.str();
2152 }
2153 
2155  return new PerElementOffsetComponent(*this);
2156 }
2157 
2159  const UpdatableComponent &other_in) const {
2160  const PerElementOffsetComponent *other =
2161  dynamic_cast<const PerElementOffsetComponent*>(&other_in);
2162  return VecVec(offsets_, other->offsets_);
2163 }
2164 
2165 
2167  std::string vector_filename;
2169  if (cfl->GetValue("vector", &vector_filename)) {
2170  ReadKaldiObject(vector_filename, &offsets_);
2171  dim_ = offsets_.Dim(); // if dim is not supplied, it defaults to this.
2172  cfl->GetValue("dim", &dim_);
2173  if (dim_ <= 0 || offsets_.Dim() % dim_ != 0)
2174  KALDI_ERR << "Invalid dimension dim=" << dim_;
2175  } else {
2176  if(!cfl->GetValue("dim", &dim_))
2177  KALDI_ERR << "'dim' not provided in the config line.";
2178  if (dim_ <= 0)
2179  KALDI_ERR << "Invalid dimension dim=" << dim_;
2180  BaseFloat param_mean = 0.0, param_stddev = 0.0;
2181  cfl->GetValue("param-mean", &param_mean);
2182  cfl->GetValue("param-stddev", &param_stddev);
2183  int32 block_dim = dim_;
2184  cfl->GetValue("block-dim", &block_dim);
2185  if (block_dim <= 0 || dim_ % block_dim != 0)
2186  KALDI_ERR << "Invalid value block-dim=" << block_dim;
2187  offsets_.Resize(block_dim);
2188  offsets_.SetRandn();
2189  offsets_.Scale(param_stddev);
2190  offsets_.Add(param_mean);
2191  }
2192  use_natural_gradient_ = true;
2193  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
2194  if (cfl->HasUnusedValues())
2195  KALDI_ERR << "Could not process these elements in initializer: "
2196  << cfl->UnusedValues();
2197  // For now you can't modify these defaults of the natural gradient.
2198  // This code must be kept in sync with the code in Read().
2201 }
2202 
2204  const ComponentPrecomputedIndexes *indexes,
2205  const CuMatrixBase<BaseFloat> &in,
2206  CuMatrixBase<BaseFloat> *out) const {
2207  if (in.Data() != out->Data())
2208  out->CopyFromMat(in);
2209  if (dim_ == offsets_.Dim()) {
2210  out->AddVecToRows(1.0, offsets_);
2211  } else {
2212  KALDI_ASSERT(out->Stride() == out->NumCols());
2213  int32 block_dim = offsets_.Dim(), multiple = dim_ / block_dim,
2214  num_rows = out->NumRows() * multiple;
2215  CuSubMatrix<BaseFloat> out_rearranged(out->Data(), num_rows,
2216  block_dim, block_dim);
2217  out_rearranged.AddVecToRows(1.0, offsets_);
2218  }
2219  return NULL;
2220 }
2221 
2223  const std::string &debug_info,
2224  const ComponentPrecomputedIndexes *indexes,
2225  const CuMatrixBase<BaseFloat> &, // in_value
2226  const CuMatrixBase<BaseFloat> &, // out_value
2227  const CuMatrixBase<BaseFloat> &out_deriv,
2228  void *memo,
2229  Component *to_update_in,
2230  CuMatrixBase<BaseFloat> *in_deriv) const {
2231  NVTX_RANGE("PerElementOffsetComponent::Backprop");
2232  PerElementOffsetComponent *to_update =
2233  dynamic_cast<PerElementOffsetComponent*>(to_update_in);
2234 
2235  if (in_deriv && in_deriv->Data() != out_deriv.Data()) {
2236  // Propagate the derivative back to the input.
2237  in_deriv->CopyFromMat(out_deriv);
2238  }
2239 
2240  if (to_update != NULL) {
2241  // we may have to reshape out_deriv, if "block-dim" was set
2242  // in the config file when initializing the object, leading
2243  // to dim_ being a multiple >1 of offset_.Dim().
2244  // To avoid having separate code paths we create a sub-matrix
2245  // in any case, but this may just be a copy of out_deriv.
2246  int32 block_dim = offsets_.Dim(), multiple = dim_ / block_dim,
2247  block_stride = (multiple == 1 ? out_deriv.Stride() : block_dim),
2248  num_rows = out_deriv.NumRows() * multiple;
2249  KALDI_ASSERT(multiple == 1 || out_deriv.Stride() == out_deriv.NumCols());
2250  CuSubMatrix<BaseFloat> out_deriv_reshaped(out_deriv.Data(), num_rows,
2251  block_dim, block_stride);
2252  if (!to_update->use_natural_gradient_ || to_update->is_gradient_) {
2253  KALDI_LOG << "Using non-NG update, lr = " << to_update->learning_rate_;
2254  to_update->offsets_.AddRowSumMat(to_update->learning_rate_,
2255  out_deriv_reshaped);
2256  } else {
2257  KALDI_LOG << "Using NG update, lr = " << to_update->learning_rate_;
2258  // make a copy as we don't want to modify the data of 'out_deriv', which
2259  // was const (even though CuSubMatrix does not respect const-ness in
2260  // this scenario)
2261  CuMatrix<BaseFloat> out_deriv_copy(out_deriv_reshaped);
2262  BaseFloat scale = 1.0;
2263  to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
2264  &scale);
2265  to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_,
2266  out_deriv_copy);
2267  }
2268  }
2269 }
2270 
2271 void PerElementOffsetComponent::Read(std::istream &is, bool binary) {
2272  ReadUpdatableCommon(is, binary); // Read opening tag and learning rate
2273  ExpectToken(is, binary, "<Offsets>");
2274  offsets_.Read(is, binary);
2275  if (PeekToken(is, binary) == 'I') {
2276  // for back compatibility; we don't write this here any
2277  // more as it's written and read in Write/ReadUpdatableCommon
2278  ExpectToken(is, binary, "<IsGradient>");
2279  ReadBasicType(is, binary, &is_gradient_);
2280  }
2281  if (PeekToken(is, binary) != '/') {
2282  ExpectToken(is, binary, "<Dim>");
2283  ReadBasicType(is, binary, &dim_);
2284  ExpectToken(is, binary, "<UseNaturalGradient>");
2285  ReadBasicType(is, binary, &use_natural_gradient_);
2286  } else {
2287  dim_ = offsets_.Dim();
2288  use_natural_gradient_ = true;
2289  }
2290  // For now you can't modify these defaults of the natural gradient.
2291  // This code must be kept in sync with the code in InitFromConfig().
2294  ExpectToken(is, binary, "</PerElementOffsetComponent>");
2295 }
2296 
2297 void PerElementOffsetComponent::Write(std::ostream &os, bool binary) const {
2298  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
2299  WriteToken(os, binary, "<Offsets>");
2300  offsets_.Write(os, binary);
2301  WriteToken(os, binary, "<Dim>");
2302  WriteBasicType(os, binary, dim_);
2303  WriteToken(os, binary, "<UseNaturalGradient>");
2305  WriteToken(os, binary, "</PerElementOffsetComponent>");
2306 }
2307 
2309  return offsets_.Dim();
2310 }
2311 
2313  params->CopyFromVec(offsets_);
2314 }
2315 
2317  const VectorBase<BaseFloat> &params) {
2318  offsets_.CopyFromVec(params);
2319 }
2320 
2321 std::string ScaleAndOffsetComponent::Info() const {
2322  std::ostringstream stream;
2323  stream << UpdatableComponent::Info()
2324  << ", rank=" << scale_preconditioner_.GetRank();
2325  if (dim_ != scales_.Dim())
2326  stream << ", block-size=" << scales_.Dim();
2327  PrintParameterStats(stream, "scales", scales_, true);
2328  PrintParameterStats(stream, "offsets", offsets_, true);
2329  return stream.str();
2330 }
2331 
2333 
2335  if (!cfl->GetValue("dim", &dim_) || dim_ <= 0) {
2336  KALDI_ERR << "Dimension 'dim' must be specified and >0: "
2337  << cfl->WholeLine();
2338  }
2339  use_natural_gradient_ = true;
2340  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
2341  int32 block_dim = dim_,
2342  rank = 20;
2343  cfl->GetValue("block-dim", &block_dim);
2344  if (block_dim <= 0 || dim_ % block_dim != 0) {
2345  KALDI_ERR << "Invalid block-dim: " << cfl->WholeLine();
2346  }
2347  cfl->GetValue("rank", &rank);
2348  scales_.Resize(block_dim);
2349  scales_.Set(1.0);
2350  offsets_.Resize(block_dim);
2351  // offsets are all zero when initialized.
2352  if (cfl->HasUnusedValues())
2353  KALDI_ERR << "Could not process these elements in initializer: "
2354  << cfl->UnusedValues();
2355  offset_preconditioner_.SetRank(rank);
2356  scale_preconditioner_.SetRank(rank);
2357  // the update period can't be configured for now; we'll add an option if we
2358  // want to.
2359  offset_preconditioner_.SetUpdatePeriod(4);
2360  scale_preconditioner_.SetUpdatePeriod(4);
2361 }
2362 
2363 void ScaleAndOffsetComponent::Read(std::istream &is, bool binary) {
2364  ReadUpdatableCommon(is, binary); // Read opening tag and learning rate
2365  ExpectToken(is, binary, "<Dim>");
2366  ReadBasicType(is, binary, &dim_);
2367  ExpectToken(is, binary, "<Scales>");
2368  scales_.Read(is, binary);
2369  ExpectToken(is, binary, "<Offsets>");
2370  offsets_.Read(is, binary);
2371  ExpectToken(is, binary, "<UseNaturalGradient>");
2372  ReadBasicType(is, binary, &use_natural_gradient_);
2373  int32 rank;
2374  ExpectToken(is, binary, "<Rank>");
2375  ReadBasicType(is, binary, &rank);
2376  scale_preconditioner_.SetRank(rank);
2377  offset_preconditioner_.SetRank(rank);
2378  ExpectToken(is, binary, "</ScaleAndOffsetComponent>");
2379 }
2380 
2381 void ScaleAndOffsetComponent::Write(std::ostream &os, bool binary) const {
2382  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
2383  WriteToken(os, binary, "<Dim>");
2384  WriteBasicType(os, binary, dim_);
2385  WriteToken(os, binary, "<Scales>");
2386  scales_.Write(os, binary);
2387  WriteToken(os, binary, "<Offsets>");
2388  offsets_.Write(os, binary);
2389  WriteToken(os, binary, "<UseNaturalGradient>");
2391  WriteToken(os, binary, "<Rank>");
2392  WriteBasicType(os, binary, scale_preconditioner_.GetRank());
2393  WriteToken(os, binary, "</ScaleAndOffsetComponent>");
2394 }
2395 
2397  if (scale == 0.0) {
2398  scales_.SetZero();
2399  offsets_.SetZero();
2400  } else {
2401  scales_.Scale(scale);
2402  offsets_.Scale(scale);
2403  }
2404 }
2405 
2407  const Component &other_in) {
2408  const ScaleAndOffsetComponent *other =
2409  dynamic_cast<const ScaleAndOffsetComponent*>(&other_in);
2410  KALDI_ASSERT(other != NULL);
2411  scales_.AddVec(alpha, other->scales_);
2412  offsets_.AddVec(alpha, other->offsets_);
2413 }
2414 
2416  const ScaleAndOffsetComponent &component):
2417  UpdatableComponent(component),
2418  dim_(component.dim_),
2419  scales_(component.scales_),
2420  offsets_(component.offsets_),
2422  scale_preconditioner_(component.scale_preconditioner_),
2423  offset_preconditioner_(component.offset_preconditioner_) { }
2424 
2426  CuVector<BaseFloat> temp(scales_.Dim(), kUndefined);
2427  temp.SetRandn();
2428  scales_.AddVec(stddev, temp);
2429  temp.SetRandn();
2430  offsets_.AddVec(stddev, temp);
2431 }
2432 
2434  const UpdatableComponent &other_in) const {
2435  const ScaleAndOffsetComponent *other =
2436  dynamic_cast<const ScaleAndOffsetComponent*>(&other_in);
2437  return VecVec(other->scales_, scales_) + VecVec(other->offsets_, offsets_);
2438 }
2439 
2441  int32 dim = scales_.Dim();
2442  params->Range(0, dim).CopyFromVec(scales_);
2443  params->Range(dim, dim).CopyFromVec(offsets_);
2444 }
2445 
2447  const VectorBase<BaseFloat> &params) {
2448  int32 dim = scales_.Dim();
2449  scales_.CopyFromVec(params.Range(0, dim));
2450  offsets_.CopyFromVec(params.Range(dim, dim));
2451 }
2452 
2454  const ComponentPrecomputedIndexes *indexes,
2455  const CuMatrixBase<BaseFloat> &in,
2456  CuMatrixBase<BaseFloat> *out) const {
2457  if (dim_ == scales_.Dim()) {
2458  PropagateInternal(in, out);
2459  } else {
2460  int32 multiple = dim_ / scales_.Dim(),
2461  num_rows = in.NumRows(), block_dim = scales_.Dim();
2462  KALDI_ASSERT(in.NumCols() == in.Stride() &&
2463  SameDimAndStride(in, *out));
2464  // Reinterpret the data as matrices with more rows but fewer columns.
2465  CuSubMatrix<BaseFloat> in_rearranged(in.Data(), num_rows * multiple,
2466  block_dim, block_dim),
2467  out_rearranged(out->Data(), num_rows * multiple,
2468  block_dim, block_dim);
2469  PropagateInternal(in_rearranged, &out_rearranged);
2470  }
2471  return NULL;
2472 }
2473 
2475  const CuMatrixBase<BaseFloat> &in,
2476  CuMatrixBase<BaseFloat> *out) const {
2477  if (out->Data() != in.Data())
2478  out->CopyFromMat(in);
2479  BaseFloat epsilon = Epsilon();
2480  int32 dim = scales_.Dim();
2481  CuVector<BaseFloat> scales_nonzero(dim, kUndefined);
2482  cu::EnsureNonzero(scales_, epsilon, &scales_nonzero);
2483  out->MulColsVec(scales_nonzero);
2484  out->AddVecToRows(1.0, offsets_);
2485 }
2486 
2488  const std::string &debug_info,
2489  const ComponentPrecomputedIndexes *indexes,
2490  const CuMatrixBase<BaseFloat> &, // in_value
2491  const CuMatrixBase<BaseFloat> &out_value,
2492  const CuMatrixBase<BaseFloat> &out_deriv,
2493  void *memo,
2494  Component *to_update_in,
2495  CuMatrixBase<BaseFloat> *in_deriv) const {
2496  NVTX_RANGE("ScaleAndOffsetComponent::Backprop");
2497  ScaleAndOffsetComponent *to_update =
2498  dynamic_cast<ScaleAndOffsetComponent*>(to_update_in);
2499 
2500  KALDI_ASSERT(SameDim(out_value, out_deriv));
2501 
2502  if (dim_ == scales_.Dim()) {
2503  BackpropInternal(debug_info, out_value, out_deriv,
2504  to_update, in_deriv);
2505  } else {
2506  KALDI_ASSERT(out_value.NumCols() == out_value.Stride() &&
2507  SameDimAndStride(out_value, out_deriv) &&
2508  (!in_deriv || SameDimAndStride(out_value, *in_deriv)));
2509  int32 multiple = dim_ / scales_.Dim(),
2510  num_rows = out_value.NumRows(),
2511  block_dim = scales_.Dim();
2512  CuSubMatrix<BaseFloat> out_value_rearranged(out_value.Data(),
2513  num_rows * multiple,
2514  block_dim, block_dim),
2515  out_deriv_rearranged(out_deriv.Data(), num_rows * multiple,
2516  block_dim, block_dim);
2517  if (in_deriv) {
2518  CuSubMatrix<BaseFloat> in_deriv_rearranged(in_deriv->Data(),
2519  num_rows * multiple,
2520  block_dim, block_dim);
2521  BackpropInternal(debug_info, out_value_rearranged,
2522  out_deriv_rearranged, to_update,
2523  &in_deriv_rearranged);
2524  } else {
2525  BackpropInternal(debug_info, out_value_rearranged,
2526  out_deriv_rearranged, to_update,
2527  NULL);
2528  }
2529  }
2530 }
2531 
2532 
2533  // Internal version of backprop, where the num-cols of the
2534  // argument matrices are equal to scales_.Dim().
2536  const std::string &debug_info,
2537  const CuMatrixBase<BaseFloat> &out_value,
2538  const CuMatrixBase<BaseFloat> &out_deriv,
2539  ScaleAndOffsetComponent *to_update,
2540  CuMatrixBase<BaseFloat> *in_deriv) const {
2541  if (to_update) {
2542  if (!to_update->use_natural_gradient_ || to_update->is_gradient_) {
2543  to_update->offsets_.AddRowSumMat(to_update->learning_rate_,
2544  out_deriv);
2545  } else {
2546  BaseFloat scale = 1.0;
2547  CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
2549  &out_deriv_copy, &scale);
2550  to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_,
2551  out_deriv_copy);
2552  }
2553  // The backprop actually needs the input to the component, not the output;
2554  // but we make the output available because in the common topologies that
2555  // will already be required for backprop-- it's for memory efficiency.
2556  CuMatrix<BaseFloat> in_value_reconstructed(out_value);
2557  int32 dim = scales_.Dim();
2558  CuVector<BaseFloat> scales_nonzero(dim, kUndefined);
2559  BaseFloat epsilon = Epsilon();
2560  cu::EnsureNonzero(scales_, epsilon, &scales_nonzero);
2561  scales_nonzero.InvertElements();
2562  in_value_reconstructed.AddVecToRows(-1.0, offsets_);
2563  // Actually scales_nonzero are now the inverses of the scales.
2564  in_value_reconstructed.MulColsVec(scales_nonzero);
2565  // OK, at this point in_value_reconstructed is the input to the component.
2566  // Multiply its elements by 'out_deriv' to get the derivatives
2567  // (for each frame) w.r.t. the scales.
2568  in_value_reconstructed.MulElements(out_deriv);
2569  BaseFloat scale = 1.0;
2570  if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
2572  &in_value_reconstructed, &scale);
2573  }
2574  to_update->scales_.AddRowSumMat(scale * to_update->learning_rate_,
2575  in_value_reconstructed);
2576  }
2577  if (in_deriv) {
2578  if (in_deriv->Data() != out_deriv.Data())
2579  in_deriv->CopyFromMat(out_deriv);
2580  in_deriv->MulColsVec(scales_);
2581  }
2582 }
2583 
2586  scale_preconditioner_.Swap(&temp_scale);
2588  offset_preconditioner_.Swap(&temp_offset);
2589 }
2590 
2591 
2592 std::string ConstantFunctionComponent::Info() const {
2593  std::ostringstream stream;
2594  stream << UpdatableComponent::Info()
2595  << ", " << Type() << ", input-dim=" << InputDim()
2596  << ", output-dim=" << OutputDim()
2597  << ", is-updatable=" << std::boolalpha << is_updatable_
2598  << ", use-natural-gradient=" << std::boolalpha
2600  PrintParameterStats(stream, "output", output_, true);
2601  return stream.str();
2602 }
2603 
2605  UpdatableComponent(), input_dim_(-1), is_updatable_(true),
2606  use_natural_gradient_(true) { }
2607 
2609  const ConstantFunctionComponent &other):
2610  UpdatableComponent(other), input_dim_(other.input_dim_),
2611  output_(other.output_), is_updatable_(other.is_updatable_),
2613  preconditioner_(other.preconditioner_) { }
2614 
2616  const ComponentPrecomputedIndexes *indexes,
2617  const CuMatrixBase<BaseFloat> &in,
2618  CuMatrixBase<BaseFloat> *out) const {
2619  out->CopyRowsFromVec(output_);
2620  return NULL;
2621 }
2622 
2624  const std::string &debug_info,
2625  const ComponentPrecomputedIndexes *indexes,
2626  const CuMatrixBase<BaseFloat> &, // in_value
2627  const CuMatrixBase<BaseFloat> &, // out_value
2628  const CuMatrixBase<BaseFloat> &out_deriv,
2629  void *memo,
2630  Component *to_update_in,
2631  CuMatrixBase<BaseFloat> *in_deriv) const {
2632  NVTX_RANGE("ConstantFunctionComponent::Backprop");
2633  // we don't update in_deriv, since we set the flag
2634  // kBackpropAdds, and the output doesn't depend on the
2635  // input, so the input-derivative is zero.
2636  if (to_update_in) {
2637  ConstantFunctionComponent *to_update =
2638  dynamic_cast<ConstantFunctionComponent*>(to_update_in);
2639  if (to_update->is_updatable_) {
2640  // only do the update if the is_updatable_ flag is set.
2641  KALDI_ASSERT(to_update && to_update->is_updatable_);
2642  if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
2643  CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
2644  BaseFloat scale = 1.0;
2645  to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
2646  &scale);
2647  to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
2648  out_deriv_copy);
2649  } else {
2650  to_update->output_.AddRowSumMat(to_update->learning_rate_,
2651  out_deriv);
2652  }
2653  }
2654  }
2655 }
2656 
2657 void ConstantFunctionComponent::Read(std::istream &is, bool binary) {
2658  std::string token;
2659  ReadToken(is, binary, &token);
2660  if (token == "<ConstantFunctionComponent>") {
2661  ReadToken(is, binary, &token);
2662  }
2663  if (token == "<LearningRateFactor>") {
2664  ReadBasicType(is, binary, &learning_rate_factor_);
2665  ReadToken(is, binary, &token);
2666  } else {
2667  learning_rate_factor_ = 1.0;
2668  }
2669  if (token == "<IsGradient>") {
2670  ReadBasicType(is, binary, &is_gradient_);
2671  ReadToken(is, binary, &token);
2672  } else {
2673  is_gradient_ = false;
2674  }
2675  if (token == "<LearningRate>") {
2676  ReadBasicType(is, binary, &learning_rate_);
2677  ReadToken(is, binary, &token);
2678  } else {
2679  learning_rate_ = 0.001;
2680  }
2681  if (token == "<InputDim>") {
2682  ReadBasicType(is, binary, &input_dim_);
2683  } else {
2684  KALDI_ERR << "Expected token <InputDim>, got "
2685  << token;
2686  }
2687  ExpectToken(is, binary, "<Output>");
2688  output_.Read(is, binary);
2689  ExpectToken(is, binary, "<IsUpdatable>");
2690  ReadBasicType(is, binary, &is_updatable_);
2691  ExpectToken(is, binary, "<UseNaturalGradient>");
2692  ReadBasicType(is, binary, &use_natural_gradient_);
2693  ExpectToken(is, binary, "</ConstantFunctionComponent>");
2694 }
2695 
2696 void ConstantFunctionComponent::Write(std::ostream &os, bool binary) const {
2697  WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate
2698  WriteToken(os, binary, "<InputDim>");
2699  WriteBasicType(os, binary, input_dim_);
2700  WriteToken(os, binary, "<Output>");
2701  output_.Write(os, binary);
2702  WriteToken(os, binary, "<IsUpdatable>");
2703  WriteBasicType(os, binary, is_updatable_);
2704  WriteToken(os, binary, "<UseNaturalGradient>");
2706  WriteToken(os, binary, "</ConstantFunctionComponent>");
2707 }
2708 
2710  return new ConstantFunctionComponent(*this);
2711 }
2712 
2714  if (is_updatable_) {
2715  if (scale == 0.0) {
2716  output_.SetZero();
2717  } else {
2718  output_.Scale(scale);
2719  }
2720  }
2721 }
2722 
2724  if (is_updatable_) {
2725  const ConstantFunctionComponent *other =
2726  dynamic_cast<const ConstantFunctionComponent*>(&other_in);
2727  KALDI_ASSERT(other != NULL);
2728  output_.AddVec(alpha, other->output_);
2729  }
2730 }
2731 
2733  CuVector<BaseFloat> temp_output(output_.Dim(), kUndefined);
2734  temp_output.SetRandn();
2735  output_.AddVec(stddev, temp_output);
2736 }
2737 
2739  const UpdatableComponent &other_in) const {
2741  const ConstantFunctionComponent *other =
2742  dynamic_cast<const ConstantFunctionComponent*>(&other_in);
2743  KALDI_ASSERT(other != NULL);
2744  return VecVec(output_, other->output_);
2745 }
2746 
2748  int32 output_dim = 0;
2750  bool ok = cfl->GetValue("output-dim", &output_dim) &&
2751  cfl->GetValue("input-dim", &input_dim_);
2752  cfl->GetValue("is-updatable", &is_updatable_);
2753  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
2754  BaseFloat output_mean = 0.0, output_stddev = 0.0;
2755  cfl->GetValue("output-mean", &output_mean);
2756  cfl->GetValue("output-stddev", &output_stddev);
2757  if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 ||
2758  output_dim <= 0) {
2759  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
2760  }
2761  Vector<BaseFloat> output(output_dim);
2762  output.SetRandn();
2763  output.Scale(output_stddev);
2764  output.Add(output_mean);
2765  output_ = output;
2766 }
2767 
2770  return output_.Dim();
2771 }
2772 
2774  params->CopyFromVec(output_);
2775 }
2776 
2778  output_.CopyFromVec(params);
2779 }
2780 
2783  preconditioner_.Swap(&temp);
2784 }
2785 
2786 void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
2787  ReadUpdatableCommon(is, binary); // Read the opening tag and learning rate
2788  ExpectToken(is, binary, "<LinearParams>");
2789  linear_params_.Read(is, binary);
2790  ExpectToken(is, binary, "<BiasParams>");
2791  bias_params_.Read(is, binary);
2792 
2793  BaseFloat num_samples_history, alpha;
2794  int32 rank_in, rank_out, update_period;
2795 
2796  ExpectToken(is, binary, "<RankIn>");
2797  ReadBasicType(is, binary, &rank_in);
2798  ExpectToken(is, binary, "<RankOut>");
2799  ReadBasicType(is, binary, &rank_out);
2800  if (PeekToken(is, binary) == 'O') {
2801  ExpectToken(is, binary, "<OrthonormalConstraint>");
2802  ReadBasicType(is, binary, &orthonormal_constraint_);
2803  } else {
2804  orthonormal_constraint_ = 0.0;
2805  }
2806  ExpectToken(is, binary, "<UpdatePeriod>");
2807  ReadBasicType(is, binary, &update_period);
2808  ExpectToken(is, binary, "<NumSamplesHistory>");
2809  ReadBasicType(is, binary, &num_samples_history);
2810  ExpectToken(is, binary, "<Alpha>");
2811  ReadBasicType(is, binary, &alpha);
2812 
2813  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
2814  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
2815  preconditioner_in_.SetAlpha(alpha);
2816  preconditioner_out_.SetAlpha(alpha);
2817  preconditioner_in_.SetRank(rank_in);
2818  preconditioner_out_.SetRank(rank_out);
2819  preconditioner_in_.SetUpdatePeriod(update_period);
2820  preconditioner_out_.SetUpdatePeriod(update_period);
2821 
2822  if (PeekToken(is, binary) == 'M') {
2823  // MaxChangePerSample, long ago removed; back compatibility.
2824  ExpectToken(is, binary, "<MaxChangePerSample>");
2825  BaseFloat temp;
2826  ReadBasicType(is, binary, &temp);
2827  }
2828  if (PeekToken(is, binary) == 'I') {
2829  // for back compatibility; we don't write this here any
2830  // more as it's written and read in Write/ReadUpdatableCommon
2831  ExpectToken(is, binary, "<IsGradient>");
2832  ReadBasicType(is, binary, &is_gradient_);
2833  }
2834  if (PeekToken(is, binary) == 'U') {
2835  ExpectToken(is, binary, "<UpdateCount>");
2836  // back-compatibility branch (these configs were added and then removed).
2837  double temp;
2838  ReadBasicType(is, binary, &temp);
2839  ExpectToken(is, binary, "<ActiveScalingCount>");
2840  ReadBasicType(is, binary, &temp);
2841  ExpectToken(is, binary, "<MaxChangeScaleStats>");
2842  ReadBasicType(is, binary, &temp);
2843  }
2844  std::string token;
2845  ReadToken(is, binary, &token);
2846  // the following has to handle a couple variants of
2847  if (token.find("NaturalGradientAffineComponent>") == std::string::npos)
2848  KALDI_ERR << "Expected <NaturalGradientAffineComponent> or "
2849  << "</NaturalGradientAffineComponent>, got " << token;
2850 }
2851 
2852 
2854  const CuMatrixBase<BaseFloat> &linear_params,
2855  const CuVectorBase<BaseFloat> &bias_params):
2856  AffineComponent(linear_params, bias_params, 0.001) {
2857  KALDI_ASSERT(bias_params.Dim() == linear_params.NumRows() &&
2858  bias_params.Dim() != 0);
2859 
2860  // set some default natural gradient configs.
2865 }
2866 
2868  bool ok = true;
2869  std::string matrix_filename;
2870 
2871  is_gradient_ = false; // not configurable; there's no reason you'd want this
2872 
2874 
2875  if (cfl->GetValue("matrix", &matrix_filename)) {
2876  CuMatrix<BaseFloat> mat;
2877  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
2878  KALDI_ASSERT(mat.NumCols() >= 2);
2879  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
2880  linear_params_.Resize(output_dim, input_dim);
2881  bias_params_.Resize(output_dim);
2882  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
2883  bias_params_.CopyColFromMat(mat, input_dim);
2884  if (cfl->GetValue("input-dim", &input_dim))
2885  KALDI_ASSERT(input_dim == InputDim() &&
2886  "input-dim mismatch vs. matrix.");
2887  if (cfl->GetValue("output-dim", &output_dim))
2888  KALDI_ASSERT(output_dim == OutputDim() &&
2889  "output-dim mismatch vs. matrix.");
2890  } else {
2891  int32 input_dim = -1, output_dim = -1;
2892 
2893  ok = ok && cfl->GetValue("input-dim", &input_dim);
2894  ok = ok && cfl->GetValue("output-dim", &output_dim);
2895  if (!ok)
2896  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
2897  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
2898  bias_stddev = 1.0, bias_mean = 0.0;
2899  cfl->GetValue("param-stddev", &param_stddev);
2900  cfl->GetValue("bias-stddev", &bias_stddev);
2901  cfl->GetValue("bias-mean", &bias_mean);
2902  linear_params_.Resize(output_dim, input_dim);
2903  bias_params_.Resize(output_dim);
2904  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0 &&
2905  bias_stddev >= 0.0);
2906  linear_params_.SetRandn(); // sets to random normally distributed noise.
2907  linear_params_.Scale(param_stddev);
2908  bias_params_.SetRandn();
2909  bias_params_.Scale(bias_stddev);
2910  bias_params_.Add(bias_mean);
2911  }
2912 
2914  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
2915 
2916  // Set natural-gradient configs.
2917  BaseFloat num_samples_history = 2000.0,
2918  alpha = 4.0;
2919  int32 rank_in = -1, rank_out = -1,
2920  update_period = 4;
2921  cfl->GetValue("num-samples-history", &num_samples_history);
2922  cfl->GetValue("alpha", &alpha);
2923  cfl->GetValue("rank-in", &rank_in);
2924  cfl->GetValue("rank-out", &rank_out);
2925  cfl->GetValue("update-period", &update_period);
2926 
2927  if (rank_in < 0)
2928  rank_in = std::min<int32>(20, (InputDim() + 1) / 2);
2929  if (rank_out < 0)
2930  rank_out = std::min<int32>(80, (OutputDim() + 1) / 2);
2931 
2932  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
2933  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
2936  preconditioner_in_.SetRank(rank_in);
2937  preconditioner_out_.SetRank(rank_out);
2938  preconditioner_in_.SetUpdatePeriod(update_period);
2939  preconditioner_out_.SetUpdatePeriod(update_period);
2940 
2941  if (cfl->HasUnusedValues())
2942  KALDI_ERR << "Could not process these elements in initializer: "
2943  << cfl->UnusedValues();
2944  if (!ok)
2945  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
2946 }
2947 
2949  bool binary) const {
2950  WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate
2951  WriteToken(os, binary, "<LinearParams>");
2952  linear_params_.Write(os, binary);
2953  WriteToken(os, binary, "<BiasParams>");
2954  bias_params_.Write(os, binary);
2955  WriteToken(os, binary, "<RankIn>");
2956  WriteBasicType(os, binary, preconditioner_in_.GetRank());
2957  WriteToken(os, binary, "<RankOut>");
2959  if (orthonormal_constraint_ != 0.0) {
2960  WriteToken(os, binary, "<OrthonormalConstraint>");
2962  }
2963  WriteToken(os, binary, "<UpdatePeriod>");
2965  WriteToken(os, binary, "<NumSamplesHistory>");
2967  WriteToken(os, binary, "<Alpha>");
2969  WriteToken(os, binary, "</NaturalGradientAffineComponent>");
2970 }
2971 
2973  std::ostringstream stream;
2974  stream << AffineComponent::Info();
2975  stream << ", rank-in=" << preconditioner_in_.GetRank()
2976  << ", rank-out=" << preconditioner_out_.GetRank()
2977  << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory()
2978  << ", update-period=" << preconditioner_in_.GetUpdatePeriod()
2979  << ", alpha=" << preconditioner_in_.GetAlpha();
2980  return stream.str();
2981 }
2982 
2984  return new NaturalGradientAffineComponent(*this);
2985 }
2986 
2988  const NaturalGradientAffineComponent &other):
2989  AffineComponent(other),
2992 
2994  const std::string &debug_info,
2995  const CuMatrixBase<BaseFloat> &in_value,
2996  const CuMatrixBase<BaseFloat> &out_deriv) {
2997  CuMatrix<BaseFloat> in_value_temp;
2998 
2999  in_value_temp.Resize(in_value.NumRows(),
3000  in_value.NumCols() + 1, kUndefined);
3001  in_value_temp.Range(0, in_value.NumRows(),
3002  0, in_value.NumCols()).CopyFromMat(in_value);
3003 
3004  // Add the 1.0 at the end of each row "in_value_temp"
3005  in_value_temp.Range(0, in_value.NumRows(),
3006  in_value.NumCols(), 1).Set(1.0);
3007 
3008  CuMatrix<BaseFloat> out_deriv_temp(out_deriv);
3009 
3010  // These "scale" values get will get multiplied into the learning rate (faster
3011  // than having the matrices scaled inside the preconditioning code).
3012  BaseFloat in_scale, out_scale;
3013 
3014  preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale);
3015  preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale);
3016 
3017  // "scale" is a scaling factor coming from the PreconditionDirections calls
3018  // (it's faster to have them output a scaling factor than to have them scale
3019  // their outputs).
3020  BaseFloat scale = in_scale * out_scale;
3021 
3022  CuSubMatrix<BaseFloat> in_value_precon_part(in_value_temp,
3023  0, in_value_temp.NumRows(),
3024  0, in_value_temp.NumCols() - 1);
3025  // this "precon_ones" is what happens to the vector of 1's representing
3026  // offsets, after multiplication by the preconditioner.
3027  CuVector<BaseFloat> precon_ones(in_value_temp.NumRows());
3028 
3029  precon_ones.CopyColFromMat(in_value_temp, in_value_temp.NumCols() - 1);
3030 
3031  BaseFloat local_lrate = scale * learning_rate_;
3032 
3033  bias_params_.AddMatVec(local_lrate, out_deriv_temp, kTrans,
3034  precon_ones, 1.0);
3035  linear_params_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
3036  in_value_precon_part, kNoTrans, 1.0);
3037 }
3038 
3040  if (scale == 0.0) {
3041  linear_params_.SetZero();
3042  bias_params_.SetZero();
3043  } else {
3044  linear_params_.Scale(scale);
3045  bias_params_.Scale(scale);
3046  }
3047 }
3048 
3050  const NaturalGradientAffineComponent *other =
3051  dynamic_cast<const NaturalGradientAffineComponent*>(&other_in);
3052  KALDI_ASSERT(other != NULL);
3053  linear_params_.AddMat(alpha, other->linear_params_);
3054  bias_params_.AddVec(alpha, other->bias_params_);
3055 }
3056 
3058  preconditioner_in_.Freeze(freeze);
3059  preconditioner_out_.Freeze(freeze);
3060 }
3061 
3064  preconditioner_in_.Swap(&temp_in);
3066  preconditioner_out_.Swap(&temp_out);
3067 }
3068 
3069 void LinearComponent::Read(std::istream &is, bool binary) {
3070  std::string token = ReadUpdatableCommon(is, binary);
3071  KALDI_ASSERT(token == "");
3072  ExpectToken(is, binary, "<Params>");
3073  params_.Read(is, binary);
3074  if (PeekToken(is, binary) == 'O') {
3075  ExpectToken(is, binary, "<OrthonormalConstraint>");
3076  ReadBasicType(is, binary, &orthonormal_constraint_);
3077  } else {
3079  }
3080  ExpectToken(is, binary, "<UseNaturalGradient>");
3081  ReadBasicType(is, binary, &use_natural_gradient_);
3082 
3083  // Read various natural-gradient-related configs.
3084  int32 rank_in, rank_out, update_period;
3085  BaseFloat alpha, num_samples_history;
3086  ExpectToken(is, binary, "<RankInOut>");
3087  ReadBasicType(is, binary, &rank_in);
3088  ReadBasicType(is, binary, &rank_out);
3089  ExpectToken(is, binary, "<Alpha>");
3090  ReadBasicType(is, binary, &alpha);
3091  ExpectToken(is, binary, "<NumSamplesHistory>");
3092  ReadBasicType(is, binary, &num_samples_history);
3093  ExpectToken(is, binary, "<UpdatePeriod>");
3094  ReadBasicType(is, binary, &update_period);
3095 
3098  preconditioner_in_.SetRank(rank_in);
3099  preconditioner_out_.SetRank(rank_out);
3100  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
3101  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
3102  preconditioner_in_.SetUpdatePeriod(update_period);
3103  preconditioner_out_.SetUpdatePeriod(update_period);
3104 
3105  ExpectToken(is, binary, "</LinearComponent>");
3106 }
3107 
3109  bool ok = true;
3110  std::string matrix_filename;
3111  is_gradient_ = false; // not configurable; there's no reason you'd want this
3112 
3114 
3115  int32 input_dim = -1, output_dim = -1;
3116  if (cfl->GetValue("matrix", &matrix_filename)) {
3117  ReadKaldiObject(matrix_filename, &params_); // will abort on failure.
3118  KALDI_ASSERT(params_.NumRows() != 0);
3119  if (cfl->GetValue("input-dim", &input_dim))
3120  KALDI_ASSERT(input_dim == InputDim() &&
3121  "input-dim mismatch vs. matrix.");
3122  if (cfl->GetValue("output-dim", &output_dim))
3123  KALDI_ASSERT(output_dim == OutputDim() &&
3124  "output-dim mismatch vs. matrix.");
3125  } else {
3126  ok = ok && cfl->GetValue("input-dim", &input_dim);
3127  ok = ok && cfl->GetValue("output-dim", &output_dim);
3128  if (!ok)
3129  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
3130  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim);
3131  cfl->GetValue("param-stddev", &param_stddev);
3132  params_.Resize(output_dim, input_dim);
3133  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
3134  params_.SetRandn(); // sets to random normally distributed noise.
3135  params_.Scale(param_stddev);
3136  }
3137  // Read various natural-gradient-related configs.
3138  int32 rank_in = -1, rank_out = -1, update_period = 4;
3139  BaseFloat alpha = 4.0,
3140  num_samples_history = 2000.0;
3141 
3142  use_natural_gradient_ = true;
3143 
3144  cfl->GetValue("num-samples-history", &num_samples_history);
3145  cfl->GetValue("alpha", &alpha);
3146  cfl->GetValue("rank-in", &rank_in);
3147  cfl->GetValue("rank-out", &rank_out);
3148  cfl->GetValue("update-period", &update_period);
3149  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
3150 
3151  if (rank_in < 0)
3152  rank_in = std::min<int32>(20, (InputDim() + 1) / 2);
3153  if (rank_out < 0)
3154  rank_out = std::min<int32>(80, (OutputDim() + 1) / 2);
3155 
3158  preconditioner_in_.SetRank(rank_in);
3159  preconditioner_out_.SetRank(rank_out);
3160  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
3161  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
3162  preconditioner_in_.SetUpdatePeriod(update_period);
3163  preconditioner_out_.SetUpdatePeriod(update_period);
3164 
3166  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
3167 
3168  if (cfl->HasUnusedValues())
3169  KALDI_ERR << "Could not process these elements in initializer: "
3170  << cfl->UnusedValues();
3171 }
3172 
3173 
3174 void LinearComponent::Write(std::ostream &os,
3175  bool binary) const {
3176  WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate
3177  WriteToken(os, binary, "<Params>");
3178  params_.Write(os, binary);
3179  if (orthonormal_constraint_ != 0.0) {
3180  WriteToken(os, binary, "<OrthonormalConstraint>");
3182  }
3183  WriteToken(os, binary, "<UseNaturalGradient>");
3184  WriteBasicType(os, binary, use_natural_gradient_);
3185 
3186  int32 rank_in = preconditioner_in_.GetRank(),
3187  rank_out = preconditioner_out_.GetRank(),
3188  update_period = preconditioner_in_.GetUpdatePeriod();
3190  num_samples_history = preconditioner_in_.GetNumSamplesHistory();
3191  WriteToken(os, binary, "<RankInOut>");
3192  WriteBasicType(os, binary, rank_in);
3193  WriteBasicType(os, binary, rank_out);
3194  WriteToken(os, binary, "<Alpha>");
3195  WriteBasicType(os, binary, alpha);
3196  WriteToken(os, binary, "<NumSamplesHistory>");
3197  WriteBasicType(os, binary, num_samples_history);
3198  WriteToken(os, binary, "<UpdatePeriod>");
3199  WriteBasicType(os, binary, update_period);
3200  WriteToken(os, binary, "</LinearComponent>");
3201 }
3202 
3203 std::string LinearComponent::Info() const {
3204  std::ostringstream stream;
3205  stream << UpdatableComponent::Info();
3206  PrintParameterStats(stream, "params", params_,
3207  false, // include_mean
3208  true, // include_row_norms
3209  true, // include_column_norms
3210  GetVerboseLevel() >= 2); // include_singular_values
3211  if (orthonormal_constraint_ != 0.0)
3212  stream << ", orthonormal-constraint=" << orthonormal_constraint_;
3213  stream << ", use-natural-gradient="
3214  << (use_natural_gradient_ ? "true" : "false")
3215  << ", rank-in=" << preconditioner_in_.GetRank()
3216  << ", rank-out=" << preconditioner_out_.GetRank()
3217  << ", num-samples-history="
3219  << ", update-period=" << preconditioner_in_.GetUpdatePeriod()
3220  << ", alpha=" << preconditioner_in_.GetAlpha();
3221  return stream.str();
3222 }
3223 
3225  const CuMatrixBase<BaseFloat> &in,
3226  CuMatrixBase<BaseFloat> *out) const {
3227  out->AddMatMat(1.0, in, kNoTrans, params_, kTrans, 1.0);
3228  return NULL;
3229 }
3230 
3231 void LinearComponent::Backprop(const std::string &debug_info,
3232  const ComponentPrecomputedIndexes *indexes,
3233  const CuMatrixBase<BaseFloat> &in_value,
3234  const CuMatrixBase<BaseFloat> &, // out_value
3235  const CuMatrixBase<BaseFloat> &out_deriv,
3236  void *memo,
3237  Component *to_update_in,
3238  CuMatrixBase<BaseFloat> *in_deriv) const {
3239  NVTX_RANGE("LinearComponent::Backprop");
3240  LinearComponent *to_update = dynamic_cast<LinearComponent*>(to_update_in);
3241 
3242  // Propagate the derivative back to the input. add with coefficient 1.0 since
3243  // property kBackpropAdds is true. If we wanted to add with coefficient 0.0
3244  // we'd need to zero the in_deriv, in case of infinities.
3245  if (in_deriv)
3246  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, params_, kNoTrans, 1.0);
3247 
3248  if (to_update != NULL) {
3249  if (!to_update->is_gradient_) {
3250  CuMatrix<BaseFloat> in_value_temp(in_value), out_deriv_temp(out_deriv);
3251  // These "scale" values get will get multiplied into the learning rate (faster
3252  // than having the matrices scaled inside the preconditioning code).
3253  BaseFloat in_scale, out_scale;
3254  to_update->preconditioner_in_.PreconditionDirections(&in_value_temp,
3255  &in_scale);
3256  to_update->preconditioner_out_.PreconditionDirections(&out_deriv_temp,
3257  &out_scale);
3258  BaseFloat local_lrate = in_scale * out_scale * to_update->learning_rate_;
3259 
3260  to_update->params_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
3261  in_value_temp, kNoTrans, 1.0);
3262  } else {
3263  to_update->params_.AddMatMat(to_update->learning_rate_,
3264  out_deriv, kTrans,
3265  in_value, kNoTrans, 1.0);
3266  }
3267  }
3268 }
3269 
3270 
3272  return new LinearComponent(*this);
3273 }
3274 
3276  const LinearComponent &other):
3277  UpdatableComponent(other),
3278  params_(other.params_),
3280  use_natural_gradient_(other.use_natural_gradient_),
3283 
3285  params_(params),
3287  use_natural_gradient_(true) {
3288  // Set defaults for natural gradient.
3293  // the component-level defaults of alpha and num_samples_history, at 4.0 and
3294  // 2000.0, are the same as in the NaturalGradientOnline code, so there is no
3295  // need to set those here.
3296 }
3297 
3299  if (scale == 0.0) params_.SetZero();
3300  else params_.Scale(scale);
3301 }
3302 
3303 void LinearComponent::Add(BaseFloat alpha, const Component &other_in) {
3304  const LinearComponent *other =
3305  dynamic_cast<const LinearComponent*>(&other_in);
3306  KALDI_ASSERT(other != NULL);
3307  params_.AddMat(alpha, other->params_);
3308 }
3309 
3311  CuMatrix<BaseFloat> temp_params(params_);
3312  temp_params.SetRandn();
3313  params_.AddMat(stddev, temp_params);
3314 }
3316  return params_.NumRows() * params_.NumCols();
3317 }
3319  KALDI_ASSERT(params->Dim() == this->NumParameters());
3320  params->CopyRowsFromMat(params_);
3321 }
3323  KALDI_ASSERT(params.Dim() == this->NumParameters());
3324  params_.CopyRowsFromVec(params);
3325 }
3327  const LinearComponent *other =
3328  dynamic_cast<const LinearComponent*>(&other_in);
3329  return TraceMatMat(params_, other->params_, kTrans);
3330 }
3331 
3333  preconditioner_in_.Freeze(freeze);
3334  preconditioner_out_.Freeze(freeze);
3335 }
3336 
3339  preconditioner_in_.Swap(&temp_in);
3341  preconditioner_out_.Swap(&temp_out);
3342 }
3343 
3344 std::string FixedAffineComponent::Info() const {
3345  std::ostringstream stream;
3346  stream << Component::Info();
3347  PrintParameterStats(stream, "linear-params", linear_params_);
3348  PrintParameterStats(stream, "bias", bias_params_, true);
3349  return stream.str();
3350 }
3351 
3353  KALDI_ASSERT(mat.NumCols() > 1);
3354  linear_params_ = mat.Range(0, mat.NumRows(), 0, mat.NumCols() - 1);
3355  bias_params_.Resize(mat.NumRows());
3356  bias_params_.CopyColFromMat(mat, mat.NumCols() - 1);
3357 }
3358 
3360  std::string filename;
3361  // Two forms allowed: "matrix=<rxfilename>", or "input-dim=x output-dim=y"
3362  // (for testing purposes only).
3363  if (cfl->GetValue("matrix", &filename)) {
3364  if (cfl->HasUnusedValues())
3365  KALDI_ERR << "Invalid initializer for layer of type "
3366  << Type() << ": \"" << cfl->WholeLine() << "\"";
3367 
3368  bool binary;
3369  Input ki(filename, &binary);
3370  CuMatrix<BaseFloat> mat;
3371  mat.Read(ki.Stream(), binary);
3372  KALDI_ASSERT(mat.NumRows() != 0);
3373  Init(mat);
3374  } else {
3375  int32 input_dim = -1, output_dim = -1;
3376  if (!cfl->GetValue("input-dim", &input_dim) ||
3377  !cfl->GetValue("output-dim", &output_dim) || cfl->HasUnusedValues()) {
3378  KALDI_ERR << "Invalid initializer for layer of type "
3379  << Type() << ": \"" << cfl->WholeLine() << "\"";
3380  }
3381  CuMatrix<BaseFloat> mat(output_dim, input_dim + 1);
3382  mat.SetRandn();
3383  Init(mat);
3384  }
3385 }
3386 
3387 
3389  linear_params_(c.LinearParams()),
3390  bias_params_(c.BiasParams()) { }
3391 
3393  const CuMatrixBase<BaseFloat> &in,
3394  CuMatrixBase<BaseFloat> *out) const {
3395  out->CopyRowsFromVec(bias_params_); // Adds the bias term first.
3396  out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 1.0);
3397  return NULL;
3398 }
3399 
3400 void FixedAffineComponent::Backprop(const std::string &debug_info,
3401  const ComponentPrecomputedIndexes *indexes,
3402  const CuMatrixBase<BaseFloat> &, //in_value
3403  const CuMatrixBase<BaseFloat> &, //out_value
3404  const CuMatrixBase<BaseFloat> &out_deriv,
3405  void *memo,
3406  Component *, //to_update
3407  CuMatrixBase<BaseFloat> *in_deriv) const {
3408  NVTX_RANGE("FixedAffineComponent::Backprop");
3409  // kBackpropAdds is true. It's the user's responsibility to zero out
3410  // <in_deriv> if they need it to be so.
3411  if (in_deriv)
3412  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans,
3413  linear_params_, kNoTrans, 1.0);
3414 }
3415 
3419  ans->bias_params_ = bias_params_;
3420  return ans;
3421 }
3422 
3423 void FixedAffineComponent::Write(std::ostream &os, bool binary) const {
3424  WriteToken(os, binary, "<FixedAffineComponent>");
3425  WriteToken(os, binary, "<LinearParams>");
3426  linear_params_.Write(os, binary);
3427  WriteToken(os, binary, "<BiasParams>");
3428  bias_params_.Write(os, binary);
3429  WriteToken(os, binary, "</FixedAffineComponent>");
3430 }
3431 
3432 void FixedAffineComponent::Read(std::istream &is, bool binary) {
3433  ExpectOneOrTwoTokens(is, binary, "<FixedAffineComponent>", "<LinearParams>");
3434  linear_params_.Read(is, binary);
3435  ExpectToken(is, binary, "<BiasParams>");
3436  bias_params_.Read(is, binary);
3437  ExpectToken(is, binary, "</FixedAffineComponent>");
3438 }
3439 
3440 void SumGroupComponent::Init(const std::vector<int32> &sizes) {
3441  KALDI_ASSERT(!sizes.empty());
3442  std::vector<Int32Pair> cpu_vec(sizes.size());
3443  std::vector<int32> reverse_cpu_vec;
3444  int32 cur_index = 0;
3445  for (size_t i = 0; i < sizes.size(); i++) {
3446  KALDI_ASSERT(sizes[i] > 0);
3447  cpu_vec[i].first = cur_index;
3448  cpu_vec[i].second = cur_index + sizes[i];
3449  cur_index += sizes[i];
3450  for (int32 j = cpu_vec[i].first; j < cpu_vec[i].second; j++)
3451  reverse_cpu_vec.push_back(i);
3452  }
3453  this->indexes_ = cpu_vec;
3454  this->reverse_indexes_ = reverse_cpu_vec;
3455  this->input_dim_ = cur_index;
3456  this->output_dim_ = sizes.size();
3457 }
3458 
3459 void SumGroupComponent::Init(int32 input_dim, int32 output_dim) {
3460  const int32 num_groups = output_dim;
3461  KALDI_ASSERT(input_dim % num_groups == 0);
3462  const int32 group_size = input_dim / num_groups;
3463 
3464  std::vector<Int32Pair> cpu_vec(num_groups);
3465  std::vector<int32> reverse_cpu_vec;
3466  int32 cur_index = 0;
3467  for (size_t i = 0; i < num_groups; i++) {
3468  cpu_vec[i].first = cur_index;
3469  cpu_vec[i].second = cur_index + group_size;
3470  cur_index += group_size;
3471  for (int32 j = cpu_vec[i].first; j < cpu_vec[i].second; j++)
3472  reverse_cpu_vec.push_back(i);
3473  }
3474  this->indexes_ = cpu_vec;
3475  this->reverse_indexes_ = reverse_cpu_vec;
3476  this->input_dim_ = input_dim;
3477  this->output_dim_ = num_groups;
3478 }
3479 
3481  std::vector<int32> sizes;
3482  bool has_sizes = cfl->GetValue("sizes", &sizes);
3483  if (has_sizes) {
3484  if (cfl->HasUnusedValues() || sizes.empty())
3485  KALDI_ERR << "Invalid initializer for layer of type "
3486  << Type() << ": \"" << cfl->WholeLine() << "\"";
3487  this->Init(sizes);
3488  } else { // each group has the same size
3489  int32 input_dim = -1, output_dim = -1;
3490  if (!cfl->GetValue("input-dim", &input_dim) ||
3491  !cfl->GetValue("output-dim", &output_dim) || cfl->HasUnusedValues()) {
3492  KALDI_ERR << "Invalid initializer for layer of type "
3493  << Type() << ": \"" << cfl->WholeLine() << "\"";
3494  }
3495  Init(input_dim, output_dim);
3496  }
3497 }
3498 
3500  SumGroupComponent *ans = new SumGroupComponent();
3501  ans->indexes_ = indexes_;
3502  ans->reverse_indexes_ = reverse_indexes_;
3503  ans->input_dim_ = input_dim_;
3504  ans->output_dim_ = output_dim_;
3505  return ans;
3506 }
3507 
3508 void SumGroupComponent::Read(std::istream &is, bool binary) {
3509  ExpectOneOrTwoTokens(is, binary, "<SumGroupComponent>", "<Sizes>");
3510  std::vector<int32> sizes;
3511  ReadIntegerVector(is, binary, &sizes);
3512 
3513  std::string token;
3514  ReadToken(is, binary, &token);
3515  if (!(token == "<SumGroupComponent>" ||
3516  token == "</SumGroupComponent>")) {
3517  KALDI_ERR << "Expected </SumGroupComponent>, got " << token;
3518  }
3519  this->Init(sizes);
3520 }
3521 
3522 void SumGroupComponent::GetSizes(std::vector<int32> *sizes) const {
3523  std::vector<Int32Pair> indexes;
3524  indexes_.CopyToVec(&indexes);
3525  sizes->resize(indexes.size());
3526  for (size_t i = 0; i < indexes.size(); i++) {
3527  (*sizes)[i] = indexes[i].second - indexes[i].first;
3528  if (i == 0) { KALDI_ASSERT(indexes[i].first == 0); }
3529  else { KALDI_ASSERT(indexes[i].first == indexes[i-1].second); }
3530  KALDI_ASSERT(indexes[i].second > indexes[i].first);
3531  (*sizes)[i] = indexes[i].second - indexes[i].first;
3532  }
3533 }
3534 
3535 void SumGroupComponent::Write(std::ostream &os, bool binary) const {
3536  WriteToken(os, binary, "<SumGroupComponent>");
3537  WriteToken(os, binary, "<Sizes>");
3538  std::vector<int32> sizes;
3539  this->GetSizes(&sizes);
3540  WriteIntegerVector(os, binary, sizes);
3541  WriteToken(os, binary, "</SumGroupComponent>");
3542 }
3543 
3545  const CuMatrixBase<BaseFloat> &in,
3546  CuMatrixBase<BaseFloat> *out) const {
3547  out->SumColumnRanges(in, indexes_);
3548  return NULL;
3549 }
3550 
3551 void SumGroupComponent::Backprop(const std::string &debug_info,
3552  const ComponentPrecomputedIndexes *indexes,
3553  const CuMatrixBase<BaseFloat> &, // in_value,
3554  const CuMatrixBase<BaseFloat> &, // out_value
3555  const CuMatrixBase<BaseFloat> &out_deriv,
3556  void *memo,
3557  Component *to_update_in,
3558  CuMatrixBase<BaseFloat> *in_deriv) const {
3559  NVTX_RANGE("SumGroupComponent::Backprop");
3560  in_deriv->CopyCols(out_deriv, reverse_indexes_);
3561 }
3562 
3564  const CuMatrixBase<BaseFloat> &in,
3565  CuMatrixBase<BaseFloat> *out) const {
3566  // Apply softmax function to each row of the output...
3567  // for that row, we do
3568  // x_i = exp(x_i) / sum_j exp(x_j).
3569  out->SoftMaxPerRow(in);
3570 
3571  // This floor on the output helps us deal with
3572  // almost-zeros in a way that doesn't lead to overflow.
3573  out->ApplyFloor(1.0e-20);
3574 
3575  return NULL;
3576 }
3577 
3578 void SoftmaxComponent::Backprop(const std::string &debug_info,
3579  const ComponentPrecomputedIndexes *indexes,
3580  const CuMatrixBase<BaseFloat> &, // in_value,
3581  const CuMatrixBase<BaseFloat> &out_value,
3582  const CuMatrixBase<BaseFloat> &out_deriv,
3583  void *memo,
3584  Component *to_update_in,
3585  CuMatrixBase<BaseFloat> *in_deriv) const {
3586  NVTX_RANGE("SoftmaxComponent::Backprop");
3587 
3588  if (to_update_in) {
3589  SoftmaxComponent *to_update =
3590  dynamic_cast<SoftmaxComponent*>(to_update_in);
3591  to_update->StoreBackpropStats(out_deriv);
3592  }
3593 
3594  if (in_deriv == NULL)
3595  return;
3596  /*
3597  Note on the derivative of the softmax function: let it be
3598  p_i = exp(x_i) / sum_i exp_i
3599  The [matrix-valued] Jacobian of this function is
3600  diag(p) - p p^T
3601  Let the derivative vector at the output be e, and at the input be
3602  d. We have
3603  d = diag(p) e - p (p^T e).
3604  d_i = p_i e_i - p_i (p^T e).
3605  */
3606  in_deriv->DiffSoftmaxPerRow(out_value, out_deriv);
3607 }
3608 
3610  const CuMatrixBase<BaseFloat> &out_value,
3611  void *memo) {
3612  // We don't store derivative stats for this component type, just activation
3613  // stats.
3614  StoreStatsInternal(out_value, NULL);
3615 }
3616 
3617 
3619  const CuMatrixBase<BaseFloat> &in,
3620  CuMatrixBase<BaseFloat> *out) const {
3621  // Applies log softmax function to each row of the output. For each row, we do
3622  // x_i = x_i - log(sum_j exp(x_j))
3623  out->LogSoftMaxPerRow(in);
3624  return NULL;
3625 }
3626 
3627 void LogSoftmaxComponent::Backprop(const std::string &debug_info,
3628  const ComponentPrecomputedIndexes *indexes,
3629  const CuMatrixBase<BaseFloat> &, // in_value
3630  const CuMatrixBase<BaseFloat> &out_value,
3631  const CuMatrixBase<BaseFloat> &out_deriv,
3632  void *memo,
3633  Component *to_update_in,
3634  CuMatrixBase<BaseFloat> *in_deriv) const {
3635  NVTX_RANGE("LogSoftmaxComponent::Backprop");
3636  if (to_update_in) {
3637  LogSoftmaxComponent *to_update =
3638  dynamic_cast<LogSoftmaxComponent*>(to_update_in);
3639  to_update->StoreBackpropStats(out_deriv);
3640  }
3641  if (in_deriv == NULL)
3642  return;
3643  in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv);
3644 }
3645 
3646 
3648  KALDI_ASSERT(scales.Dim() != 0);
3649  scales_ = scales;
3650 }
3651 
3652 
3654  std::string filename;
3655  // Accepts "scales" config (for filename) or "dim" -> random init, for testing.
3656  if (cfl->GetValue("scales", &filename)) {
3657  if (cfl->HasUnusedValues())
3658  KALDI_ERR << "Invalid initializer for layer of type "
3659  << Type() << ": \"" << cfl->WholeLine() << "\"";
3660  CuVector<BaseFloat> vec;
3661  ReadKaldiObject(filename, &vec);
3662  Init(vec);
3663  } else {
3664  int32 dim;
3665  BaseFloat scale = 1.0;
3666  bool scale_is_set = cfl->GetValue("scale", &scale);
3667  if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues())
3668  KALDI_ERR << "Invalid initializer for layer of type "
3669  << Type() << ": \"" << cfl->WholeLine() << "\"";
3670  KALDI_ASSERT(dim > 0);
3671  CuVector<BaseFloat> vec(dim);
3672  if (scale_is_set) {
3673  vec.Set(scale);
3674  } else {
3675  vec.SetRandn();
3676  }
3677  Init(vec);
3678  }
3679 }
3680 
3681 
3682 std::string FixedScaleComponent::Info() const {
3683  std::ostringstream stream;
3684  stream << Component::Info();
3685  PrintParameterStats(stream, "scales", scales_, true);
3686  return stream.str();
3687 }
3688 
3690  const CuMatrixBase<BaseFloat> &in,
3691  CuMatrixBase<BaseFloat> *out) const {
3692  out->CopyFromMat(in); // does nothing if same matrix.
3693  out->MulColsVec(scales_);
3694  return NULL;
3695 }
3696 
3697 void FixedScaleComponent::Backprop(const std::string &debug_info,
3698  const ComponentPrecomputedIndexes *indexes,
3699  const CuMatrixBase<BaseFloat> &, // in_value
3700  const CuMatrixBase<BaseFloat> &, // out_value
3701  const CuMatrixBase<BaseFloat> &out_deriv,
3702  void *memo,
3703  Component *, // to_update
3704  CuMatrixBase<BaseFloat> *in_deriv) const {
3705  NVTX_RANGE("FixedScaleComponent::Backprop");
3706  in_deriv->CopyFromMat(out_deriv); // does nothing if same memory.
3707  in_deriv->MulColsVec(scales_);
3708 }
3709 
3712  ans->scales_ = scales_;
3713  return ans;
3714 }
3715 
3716 
3717 void FixedScaleComponent::Write(std::ostream &os, bool binary) const {
3718  WriteToken(os, binary, "<FixedScaleComponent>");
3719  WriteToken(os, binary, "<Scales>");
3720  scales_.Write(os, binary);
3721  WriteToken(os, binary, "</FixedScaleComponent>");
3722 }
3723 
3724 void FixedScaleComponent::Read(std::istream &is, bool binary) {
3725  ExpectOneOrTwoTokens(is, binary, "<FixedScaleComponent>", "<Scales>");
3726  scales_.Read(is, binary);
3727  ExpectToken(is, binary, "</FixedScaleComponent>");
3728 }
3729 
3731  KALDI_ASSERT(bias.Dim() != 0);
3732  bias_ = bias;
3733 }
3734 
3736  std::string filename;
3737  // Accepts "bias" config (for filename) or "dim" -> random init, for testing.
3738  if (cfl->GetValue("bias", &filename)) {
3739  if (cfl->HasUnusedValues())
3740  KALDI_ERR << "Invalid initializer for layer of type "
3741  << Type() << ": \"" << cfl->WholeLine() << "\"";
3742  CuVector<BaseFloat> vec;
3743  ReadKaldiObject(filename, &vec);
3744  Init(vec);
3745  } else {
3746  int32 dim;
3747  if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues())
3748  KALDI_ERR << "Invalid initializer for layer of type "
3749  << Type() << ": \"" << cfl->WholeLine() << "\"";
3750  KALDI_ASSERT(dim > 0);
3751  CuVector<BaseFloat> vec(dim);
3752  vec.SetRandn();
3753  Init(vec);
3754  }
3755 }
3756 
3757 std::string FixedBiasComponent::Info() const {
3758  std::ostringstream stream;
3759  stream << Component::Info();
3760  PrintParameterStats(stream, "bias", bias_, true);
3761  return stream.str();
3762 }
3763 
3765  const CuMatrixBase<BaseFloat> &in,
3766  CuMatrixBase<BaseFloat> *out) const {
3767  out->CopyFromMat(in); // will do nothing if in and out have same memory.
3768  out->AddVecToRows(1.0, bias_, 1.0);
3769  return NULL;
3770 }
3771 
3772 void FixedBiasComponent::Backprop(const std::string &debug_info,
3773  const ComponentPrecomputedIndexes *indexes,
3774  const CuMatrixBase<BaseFloat> &, // in_value
3775  const CuMatrixBase<BaseFloat> &, // out_value
3776  const CuMatrixBase<BaseFloat> &out_deriv,
3777  void *memo,
3778  Component *, // to_update
3779  CuMatrixBase<BaseFloat> *in_deriv) const {
3780  NVTX_RANGE("FixedBiasComponent::Backprop");
3781  // the following statement will do nothing if in_deriv and out_deriv have same
3782  // memory.
3783  in_deriv->CopyFromMat(out_deriv);
3784 }
3785 
3788  ans->bias_ = bias_;
3789  return ans;
3790 }
3791 
3792 
3793 void FixedBiasComponent::Write(std::ostream &os, bool binary) const {
3794  WriteToken(os, binary, "<FixedBiasComponent>");
3795  WriteToken(os, binary, "<Bias>");
3796  bias_.Write(os, binary);
3797  WriteToken(os, binary, "</FixedBiasComponent>");
3798 }
3799 
3800 void FixedBiasComponent::Read(std::istream &is, bool binary) {
3801  ExpectOneOrTwoTokens(is, binary, "<FixedBiasComponent>", "<Bias>");
3802  bias_.Read(is, binary);
3803  ExpectToken(is, binary, "</FixedBiasComponent>");
3804 }
3805 
3806 
3808  std::istream &is, bool binary) {
3809  ReadUpdatableCommon(is, binary); // Read the opening tag and learning rate
3810  ExpectToken(is, binary, "<Params>");
3811  scales_.Read(is, binary);
3812  ExpectToken(is, binary, "<IsGradient>");
3813  ReadBasicType(is, binary, &is_gradient_);
3814  int32 rank, update_period;
3815  ExpectToken(is, binary, "<Rank>");
3816  ReadBasicType(is, binary, &rank);
3817  preconditioner_.SetRank(rank);
3818  ExpectToken(is, binary, "<UpdatePeriod>");
3819  ReadBasicType(is, binary, &update_period);
3820  preconditioner_.SetUpdatePeriod(update_period);
3821  BaseFloat num_samples_history, alpha;
3822  ExpectToken(is, binary, "<NumSamplesHistory>");
3823  ReadBasicType(is, binary, &num_samples_history);
3824  preconditioner_.SetNumSamplesHistory(num_samples_history);
3825  ExpectToken(is, binary, "<Alpha>");
3826  ReadBasicType(is, binary, &alpha);
3827  preconditioner_.SetAlpha(alpha);
3828  std::string token;
3829  ReadToken(is, binary, &token);
3830  if (token == "<MaxChangePerMinibatch>") {
3831  // back compatibility; this was removed, it's now handled by the
3832  // 'max-change' config variable.
3833  BaseFloat temp;
3834  ReadBasicType(is, binary, &temp);
3835  ReadToken(is, binary, &token);
3836  }
3837  KALDI_ASSERT(token == "</NaturalGradientPerElementScaleComponent>");
3838 }
3839 
3841  bool binary) const {
3842  WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate
3843  WriteToken(os, binary, "<Params>");
3844  scales_.Write(os, binary);
3845  WriteToken(os, binary, "<IsGradient>");
3846  WriteBasicType(os, binary, is_gradient_);
3847  WriteToken(os, binary, "<Rank>");
3848  WriteBasicType(os, binary, preconditioner_.GetRank());
3849  WriteToken(os, binary, "<UpdatePeriod>");
3850  WriteBasicType(os, binary, preconditioner_.GetUpdatePeriod());
3851  WriteToken(os, binary, "<NumSamplesHistory>");
3852  WriteBasicType(os, binary, preconditioner_.GetNumSamplesHistory());
3853  WriteToken(os, binary, "<Alpha>");
3854  WriteBasicType(os, binary, preconditioner_.GetAlpha());
3855  WriteToken(os, binary, "</NaturalGradientPerElementScaleComponent>");
3856 }
3857 
3859  std::ostringstream stream;
3860  stream << PerElementScaleComponent::Info()
3861  << ", rank=" << preconditioner_.GetRank()
3862  << ", update-period=" << preconditioner_.GetUpdatePeriod()
3863  << ", num-samples-history=" << preconditioner_.GetNumSamplesHistory()
3864  << ", alpha=" << preconditioner_.GetAlpha();
3865  return stream.str();
3866 }
3867 
3869  // First set various configuration values that have defaults.
3870  int32 rank = 8, // Use a small rank because in this case the amount of memory
3871  // for the preconditioner actually exceeds the memory for the
3872  // parameters (by "rank").
3873  update_period = 10;
3874  BaseFloat num_samples_history = 2000.0, alpha = 4.0;
3875  cfl->GetValue("rank", &rank);
3876  cfl->GetValue("update-period", &update_period);
3877  cfl->GetValue("num-samples-history", &num_samples_history);
3878  cfl->GetValue("alpha", &alpha);
3879  InitLearningRatesFromConfig(cfl);
3880  std::string filename;
3881  // Accepts "scales" config (for filename) or "dim" -> random init, for testing.
3882  if (cfl->GetValue("scales", &filename)) {
3883  if (cfl->HasUnusedValues())
3884  KALDI_ERR << "Invalid initializer for layer of type "
3885  << Type() << ": \"" << cfl->WholeLine() << "\"";
3886  Init(filename, rank, update_period, num_samples_history, alpha);
3887 
3888  } else {
3889  BaseFloat param_mean = 1.0, param_stddev = 0.0;
3890  cfl->GetValue("param-mean", &param_mean);
3891  cfl->GetValue("param-stddev", &param_stddev);
3892 
3893  int32 dim;
3894  if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues())
3895  KALDI_ERR << "Invalid initializer for layer of type "
3896  << Type() << ": \"" << cfl->WholeLine() << "\"";
3897  KALDI_ASSERT(dim > 0);
3898 
3899  Init(dim, param_mean, param_stddev, rank, update_period,
3900  num_samples_history, alpha);
3901  }
3902 }
3903 
3905  int32 dim, BaseFloat param_mean,
3906  BaseFloat param_stddev, int32 rank, int32 update_period,
3907  BaseFloat num_samples_history, BaseFloat alpha) {
3908  PerElementScaleComponent::Init(dim, param_mean,
3909  param_stddev);
3910  preconditioner_.SetRank(rank);
3911  preconditioner_.SetUpdatePeriod(update_period);
3912  preconditioner_.SetNumSamplesHistory(num_samples_history);
3913  preconditioner_.SetAlpha(alpha);
3914 }
3915 
3917  std::string vector_filename,
3918  int32 rank, int32 update_period, BaseFloat num_samples_history,
3919  BaseFloat alpha) {
3920  PerElementScaleComponent::Init(vector_filename);
3921  preconditioner_.SetRank(rank);
3922  preconditioner_.SetUpdatePeriod(update_period);
3923  preconditioner_.SetNumSamplesHistory(num_samples_history);
3924  preconditioner_.SetAlpha(alpha);
3925 }
3926 
3927 
3930  PerElementScaleComponent(other),
3931  preconditioner_(other.preconditioner_) { }
3932 
3933 
3934 
3935 
3937  return new NaturalGradientPerElementScaleComponent(*this);
3938 }
3939 
3941  const std::string &debug_info,
3942  const CuMatrixBase<BaseFloat> &in_value,
3943  const CuMatrixBase<BaseFloat> &out_deriv) {
3944 
3945  CuMatrix<BaseFloat> derivs_per_frame(in_value);
3946  derivs_per_frame.MulElements(out_deriv);
3947  // the non-natural-gradient update would just do
3948  // scales_.AddRowSumMat(learning_rate_, derivs_per_frame).
3949 
3950  BaseFloat scale;
3951  preconditioner_.PreconditionDirections(&derivs_per_frame, &scale);
3952 
3953  CuVector<BaseFloat> delta_scales(scales_.Dim());
3954  delta_scales.AddRowSumMat(scale * learning_rate_, derivs_per_frame);
3955  scales_.AddVec(1.0, delta_scales);
3956 }
3957 
3959  preconditioner_.Freeze(freeze);
3960 }
3961 
3964  preconditioner_.Swap(&temp);
3965 }
3966 
3968  int32 dim = column_map_.Dim();
3969  KALDI_ASSERT(dim > 0);
3970  std::vector<int32> reverse_column_map_cpu(dim, -1),
3971  column_map_cpu(dim);
3972  column_map_.CopyToVec(&column_map_cpu);
3973  for (int32 i = 0; i < dim; i++) {
3974  int32 &dest = reverse_column_map_cpu[column_map_cpu[i]];
3975  if (dest != -1)
3976  KALDI_ERR << "Column map does not represent a permutation.";
3977  dest = i;
3978  }
3979  reverse_column_map_.Resize(dim);
3980  reverse_column_map_.CopyFromVec(reverse_column_map_cpu);
3981 }
3982 
3984  PermuteComponent *ans = new PermuteComponent();
3985  ans->column_map_ = column_map_;
3986  ans->reverse_column_map_ = reverse_column_map_;
3987  return ans;
3988 }
3989 
3991  const CuMatrixBase<BaseFloat> &in,
3992  CuMatrixBase<BaseFloat> *out) const {
3993  out->CopyCols(in, column_map_);
3994  return NULL;
3995 }
3996 void PermuteComponent::Backprop(const std::string &debug_info,
3997  const ComponentPrecomputedIndexes *indexes,
3998  const CuMatrixBase<BaseFloat> &, //in_value
3999  const CuMatrixBase<BaseFloat> &, // out_value,
4000  const CuMatrixBase<BaseFloat> &out_deriv,
4001  void *memo,
4002  Component *to_update,
4003  CuMatrixBase<BaseFloat> *in_deriv) const {
4004  NVTX_RANGE("PermuteComponent::Backprop");
4005  in_deriv->CopyCols(out_deriv, reverse_column_map_);
4006 }
4007 
4009  bool ok = true;
4010  std::string column_map_str;
4011  ok = ok && cfl->GetValue("column-map", &column_map_str);
4012  std::vector<int32> column_map;
4013  if (!SplitStringToIntegers(column_map_str, ",", true, &column_map))
4014  KALDI_ERR << "Bad initializer in PermuteComponent: column-map="
4015  << column_map_str;
4016  if (cfl->HasUnusedValues())
4017  KALDI_ERR << "Could not process these elements in initializer: "
4018  << cfl->UnusedValues();
4019  if (!ok)
4020  KALDI_ERR << "Invalid initializer for layer of type "
4021  << Type() << ": \"" << cfl->WholeLine() << "\"";
4022  Init(column_map);
4023 }
4024 
4025 void PermuteComponent::Init(const std::vector<int32> &column_map) {
4026  KALDI_ASSERT(column_map.size() > 0);
4027  column_map_.CopyFromVec(column_map);
4028  ComputeReverseColumnMap();
4029 }
4030 
4031 void PermuteComponent::Read(std::istream &is, bool binary) {
4032  ExpectOneOrTwoTokens(is, binary, "<PermuteComponent>", "<ColumnMap>");
4033  std::vector<int32> column_map;
4034  if (binary && is.peek() == 'F') {
4035  // back-compatibility code [temporary]
4036  Vector<BaseFloat> float_map;
4037  float_map.Read(is, binary);
4038  column_map.resize(float_map.Dim());
4039  for (int32 i = 0; i < float_map.Dim(); i++) {
4040  // note: casting truncates toward zero: add 0.5 to approximate rounding.
4041  column_map[i] = static_cast<int32>(float_map(i) + 0.5);
4042  }
4043  // the next line is a workaround for a bug in the old
4044  // writing code, which now causes an assert failure. it's only
4045  // valid for the permutations we're currently using. anyway all this
4046  // code is only temporary.
4047  column_map.back() = float_map.Dim() - 1;
4048  } else {
4049  ReadIntegerVector(is, binary, &column_map);
4050  }
4051  column_map_.CopyFromVec(column_map);
4052  ExpectToken(is, binary, "</PermuteComponent>");
4053  ComputeReverseColumnMap();
4054 }
4055 
4056 void PermuteComponent::Write(std::ostream &os, bool binary) const {
4057  WriteToken(os, binary, "<PermuteComponent>");
4058  WriteToken(os, binary, "<ColumnMap>");
4059  std::ostringstream buffer;
4060  std::vector<int32> column_map;
4061  column_map_.CopyToVec(&column_map);
4062  WriteIntegerVector(os, binary, column_map);
4063  WriteToken(os, binary, "</PermuteComponent>");
4064 }
4065 
4066 std::string PermuteComponent::Info() const {
4067  std::ostringstream stream;
4068  stream << Type() << ", dim=" << column_map_.Dim();
4069  stream << " , column-map=[ ";
4070  std::vector<int32> column_map(column_map_.Dim());
4071  column_map_.CopyToVec(&column_map);
4072  int32 max_size = 5;
4073  for (size_t i = 0; i < column_map.size() && i < max_size; i++)
4074  stream << column_map[i] << ' ';
4075  if (static_cast<int32>(column_map.size()) > max_size)
4076  stream << "... ";
4077  stream << "]";
4078  return stream.str();
4079 }
4080 
4081 
4083  for (std::vector<Component*>::const_iterator iter = components_.begin(),
4084  end = components_.end(); iter != end; ++iter)
4085  if (((*iter)->Properties() & kUpdatableComponent) != 0)
4086  return true;
4087  return false;
4088 }
4089 
4090 // virtual
4092  KALDI_ASSERT(!components_.empty());
4093  return components_.front()->InputDim();
4094 }
4095 
4096 // virtual
4098  KALDI_ASSERT(!components_.empty());
4099  return components_.back()->OutputDim();
4100 }
4101 
4102 // virtual
4104  KALDI_ASSERT(!components_.empty());
4105  int32 last_component_properties = components_.back()->Properties(),
4106  first_component_properties = components_.front()->Properties();
4107  // We always assume backprop needs the input, as this would be necessary to
4108  // get the activations at intermediate layers, if these were not needed in
4109  // backprop, there would be no reason to use a CompositeComponent.
4111  (last_component_properties &
4113  (first_component_properties &
4115  (IsUpdatable() ? kUpdatableComponent : 0);
4116  // note, we don't return the kStoresStats property because that function is
4117  // not implemented; instead, for efficiency, we call StoreStats() on any
4118  // sub-components as part of the backprop phase.
4119  if (last_component_properties & kStoresStats)
4120  ans |= kBackpropNeedsOutput;
4121  return ans;
4122 }
4123 
4124 
4126  int32 num_components = components_.size();
4127  if ((components_[i]->Properties() & kOutputContiguous) ||
4128  (i + 1 < num_components &&
4129  (components_[i + 1]->Properties() & kInputContiguous)))
4130  return kStrideEqualNumCols;
4131  else
4132  return kDefaultStride;
4133 }
4134 
4135 
4136 // virtual
4138  const ComponentPrecomputedIndexes *, // indexes
4139  const CuMatrixBase<BaseFloat> &in,
4140  CuMatrixBase<BaseFloat> *out) const {
4141  KALDI_ASSERT(in.NumRows() == out->NumRows() && in.NumCols() == InputDim() &&
4142  out->NumCols() == OutputDim());
4143  int32 num_rows = in.NumRows(),
4144  num_components = components_.size();
4145  if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
4146  // recurse and process smaller parts of the data, to save memory.
4147  for (int32 row_offset = 0; row_offset < num_rows;
4148  row_offset += max_rows_process_) {
4149  int32 this_num_rows = std::min<int32>(max_rows_process_,
4150  num_rows - row_offset);
4151  const CuSubMatrix<BaseFloat> in_part(in, row_offset, this_num_rows,
4152  0, in.NumCols());
4153  CuSubMatrix<BaseFloat> out_part(*out, row_offset, this_num_rows,
4154  0, out->NumCols());
4155  this->Propagate(NULL, in_part, &out_part);
4156  }
4157  return NULL;
4158  }
4159  std::vector<CuMatrix<BaseFloat> > intermediate_outputs(num_components - 1);
4160  for (int32 i = 0; i < num_components; i++) {
4161  if (i + 1 < num_components) {
4162  MatrixResizeType resize_type =
4163  ((components_[i]->Properties() & kPropagateAdds) ?
4164  kSetZero : kUndefined);
4165  intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
4166  resize_type, GetStrideType(i));
4167  }
4168  const CuMatrixBase<BaseFloat> &this_in = (i == 0 ? in :
4169  intermediate_outputs[i-1]);
4170  CuMatrixBase<BaseFloat> *this_out = (i + 1 == num_components ?
4171  out : &(intermediate_outputs[i]));
4172  void *memo = components_[i]->Propagate(NULL, this_in, this_out);
4173  // we'll re-do the forward propagation in the backprop, and we can
4174  // regenerate any memos there, so no need to keep them.
4175  if (memo != NULL)
4176  components_[i]->DeleteMemo(memo);
4177  if (i > 0)
4178  intermediate_outputs[i-1].Resize(0, 0);
4179  }
4180  return NULL;
4181 }
4182 
4183 
4184 void CompositeComponent::Init(const std::vector<Component*> &components,
4185  int32 max_rows_process) {
4186  DeletePointers(&components_); // clean up.
4187  components_ = components;
4188  KALDI_ASSERT(!components.empty());
4189  max_rows_process_ = max_rows_process;
4190 
4191  for (size_t i = 0; i < components_.size(); i++) {
4192  // make sure all constituent components are simple.
4193  KALDI_ASSERT(components_[i]->Properties() & kSimpleComponent);
4194  if (i > 0) {
4195  // make sure all the internal dimensions match up.
4196  KALDI_ASSERT(components_[i]->InputDim() ==
4197  components_[i-1]->OutputDim());
4198  }
4199  }
4200 }
4201 
4202 // virtual
4203 void CompositeComponent::Read(std::istream &is, bool binary) {
4204  // Because we didn't previously write out the learning rate,
4205  // we need some temporary code.
4206  int32 max_rows_process;
4207  if (false) {
4208  ReadUpdatableCommon(is, binary);
4209  ExpectToken(is, binary, "<MaxRowsProcess>");
4210  ReadBasicType(is, binary, &max_rows_process);
4211  } else { // temporary code.
4212  std::string token;
4213  ReadToken(is, binary, &token);
4214  if (token == "<CompositeComponent>") {
4215  // if the first token is the opening tag, then
4216  // ignore it and get the next tag.
4217  ReadToken(is, binary, &token);
4218  }
4219  if (token == "<LearningRateFactor>") {
4220  ReadBasicType(is, binary, &learning_rate_factor_);
4221  ReadToken(is, binary, &token);
4222  } else {
4223  learning_rate_factor_ = 1.0;
4224  }
4225  if (token == "<IsGradient>") {
4226  ReadBasicType(is, binary, &is_gradient_);
4227  ReadToken(is, binary, &token);
4228  } else {
4229  is_gradient_ = false;
4230  }
4231  if (token == "<LearningRate>") {
4232  ReadBasicType(is, binary, &learning_rate_);
4233  ReadToken(is, binary, &token);
4234  }
4235  if (token != "<MaxRowsProcess>") {
4236  KALDI_ERR << "Expected token <MaxRowsProcess>, got "
4237  << token;
4238  }
4239  ReadBasicType(is, binary, &max_rows_process);
4240  }
4241  ExpectToken(is, binary, "<NumComponents>");
4242  int32 num_components;
4243  ReadBasicType(is, binary, &num_components); // Read dimension.
4244  if (num_components < 0 || num_components > 100000)
4245  KALDI_ERR << "Bad num-components";
4246  std::vector<Component*> components(num_components);
4247  for (int32 i = 0; i < num_components; i++)
4248  components[i] = ReadNew(is, binary);
4249  Init(components, max_rows_process);
4250  ExpectToken(is, binary, "</CompositeComponent>");
4251 }
4252 
4253 // virtual
4255  // we call ZeroStats() on all components without checking their flags; this
4256  // will do nothing if the component doesn't store stats. (components like
4257  // ReLU and sigmoid and tanh store stats on activations).
4258  for (size_t i = 0; i < components_.size(); i++)
4259  components_[i]->ZeroStats();
4260 }
4261 
4262 // virtual
4263 void CompositeComponent::Write(std::ostream &os, bool binary) const {
4264  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate.
4265  WriteToken(os, binary, "<MaxRowsProcess>");
4266  WriteBasicType(os, binary, max_rows_process_);
4267  WriteToken(os, binary, "<NumComponents>");
4268  int32 num_components = components_.size();
4269  WriteBasicType(os, binary, num_components);
4270  for (int32 i = 0; i < num_components; i++)
4271  components_[i]->Write(os, binary);
4272  WriteToken(os, binary, "</CompositeComponent>");
4273 }
4274 
4275 
4276 // virtual
4277 void CompositeComponent::Backprop(const std::string &debug_info,
4278  const ComponentPrecomputedIndexes *indexes,
4279  const CuMatrixBase<BaseFloat> &in_value,
4280  const CuMatrixBase<BaseFloat> &out_value,
4281  const CuMatrixBase<BaseFloat> &out_deriv,
4282  void *memo,
4283  Component *to_update,
4284  CuMatrixBase<BaseFloat> *in_deriv) const {
4285  NVTX_RANGE("CompositeComponent::Backprop");
4286  KALDI_ASSERT(in_value.NumRows() == out_deriv.NumRows() &&
4287  in_value.NumCols() == InputDim() &&
4288  out_deriv.NumCols() == OutputDim());
4289  int32 num_rows = in_value.NumRows(),
4290  num_components = components_.size();
4291  if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
4292  KALDI_ASSERT(max_rows_process_ > 0);
4293  // recurse and process smaller parts of the data, to save memory.
4294  for (int32 row_offset = 0; row_offset < num_rows;
4295  row_offset += max_rows_process_) {
4296  bool have_output_value = (out_value.NumRows() != 0);
4297  int32 this_num_rows = std::min<int32>(max_rows_process_,
4298  num_rows - row_offset);
4299  // out_value_part will only be used if out_value is nonempty; otherwise we
4300  // make it a submatrix of 'out_deriv' to avoid errors in the constructor.
4301  const CuSubMatrix<BaseFloat> out_value_part(have_output_value ? out_value : out_deriv,
4302  row_offset, this_num_rows,
4303  0, out_deriv.NumCols());
4304  // in_deriv_value_part will only be used if in_deriv != NULL; otherwise we
4305  // make it a submatrix of 'in_value' to avoid errors in the constructor.
4306  CuSubMatrix<BaseFloat> in_deriv_part(in_deriv != NULL ? *in_deriv : in_value,
4307  row_offset, this_num_rows,
4308  0, in_value.NumCols());
4309  CuSubMatrix<BaseFloat> in_value_part(in_value, row_offset, this_num_rows,
4310  0, in_value.NumCols());
4311  const CuSubMatrix<BaseFloat> out_deriv_part(out_deriv,
4312  row_offset, this_num_rows,
4313  0, out_deriv.NumCols());
4314  CuMatrix<BaseFloat> empty_mat;
4315  this->Backprop(debug_info, NULL, in_value_part,
4316  (have_output_value ? static_cast<const CuMatrixBase<BaseFloat>&>(out_value_part) :
4317  static_cast<const CuMatrixBase<BaseFloat>&>(empty_mat)),
4318  out_deriv_part, NULL, to_update,
4319  in_deriv != NULL ? &in_deriv_part : NULL);
4320  }
4321  return;
4322  }
4323  // For now, assume all intermediate values and derivatives need to be
4324  // computed. in_value and out_deriv will always be supplied.
4325 
4326  // intermediate_outputs[i] contains the output of component i.
4327  std::vector<CuMatrix<BaseFloat> > intermediate_outputs(num_components);
4328  // intermediate_derivs[i] contains the deriative at the output of component i.
4329  std::vector<CuMatrix<BaseFloat> > intermediate_derivs(num_components - 1);
4330 
4331  KALDI_ASSERT(memo == NULL);
4332  // note: only a very few components use memos, but we need to support them.
4333  std::vector<void*> memos(num_components, NULL);
4334 
4335  int32 num_components_to_propagate = num_components;
4336  if (!(components_[num_components - 1]->Properties() & kUsesMemo)) {
4337  // we only need to propagate the very last component if it uses a memo.
4338  num_components_to_propagate--;
4339  if (num_components > 1) {
4340  // skip the last-but-one component's propagate if the last component's
4341  // backprop doesn't need the input and the last-but-one component's
4342  // backprop doesn't need the output. This is the lowest hanging fruit for
4343  // optimization; other propagates might also be skippable.
4344  int32 properties = components_[num_components - 2]->Properties(),
4345  next_properties = components_[num_components - 1]->Properties();
4346  if (!(properties & (kBackpropNeedsOutput | kUsesMemo)) &&
4347  !(next_properties & kBackpropNeedsInput)) {
4348  num_components_to_propagate--;
4349  }
4350  }
4351  }
4352 
4353 
4354  // Do the propagation again.
4355  for (int32 i = 0; i < num_components_to_propagate; i++) {
4356  MatrixResizeType resize_type =
4357  ((components_[i]->Properties() & kPropagateAdds) ?
4358  kSetZero : kUndefined);
4359  intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
4360  resize_type, GetStrideType(i));
4361  memos[i] =
4362  components_[i]->Propagate(NULL,
4363  (i == 0 ? in_value : intermediate_outputs[i-1]),
4364  &(intermediate_outputs[i]));
4365  }
4366 
4367  for (int32 i = num_components - 1; i >= 0; i--) {
4368  const CuMatrixBase<BaseFloat> &this_in_value =
4369  (i == 0 ? in_value : intermediate_outputs[i-1]),
4370  &this_out_value =
4371  (i == num_components - 1 ? out_value : intermediate_outputs[i]);
4372 
4373  Component *component_to_update =
4374  (to_update == NULL ? NULL :
4375  dynamic_cast<CompositeComponent*>(to_update)->components_[i]);
4376 
4377  if (component_to_update != NULL &&
4378  components_[i]->Properties() & kStoresStats)
4379  component_to_update->StoreStats(this_in_value, this_out_value, memos[i]);
4380 
4381  if (i > 0) {
4382  MatrixResizeType resize_type =
4383  ((components_[i]->Properties() & kBackpropAdds) ?
4384  kSetZero : kUndefined);
4385  intermediate_derivs[i-1].Resize(num_rows, components_[i]->InputDim(),
4386  resize_type, GetStrideType(i - 1));
4387  }
4388  // skip the first component's backprop if it's not updatable and in_deriv is
4389  // not requested. Again, this is the lowest-hanging fruit to optimize.
4390  if (!(i == 0 && !(components_[0]->Properties() & kUpdatableComponent) &&
4391  in_deriv == NULL)) {
4392  components_[i]->Backprop(debug_info, NULL,
4393  this_in_value, this_out_value,
4394  (i + 1 == num_components ? out_deriv : intermediate_derivs[i]),
4395  memos[i], component_to_update,
4396  (i == 0 ? in_deriv : &(intermediate_derivs[i-1])));
4397  }
4398  if (memos[i] != NULL)
4399  components_[i]->DeleteMemo(memos[i]);
4400  }
4401 }
4402 
4403 
4404 // virtual
4405 std::string CompositeComponent::Info() const {
4406  std::ostringstream stream;
4407  stream << Type() << " ";
4408  for (size_t i = 0; i < components_.size(); i++) {
4409  if (i > 0) stream << ", ";
4410  stream << "sub-component" << (i+1) << " = { "
4411  << components_[i]->Info() << " }";
4412  }
4413  return stream.str();
4414 }
4415 
4416 // virtual
4418  for (size_t i = 0; i < components_.size(); i++)
4419  components_[i]->Scale(scale);
4420 }
4421 
4422 // virtual
4423 void CompositeComponent::Add(BaseFloat alpha, const Component &other_in) {
4424  const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
4425  &other_in);
4426  KALDI_ASSERT(other != NULL && other->components_.size() ==
4427  components_.size() && "Mismatching nnet topologies");
4428  for (size_t i = 0; i < components_.size(); i++)
4429  components_[i]->Add(alpha, *(other->components_[i]));
4430 }
4431 
4432 // virtual
4434  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4435  for (size_t i = 0; i < components_.size(); i++) {
4436  if (components_[i]->Properties() & kUpdatableComponent) {
4437  UpdatableComponent *uc =
4438  dynamic_cast<UpdatableComponent*>(components_[i]);
4439  uc->PerturbParams(stddev);
4440  }
4441  }
4442 }
4443 
4445  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4447 
4448  // apply any learning-rate-factor that's set at this level (ill-advised, but
4449  // we'll do it.)
4450  BaseFloat effective_lrate = LearningRate();
4451  for (size_t i = 0; i < components_.size(); i++) {
4452  if (components_[i]->Properties() & kUpdatableComponent) {
4453  UpdatableComponent *uc =
4454  dynamic_cast<UpdatableComponent*>(components_[i]);
4455  uc->SetUnderlyingLearningRate(effective_lrate);
4456  }
4457  }
4458 }
4459 
4461  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4463  for (size_t i = 0; i < components_.size(); i++) {
4464  if (components_[i]->Properties() & kUpdatableComponent) {
4465  UpdatableComponent *uc =
4466  dynamic_cast<UpdatableComponent*>(components_[i]);
4467  uc->SetActualLearningRate(lrate);
4468  }
4469  }
4470 }
4471 
4472 // virtual
4474  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4476  for (size_t i = 0; i < components_.size(); i++) {
4477  if (components_[i]->Properties() & kUpdatableComponent) {
4478  UpdatableComponent *uc =
4479  dynamic_cast<UpdatableComponent*>(components_[i]);
4480  uc->SetAsGradient();
4481  }
4482  }
4483 }
4484 
4485 // virtual
4487  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4488  int32 ans = 0;
4489  for (size_t i = 0; i < components_.size(); i++) {
4490  if (components_[i]->Properties() & kUpdatableComponent) {
4491  UpdatableComponent *uc =
4492  dynamic_cast<UpdatableComponent*>(components_[i]);
4493  ans += uc->NumParameters();
4494  }
4495  }
4496  return ans;
4497 }
4498 
4499 // virtual
4501  int32 cur_offset = 0;
4502  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4503  for (size_t i = 0; i < components_.size(); i++) {
4504  if (components_[i]->Properties() & kUpdatableComponent) {
4505  UpdatableComponent *uc =
4506  dynamic_cast<UpdatableComponent*>(components_[i]);
4507  int32 this_size = uc->NumParameters();
4508  SubVector<BaseFloat> params_range(*params, cur_offset, this_size);
4509  uc->Vectorize(&params_range);
4510  cur_offset += this_size;
4511  }
4512  }
4513  KALDI_ASSERT(cur_offset == params->Dim());
4514 }
4515 
4516 // virtual
4518  int32 cur_offset = 0;
4519  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4520  for (size_t i = 0; i < components_.size(); i++) {
4521  if (components_[i]->Properties() & kUpdatableComponent) {
4522  UpdatableComponent *uc =
4523  dynamic_cast<UpdatableComponent*>(components_[i]);
4524  int32 this_size = uc->NumParameters();
4525  SubVector<BaseFloat> params_range(params, cur_offset, this_size);
4526  uc->UnVectorize(params_range);
4527  cur_offset += this_size;
4528  }
4529  }
4530  KALDI_ASSERT(cur_offset == params.Dim());
4531 }
4532 
4533 // virtual
4535  const UpdatableComponent &other_in) const {
4536  const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
4537  &other_in);
4538  KALDI_ASSERT(other != NULL && other->components_.size() ==
4539  components_.size() && "Mismatching nnet topologies");
4540  BaseFloat ans = 0.0;
4541  for (size_t i = 0.0; i < components_.size(); i++) {
4542  if (components_[i]->Properties() & kUpdatableComponent) {
4543  UpdatableComponent *uc =
4544  dynamic_cast<UpdatableComponent*>(components_[i]);
4545  const UpdatableComponent *uc_other =
4546  dynamic_cast<UpdatableComponent*>(other->components_[i]);
4547  KALDI_ASSERT(uc != NULL && uc_other != NULL);
4548  ans += uc->DotProduct(*uc_other);
4549  }
4550  }
4551  return ans;
4552 }
4553 
4556  for (size_t i = 0; i < components_.size(); i++) {
4557  if (components_[i]->Properties() & kUpdatableComponent) {
4558  UpdatableComponent *uc =
4559  dynamic_cast<UpdatableComponent*>(components_[i]);
4560  KALDI_ASSERT(uc != NULL);
4561  uc->FreezeNaturalGradient(freeze);
4562  }
4563  }
4564 }
4565 
4566 // virtual
4568  std::vector<Component*> components(components_.size());
4569  for (size_t i = 0; i < components_.size(); i++)
4570  components[i] = components_[i]->Copy();
4572  ans->Init(components, max_rows_process_);
4573  return ans;
4574 }
4575 
4576 
4577 // virtual
4579  int32 max_rows_process = 4096, num_components = -1;
4580  cfl->GetValue("max-rows-process", &max_rows_process);
4581  if (!cfl->GetValue("num-components", &num_components) ||
4582  num_components < 1)
4583  KALDI_ERR << "Expected num-components to be defined in "
4584  << "CompositeComponent config line '" << cfl->WholeLine() << "'";
4585  std::vector<Component*> components;
4586  for (int32 i = 1; i <= num_components; i++) {
4587  std::ostringstream name_stream;
4588  name_stream << "component" << i;
4589  std::string component_config;
4590  if (!cfl->GetValue(name_stream.str(), &component_config)) {
4591  DeletePointers(&components);
4592  KALDI_ERR << "Expected '" << name_stream.str() << "' to be defined in "
4593  << "CompositeComponent config line '" << cfl->WholeLine() << "'";
4594  }
4595  ConfigLine nested_line;
4596  // note: the nested line may not contain comments.
4597  std::string component_type;
4598  Component *this_component = NULL;
4599  if (!nested_line.ParseLine(component_config) ||
4600  !nested_line.GetValue("type", &component_type) ||
4601  !(this_component = NewComponentOfType(component_type)) ||
4602  nested_line.FirstToken() != "") {
4603  DeletePointers(&components);
4604  KALDI_ERR << "Could not parse config line for '" << name_stream.str()
4605  << "(or undefined or bad component type [type=xxx]), in "
4606  << "CompositeComponent config line '" << cfl->WholeLine() << "'";
4607  }
4608  if(this_component->Type() == "CompositeComponent") {
4609  DeletePointers(&components);
4610  delete this_component;
4611  // This is not allowed. If memory is too much with just one
4612  // CompositeComponent, try decreasing max-rows-process instead.
4613  KALDI_ERR << "Found CompositeComponent nested within CompositeComponent."
4614  << "Nested line: '" << nested_line.WholeLine() << "'\n"
4615  << "Toplevel CompositeComponent line '" << cfl->WholeLine()
4616  << "'";
4617  }
4618  this_component->InitFromConfig(&nested_line);
4619  int32 props = this_component->Properties();
4620  if ((props & kRandomComponent) != 0 ||
4621  (props & kSimpleComponent) == 0) {
4622  KALDI_ERR << "CompositeComponent contains disallowed component type: "
4623  << nested_line.WholeLine();
4624  }
4625  components.push_back(this_component);
4626  }
4627  if (cfl->HasUnusedValues())
4628  KALDI_ERR << "Could not process these elements in initializer: "
4629  << cfl->UnusedValues();
4630  this->Init(components, max_rows_process);
4631 }
4632 
4634  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
4635  return components_[i];
4636 }
4637 
4639  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
4640  delete components_[i];
4641  components_[i] = component;
4642 }
4643 
4644 
4646  input_dim_(other.input_dim_), output_dim_(other.output_dim_),
4647  scale_(other.scale_) { }
4648 
4650  scale_ = 1.0;
4651  bool ok = cfl->GetValue("input-dim", &input_dim_) &&
4652  cfl->GetValue("output-dim", &output_dim_);
4653  if (!ok)
4654  KALDI_ERR << "input-dim and output-dim must both be provided.";
4655  if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0)
4656  KALDI_ERR << "Invalid values input-dim=" << input_dim_
4657  << " output-dim=" << output_dim_;
4658  cfl->GetValue("scale", &scale_);
4659  if (cfl->HasUnusedValues())
4660  KALDI_ERR << "Could not process these elements in initializer: "
4661  << cfl->UnusedValues();
4662 }
4663 
4664 void SumBlockComponent::Read(std::istream &is, bool binary) {
4665  ExpectOneOrTwoTokens(is, binary, "<SumBlockComponent>", "<InputDim>");
4666  ReadBasicType(is, binary, &input_dim_);
4667  ExpectToken(is, binary, "<OutputDim>");
4668  ReadBasicType(is, binary, &output_dim_);
4669  ExpectToken(is, binary, "<Scale>");
4670  ReadBasicType(is, binary, &scale_);
4671  ExpectToken(is, binary, "</SumBlockComponent>");
4672 }
4673 
4674 void SumBlockComponent::Write(std::ostream &os, bool binary) const {
4675  WriteToken(os, binary, "<SumBlockComponent>");
4676  WriteToken(os, binary, "<InputDim>");
4677  WriteBasicType(os, binary, input_dim_);
4678  WriteToken(os, binary, "<OutputDim>");
4679  WriteBasicType(os, binary, output_dim_);
4680  WriteToken(os, binary, "<Scale>");
4681  WriteBasicType(os, binary, scale_);
4682  WriteToken(os, binary, "</SumBlockComponent>");
4683 }
4684 
4685 std::string SumBlockComponent::Info() const {
4686  std::ostringstream stream;
4687  stream << Type() << ", input-dim=" << input_dim_
4688  << ", output-dim=" << output_dim_
4689  << ", scale=" << scale_;
4690  return stream.str();
4691 }
4692 
4694  const CuMatrixBase<BaseFloat> &in,
4695  CuMatrixBase<BaseFloat> *out) const {
4696  KALDI_ASSERT(out->NumRows() == in.NumRows() &&
4697  out->NumCols() == output_dim_ &&
4698  in.NumCols() == input_dim_);
4699  out->AddMatBlocks(scale_, in, kNoTrans);
4700  return NULL;
4701 }
4702 
4704  const std::string &debug_info,
4705  const ComponentPrecomputedIndexes *indexes,
4706  const CuMatrixBase<BaseFloat> &, //in_value
4707  const CuMatrixBase<BaseFloat> &, // out_value,
4708  const CuMatrixBase<BaseFloat> &out_deriv,
4709  void *memo,
4710  Component *to_update,
4711  CuMatrixBase<BaseFloat> *in_deriv) const {
4712  NVTX_RANGE("SumBlockComponent::Backprop");
4713  if (in_deriv) {
4714  in_deriv->AddMatBlocks(scale_, out_deriv,