nnet-simple-component.cc
Go to the documentation of this file.
1 // nnet3/nnet-simple-component.cc
2 
3 // Copyright 2015-2017 Johns Hopkins University (author: Daniel Povey)
4 // 2015 Xiaohui Zhang
5 // 2015 Guoguo Chen
6 // 2015 Daniel Galvez
7 // 2016 Yiming Wang
8 
9 // See ../../COPYING for clarification regarding multiple authors
10 //
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 //
15 // http://www.apache.org/licenses/LICENSE-2.0
16 //
17 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
19 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
20 // MERCHANTABLITY OR NON-INFRINGEMENT.
21 // See the Apache 2 License for the specific language governing permissions and
22 // limitations under the License.
23 
24 #include <iterator>
25 #include <sstream>
26 #include <algorithm>
27 #include <iomanip>
29 #include "nnet3/nnet-parse.h"
30 #include "cudamatrix/cu-math.h"
31 
32 namespace kaldi {
33 namespace nnet3 {
34 
35 void PnormComponent::Init(int32 input_dim, int32 output_dim) {
36  input_dim_ = input_dim;
37  output_dim_ = output_dim;
38  KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0 &&
39  input_dim_ % output_dim_ == 0);
40 }
41 
43  int32 input_dim = 0;
44  int32 output_dim = 0;
45  bool ok = cfl->GetValue("output-dim", &output_dim) &&
46  cfl->GetValue("input-dim", &input_dim);
47  if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
48  KALDI_ERR << "Invalid initializer for layer of type "
49  << Type() << ": \"" << cfl->WholeLine() << "\"";
50  Init(input_dim, output_dim);
51 }
52 
53 
55  const CuMatrixBase<BaseFloat> &in,
56  CuMatrixBase<BaseFloat> *out) const {
57  BaseFloat p = 2.0;
58  out->GroupPnorm(in, p);
59  return NULL;
60 }
61 
62 void PnormComponent::Backprop(const std::string &debug_info,
63  const ComponentPrecomputedIndexes *indexes,
64  const CuMatrixBase<BaseFloat> &in_value,
65  const CuMatrixBase<BaseFloat> &out_value,
66  const CuMatrixBase<BaseFloat> &out_deriv,
67  void *memo,
68  Component *to_update,
69  CuMatrixBase<BaseFloat> *in_deriv) const {
70  NVTX_RANGE("PnormComponent::Backprop");
71  if (!in_deriv)
72  return;
73  BaseFloat p = 2.0;
74  in_deriv->DiffGroupPnorm(in_value, out_value, out_deriv, p);
75 }
76 
77 void PnormComponent::Read(std::istream &is, bool binary) {
78  ExpectOneOrTwoTokens(is, binary, "<PnormComponent>", "<InputDim>");
79  ReadBasicType(is, binary, &input_dim_);
80  ExpectToken(is, binary, "<OutputDim>");
81  ReadBasicType(is, binary, &output_dim_);
82  ExpectToken(is, binary, "</PnormComponent>");
83 }
84 
85 void PnormComponent::Write(std::ostream &os, bool binary) const {
86  WriteToken(os, binary, "<PnormComponent>");
87  WriteToken(os, binary, "<InputDim>");
88  WriteBasicType(os, binary, input_dim_);
89  WriteToken(os, binary, "<OutputDim>");
90  WriteBasicType(os, binary, output_dim_);
91  WriteToken(os, binary, "</PnormComponent>");
92 }
93 
95  RandomComponent(other),
96  dim_(other.dim_),
97  dropout_proportion_(other.dropout_proportion_),
98  dropout_per_frame_(other.dropout_per_frame_) { }
99 
101  DropoutComponent *ans = new DropoutComponent(*this);
102  return ans;
103 }
104 
105 void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion,
106  bool dropout_per_frame) {
107  dropout_proportion_ = dropout_proportion;
108  dropout_per_frame_ = dropout_per_frame;
109  dim_ = dim;
110 }
111 
113  int32 dim = 0;
114  BaseFloat dropout_proportion = 0.0;
115  bool dropout_per_frame = false;
116  test_mode_ = false;
117  bool ok = cfl->GetValue("dim", &dim) &&
118  cfl->GetValue("dropout-proportion", &dropout_proportion);
119  cfl->GetValue("dropout-per-frame", &dropout_per_frame);
120  // It only makes sense to set test-mode in the config for testing purposes.
121  cfl->GetValue("test-mode", &test_mode_);
122  // for this stage, dropout is hard coded in
123  // normal mode if not declared in config
124  if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
125  dropout_proportion < 0.0 || dropout_proportion > 1.0)
126  KALDI_ERR << "Invalid initializer for layer of type "
127  << Type() << ": \"" << cfl->WholeLine() << "\"";
128  Init(dim, dropout_proportion, dropout_per_frame);
129 }
130 
131 std::string DropoutComponent::Info() const {
132  std::ostringstream stream;
133  stream << Type() << ", dim=" << dim_
134  << ", dropout-proportion=" << dropout_proportion_
135  << ", dropout-per-frame=" << (dropout_per_frame_ ? "true" : "false");
136  return stream.str();
137 }
138 
140  const CuMatrixBase<BaseFloat> &in,
141  CuMatrixBase<BaseFloat> *out) const {
142  KALDI_ASSERT(out->NumRows() == in.NumRows() && out->NumCols() == in.NumCols()
143  && in.NumCols() == dim_);
144 
145  BaseFloat dropout = dropout_proportion_;
146  KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
147  if (test_mode_) {
148  out->CopyFromMat(in);
149  out->Scale(1.0 - dropout);
150  return NULL;
151  }
152  if (!dropout_per_frame_) {
153  // This const_cast is only safe assuming you don't attempt
154  // to use multi-threaded code with the GPU.
156 
157  out->Add(-dropout); // now, a proportion "dropout" will be <0.0
158  // apply the function (x>0?1:0). Now, a proportion
159  // "dropout" will be zero and (1 - dropout) will be 1.0.
160  out->ApplyHeaviside();
161 
162  out->MulElements(in);
163  } else {
164  // randomize the dropout matrix by row,
165  // i.e. [[1,1,1,1],[0,0,0,0],[0,0,0,0],[1,1,1,1],[0,0,0,0]]
166  CuMatrix<BaseFloat> tmp(1, out->NumRows(), kUndefined);
167  // This const_cast is only safe assuming you don't attempt
168  // to use multi-threaded code with the GPU.
169  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&tmp);
170  tmp.Add(-dropout);
171  tmp.ApplyHeaviside();
172  out->CopyColsFromVec(tmp.Row(0));
173  out->MulElements(in);
174  }
175  return NULL;
176 }
177 
178 
179 void DropoutComponent::Backprop(const std::string &debug_info,
180  const ComponentPrecomputedIndexes *indexes,
181  const CuMatrixBase<BaseFloat> &in_value,
182  const CuMatrixBase<BaseFloat> &out_value,
183  const CuMatrixBase<BaseFloat> &out_deriv,
184  void *memo,
185  Component *to_update,
186  CuMatrixBase<BaseFloat> *in_deriv) const {
187  NVTX_RANGE("DropoutComponent::Backprop");
188  KALDI_ASSERT(in_value.NumRows() == out_value.NumRows() &&
189  in_value.NumCols() == out_value.NumCols());
190 
191  KALDI_ASSERT(in_value.NumRows() == out_deriv.NumRows() &&
192  in_value.NumCols() == out_deriv.NumCols());
193  in_deriv->SetMatMatDivMat(out_deriv, out_value, in_value);
194 }
195 
196 
197 
198 void DropoutComponent::Read(std::istream &is, bool binary) {
199  std::string token;
200  ReadToken(is, binary, &token);
201  if (token == "<DropoutComponent>") {
202  ReadToken(is, binary, &token);
203  }
204  KALDI_ASSERT(token == "<Dim>");
205  ReadBasicType(is, binary, &dim_); // read dimension.
206  ReadToken(is, binary, &token);
207  KALDI_ASSERT(token == "<DropoutProportion>");
208  ReadBasicType(is, binary, &dropout_proportion_); // read dropout rate
209  ReadToken(is, binary, &token);
210  if (token == "<DropoutPerFrame>") {
211  ReadBasicType(is, binary, &dropout_per_frame_); // read dropout mode
212  ReadToken(is, binary, &token);
213  } else {
214  dropout_per_frame_ = false;
215  }
216  if (token == "<TestMode>") {
217  ReadBasicType(is, binary, &test_mode_); // read test mode
218  ExpectToken(is, binary, "</DropoutComponent>");
219  } else {
220  test_mode_ = false;
221  KALDI_ASSERT(token == "</DropoutComponent>");
222  }
223 }
224 
225 void DropoutComponent::Write(std::ostream &os, bool binary) const {
226  WriteToken(os, binary, "<DropoutComponent>");
227  WriteToken(os, binary, "<Dim>");
228  WriteBasicType(os, binary, dim_);
229  WriteToken(os, binary, "<DropoutProportion>");
230  WriteBasicType(os, binary, dropout_proportion_);
231  WriteToken(os, binary, "<DropoutPerFrame>");
232  WriteBasicType(os, binary, dropout_per_frame_);
233  WriteToken(os, binary, "<TestMode>");
234  WriteBasicType(os, binary, test_mode_);
235  WriteToken(os, binary, "</DropoutComponent>");
236 }
237 
238 void ElementwiseProductComponent::Init(int32 input_dim, int32 output_dim) {
239  input_dim_ = input_dim;
240  output_dim_ = output_dim;
241  KALDI_ASSERT(input_dim_ > 0 && output_dim_ >= 0);
242  KALDI_ASSERT(input_dim_ > output_dim_);
243  KALDI_ASSERT(input_dim_ % output_dim_ == 0);
244 }
245 
247  int32 input_dim = 0;
248  int32 output_dim = 0;
249  bool ok = cfl->GetValue("output-dim", &output_dim) &&
250  cfl->GetValue("input-dim", &input_dim);
251  if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
252  KALDI_ERR << "Invalid initializer for layer of type "
253  << Type() << ": \"" << cfl->WholeLine() << "\"";
254  Init(input_dim, output_dim);
255 }
256 
258  const ComponentPrecomputedIndexes *indexes,
259  const CuMatrixBase<BaseFloat> &in,
260  CuMatrixBase<BaseFloat> *out) const {
261  KALDI_ASSERT(in.NumCols() == input_dim_);
262  int32 num_inputs = input_dim_ / output_dim_;
263  for (int32 i = 0; i < num_inputs; i++) {
264  CuSubMatrix<BaseFloat> current_in(in, 0, in.NumRows(),
265  i * output_dim_, output_dim_);
266  if (i == 0) {
267  out->CopyFromMat(current_in);
268  } else {
269  out->MulElements(current_in);
270  }
271  }
272  return NULL;
273 }
274 
275 void ElementwiseProductComponent::Backprop(const std::string &debug_info,
276  const ComponentPrecomputedIndexes *indexes,
277  const CuMatrixBase<BaseFloat> &in_value,
278  const CuMatrixBase<BaseFloat> &out_value,
279  const CuMatrixBase<BaseFloat> &out_deriv,
280  void *memo,
281  Component *to_update,
282  CuMatrixBase<BaseFloat> *in_deriv) const {
283  NVTX_RANGE("ElementwiseProductComponent::Backprop");
284  if (!in_deriv) return;
285  int32 num_inputs = input_dim_ / output_dim_;
286  for (int32 i = 0; i < num_inputs; i++) {
287  CuSubMatrix<BaseFloat> current_in_deriv(*in_deriv, 0, in_deriv->NumRows(),
288  i * output_dim_,
289  output_dim_);
290  current_in_deriv.CopyFromMat(out_deriv);
291  for (int32 j = 0; j < num_inputs; j++) {
292  if (i == j)
293  continue;
294  CuSubMatrix<BaseFloat> in_value_partition(in_value, 0,
295  in_value.NumRows(),
296  j * output_dim_,
297  output_dim_);
298  current_in_deriv.MulElements(in_value_partition);
299  }
300  }
301 }
302 
303 void ElementwiseProductComponent::Read(std::istream &is, bool binary) {
304  ExpectOneOrTwoTokens(is, binary, "<ElementwiseProductComponent>",
305  "<InputDim>");
306  ReadBasicType(is, binary, &input_dim_);
307  ExpectToken(is, binary, "<OutputDim>");
308  ReadBasicType(is, binary, &output_dim_);
309  ExpectToken(is, binary, "</ElementwiseProductComponent>");
310 }
311 
312 void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const {
313  WriteToken(os, binary, "<ElementwiseProductComponent>");
314  WriteToken(os, binary, "<InputDim>");
315  WriteBasicType(os, binary, input_dim_);
316  WriteToken(os, binary, "<OutputDim>");
317  WriteBasicType(os, binary, output_dim_);
318  WriteToken(os, binary, "</ElementwiseProductComponent>");
319 }
320 
322  const CuMatrixBase<BaseFloat> &in,
323  CuMatrixBase<BaseFloat> *out) const {
324  out->Sigmoid(in);
325  return NULL;
326 }
327 
328 void SigmoidComponent::Backprop(const std::string &debug_info,
329  const ComponentPrecomputedIndexes *indexes,
330  const CuMatrixBase<BaseFloat> &,
331  const CuMatrixBase<BaseFloat> &out_value,
332  const CuMatrixBase<BaseFloat> &out_deriv,
333  void *memo,
334  Component *to_update_in,
335  CuMatrixBase<BaseFloat> *in_deriv) const {
336  NVTX_RANGE("SigmoidComponent::Backprop");
337  if (in_deriv != NULL) {
338  in_deriv->DiffSigmoid(out_value, out_deriv);
339  SigmoidComponent *to_update = dynamic_cast<SigmoidComponent*>(to_update_in);
340  if (to_update != NULL) {
341  RepairGradients(out_value, in_deriv, to_update);
342  to_update->StoreBackpropStats(out_deriv);
343  }
344  }
345 }
346 
348  const CuMatrixBase<BaseFloat> &out_value,
349  CuMatrixBase<BaseFloat> *in_deriv,
350  SigmoidComponent *to_update) const {
351  KALDI_ASSERT(to_update != NULL);
352  // maximum possible derivative of SigmoidComponent is 0.25.
353  // the default lower-threshold on the derivative, below which we
354  // add a term to the derivative to encourage the inputs to the sigmoid
355  // to be closer to zero, is 0.05, which means the derivative is on average
356  // 5 times smaller than its maximum possible value.
357  BaseFloat default_lower_threshold = 0.05;
358 
359  // we use this 'repair_probability' (hardcoded for now) to limit
360  // this code to running on about half of the minibatches.
361  BaseFloat repair_probability = 0.5;
362 
363  to_update->num_dims_processed_ += dim_;
364 
365  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
366  RandUniform() > repair_probability)
367  return;
368 
369  // check that the self-repair scale is in a reasonable range.
370  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
371  BaseFloat unset = kUnsetThreshold; // -1000.0
372  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
373  default_lower_threshold :
374  self_repair_lower_threshold_) *
375  count_;
376  if (self_repair_upper_threshold_ != unset) {
377  KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
378  << "components, it does nothing.";
379  }
380 
381  // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside
382  // function isn't defined for vectors).
383  CuMatrix<BaseFloat> thresholds(1, dim_);
384  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
385  thresholds_vec.AddVec(-1.0, deriv_sum_);
386  thresholds_vec.Add(lower_threshold);
387  thresholds.ApplyHeaviside();
388  to_update->num_dims_self_repaired_ += thresholds_vec.Sum();
389 
390  // At this point, 'thresholds_vec' contains a 1 for each dimension of
391  // the output that is 'problematic', i.e. for which the avg-deriv
392  // is less than the self-repair lower threshold, and a 0 for
393  // each dimension that is not problematic.
394 
395  // what we want to do is to add
396  // -self_repair_scale_ / repair_probability times (2 * output-valiue - 1.0)
397  // to the input derivative for each problematic dimension.
398 
399  // Here, 2 * output - 1.0 is a version of the sigmoid that goes from -1.0 to
400  // 1.0, like a tanh. the negative sign is so that for inputs <0, we push them
401  // up towards 0, and for inputs >0, we push them down towards 0.
402  // Our use of this sigmoid-type function here is just a convenience since
403  // we have it available. We could use just about any function that is positive
404  // for inputs < 0 and negative for inputs > 0.
405 
406  // We can rearrange the above as: for only the problematic columns,
407  // input-deriv -= 2 * self-repair-scale / repair-probabilty * output
408  // input-deriv += self-repair-scale / repair-probabilty
409  // which we can write as:
410  // input-deriv -= 2 * self-repair-scale / repair-probabilty * output * thresholds-vec
411  // input-deriv += self-repair-scale / repair-probabilty * thresholds-vec
412 
413  in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability,
414  out_value, kNoTrans, thresholds_vec);
415  in_deriv->AddVecToRows(self_repair_scale_ / repair_probability,
416  thresholds_vec);
417 }
418 
419 
420 
422  const CuMatrixBase<BaseFloat> &out_value,
423  void *memo) {
424  // Only store stats about every other minibatch (but on the first minibatch,
425  // always store it, which is necessary for the ConsolidateMemory() operation
426  // to work correctly.
427  if (RandInt(0, 1) == 0 && count_ != 0)
428  return;
429  // derivative of the nonlinearity is out_value * (1.0 - out_value);
430  CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(), out_value.NumCols(),
431  kUndefined);
432  temp_deriv.Set(1.0);
433  temp_deriv.AddMat(-1.0, out_value);
434  temp_deriv.MulElements(out_value);
435  StoreStatsInternal(out_value, &temp_deriv);
436 }
437 
438 
439 
441  const CuMatrixBase<BaseFloat> &in,
442  CuMatrixBase<BaseFloat> *out) const {
443  out->CopyFromMat(in);
444  return NULL;
445 }
446 
447 void NoOpComponent::Backprop(const std::string &debug_info,
448  const ComponentPrecomputedIndexes *indexes,
449  const CuMatrixBase<BaseFloat> &,
450  const CuMatrixBase<BaseFloat> &,
451  const CuMatrixBase<BaseFloat> &out_deriv,
452  void *memo,
453  Component *to_update, // may be NULL; may be identical
454  // to "this" or different.
455  CuMatrixBase<BaseFloat> *in_deriv) const {
456  NVTX_RANGE("NoOpComponent::Backprop");
457  in_deriv->CopyFromMat(out_deriv);
458  if (backprop_scale_ != 1.0)
459  in_deriv->Scale(backprop_scale_);
460 }
461 
463  backprop_scale_ = 1.0;
464  cfl->GetValue("backprop-scale", &backprop_scale_);
465  if (!cfl->GetValue("dim", &dim_) ||
466  dim_ <= 0 || cfl->HasUnusedValues()) {
467  KALDI_ERR << "Invalid initializer for layer of type "
468  << Type() << ": \"" << cfl->WholeLine() << "\"";
469  }
470 }
471 
472 std::string NoOpComponent::Info() const {
473  std::ostringstream stream;
474  stream << Type() << ", dim=" << dim_;
475  if (backprop_scale_ != 1.0)
476  stream << ", backprop-scale=" << backprop_scale_;
477  return stream.str();
478 }
479 
480 void NoOpComponent::Write(std::ostream &os, bool binary) const {
481  WriteToken(os, binary, "<NoOpComponent>");
482  WriteToken(os, binary, "<Dim>");
483  WriteBasicType(os, binary, dim_);
484  WriteToken(os, binary, "<BackpropScale>");
485  WriteBasicType(os, binary, backprop_scale_);
486  WriteToken(os, binary, "</NoOpComponent>");
487 }
488 
489 void NoOpComponent::Read(std::istream &is, bool binary) {
490  ExpectOneOrTwoTokens(is, binary, "<NoOpComponent>", "<Dim>");
491  ReadBasicType(is, binary, &dim_);
492 
493  if (PeekToken(is, binary) == 'V') {
494  // This is the old format, from when NoOpComponent inherited from
495  // NonlinearComponent.
496  backprop_scale_ = 1.0;
497  ExpectToken(is, binary, "<ValueAvg>");
498  CuVector<BaseFloat> temp_vec;
499  temp_vec.Read(is, binary);
500  ExpectToken(is, binary, "<DerivAvg>");
501  temp_vec.Read(is, binary);
502  ExpectToken(is, binary, "<Count>");
503  BaseFloat temp_float;
504  ReadBasicType(is, binary, &temp_float);
505  if (PeekToken(is, binary) == 'O') {
506  ExpectToken(is, binary, "<OderivRms>");
507  temp_vec.Read(is, binary);
508  ExpectToken(is, binary, "<OderivCount>");
509  ReadBasicType(is, binary, &temp_float);
510  }
511  std::string token;
512  ReadToken(is, binary, &token);
513  if (token[0] != '<') {
514  // this should happen only rarely, in case we couldn't push back the
515  // '<' to the stream in PeekToken().
516  token = '<' + token;
517  }
518  if (token == "<NumDimsSelfRepaired>") {
519  ReadBasicType(is, binary, &temp_float);
520  ReadToken(is, binary, &token);
521  }
522  if (token == "<NumDimsProcessed>") {
523  ReadBasicType(is, binary, &temp_float);
524  ReadToken(is, binary, &token);
525  }
526  KALDI_ASSERT(token == "</NoOpComponent>");
527  return;
528  } else {
529  ExpectToken(is, binary, "<BackpropScale>");
530  ReadBasicType(is, binary, &backprop_scale_);
531  ExpectToken(is, binary, "</NoOpComponent>");
532  }
533 }
534 
535 
536 void ClipGradientComponent::Read(std::istream &is, bool binary) {
537  // might not see the "<NaturalGradientAffineComponent>" part because
538  // of how ReadNew() works.
539  ExpectOneOrTwoTokens(is, binary, "<ClipGradientComponent>",
540  "<Dim>");
541  ReadBasicType(is, binary, &dim_);
542  ExpectToken(is, binary, "<ClippingThreshold>");
543  ReadBasicType(is, binary, &clipping_threshold_);
544  ExpectToken(is, binary, "<NormBasedClipping>");
545  ReadBasicType(is, binary, &norm_based_clipping_);
546  std::string token;
547  ReadToken(is, binary, &token);
548  if (token == "<SelfRepairClippedProportionThreshold>") {
549  ReadBasicType(is, binary, &self_repair_clipped_proportion_threshold_);
550  ExpectToken(is, binary, "<SelfRepairTarget>");
551  ReadBasicType(is, binary, &self_repair_target_);
552  ExpectToken(is, binary, "<SelfRepairScale>");
553  ReadBasicType(is, binary, &self_repair_scale_);
554  ExpectToken(is, binary, "<NumElementsClipped>");
555  } else {
556  self_repair_clipped_proportion_threshold_ = 1.0;
557  self_repair_target_ = 0.0;
558  self_repair_scale_ = 0.0;
559  KALDI_ASSERT(token == "<NumElementsClipped>");
560  }
561  ReadBasicType(is, binary, &num_clipped_);
562  ExpectToken(is, binary, "<NumElementsProcessed>");
563  ReadBasicType(is, binary, &count_);
564  ReadToken(is, binary, &token);
565  if (token == "<NumSelfRepaired>") {
566  ReadBasicType(is, binary, &num_self_repaired_);
567  ExpectToken(is, binary, "<NumBackpropped>");
568  ReadBasicType(is, binary, &num_backpropped_);
569  ExpectToken(is, binary, "</ClipGradientComponent>");
570  } else {
571  num_self_repaired_ = 0;
572  num_backpropped_ = 0;
573  KALDI_ASSERT(token == "</ClipGradientComponent>");
574  }
575 }
576 
577 void ClipGradientComponent::Write(std::ostream &os, bool binary) const {
578  WriteToken(os, binary, "<ClipGradientComponent>");
579  WriteToken(os, binary, "<Dim>");
580  WriteBasicType(os, binary, dim_);
581  WriteToken(os, binary, "<ClippingThreshold>");
582  WriteBasicType(os, binary, clipping_threshold_);
583  WriteToken(os, binary, "<NormBasedClipping>");
584  WriteBasicType(os, binary, norm_based_clipping_);
585  WriteToken(os, binary, "<SelfRepairClippedProportionThreshold>");
586  WriteBasicType(os, binary, self_repair_clipped_proportion_threshold_);
587  WriteToken(os, binary, "<SelfRepairTarget>");
588  WriteBasicType(os, binary, self_repair_target_);
589  WriteToken(os, binary, "<SelfRepairScale>");
590  WriteBasicType(os, binary, self_repair_scale_);
591  WriteToken(os, binary, "<NumElementsClipped>");
592  WriteBasicType(os, binary, num_clipped_);
593  WriteToken(os, binary, "<NumElementsProcessed>");
594  WriteBasicType(os, binary, count_);
595  WriteToken(os, binary, "<NumSelfRepaired>");
596  WriteBasicType(os, binary, num_self_repaired_);
597  WriteToken(os, binary, "<NumBackpropped>");
598  WriteBasicType(os, binary, num_backpropped_);
599  WriteToken(os, binary, "</ClipGradientComponent>");
600 }
601 
602 std::string ClipGradientComponent::Info() const {
603  std::ostringstream stream;
604  stream << Type() << ", dim=" << dim_
605  << ", norm-based-clipping="
606  << (norm_based_clipping_ ? "true" : "false")
607  << ", clipping-threshold=" << clipping_threshold_
608  << ", clipped-proportion="
609  << (count_ > 0 ? static_cast<BaseFloat>(num_clipped_)/count_ : 0);
610  if (self_repair_scale_ != 0.0)
611  stream << ", self-repair-clipped-proportion-threshold="
612  << self_repair_clipped_proportion_threshold_
613  << ", self-repair-target=" << self_repair_target_
614  << ", self-repair-scale=" << self_repair_scale_;
615  return stream.str();
616 }
617 
619  BaseFloat clipping_threshold,
620  bool norm_based_clipping,
621  BaseFloat self_repair_clipped_proportion_threshold,
622  BaseFloat self_repair_target,
623  BaseFloat self_repair_scale,
624  int32 num_clipped,
625  int32 count,
626  int32 num_self_repaired,
627  int32 num_backpropped) {
628  KALDI_ASSERT(clipping_threshold >= 0 && dim > 0 &&
629  self_repair_clipped_proportion_threshold >= 0.0 &&
630  self_repair_target >= 0.0 && self_repair_scale >= 0.0);
631  dim_ = dim;
632  norm_based_clipping_ = norm_based_clipping;
633  clipping_threshold_ = clipping_threshold;
634  self_repair_clipped_proportion_threshold_ =
635  self_repair_clipped_proportion_threshold;
636  self_repair_target_ = self_repair_target;
637  self_repair_scale_ = self_repair_scale;
638  num_clipped_ = num_clipped;
639  count_ = count;
640  num_self_repaired_ = num_self_repaired;
641  num_backpropped_ = num_backpropped;
642 }
643 
645  int32 dim = 0;
646  bool ok = cfl->GetValue("dim", &dim);
647  bool norm_based_clipping = false;
648  BaseFloat clipping_threshold = 15.0;
649  BaseFloat self_repair_clipped_proportion_threshold = 0.01;
650  BaseFloat self_repair_target = 0.0;
651  BaseFloat self_repair_scale = 1.0;
652  cfl->GetValue("clipping-threshold", &clipping_threshold);
653  cfl->GetValue("norm-based-clipping", &norm_based_clipping);
654  cfl->GetValue("self-repair-clipped-proportion-threshold",
655  &self_repair_clipped_proportion_threshold);
656  cfl->GetValue("self-repair-target",
657  &self_repair_target);
658  cfl->GetValue("self-repair-scale", &self_repair_scale);
659  if (!ok || cfl->HasUnusedValues() ||
660  clipping_threshold < 0 || dim <= 0 ||
661  self_repair_clipped_proportion_threshold < 0.0 ||
662  self_repair_target < 0.0 || self_repair_scale < 0.0)
663  KALDI_ERR << "Invalid initializer for layer of type "
664  << Type() << ": \"" << cfl->WholeLine() << "\"";
665  Init(dim, clipping_threshold, norm_based_clipping,
666  self_repair_clipped_proportion_threshold,
667  self_repair_target,
668  self_repair_scale, 0, 0, 0, 0);
669 }
670 
672  const ComponentPrecomputedIndexes *indexes,
673  const CuMatrixBase<BaseFloat> &in,
674  CuMatrixBase<BaseFloat> *out) const {
675  out->CopyFromMat(in);
676  return NULL;
677 }
678 
679 
680 void ClipGradientComponent::Backprop(const std::string &debug_info,
681  const ComponentPrecomputedIndexes *indexes,
682  const CuMatrixBase<BaseFloat> &in_value,
683  const CuMatrixBase<BaseFloat> &,
684  const CuMatrixBase<BaseFloat> &out_deriv,
685  void *memo,
686  Component *to_update_in, // may be NULL; may be identical
687  // to "this" or different.
688  CuMatrixBase<BaseFloat> *in_deriv) const {
689  NVTX_RANGE("ClipGradientComponent::Backprop");
690  // the following statement will do nothing if in_deriv and out_deriv have same
691  // memory.
692  in_deriv->CopyFromMat(out_deriv);
693 
694  ClipGradientComponent *to_update =
695  dynamic_cast<ClipGradientComponent*>(to_update_in);
696 
697  if (clipping_threshold_ > 0) {
698  if (norm_based_clipping_) {
699  // each row in the derivative matrix, which corresponds to one sample in
700  // the mini-batch, is scaled to have a max-norm of clipping_threshold_
701  CuVector<BaseFloat> clipping_scales(in_deriv->NumRows());
702  clipping_scales.AddDiagMat2(pow(clipping_threshold_, -2), *in_deriv,
703  kNoTrans, 0.0);
704  // now clipping_scales contains the squared (norm of each row divided by
705  // clipping_threshold)
706  int32 num_not_scaled;
707  clipping_scales.ApplyFloor(1.0, &num_not_scaled);
708  // now clipping_scales contains min(1,
709  // squared-(norm/clipping_threshold))
710  if (num_not_scaled != clipping_scales.Dim()) {
711  clipping_scales.ApplyPow(-0.5);
712  // now clipping_scales contains max(1,
713  // clipping_threshold/vector_norm)
714  in_deriv->MulRowsVec(clipping_scales);
715  if (to_update != NULL)
716  to_update->num_clipped_ += (clipping_scales.Dim() - num_not_scaled);
717  }
718  if (to_update != NULL)
719  to_update->count_ += clipping_scales.Dim();
720  } else {
721  // each element of the derivative matrix, is clipped to be below the
722  // clipping_threshold_
723  in_deriv->ApplyCeiling(clipping_threshold_);
724  in_deriv->ApplyFloor(-1 * clipping_threshold_);
725  }
726 
727  if (to_update != NULL) {
728  to_update->num_backpropped_ += 1;
729  RepairGradients(debug_info, in_value, in_deriv, to_update);
730  }
731  } else if (clipping_threshold_ == 0.0) {
732  in_deriv->SetZero();
733  }
734 }
735 
736 // This function will add a self-repair term to in-deriv, attempting to shrink
737 // the magnitude of the input towards self_repair_target_.
738 // This term is proportional to [-(input vector - self_repair_target_)].
739 // The avarage magnitude of this term is equal to
740 // [self_repair_scale_ * clipped_proportion * average norm of input derivative].
741 // We use norm of input derivative when computing the magnitude so that it is
742 // comparable to the magnitude of input derivative, especially when the gradient
743 // explosion is actually happening.
745  const std::string &debug_info,
746  const CuMatrixBase<BaseFloat> &in_value,
747  CuMatrixBase<BaseFloat> *in_deriv, ClipGradientComponent *to_update) const {
748  KALDI_ASSERT(to_update != NULL);
749 
750  // we use this 'repair_probability' (hardcoded for now) to limit
751  // this code to running on about half of the minibatches.
752  BaseFloat repair_probability = 0.5;
753  if (self_repair_clipped_proportion_threshold_ >= 1.0 ||
754  self_repair_scale_ == 0.0 || count_ == 0 ||
755  RandUniform() > repair_probability)
756  return;
757 
758  KALDI_ASSERT(self_repair_target_ >= 0.0 && self_repair_scale_ > 0.0);
759 
760  BaseFloat clipped_proportion =
761  (count_ > 0 ? static_cast<BaseFloat>(num_clipped_) / count_ : 0);
762  // in-deriv would be modified only when clipped_proportion exceeds the
763  // threshold
764  if (clipped_proportion <= self_repair_clipped_proportion_threshold_)
765  return;
766 
767  to_update->num_self_repaired_ += 1;
768  if (to_update->debug_info_ == "") // get the component-node name
769  to_update->debug_info_ = debug_info;
770  if (to_update->num_self_repaired_ == 1)
771  KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info
772  << ")'s self-repair was activated as the first time at the "
773  << to_update->num_backpropped_
774  << "-th call of Backprop() in this training job.";
775 
776  // sign_mat = sign(in_value), i.e.,
777  // An element in sign_mat is 1 if its corresponding element in in_value > 0,
778  // or -1 otherwise
779  CuMatrix<BaseFloat> sign_mat(in_value);
780  sign_mat.ApplyHeaviside();
781  sign_mat.Scale(2.0);
782  sign_mat.Add(-1.0);
783 
784  // repair_mat =
785  // floor(abs(in_value) - self_repair_target_, 0) .* sign(in_value)
786  CuMatrix<BaseFloat> repair_mat(in_value);
787  repair_mat.ApplyPowAbs(1.0);
788  repair_mat.Add(-self_repair_target_);
789  repair_mat.ApplyFloor(0.0);
790  repair_mat.MulElements(sign_mat);
791 
792  // magnitude =
793  // self_repair_scale_ * clipped_proportion * average norm of in-deriv
794  CuVector<BaseFloat> in_deriv_norm_vec(in_deriv->NumRows());
795  in_deriv_norm_vec.AddDiagMat2(1.0, *in_deriv, kNoTrans, 0.0);
796  in_deriv_norm_vec.ApplyPow(0.5);
797  double in_deriv_norm_sum = in_deriv_norm_vec.Sum();
798  BaseFloat magnitude = self_repair_scale_ * clipped_proportion *
799  (in_deriv_norm_sum / in_deriv_norm_vec.Dim());
800 
801  CuVector<BaseFloat> repair_mat_norm_vec(repair_mat.NumRows());
802  repair_mat_norm_vec.AddDiagMat2(1.0, repair_mat, kNoTrans, 0.0);
803  repair_mat_norm_vec.ApplyPow(0.5);
804  double repair_mat_norm_sum = repair_mat_norm_vec.Sum();
805  double scale = 0.0;
806  if (repair_mat_norm_sum != 0.0)
807  scale = magnitude / (repair_mat_norm_sum / repair_mat_norm_vec.Dim());
808  // repair_mat is scaled so that on average the rows have the norm
809  // (magnitude / repair_probability). This will give higher magnitude of
810  // self-repair to input vectors that have larger absolute value, which tend to
811  // be those that are diverging.
812  in_deriv->AddMat(-scale / repair_probability, repair_mat);
813  CuVector<BaseFloat> in_deriv_repaired_norm_vec(in_deriv->NumRows());
814  in_deriv_repaired_norm_vec.AddDiagMat2(1.0, *in_deriv, kNoTrans, 0.0);
815  in_deriv_repaired_norm_vec.ApplyPow(0.5);
816  // scale in_deriv to have the same norm as that before adding the self-repair
817  // term, in order to avoid increase of the norm caused by self-repair,
818  // which may incur more clip of gradient and thus more self-repair
819  double in_deriv_repaired_norm_sum = in_deriv_repaired_norm_vec.Sum();
820  if (in_deriv_repaired_norm_sum != 0.0)
821  in_deriv->Scale(in_deriv_norm_sum / in_deriv_repaired_norm_sum);
822 }
823 
825  count_ = 0.0;
826  num_clipped_ = 0.0;
827  num_self_repaired_ = 0;
828  num_backpropped_ = 0;
829 }
830 
832  count_ *= scale;
833  num_clipped_ *= scale;
834 }
835 
836 void ClipGradientComponent::Add(BaseFloat alpha, const Component &other_in) {
837  const ClipGradientComponent *other =
838  dynamic_cast<const ClipGradientComponent*>(&other_in);
839  KALDI_ASSERT(other != NULL);
840  count_ += alpha * other->count_;
841  num_clipped_ += alpha * other->num_clipped_;
842 }
843 
845  const CuMatrixBase<BaseFloat> &in,
846  CuMatrixBase<BaseFloat> *out) const {
847  // Apply tanh function to each element of the output...
848  // the tanh function may be written as -1 + ( 2 / (1 + e^{-2 x})),
849  // which is a scaled and shifted sigmoid.
850  out->Tanh(in);
851  return NULL;
852 }
853 
854 
856  const CuMatrixBase<BaseFloat> &out_value,
857  CuMatrixBase<BaseFloat> *in_deriv,
858  TanhComponent *to_update) const {
859  KALDI_ASSERT(to_update != NULL);
860  // maximum possible derivative of SigmoidComponent is 1.0
861  // the default lower-threshold on the derivative, below which we
862  // add a term to the derivative to encourage the inputs to the sigmoid
863  // to be closer to zero, is 0.2, which means the derivative is on average
864  // 5 times smaller than its maximum possible value.
865  BaseFloat default_lower_threshold = 0.2;
866 
867  // we use this 'repair_probability' (hardcoded for now) to limit
868  // this code to running on about half of the minibatches.
869  BaseFloat repair_probability = 0.5;
870 
871  to_update->num_dims_processed_ += dim_;
872 
873  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
874  RandUniform() > repair_probability)
875  return;
876 
877  // check that the self-repair scale is in a reasonable range.
878  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
879  BaseFloat unset = kUnsetThreshold; // -1000.0
880  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
881  default_lower_threshold :
882  self_repair_lower_threshold_) *
883  count_;
884  if (self_repair_upper_threshold_ != unset) {
885  KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
886  << "components, it does nothing.";
887  }
888 
889  // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside
890  // function isn't defined for vectors).
891  CuMatrix<BaseFloat> thresholds(1, dim_);
892  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
893  thresholds_vec.AddVec(-1.0, deriv_sum_);
894  thresholds_vec.Add(lower_threshold);
895  thresholds.ApplyHeaviside();
896  to_update->num_dims_self_repaired_ += thresholds_vec.Sum();
897 
898  // At this point, 'thresholds_vec' contains a 1 for each dimension of
899  // the output that is 'problematic', i.e. for which the avg-deriv
900  // is less than the self-repair lower threshold, and a 0 for
901  // each dimension that is not problematic.
902 
903  // what we want to do is to add -self_repair_scale_ / repair_probability times
904  // output-valiue) to the input derivative for each problematic dimension.
905  // note that for the tanh, the output-value goes from -1.0 when the input is
906  // -inf to +1.0 when the input is +inf. The negative sign is so that for
907  // inputs <0, we push them up towards 0, and for inputs >0, we push them down
908  // towards 0. Our use of the tanh here is just a convenience since we have it
909  // available. We could use just about any function that is positive for
910  // inputs < 0 and negative for inputs > 0.
911 
912  // We can rearrange the above as: for only the problematic columns,
913  // input-deriv -= self-repair-scale / repair-probabilty * output
914  // which we can write as:
915  // input-deriv -= self-repair-scale / repair-probabilty * output * thresholds-vec
916 
917  in_deriv->AddMatDiagVec(-self_repair_scale_ / repair_probability,
918  out_value, kNoTrans, thresholds_vec);
919 }
920 
921 void TanhComponent::Backprop(const std::string &debug_info,
922  const ComponentPrecomputedIndexes *indexes,
923  const CuMatrixBase<BaseFloat> &,
924  const CuMatrixBase<BaseFloat> &out_value,
925  const CuMatrixBase<BaseFloat> &out_deriv,
926  void *memo,
927  Component *to_update_in, // may be NULL; may be identical
928  // to "this" or different.
929  CuMatrixBase<BaseFloat> *in_deriv) const {
930  NVTX_RANGE("TanhComponent::Backprop");
931  if (in_deriv != NULL) {
932  in_deriv->DiffTanh(out_value, out_deriv);
933  TanhComponent *to_update = dynamic_cast<TanhComponent*>(to_update_in);
934  if (to_update != NULL) {
935  RepairGradients(out_value, in_deriv, to_update);
936  to_update->StoreBackpropStats(out_deriv);
937  }
938  }
939 }
940 
941 /*
942  Note on the derivative of the tanh function:
943  tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
944 
945  The element by element equation of what we're doing would be:
946  in_deriv = out_deriv * (1.0 - out_value^2).
947  We can accomplish this via calls to the matrix library. */
949  const CuMatrixBase<BaseFloat> &out_value,
950  void *memo) {
951  // Only store stats about every other minibatch (but on the first minibatch,
952  // always store it, which is necessary for the ConsolidateMemory() operation
953  // to work correctly.
954  if (RandInt(0, 1) == 0 && count_ != 0)
955  return;
956  // derivative of the onlinearity is out_value * (1.0 - out_value);
957  CuMatrix<BaseFloat> temp_deriv(out_value);
958  temp_deriv.ApplyPow(2.0);
959  temp_deriv.Scale(-1.0);
960  temp_deriv.Add(1.0);
961  StoreStatsInternal(out_value, &temp_deriv);
962 }
963 
965  const ComponentPrecomputedIndexes *indexes,
966  const CuMatrixBase<BaseFloat> &in,
967  CuMatrixBase<BaseFloat> *out) const {
968  // Apply rectified linear function (x >= 0 ? 1.0 : 0.0)
969  out->CopyFromMat(in);
970  out->ApplyFloor(0.0);
971  return NULL;
972 }
973 
975  const std::string &debug_info,
976  const ComponentPrecomputedIndexes *indexes,
977  const CuMatrixBase<BaseFloat> &, //in_value
978  const CuMatrixBase<BaseFloat> &out_value,
979  const CuMatrixBase<BaseFloat> &out_deriv,
980  void *memo,
981  Component *to_update_in,
982  CuMatrixBase<BaseFloat> *in_deriv) const {
983  NVTX_RANGE("RectifiedLinearComponent::Backprop");
984  if (in_deriv != NULL) {
985  in_deriv->Heaviside(out_value);
986  in_deriv->MulElements(out_deriv);
987  RectifiedLinearComponent *to_update =
988  dynamic_cast<RectifiedLinearComponent*>(to_update_in);
989  if (to_update != NULL) {
990  RepairGradients(in_deriv, to_update);
991  to_update->StoreBackpropStats(out_deriv);
992  }
993  }
994 }
995 
996 
998  CuMatrixBase<BaseFloat> *in_deriv,
999  RectifiedLinearComponent *to_update) const {
1000  KALDI_ASSERT(to_update != NULL);
1001  int32 dim = dim_, block_dim = block_dim_;
1002  BaseFloat default_lower_threshold = 0.05,
1003  default_upper_threshold = 0.95;
1004  // we use this 'repair_probability' (hardcoded for now) to limit
1005  // this code to running on about half of the minibatches.
1006  BaseFloat repair_probability = 0.5;
1007  KALDI_ASSERT(in_deriv->NumCols() == dim || in_deriv->NumCols() == block_dim);
1008  if (self_repair_scale_ == 0.0 || count_ == 0.0 ||
1009  deriv_sum_.Dim() != dim)
1010  return;
1011 
1012  if (in_deriv->NumCols() != block_dim) {
1013  KALDI_ASSERT(in_deriv->NumCols() == in_deriv->Stride());
1014  int32 dim_multiple = dim / block_dim;
1015  CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(),
1016  in_deriv->NumRows() * dim_multiple,
1017  block_dim, block_dim);
1018  RepairGradients(&in_deriv_reshaped, to_update);
1019  return;
1020  }
1021 
1022  // By now we know that in_deriv->NumCols() == block_dim.
1023 
1024  if (RandUniform() > repair_probability)
1025  return;
1026 
1027  to_update->num_dims_processed_ += block_dim;
1028 
1029  // check that the self-repair scale is in a reasonable range.
1030  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
1031  BaseFloat unset = kUnsetThreshold; // -1000.0
1032  BaseFloat count = count_,
1033  lower_threshold = (self_repair_lower_threshold_ == unset ?
1034  default_lower_threshold :
1035  self_repair_lower_threshold_) * count,
1036  upper_threshold = (self_repair_upper_threshold_ == unset ?
1037  default_upper_threshold :
1038  self_repair_upper_threshold_) * count;
1039 
1040  CuMatrix<BaseFloat> storage(2, block_dim + 2, kUndefined);
1041  CuSubVector<BaseFloat> thresholds_vec(storage.RowData(0) + block_dim, 2);
1042  CuSubMatrix<BaseFloat> stats_mat(storage, 0, 2, 0, block_dim);
1043  thresholds_vec(0) = -lower_threshold;
1044  thresholds_vec(1) = -upper_threshold;
1045  CuSubVector<BaseFloat> row0(stats_mat, 0);
1046  CuSubVector<BaseFloat> row1(stats_mat, 1);
1047 
1048  if (block_dim == dim) {
1049  row0.CopyFromVec(deriv_sum_);
1050  } else {
1051  CuSubMatrix<double> deriv_sum_mat(deriv_sum_.Data(),
1052  dim / block_dim,
1053  block_dim, block_dim);
1054  CuVector<double> deriv_sum_dbl(block_dim);
1055  // get the average of the deriv-sums over the blocks.
1056  deriv_sum_dbl.AddRowSumMat(block_dim * 1.0 / dim, deriv_sum_mat);
1057  row0.CopyFromVec(deriv_sum_dbl);
1058  }
1059  row1.CopyFromVec(row0);
1060  stats_mat.AddVecToCols(1.0, thresholds_vec, 1.0);
1061  // now row0 equals stats - lower_threshold, and
1062  // row1 equals stats - upper_threshold.
1063  stats_mat.ApplyHeaviside();
1064  // now row0 equals (stats > lower_threshold ? 1 : 0), and
1065  // row1 equals (stats > upper_threshold ? 1 : 0).
1066  // what we want is:
1067  // self_repair_scale * ((stats <= lower_threshold ? 1 : 0) +
1068  // (stats > upper_threshold ? -1 : 0)).
1069  //
1070  // we can get these in stats_mat.Row(0) by computing:
1071  // -self_repair_scale * (stats_mat.Row(1) + stats_mat.Row(0) - 1).
1072  row0.AddVec(1.0, row1, 1.0);
1073  row0.Add(-1.0);
1074  CuVector<BaseFloat> temp(row0);
1075  temp.ApplyPow(2.0);
1076  to_update->num_dims_self_repaired_ += temp.Sum();
1077  // [actually we need to divide by repair_probability also, to
1078  // correct for the fact that we only do this on some frames.]
1079  row0.Scale(-self_repair_scale_ / repair_probability);
1080  in_deriv->AddVecToRows(1.0, row0, 1.0);
1081 }
1082 
1083 
1085  const CuMatrixBase<BaseFloat> &in_value,
1086  const CuMatrixBase<BaseFloat> &out_value,
1087  void *memo) {
1088  // Only store stats about every other minibatch (but on the first minibatch,
1089  // always store it, which is necessary for the ConsolidateMemory() operation
1090  // to work correctly.
1091  if (RandInt(0, 1) == 0 && count_ != 0)
1092  return;
1093  CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(),
1094  out_value.NumCols(),
1095  kUndefined);
1096  temp_deriv.Heaviside(out_value);
1097  StoreStatsInternal(out_value, &temp_deriv);
1098 }
1099 
1101  if (scale == 0.0) {
1102  // If scale == 0.0 we call SetZero() which will get rid of NaN's and inf's.
1103  linear_params_.SetZero();
1104  bias_params_.SetZero();
1105  } else {
1106  linear_params_.Scale(scale);
1107  bias_params_.Scale(scale);
1108  }
1109 }
1110 
1111 void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
1112  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
1113  bias_params_.Resize(output_dim);
1114  linear_params_.Resize(output_dim, input_dim);
1115 }
1116 
1117 void AffineComponent::Add(BaseFloat alpha, const Component &other_in) {
1118  const AffineComponent *other =
1119  dynamic_cast<const AffineComponent*>(&other_in);
1120  KALDI_ASSERT(other != NULL);
1121  linear_params_.AddMat(alpha, other->linear_params_);
1122  bias_params_.AddVec(alpha, other->bias_params_);
1123 }
1124 
1126  UpdatableComponent(component),
1127  linear_params_(component.linear_params_),
1128  bias_params_(component.bias_params_),
1129  orthonormal_constraint_(component.orthonormal_constraint_) { }
1130 
1132  const CuVectorBase<BaseFloat> &bias_params,
1133  BaseFloat learning_rate):
1134  linear_params_(linear_params),
1135  bias_params_(bias_params),
1137  SetUnderlyingLearningRate(learning_rate);
1138  KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
1139  bias_params.Dim() != 0);
1140 }
1141 
1143  const CuMatrixBase<BaseFloat> &linear) {
1144  bias_params_ = bias;
1145  linear_params_ = linear;
1146  KALDI_ASSERT(bias_params_.Dim() == linear_params_.NumRows());
1147 }
1148 
1150  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
1151  temp_linear_params.SetRandn();
1152  linear_params_.AddMat(stddev, temp_linear_params);
1153 
1154  CuVector<BaseFloat> temp_bias_params(bias_params_);
1155  temp_bias_params.SetRandn();
1156  bias_params_.AddVec(stddev, temp_bias_params);
1157 }
1158 
1159 std::string AffineComponent::Info() const {
1160  std::ostringstream stream;
1161  stream << UpdatableComponent::Info();
1162  if (orthonormal_constraint_ != 0.0)
1163  stream << ", orthonormal-constraint=" << orthonormal_constraint_;
1164  PrintParameterStats(stream, "linear-params", linear_params_,
1165  false, // include_mean
1166  true, // include_row_norms
1167  true, // include_column_norms
1168  GetVerboseLevel() >= 2); // include_singular_values
1169  PrintParameterStats(stream, "bias", bias_params_, true);
1170  return stream.str();
1171 }
1172 
1174  AffineComponent *ans = new AffineComponent(*this);
1175  return ans;
1176 }
1177 
1179  const AffineComponent *other =
1180  dynamic_cast<const AffineComponent*>(&other_in);
1182  + VecVec(bias_params_, other->bias_params_);
1183 }
1184 
1185 void AffineComponent::Init(int32 input_dim, int32 output_dim,
1186  BaseFloat param_stddev, BaseFloat bias_stddev) {
1187  linear_params_.Resize(output_dim, input_dim);
1188  bias_params_.Resize(output_dim);
1189  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
1190  linear_params_.SetRandn(); // sets to random normally distributed noise.
1191  linear_params_.Scale(param_stddev);
1192  bias_params_.SetRandn();
1193  bias_params_.Scale(bias_stddev);
1194 }
1195 
1196 void AffineComponent::Init(std::string matrix_filename) {
1197  CuMatrix<BaseFloat> mat;
1198  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
1199  KALDI_ASSERT(mat.NumCols() >= 2);
1200  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
1201  linear_params_.Resize(output_dim, input_dim);
1202  bias_params_.Resize(output_dim);
1203  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
1204  bias_params_.CopyColFromMat(mat, input_dim);
1205 }
1206 
1208  bool ok = true;
1209  std::string matrix_filename;
1210  int32 input_dim = -1, output_dim = -1;
1212  if (cfl->GetValue("matrix", &matrix_filename)) {
1213  Init(matrix_filename);
1214  if (cfl->GetValue("input-dim", &input_dim))
1215  KALDI_ASSERT(input_dim == InputDim() &&
1216  "input-dim mismatch vs. matrix.");
1217  if (cfl->GetValue("output-dim", &output_dim))
1218  KALDI_ASSERT(output_dim == OutputDim() &&
1219  "output-dim mismatch vs. matrix.");
1220  } else {
1221  ok = ok && cfl->GetValue("input-dim", &input_dim);
1222  ok = ok && cfl->GetValue("output-dim", &output_dim);
1223  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
1224  bias_stddev = 1.0;
1225  cfl->GetValue("param-stddev", &param_stddev);
1226  cfl->GetValue("bias-stddev", &bias_stddev);
1227  Init(input_dim, output_dim,
1228  param_stddev, bias_stddev);
1229  }
1230  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
1231 
1232  if (cfl->HasUnusedValues())
1233  KALDI_ERR << "Could not process these elements in initializer: "
1234  << cfl->UnusedValues();
1235  if (!ok)
1236  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
1237 }
1238 
1239 
1240 
1241 
1243  const CuMatrixBase<BaseFloat> &in,
1244  CuMatrixBase<BaseFloat> *out) const {
1245 
1246  // No need for asserts as they'll happen within the matrix operations.
1247  out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row
1248  // of *out.
1249  out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 1.0);
1250  return NULL;
1251 }
1252 
1254  const CuMatrixBase<BaseFloat> &out_deriv) {
1255  bias_params_.AddRowSumMat(learning_rate_, out_deriv, 1.0);
1256  linear_params_.AddMatMat(learning_rate_, out_deriv, kTrans,
1257  in_value, kNoTrans, 1.0);
1258 }
1259 
1260 void AffineComponent::Backprop(const std::string &debug_info,
1261  const ComponentPrecomputedIndexes *indexes,
1262  const CuMatrixBase<BaseFloat> &in_value,
1263  const CuMatrixBase<BaseFloat> &, // out_value
1264  const CuMatrixBase<BaseFloat> &out_deriv,
1265  void *memo,
1266  Component *to_update_in,
1267  CuMatrixBase<BaseFloat> *in_deriv) const {
1268  NVTX_RANGE("AffineComponent::Backprop");
1269  AffineComponent *to_update = dynamic_cast<AffineComponent*>(to_update_in);
1270 
1271  // Propagate the derivative back to the input.
1272  // add with coefficient 1.0 since property kBackpropAdds is true.
1273  // If we wanted to add with coefficient 0.0 we'd need to zero the
1274  // in_deriv, in case of infinities.
1275  if (in_deriv)
1276  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, linear_params_, kNoTrans,
1277  1.0);
1278 
1279  if (to_update != NULL) {
1280  // Next update the model (must do this 2nd so the derivatives we propagate
1281  // are accurate, in case this == to_update_in.)
1282  if (to_update->is_gradient_)
1283  to_update->UpdateSimple(in_value, out_deriv);
1284  else // the call below is to a virtual function that may be re-implemented
1285  to_update->Update(debug_info, in_value, out_deriv); // by child classes.
1286  }
1287 }
1288 
1289 void AffineComponent::Read(std::istream &is, bool binary) {
1290  ReadUpdatableCommon(is, binary); // read opening tag and learning rate.
1291  ExpectToken(is, binary, "<LinearParams>");
1292  linear_params_.Read(is, binary);
1293  ExpectToken(is, binary, "<BiasParams>");
1294  bias_params_.Read(is, binary);
1295  if (PeekToken(is, binary) == 'I') {
1296  // for back compatibility; we don't write this here any
1297  // more as it's written and read in Write/ReadUpdatableCommon
1298  ExpectToken(is, binary, "<IsGradient>");
1299  ReadBasicType(is, binary, &is_gradient_);
1300  }
1301  if (PeekToken(is, binary) == 'O') {
1302  ExpectToken(is, binary, "<OrthonormalConstraint>");
1303  ReadBasicType(is, binary, &orthonormal_constraint_);
1304  } else {
1306  }
1307  ExpectToken(is, binary, "</AffineComponent>");
1308 }
1309 
1310 void AffineComponent::Write(std::ostream &os, bool binary) const {
1311  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
1312  WriteToken(os, binary, "<LinearParams>");
1313  linear_params_.Write(os, binary);
1314  WriteToken(os, binary, "<BiasParams>");
1315  bias_params_.Write(os, binary);
1316  if (orthonormal_constraint_ != 0.0) {
1317  WriteToken(os, binary, "<OrthonormalConstraint>");
1319  }
1320  WriteToken(os, binary, "</AffineComponent>");
1321 }
1322 
1324  return (InputDim() + 1) * OutputDim();
1325 }
1327  KALDI_ASSERT(params->Dim() == this->NumParameters());
1328  params->Range(0, InputDim() * OutputDim()).CopyRowsFromMat(linear_params_);
1329  params->Range(InputDim() * OutputDim(),
1330  OutputDim()).CopyFromVec(bias_params_);
1331 }
1333  KALDI_ASSERT(params.Dim() == this->NumParameters());
1334  linear_params_.CopyRowsFromVec(params.Range(0, InputDim() * OutputDim()));
1335  bias_params_.CopyFromVec(params.Range(InputDim() * OutputDim(),
1336  OutputDim()));
1337 }
1338 
1340  UpdatableComponent(component),
1341  linear_params_(component.linear_params_),
1342  bias_params_(component.bias_params_),
1343  num_repeats_(component.num_repeats_) {}
1344 
1345 
1347  if (scale == 0.0) {
1348  linear_params_.SetZero();
1349  bias_params_.SetZero();
1350  } else {
1351  linear_params_.Scale(scale);
1352  bias_params_.Scale(scale);
1353  }
1354 }
1355 
1356 void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
1357  const RepeatedAffineComponent *other =
1358  dynamic_cast<const RepeatedAffineComponent *>(&other_in);
1359  KALDI_ASSERT(other != NULL);
1360  linear_params_.AddMat(alpha, other->linear_params_);
1361  bias_params_.AddVec(alpha, other->bias_params_);
1362 }
1363 
1365  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
1366  temp_linear_params.SetRandn();
1367  linear_params_.AddMat(stddev, temp_linear_params);
1368  CuVector<BaseFloat> temp_bias_params(bias_params_);
1369  temp_bias_params.SetRandn();
1370  bias_params_.AddVec(stddev, temp_bias_params);
1371 }
1372 
1373 std::string RepeatedAffineComponent::Info() const {
1374  std::ostringstream stream;
1375  stream << UpdatableComponent::Info()
1376  << ", num-repeats=" << num_repeats_;
1377  PrintParameterStats(stream, "linear-params", linear_params_);
1378  PrintParameterStats(stream, "bias", bias_params_, true);
1379  return stream.str();
1380 }
1381 
1384  return ans;
1385 }
1386 
1388  const RepeatedAffineComponent *other =
1389  dynamic_cast<const RepeatedAffineComponent*>(&other_in);
1391  + VecVec(bias_params_, other->bias_params_);
1392 }
1393 
1394 void RepeatedAffineComponent::Init(int32 input_dim, int32 output_dim, int32 num_repeats,
1395  BaseFloat param_stddev, BaseFloat bias_mean,
1396  BaseFloat bias_stddev) {
1397  KALDI_ASSERT(input_dim % num_repeats == 0 && output_dim % num_repeats == 0);
1398  linear_params_.Resize(output_dim / num_repeats, input_dim / num_repeats);
1399  bias_params_.Resize(output_dim / num_repeats);
1400  num_repeats_ = num_repeats;
1401  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
1402  linear_params_.SetRandn(); // sets to random normally distributed noise.
1403  linear_params_.Scale(param_stddev);
1404  bias_params_.SetRandn();
1405  bias_params_.Scale(bias_stddev);
1406  bias_params_.Add(bias_mean);
1408 }
1409 
1410 
1412  bool ok = true;
1413  int32 num_repeats = num_repeats_;
1414  int32 input_dim = -1, output_dim = -1;
1416  ok = cfl->GetValue("num-repeats", &num_repeats) && ok;
1417  ok = cfl->GetValue("input-dim", &input_dim) && ok;
1418  ok = cfl->GetValue("output-dim", &output_dim) && ok;
1419  KALDI_ASSERT(input_dim % num_repeats == 0 &&
1420  "num-repeats must divide input-dim");
1421  KALDI_ASSERT(output_dim % num_repeats == 0 &&
1422  "num-repeats must divide output-dim");
1423  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_repeats),
1424  bias_mean = 0.0, bias_stddev = 0.0;
1425  cfl->GetValue("param-stddev", &param_stddev);
1426  cfl->GetValue("bias-mean", &bias_mean);
1427  cfl->GetValue("bias-stddev", &bias_stddev);
1428  Init(input_dim, output_dim,
1429  num_repeats, param_stddev, bias_mean, bias_stddev);
1430  if (cfl->HasUnusedValues())
1431  KALDI_ERR << "Could not process these elements in initializer: "
1432  << cfl->UnusedValues();
1433  if (!ok)
1434  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
1435 }
1436 
1438  const CuMatrixBase<BaseFloat> &in,
1439  CuMatrixBase<BaseFloat> *out) const {
1440  // we gave the kInputContiguous and kOutputContiguous flags-- check that they
1441  // are honored.
1442  KALDI_ASSERT(in.NumCols() == in.Stride() &&
1443  out->NumCols() == out->Stride() &&
1444  out->NumRows() == in.NumRows());
1445 
1446  int32 num_repeats = num_repeats_,
1447  num_rows = in.NumRows(),
1448  block_dim_out = linear_params_.NumRows(),
1449  block_dim_in = linear_params_.NumCols();
1450 
1451  CuSubMatrix<BaseFloat> in_reshaped(in.Data(), num_rows * num_repeats,
1452  block_dim_in, block_dim_in),
1453  out_reshaped(out->Data(), num_rows * num_repeats,
1454  block_dim_out, block_dim_out);
1455 
1456  out_reshaped.CopyRowsFromVec(bias_params_);
1457 
1458  out_reshaped.AddMatMat(1.0, in_reshaped, kNoTrans,
1459  linear_params_, kTrans, 1.0);
1460  return NULL;
1461 }
1462 
1463 void RepeatedAffineComponent::Backprop(const std::string &debug_info,
1464  const ComponentPrecomputedIndexes *indexes,
1465  const CuMatrixBase<BaseFloat> &in_value,
1466  const CuMatrixBase<BaseFloat> &, // out_value
1467  const CuMatrixBase<BaseFloat> &out_deriv,
1468  void *memo,
1469  Component *to_update_in,
1470  CuMatrixBase<BaseFloat> *in_deriv) const {
1471  NVTX_RANGE("RepeatedAffineComponent::Backprop");
1472  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
1473  (in_value.NumCols() == 0 || in_value.NumCols() == in_value.Stride()) &&
1474  (!in_deriv || in_deriv->NumCols() == in_deriv->Stride()));
1475 
1476  RepeatedAffineComponent *to_update = dynamic_cast<RepeatedAffineComponent*>(
1477  to_update_in);
1478 
1479  // Propagate the derivative back to the input.
1480  // add with coefficient 1.0 since property kBackpropAdds is true.
1481  // If we wanted to add with coefficient 0.0 we'd need to zero the
1482  // in_deriv, in case of infinities.
1483  if (in_deriv) {
1484  int32 num_repeats = num_repeats_,
1485  num_rows = out_deriv.NumRows(),
1486  block_dim_out = linear_params_.NumRows(),
1487  block_dim_in = linear_params_.NumCols();
1488 
1489  CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(),
1490  num_rows * num_repeats,
1491  block_dim_in, block_dim_in),
1492  out_deriv_reshaped(out_deriv.Data(),
1493  num_rows * num_repeats,
1494  block_dim_out, block_dim_out);
1495  in_deriv_reshaped.AddMatMat(1.0, out_deriv_reshaped, kNoTrans,
1496  linear_params_, kNoTrans, 1.0);
1497  }
1498 
1499  // Next update the model (must do this 2nd so the derivatives we propagate are
1500  // accurate, in case this == to_update_in.)
1501  if (to_update != NULL)
1502  to_update->Update(in_value, out_deriv);
1503 }
1504 
1506  const CuMatrixBase<BaseFloat> &out_deriv) {
1507  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
1508  in_value.NumCols() == in_value.Stride() &&
1509  in_value.NumRows() == out_deriv.NumRows());
1510 
1511 
1512  int32 num_repeats = num_repeats_,
1513  num_rows = in_value.NumRows(),
1514  block_dim_out = linear_params_.NumRows(),
1515  block_dim_in = linear_params_.NumCols();
1516 
1517  CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
1518  num_rows * num_repeats,
1519  block_dim_in, block_dim_in),
1520  out_deriv_reshaped(out_deriv.Data(),
1521  num_rows * num_repeats,
1522  block_dim_out, block_dim_out);
1523 
1524 
1525  linear_params_.AddMatMat(learning_rate_, out_deriv_reshaped, kTrans,
1526  in_value_reshaped, kNoTrans, 1.0);
1527  bias_params_.AddRowSumMat(learning_rate_,
1528  out_deriv_reshaped);
1529 }
1530 
1531 void RepeatedAffineComponent::Read(std::istream &is, bool binary) {
1532  // This Read function also works for NaturalGradientRepeatedAffineComponent.
1533  ReadUpdatableCommon(is, binary); // read opening tag and learning rate.
1534  ExpectToken(is, binary, "<NumRepeats>");
1535  ReadBasicType(is, binary, &num_repeats_);
1536  ExpectToken(is, binary, "<LinearParams>");
1537  linear_params_.Read(is, binary);
1538  ExpectToken(is, binary, "<BiasParams>");
1539  bias_params_.Read(is, binary);
1540  if (PeekToken(is, binary) == 'I') {
1541  // for back compatibility; we don't write this here any
1542  // more as it's written and read in Write/ReadUpdatableCommon
1543  ExpectToken(is, binary, "<IsGradient>");
1544  ReadBasicType(is, binary, &is_gradient_);
1545  }
1546  ExpectToken(is, binary, std::string("</") + Type() + std::string(">"));
1548 }
1549 
1550 void RepeatedAffineComponent::Write(std::ostream &os, bool binary) const {
1551  // This Write function also works for NaturalGradientRepeatedAffineComponent.
1552  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
1553  WriteToken(os, binary, "<NumRepeats>");
1554  WriteBasicType(os, binary, num_repeats_);
1555  WriteToken(os, binary, "<LinearParams>");
1556  linear_params_.Write(os, binary);
1557  WriteToken(os, binary, "<BiasParams>");
1558  bias_params_.Write(os, binary);
1559  // write closing token.
1560  WriteToken(os, binary, std::string("</") + Type() + std::string(">"));
1561 }
1562 
1564  // Note: unlike AffineComponent, InputDim() & OutputDim() are not used here and below,
1565  // for they are multipled by num_repeats_.
1566  return linear_params_.NumCols() * linear_params_.NumRows() + bias_params_.Dim();
1567 }
1568 
1570  KALDI_ASSERT(params->Dim() == this->NumParameters());
1571  params->Range(0, linear_params_.NumCols() * linear_params_.NumRows()).CopyRowsFromMat(linear_params_);
1572  params->Range(linear_params_.NumCols() * linear_params_.NumRows(),
1573  bias_params_.Dim()).CopyFromVec(bias_params_);
1574 }
1575 
1577  KALDI_ASSERT(params.Dim() == this->NumParameters());
1578  linear_params_.CopyRowsFromVec(params.Range(0, linear_params_.NumCols() * linear_params_.NumRows()));
1579  bias_params_.CopyFromVec(params.Range(linear_params_.NumCols() * linear_params_.NumRows(),
1580  bias_params_.Dim()));
1581 }
1582 
1584  int32 rank_in = 40;
1585  int32 input_dim = linear_params_.NumCols();
1586  if (rank_in > input_dim / 2)
1587  rank_in = input_dim / 2;
1588  if (rank_in < 1)
1589  rank_in = 1;
1590  preconditioner_in_.SetRank(rank_in);
1591  preconditioner_in_.SetUpdatePeriod(4);
1592 }
1593 
1596  RepeatedAffineComponent(other),
1597  preconditioner_in_(other.preconditioner_in_) { }
1598 
1599 // virtual
1601  return new NaturalGradientRepeatedAffineComponent(*this);
1602 }
1603 
1605  const CuMatrixBase<BaseFloat> &in_value,
1606  const CuMatrixBase<BaseFloat> &out_deriv) {
1607  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
1608  in_value.NumCols() == in_value.Stride() &&
1609  in_value.NumRows() == out_deriv.NumRows());
1610 
1611  int32 num_repeats = num_repeats_,
1612  num_rows = in_value.NumRows(),
1613  block_dim_out = linear_params_.NumRows(),
1614  block_dim_in = linear_params_.NumCols();
1615 
1616  CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
1617  num_rows * num_repeats,
1618  block_dim_in, block_dim_in),
1619  out_deriv_reshaped(out_deriv.Data(),
1620  num_rows * num_repeats,
1621  block_dim_out, block_dim_out);
1622 
1623  CuVector<BaseFloat> bias_deriv(block_dim_out);
1624  bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped);
1625 
1626  CuMatrix<BaseFloat> deriv(block_dim_out,
1627  block_dim_in + 1);
1628  deriv.ColRange(0, block_dim_in).AddMatMat(
1629  1.0, out_deriv_reshaped, kTrans,
1630  in_value_reshaped, kNoTrans, 1.0);
1631  deriv.CopyColFromVec(bias_deriv, block_dim_in);
1632 
1633  BaseFloat scale = 1.0;
1634  if (!is_gradient_) {
1635  try {
1636  // Only apply the preconditioning/natural-gradient if we're not computing
1637  // the exact gradient.
1639  } catch (...) {
1640  int32 num_bad_rows = 0;
1641  for (int32 i = 0; i < out_deriv.NumRows(); i++) {
1642  BaseFloat f = out_deriv.Row(i).Sum();
1643  if (!(f - f == 0)) num_bad_rows++;
1644  }
1645  KALDI_ERR << "Preonditioning failed, in_value sum is "
1646  << in_value.Sum() << ", out_deriv sum is " << out_deriv.Sum()
1647  << ", out_deriv has " << num_bad_rows << " bad rows.";
1648  }
1649  }
1650  linear_params_.AddMat(learning_rate_ * scale,
1651  deriv.ColRange(0, block_dim_in));
1652  bias_deriv.CopyColFromMat(deriv, block_dim_in);
1653  bias_params_.AddVec(learning_rate_ * scale, bias_deriv);
1654 }
1655 
1658  preconditioner_in_.Swap(&temp);
1659 }
1660 
1661 
1663  UpdatableComponent(other),
1665  bias_params_(other.bias_params_),
1666  num_blocks_(other.num_blocks_) {}
1667 
1669  UpdatableComponent(rac),
1670  linear_params_(rac.num_repeats_ * rac.linear_params_.NumRows(),
1671  rac.linear_params_.NumCols(), kUndefined),
1672  bias_params_(rac.num_repeats_ * rac.linear_params_.NumRows(), kUndefined),
1673  num_blocks_(rac.num_repeats_) {
1674  // copy rac's linear_params_ and bias_params_ to this.
1675  int32 num_rows_in_block = rac.linear_params_.NumRows();
1676  for(int32 block_counter = 0; block_counter < num_blocks_; block_counter++) {
1677  int32 row_offset = block_counter * num_rows_in_block;
1678  CuSubMatrix<BaseFloat> block = this->linear_params_.RowRange(row_offset,
1679  num_rows_in_block);
1680  block.CopyFromMat(rac.linear_params_);
1681  CuSubVector<BaseFloat> block_bias = this->bias_params_.Range(row_offset,
1682  num_rows_in_block);
1683  block_bias.CopyFromVec(rac.bias_params_);
1684  }
1685 }
1686 
1688  BlockAffineComponent *ans = new BlockAffineComponent(*this);
1689  return ans;
1690 }
1691 
1692 std::string BlockAffineComponent::Info() const {
1693  std::ostringstream stream;
1694  stream << UpdatableComponent::Info()
1695  << ", num-blocks=" << num_blocks_;
1696  PrintParameterStats(stream, "linear-params", linear_params_);
1697  PrintParameterStats(stream, "bias", bias_params_, true);
1698  return stream.str();
1699 }
1700 
1702  int32 output_dim, int32 num_blocks,
1703  BaseFloat param_stddev, BaseFloat bias_mean,
1704  BaseFloat bias_stddev) {
1705  KALDI_ASSERT(input_dim > 0 && output_dim > 0 && num_blocks >= 1);
1706  KALDI_ASSERT(output_dim % num_blocks == 0 && input_dim % num_blocks == 0);
1707  const int32 num_columns_per_block = input_dim / num_blocks;
1708  linear_params_.Resize(output_dim, num_columns_per_block);
1709  bias_params_.Resize(output_dim);
1710  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
1711  linear_params_.SetRandn();
1712  linear_params_.Scale(param_stddev);
1713  bias_params_.SetRandn();
1714  bias_params_.Scale(bias_stddev);
1715  bias_params_.Add(bias_mean);
1716  num_blocks_ = num_blocks;
1717 }
1718 
1720  int32 input_dim = -1, output_dim = -1, num_blocks = -1;
1721  if(!cfl->GetValue("input-dim", &input_dim) ||
1722  !cfl->GetValue("output-dim", &output_dim) ||
1723  !cfl->GetValue("num-blocks", &num_blocks))
1724  KALDI_ERR << "Invalid initializer for layer of type "
1725  << Type() << ": \"" << cfl->WholeLine() << "\"";
1727  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_blocks),
1728  bias_mean = 0.0, bias_stddev = 1.0;
1729  cfl->GetValue("param-stddev", &param_stddev);
1730  cfl->GetValue("bias-stddev", &bias_stddev);
1731  cfl->GetValue("bias-mean", &bias_mean);
1732 
1733  if (cfl->HasUnusedValues())
1734  KALDI_ERR << "Invalid initializer for layer of type "
1735  << Type() << ": \"" << cfl->WholeLine() << "\"";
1736 
1737  Init(input_dim, output_dim, num_blocks,
1738  param_stddev, bias_mean, bias_stddev);
1739 }
1740 
1742  const CuMatrixBase<BaseFloat> &in,
1743  CuMatrixBase<BaseFloat> *out) const {
1745  // block_dimension is both the number of columns, and the number of rows,
1746  // of a block.
1747  int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_;
1748  int32 num_cols_in_block = linear_params_.NumCols();
1749  std::vector<CuSubMatrix<BaseFloat> *> in_batch, out_batch,
1750  linear_params_batch;
1751  for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
1752  CuSubMatrix<BaseFloat> *in_block =
1753  new CuSubMatrix<BaseFloat>(in.ColRange(block_counter * num_cols_in_block,
1754  num_cols_in_block));
1755  in_batch.push_back(in_block);
1756 
1757  CuSubMatrix<BaseFloat> *out_block =
1758  new CuSubMatrix<BaseFloat>(out->ColRange(block_counter * num_rows_in_block,
1759  num_rows_in_block));
1760  out_batch.push_back(out_block);
1761 
1762  CuSubMatrix<BaseFloat> *linear_params_block =
1763  new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
1764  num_rows_in_block));
1765  linear_params_batch.push_back(linear_params_block);
1766  }
1767  AddMatMatBatched<BaseFloat>(1.0, out_batch, in_batch, kNoTrans,
1768  linear_params_batch, kTrans, 1.0);
1769 
1770  DeletePointers(&in_batch);
1771  DeletePointers(&out_batch);
1772  DeletePointers(&linear_params_batch);
1773  return NULL;
1774 }
1775 
1776 void BlockAffineComponent::Backprop(const std::string &debug_info,
1777  const ComponentPrecomputedIndexes *indexes,
1778  const CuMatrixBase<BaseFloat> &in_value,
1779  const CuMatrixBase<BaseFloat> &, // out_value
1780  const CuMatrixBase<BaseFloat> &out_deriv,
1781  void *memo,
1782  Component *to_update_in,
1783  CuMatrixBase<BaseFloat> *in_deriv) const {
1784  NVTX_RANGE("BlockAffineComponent::Backprop");
1785  BlockAffineComponent *to_update = dynamic_cast<BlockAffineComponent*>(to_update_in);
1786 
1787  const int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_;
1788  const int32 num_cols_in_block = linear_params_.NumCols();
1789 
1790  // Propagate the derivative back to the input.
1791  // add with coefficient 1.0 since property kBackpropAdds is true.
1792  // If we wanted to add with coefficient 0.0 we'd need to zero the
1793  // in_deriv, in case of infinities.
1794  if (in_deriv) {
1795  std::vector<CuSubMatrix<BaseFloat> *> in_deriv_batch, out_deriv_batch, linear_params_batch;
1796 
1797  for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
1798  CuSubMatrix<BaseFloat> *in_deriv_block =
1799  new CuSubMatrix<BaseFloat>(in_deriv->ColRange(block_counter * num_cols_in_block,
1800  num_cols_in_block));
1801  in_deriv_batch.push_back(in_deriv_block);
1802 
1803  CuSubMatrix<BaseFloat> *out_deriv_block =
1804  new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
1805  num_rows_in_block));
1806  out_deriv_batch.push_back(out_deriv_block);
1807 
1808  CuSubMatrix<BaseFloat> *linear_params_block =
1809  new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
1810  num_rows_in_block));
1811  linear_params_batch.push_back(linear_params_block);
1812  }
1813 
1814  AddMatMatBatched<BaseFloat>(1.0, in_deriv_batch, out_deriv_batch, kNoTrans,
1815  linear_params_batch, kNoTrans, 1.0);
1816 
1817  DeletePointers(&in_deriv_batch);
1818  DeletePointers(&out_deriv_batch);
1819  DeletePointers(&linear_params_batch);
1820  }
1821 
1822  if (to_update != NULL) {
1823 
1824  { // linear params update
1825 
1826  std::vector<CuSubMatrix<BaseFloat> *> in_value_batch,
1827  out_deriv_batch, linear_params_batch;
1828 
1829  for (int block_counter = 0; block_counter < num_blocks_; block_counter++) {
1830  CuSubMatrix<BaseFloat> *in_value_block =
1831  new CuSubMatrix<BaseFloat>(in_value.ColRange(block_counter * num_cols_in_block,
1832  num_cols_in_block));
1833  in_value_batch.push_back(in_value_block);
1834 
1835  CuSubMatrix<BaseFloat> *out_deriv_block =
1836  new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
1837  num_rows_in_block));
1838  out_deriv_batch.push_back(out_deriv_block);
1839 
1840  CuSubMatrix<BaseFloat> *linear_params_block =
1841  new CuSubMatrix<BaseFloat>(to_update->linear_params_.RowRange(block_counter * num_rows_in_block,
1842  num_rows_in_block));
1843  linear_params_batch.push_back(linear_params_block);
1844  }
1845 
1846  AddMatMatBatched<BaseFloat>(to_update->learning_rate_,
1847  linear_params_batch,
1848  out_deriv_batch, kTrans,
1849  in_value_batch, kNoTrans, 1.0);
1850 
1851  DeletePointers(&in_value_batch);
1852  DeletePointers(&out_deriv_batch);
1853  DeletePointers(&linear_params_batch);
1854  } // end linear params update
1855 
1856  { // bias update
1857  to_update->bias_params_.AddRowSumMat(to_update->learning_rate_,
1858  out_deriv, 1.0);
1859  } // end bias update
1860  }
1861 }
1862 
1864  if (scale == 0.0) {
1865  linear_params_.SetZero();
1866  bias_params_.SetZero();
1867  } else {
1868  linear_params_.Scale(scale);
1869  bias_params_.Scale(scale);
1870  }
1871 }
1872 
1873 void BlockAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
1874  const BlockAffineComponent *other =
1875  dynamic_cast<const BlockAffineComponent *>(&other_in);
1876  KALDI_ASSERT(other != NULL);
1877  linear_params_.AddMat(alpha, other->linear_params_);
1878  bias_params_.AddVec(alpha, other->bias_params_);
1879 }
1880 
1882  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
1883  temp_linear_params.SetRandn();
1884  linear_params_.AddMat(stddev, temp_linear_params);
1885 
1886  CuVector<BaseFloat> temp_bias_params(bias_params_);
1887  temp_bias_params.SetRandn();
1888  bias_params_.AddVec(stddev, temp_bias_params);
1889 }
1890 
1892  const BlockAffineComponent *other =
1893  dynamic_cast<const BlockAffineComponent*>(&other_in);
1894  return TraceMatMat(linear_params_, other->linear_params_, kTrans) +
1895  VecVec(bias_params_, other->bias_params_);
1896 }
1897 
1898 void BlockAffineComponent::Read(std::istream &is, bool binary) {
1899  ReadUpdatableCommon(is, binary); // read opening tag and learning rate.
1900  ExpectToken(is, binary, "<NumBlocks>");
1901  ReadBasicType(is, binary, &num_blocks_);
1902  ExpectToken(is, binary, "<LinearParams>");
1903  linear_params_.Read(is, binary);
1904  ExpectToken(is, binary, "<BiasParams>");
1905  bias_params_.Read(is, binary);
1906  if (PeekToken(is, binary) == 'I') {
1907  // for back compatibility; we don't write this here any
1908  // more as it's written and read in Write/ReadUpdatableCommon
1909  ExpectToken(is, binary, "<IsGradient>");
1910  ReadBasicType(is, binary, &is_gradient_);
1911  }
1912  ExpectToken(is, binary, "</BlockAffineComponent>");
1913 }
1914 
1915 void BlockAffineComponent::Write(std::ostream &os, bool binary) const {
1916  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
1917  WriteToken(os, binary, "<NumBlocks>");
1918  WriteBasicType(os, binary, num_blocks_);
1919  WriteToken(os, binary, "<LinearParams>");
1920  linear_params_.Write(os, binary);
1921  WriteToken(os, binary, "<BiasParams>");
1922  bias_params_.Write(os, binary);
1923  WriteToken(os, binary, "</BlockAffineComponent>");
1924 }
1925 
1927  return linear_params_.NumCols() * linear_params_.NumRows() + bias_params_.Dim();
1928 }
1929 
1931  KALDI_ASSERT(params->Dim() == this->NumParameters());
1932  int32 num_linear_params = linear_params_.NumCols() * linear_params_.NumRows();
1933  int32 num_bias_params = bias_params_.Dim();
1934  params->Range(0, num_linear_params).CopyRowsFromMat(linear_params_);
1935  params->Range(num_linear_params, num_bias_params).CopyFromVec(bias_params_);
1936 }
1937 
1939  KALDI_ASSERT(params.Dim() == this->NumParameters());
1940  int32 num_linear_params = linear_params_.NumCols() * linear_params_.NumRows();
1941  int32 num_bias_params = bias_params_.Dim();
1942  linear_params_.CopyRowsFromVec(params.Range(0, num_linear_params));
1943  bias_params_.CopyFromVec(params.Range(num_linear_params, num_bias_params));
1944 }
1945 
1947  if (scale == 0.0) {
1948  scales_.SetZero();
1949  } else {
1950  scales_.Scale(scale);
1951  }
1952 }
1953 
1955  const Component &other_in) {
1956  const PerElementScaleComponent *other =
1957  dynamic_cast<const PerElementScaleComponent*>(&other_in);
1958  KALDI_ASSERT(other != NULL);
1959  scales_.AddVec(alpha, other->scales_);
1960 }
1961 
1963  const PerElementScaleComponent &component):
1964  UpdatableComponent(component),
1965  scales_(component.scales_) { }
1966 
1968  CuVector<BaseFloat> temp_scales(scales_.Dim(), kUndefined);
1969  temp_scales.SetRandn();
1970  scales_.AddVec(stddev, temp_scales);
1971 }
1972 
1973 std::string PerElementScaleComponent::Info() const {
1974  std::ostringstream stream;
1975  stream << UpdatableComponent::Info()
1976  << ", scales-min=" << scales_.Min()
1977  << ", scales-max=" << scales_.Max();
1978  PrintParameterStats(stream, "scales", scales_, true);
1979  return stream.str();
1980 }
1981 
1983  return new PerElementScaleComponent(*this);
1984 }
1985 
1987  const UpdatableComponent &other_in) const {
1988  const PerElementScaleComponent *other =
1989  dynamic_cast<const PerElementScaleComponent*>(&other_in);
1990  return VecVec(scales_, other->scales_);
1991 }
1992 
1994  BaseFloat param_mean,
1995  BaseFloat param_stddev) {
1996  KALDI_ASSERT(dim > 0 && param_stddev >= 0.0);
1997  scales_.Resize(dim);
1998  scales_.SetRandn();
1999  scales_.Scale(param_stddev);
2000  scales_.Add(param_mean);
2001 }
2002 
2003 void PerElementScaleComponent::Init(std::string vector_filename) {
2004  CuVector<BaseFloat> vec;
2005  ReadKaldiObject(vector_filename, &vec); // will abort on failure.
2006  scales_.Resize(vec.Dim());
2007  scales_.CopyFromVec(vec);
2008 }
2009 
2011  std::string vector_filename;
2012  int32 dim = -1;
2014  if (cfl->GetValue("vector", &vector_filename)) {
2015  Init(vector_filename);
2016  if (cfl->GetValue("dim", &dim))
2017  KALDI_ASSERT(dim == InputDim() &&
2018  "input-dim mismatch vs. vector.");
2019  } else {
2020  if(!cfl->GetValue("dim", &dim))
2021  KALDI_ERR << "'dim' not provided in the config line.";
2022  BaseFloat param_mean = 1.0, param_stddev = 0.0;
2023  cfl->GetValue("param-mean", &param_mean);
2024  cfl->GetValue("param-stddev", &param_stddev);
2025  Init(dim, param_mean, param_stddev);
2026  }
2027  if (cfl->HasUnusedValues())
2028  KALDI_ERR << "Could not process these elements in initializer: "
2029  << cfl->UnusedValues();
2030 }
2031 
2033  const ComponentPrecomputedIndexes *indexes,
2034  const CuMatrixBase<BaseFloat> &in,
2035  CuMatrixBase<BaseFloat> *out) const {
2036  out->CopyFromMat(in);
2037  out->MulColsVec(scales_);
2038  return NULL;
2039 }
2040 
2042  const CuMatrixBase<BaseFloat> &in_value,
2043  const CuMatrixBase<BaseFloat> &out_deriv) {
2044  scales_.AddDiagMatMat(learning_rate_, out_deriv, kTrans,
2045  in_value, kNoTrans, 1.0);
2046 }
2047 
2049  const std::string &debug_info,
2050  const ComponentPrecomputedIndexes *indexes,
2051  const CuMatrixBase<BaseFloat> &in_value,
2052  const CuMatrixBase<BaseFloat> &, // out_value
2053  const CuMatrixBase<BaseFloat> &out_deriv,
2054  void *memo,
2055  Component *to_update_in,
2056  CuMatrixBase<BaseFloat> *in_deriv) const {
2057  NVTX_RANGE("PerElementScaleComponent::Backprop");
2058  PerElementScaleComponent *to_update =
2059  dynamic_cast<PerElementScaleComponent*>(to_update_in);
2060 
2061  if (to_update != NULL) {
2062  // Next update the model (must do this 2nd so the derivatives we propagate
2063  // are accurate, in case this == to_update_in.)
2064  if (to_update->is_gradient_)
2065  to_update->UpdateSimple(in_value, out_deriv);
2066  else // the call below is to a virtual function that may be re-implemented
2067  to_update->Update(debug_info, in_value, out_deriv); // by child classes.
2068  }
2069 
2070  if (in_deriv) {
2071  // Propagate the derivative back to the input.
2072  if (in_deriv->Data() != out_deriv.Data())
2073  in_deriv->CopyFromMat(out_deriv);
2074  in_deriv->MulColsVec(scales_);
2075  }
2076 }
2077 
2078 void PerElementScaleComponent::Read(std::istream &is, bool binary) {
2079  ReadUpdatableCommon(is, binary); // Read opening tag and learning rate.
2080  ExpectToken(is, binary, "<Params>");
2081  scales_.Read(is, binary);
2082  if (PeekToken(is, binary) == 'I') {
2083  // for back compatibility; we don't write this here any
2084  // more as it's written and read in Write/ReadUpdatableCommon
2085  ExpectToken(is, binary, "<IsGradient>");
2086  ReadBasicType(is, binary, &is_gradient_);
2087  }
2088  ExpectToken(is, binary, "</PerElementScaleComponent>");
2089 }
2090 
2091 void PerElementScaleComponent::Write(std::ostream &os, bool binary) const {
2092  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate.
2093  WriteToken(os, binary, "<Params>");
2094  scales_.Write(os, binary);
2095  WriteToken(os, binary, "</PerElementScaleComponent>");
2096 }
2097 
2099  return InputDim();
2100 }
2101 
2103  params->CopyFromVec(scales_);
2104 }
2105 
2107  const VectorBase<BaseFloat> &params) {
2108  scales_.CopyFromVec(params);
2109 }
2110 
2112  if (scale == 0.0) {
2113  offsets_.SetZero();
2114  } else {
2115  offsets_.Scale(scale);
2116  }
2117 }
2118 
2119 
2121  const Component &other_in) {
2122  const PerElementOffsetComponent *other =
2123  dynamic_cast<const PerElementOffsetComponent*>(&other_in);
2124  KALDI_ASSERT(other != NULL);
2125  offsets_.AddVec(alpha, other->offsets_);
2126 }
2127 
2129  const PerElementOffsetComponent &component):
2130  UpdatableComponent(component),
2131  offsets_(component.offsets_),
2132  dim_(component.dim_),
2133  use_natural_gradient_(component.use_natural_gradient_),
2134  preconditioner_(component.preconditioner_) { }
2135 
2137  CuVector<BaseFloat> temp_offsets(offsets_.Dim(), kUndefined);
2138  temp_offsets.SetRandn();
2139  offsets_.AddVec(stddev, temp_offsets);
2140 }
2141 
2142 std::string PerElementOffsetComponent::Info() const {
2143  std::ostringstream stream;
2144  stream << UpdatableComponent::Info()
2145  << ", offsets-min=" << offsets_.Min()
2146  << ", offsets-max=" << offsets_.Max()
2147  << ", block-dim=" << offsets_.Dim()
2148  << ", use-natural-gradient="
2149  << (use_natural_gradient_ ? "true" : "false");
2150  PrintParameterStats(stream, "offsets", offsets_, true);
2151  return stream.str();
2152 }
2153 
2155  return new PerElementOffsetComponent(*this);
2156 }
2157 
2159  const UpdatableComponent &other_in) const {
2160  const PerElementOffsetComponent *other =
2161  dynamic_cast<const PerElementOffsetComponent*>(&other_in);
2162  return VecVec(offsets_, other->offsets_);
2163 }
2164 
2165 
2167  std::string vector_filename;
2169  if (cfl->GetValue("vector", &vector_filename)) {
2170  ReadKaldiObject(vector_filename, &offsets_);
2171  dim_ = offsets_.Dim(); // if dim is not supplied, it defaults to this.
2172  cfl->GetValue("dim", &dim_);
2173  if (dim_ <= 0 || offsets_.Dim() % dim_ != 0)
2174  KALDI_ERR << "Invalid dimension dim=" << dim_;
2175  } else {
2176  if(!cfl->GetValue("dim", &dim_))
2177  KALDI_ERR << "'dim' not provided in the config line.";
2178  if (dim_ <= 0)
2179  KALDI_ERR << "Invalid dimension dim=" << dim_;
2180  BaseFloat param_mean = 0.0, param_stddev = 0.0;
2181  cfl->GetValue("param-mean", &param_mean);
2182  cfl->GetValue("param-stddev", &param_stddev);
2183  int32 block_dim = dim_;
2184  cfl->GetValue("block-dim", &block_dim);
2185  if (block_dim <= 0 || dim_ % block_dim != 0)
2186  KALDI_ERR << "Invalid value block-dim=" << block_dim;
2187  offsets_.Resize(block_dim);
2188  offsets_.SetRandn();
2189  offsets_.Scale(param_stddev);
2190  offsets_.Add(param_mean);
2191  }
2192  use_natural_gradient_ = true;
2193  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
2194  if (cfl->HasUnusedValues())
2195  KALDI_ERR << "Could not process these elements in initializer: "
2196  << cfl->UnusedValues();
2197  // For now you can't modify these defaults of the natural gradient.
2198  // This code must be kept in sync with the code in Read().
2201 }
2202 
2204  const ComponentPrecomputedIndexes *indexes,
2205  const CuMatrixBase<BaseFloat> &in,
2206  CuMatrixBase<BaseFloat> *out) const {
2207  if (in.Data() != out->Data())
2208  out->CopyFromMat(in);
2209  if (dim_ == offsets_.Dim()) {
2210  out->AddVecToRows(1.0, offsets_);
2211  } else {
2212  KALDI_ASSERT(out->Stride() == out->NumCols());
2213  int32 block_dim = offsets_.Dim(), multiple = dim_ / block_dim,
2214  num_rows = out->NumRows() * multiple;
2215  CuSubMatrix<BaseFloat> out_rearranged(out->Data(), num_rows,
2216  block_dim, block_dim);
2217  out_rearranged.AddVecToRows(1.0, offsets_);
2218  }
2219  return NULL;
2220 }
2221 
2223  const std::string &debug_info,
2224  const ComponentPrecomputedIndexes *indexes,
2225  const CuMatrixBase<BaseFloat> &, // in_value
2226  const CuMatrixBase<BaseFloat> &, // out_value
2227  const CuMatrixBase<BaseFloat> &out_deriv,
2228  void *memo,
2229  Component *to_update_in,
2230  CuMatrixBase<BaseFloat> *in_deriv) const {
2231  NVTX_RANGE("PerElementOffsetComponent::Backprop");
2232  PerElementOffsetComponent *to_update =
2233  dynamic_cast<PerElementOffsetComponent*>(to_update_in);
2234 
2235  if (in_deriv && in_deriv->Data() != out_deriv.Data()) {
2236  // Propagate the derivative back to the input.
2237  in_deriv->CopyFromMat(out_deriv);
2238  }
2239 
2240  if (to_update != NULL) {
2241  // we may have to reshape out_deriv, if "block-dim" was set
2242  // in the config file when initializing the object, leading
2243  // to dim_ being a multiple >1 of offset_.Dim().
2244  // To avoid having separate code paths we create a sub-matrix
2245  // in any case, but this may just be a copy of out_deriv.
2246  int32 block_dim = offsets_.Dim(), multiple = dim_ / block_dim,
2247  block_stride = (multiple == 1 ? out_deriv.Stride() : block_dim),
2248  num_rows = out_deriv.NumRows() * multiple;
2249  KALDI_ASSERT(multiple == 1 || out_deriv.Stride() == out_deriv.NumCols());
2250  CuSubMatrix<BaseFloat> out_deriv_reshaped(out_deriv.Data(), num_rows,
2251  block_dim, block_stride);
2252  if (!to_update->use_natural_gradient_ || to_update->is_gradient_) {
2253  KALDI_LOG << "Using non-NG update, lr = " << to_update->learning_rate_;
2254  to_update->offsets_.AddRowSumMat(to_update->learning_rate_,
2255  out_deriv_reshaped);
2256  } else {
2257  KALDI_LOG << "Using NG update, lr = " << to_update->learning_rate_;
2258  // make a copy as we don't want to modify the data of 'out_deriv', which
2259  // was const (even though CuSubMatrix does not respect const-ness in
2260  // this scenario)
2261  CuMatrix<BaseFloat> out_deriv_copy(out_deriv_reshaped);
2262  BaseFloat scale = 1.0;
2263  to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
2264  &scale);
2265  to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_,
2266  out_deriv_copy);
2267  }
2268  }
2269 }
2270 
2271 void PerElementOffsetComponent::Read(std::istream &is, bool binary) {
2272  ReadUpdatableCommon(is, binary); // Read opening tag and learning rate
2273  ExpectToken(is, binary, "<Offsets>");
2274  offsets_.Read(is, binary);
2275  if (PeekToken(is, binary) == 'I') {
2276  // for back compatibility; we don't write this here any
2277  // more as it's written and read in Write/ReadUpdatableCommon
2278  ExpectToken(is, binary, "<IsGradient>");
2279  ReadBasicType(is, binary, &is_gradient_);
2280  }
2281  if (PeekToken(is, binary) != '/') {
2282  ExpectToken(is, binary, "<Dim>");
2283  ReadBasicType(is, binary, &dim_);
2284  ExpectToken(is, binary, "<UseNaturalGradient>");
2285  ReadBasicType(is, binary, &use_natural_gradient_);
2286  } else {
2287  dim_ = offsets_.Dim();
2288  use_natural_gradient_ = true;
2289  }
2290  // For now you can't modify these defaults of the natural gradient.
2291  // This code must be kept in sync with the code in InitFromConfig().
2294  ExpectToken(is, binary, "</PerElementOffsetComponent>");
2295 }
2296 
2297 void PerElementOffsetComponent::Write(std::ostream &os, bool binary) const {
2298  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
2299  WriteToken(os, binary, "<Offsets>");
2300  offsets_.Write(os, binary);
2301  WriteToken(os, binary, "<Dim>");
2302  WriteBasicType(os, binary, dim_);
2303  WriteToken(os, binary, "<UseNaturalGradient>");
2305  WriteToken(os, binary, "</PerElementOffsetComponent>");
2306 }
2307 
2309  return offsets_.Dim();
2310 }
2311 
2313  params->CopyFromVec(offsets_);
2314 }
2315 
2317  const VectorBase<BaseFloat> &params) {
2318  offsets_.CopyFromVec(params);
2319 }
2320 
2321 std::string ScaleAndOffsetComponent::Info() const {
2322  std::ostringstream stream;
2323  stream << UpdatableComponent::Info()
2324  << ", rank=" << scale_preconditioner_.GetRank();
2325  if (dim_ != scales_.Dim())
2326  stream << ", block-size=" << scales_.Dim();
2327  PrintParameterStats(stream, "scales", scales_, true);
2328  PrintParameterStats(stream, "offsets", offsets_, true);
2329  return stream.str();
2330 }
2331 
2333 
2335  if (!cfl->GetValue("dim", &dim_) || dim_ <= 0) {
2336  KALDI_ERR << "Dimension 'dim' must be specified and >0: "
2337  << cfl->WholeLine();
2338  }
2339  use_natural_gradient_ = true;
2340  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
2341  int32 block_dim = dim_,
2342  rank = 20;
2343  cfl->GetValue("block-dim", &block_dim);
2344  if (block_dim <= 0 || dim_ % block_dim != 0) {
2345  KALDI_ERR << "Invalid block-dim: " << cfl->WholeLine();
2346  }
2347  cfl->GetValue("rank", &rank);
2348  scales_.Resize(block_dim);
2349  scales_.Set(1.0);
2350  offsets_.Resize(block_dim);
2351  // offsets are all zero when initialized.
2352  if (cfl->HasUnusedValues())
2353  KALDI_ERR << "Could not process these elements in initializer: "
2354  << cfl->UnusedValues();
2355  offset_preconditioner_.SetRank(rank);
2356  scale_preconditioner_.SetRank(rank);
2357  // the update period can't be configured for now; we'll add an option if we
2358  // want to.
2359  offset_preconditioner_.SetUpdatePeriod(4);
2360  scale_preconditioner_.SetUpdatePeriod(4);
2361 }
2362 
2363 void ScaleAndOffsetComponent::Read(std::istream &is, bool binary) {
2364  ReadUpdatableCommon(is, binary); // Read opening tag and learning rate
2365  ExpectToken(is, binary, "<Dim>");
2366  ReadBasicType(is, binary, &dim_);
2367  ExpectToken(is, binary, "<Scales>");
2368  scales_.Read(is, binary);
2369  ExpectToken(is, binary, "<Offsets>");
2370  offsets_.Read(is, binary);
2371  ExpectToken(is, binary, "<UseNaturalGradient>");
2372  ReadBasicType(is, binary, &use_natural_gradient_);
2373  int32 rank;
2374  ExpectToken(is, binary, "<Rank>");
2375  ReadBasicType(is, binary, &rank);
2376  scale_preconditioner_.SetRank(rank);
2377  offset_preconditioner_.SetRank(rank);
2378  ExpectToken(is, binary, "</ScaleAndOffsetComponent>");
2379 }
2380 
2381 void ScaleAndOffsetComponent::Write(std::ostream &os, bool binary) const {
2382  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate
2383  WriteToken(os, binary, "<Dim>");
2384  WriteBasicType(os, binary, dim_);
2385  WriteToken(os, binary, "<Scales>");
2386  scales_.Write(os, binary);
2387  WriteToken(os, binary, "<Offsets>");
2388  offsets_.Write(os, binary);
2389  WriteToken(os, binary, "<UseNaturalGradient>");
2391  WriteToken(os, binary, "<Rank>");
2392  WriteBasicType(os, binary, scale_preconditioner_.GetRank());
2393  WriteToken(os, binary, "</ScaleAndOffsetComponent>");
2394 }
2395 
2397  if (scale == 0.0) {
2398  scales_.SetZero();
2399  offsets_.SetZero();
2400  } else {
2401  scales_.Scale(scale);
2402  offsets_.Scale(scale);
2403  }
2404 }
2405 
2407  const Component &other_in) {
2408  const ScaleAndOffsetComponent *other =
2409  dynamic_cast<const ScaleAndOffsetComponent*>(&other_in);
2410  KALDI_ASSERT(other != NULL);
2411  scales_.AddVec(alpha, other->scales_);
2412  offsets_.AddVec(alpha, other->offsets_);
2413 }
2414 
2416  const ScaleAndOffsetComponent &component):
2417  UpdatableComponent(component),
2418  dim_(component.dim_),
2419  scales_(component.scales_),
2420  offsets_(component.offsets_),
2422  scale_preconditioner_(component.scale_preconditioner_),
2423  offset_preconditioner_(component.offset_preconditioner_) { }
2424 
2426  CuVector<BaseFloat> temp(scales_.Dim(), kUndefined);
2427  temp.SetRandn();
2428  scales_.AddVec(stddev, temp);
2429  temp.SetRandn();
2430  offsets_.AddVec(stddev, temp);
2431 }
2432 
2434  const UpdatableComponent &other_in) const {
2435  const ScaleAndOffsetComponent *other =
2436  dynamic_cast<const ScaleAndOffsetComponent*>(&other_in);
2437  return VecVec(other->scales_, scales_) + VecVec(other->offsets_, offsets_);
2438 }
2439 
2441  int32 dim = scales_.Dim();
2442  params->Range(0, dim).CopyFromVec(scales_);
2443  params->Range(dim, dim).CopyFromVec(offsets_);
2444 }
2445 
2447  const VectorBase<BaseFloat> &params) {
2448  int32 dim = scales_.Dim();
2449  scales_.CopyFromVec(params.Range(0, dim));
2450  offsets_.CopyFromVec(params.Range(dim, dim));
2451 }
2452 
2454  const ComponentPrecomputedIndexes *indexes,
2455  const CuMatrixBase<BaseFloat> &in,
2456  CuMatrixBase<BaseFloat> *out) const {
2457  if (dim_ == scales_.Dim()) {
2458  PropagateInternal(in, out);
2459  } else {
2460  int32 multiple = dim_ / scales_.Dim(),
2461  num_rows = in.NumRows(), block_dim = scales_.Dim();
2462  KALDI_ASSERT(in.NumCols() == in.Stride() &&
2463  SameDimAndStride(in, *out));
2464  // Reinterpret the data as matrices with more rows but fewer columns.
2465  CuSubMatrix<BaseFloat> in_rearranged(in.Data(), num_rows * multiple,
2466  block_dim, block_dim),
2467  out_rearranged(out->Data(), num_rows * multiple,
2468  block_dim, block_dim);
2469  PropagateInternal(in_rearranged, &out_rearranged);
2470  }
2471  return NULL;
2472 }
2473 
2475  const CuMatrixBase<BaseFloat> &in,
2476  CuMatrixBase<BaseFloat> *out) const {
2477  if (out->Data() != in.Data())
2478  out->CopyFromMat(in);
2479  BaseFloat epsilon = Epsilon();
2480  int32 dim = scales_.Dim();
2481  CuVector<BaseFloat> scales_nonzero(dim, kUndefined);
2482  cu::EnsureNonzero(scales_, epsilon, &scales_nonzero);
2483  out->MulColsVec(scales_nonzero);
2484  out->AddVecToRows(1.0, offsets_);
2485 }
2486 
2488  const std::string &debug_info,
2489  const ComponentPrecomputedIndexes *indexes,
2490  const CuMatrixBase<BaseFloat> &, // in_value
2491  const CuMatrixBase<BaseFloat> &out_value,
2492  const CuMatrixBase<BaseFloat> &out_deriv,
2493  void *memo,
2494  Component *to_update_in,
2495  CuMatrixBase<BaseFloat> *in_deriv) const {
2496  NVTX_RANGE("ScaleAndOffsetComponent::Backprop");
2497  ScaleAndOffsetComponent *to_update =
2498  dynamic_cast<ScaleAndOffsetComponent*>(to_update_in);
2499 
2500  KALDI_ASSERT(SameDim(out_value, out_deriv));
2501 
2502  if (dim_ == scales_.Dim()) {
2503  BackpropInternal(debug_info, out_value, out_deriv,
2504  to_update, in_deriv);
2505  } else {
2506  KALDI_ASSERT(out_value.NumCols() == out_value.Stride() &&
2507  SameDimAndStride(out_value, out_deriv) &&
2508  (!in_deriv || SameDimAndStride(out_value, *in_deriv)));
2509  int32 multiple = dim_ / scales_.Dim(),
2510  num_rows = out_value.NumRows(),
2511  block_dim = scales_.Dim();
2512  CuSubMatrix<BaseFloat> out_value_rearranged(out_value.Data(),
2513  num_rows * multiple,
2514  block_dim, block_dim),
2515  out_deriv_rearranged(out_deriv.Data(), num_rows * multiple,
2516  block_dim, block_dim);
2517  if (in_deriv) {
2518  CuSubMatrix<BaseFloat> in_deriv_rearranged(in_deriv->Data(),
2519  num_rows * multiple,
2520  block_dim, block_dim);
2521  BackpropInternal(debug_info, out_value_rearranged,
2522  out_deriv_rearranged, to_update,
2523  &in_deriv_rearranged);
2524  } else {
2525  BackpropInternal(debug_info, out_value_rearranged,
2526  out_deriv_rearranged, to_update,
2527  NULL);
2528  }
2529  }
2530 }
2531 
2532 
2533  // Internal version of backprop, where the num-cols of the
2534  // argument matrices are equal to scales_.Dim().
2536  const std::string &debug_info,
2537  const CuMatrixBase<BaseFloat> &out_value,
2538  const CuMatrixBase<BaseFloat> &out_deriv,
2539  ScaleAndOffsetComponent *to_update,
2540  CuMatrixBase<BaseFloat> *in_deriv) const {
2541  if (to_update) {
2542  if (!to_update->use_natural_gradient_ || to_update->is_gradient_) {
2543  to_update->offsets_.AddRowSumMat(to_update->learning_rate_,
2544  out_deriv);
2545  } else {
2546  BaseFloat scale = 1.0;
2547  CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
2549  &out_deriv_copy, &scale);
2550  to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_,
2551  out_deriv_copy);
2552  }
2553  // The backprop actually needs the input to the component, not the output;
2554  // but we make the output available because in the common topologies that
2555  // will already be required for backprop-- it's for memory efficiency.
2556  CuMatrix<BaseFloat> in_value_reconstructed(out_value);
2557  int32 dim = scales_.Dim();
2558  CuVector<BaseFloat> scales_nonzero(dim, kUndefined);
2559  BaseFloat epsilon = Epsilon();
2560  cu::EnsureNonzero(scales_, epsilon, &scales_nonzero);
2561  scales_nonzero.InvertElements();
2562  in_value_reconstructed.AddVecToRows(-1.0, offsets_);
2563  // Actually scales_nonzero are now the inverses of the scales.
2564  in_value_reconstructed.MulColsVec(scales_nonzero);
2565  // OK, at this point in_value_reconstructed is the input to the component.
2566  // Multiply its elements by 'out_deriv' to get the derivatives
2567  // (for each frame) w.r.t. the scales.
2568  in_value_reconstructed.MulElements(out_deriv);
2569  BaseFloat scale = 1.0;
2570  if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
2572  &in_value_reconstructed, &scale);
2573  }
2574  to_update->scales_.AddRowSumMat(scale * to_update->learning_rate_,
2575  in_value_reconstructed);
2576  }
2577  if (in_deriv) {
2578  if (in_deriv->Data() != out_deriv.Data())
2579  in_deriv->CopyFromMat(out_deriv);
2580  in_deriv->MulColsVec(scales_);
2581  }
2582 }
2583 
2586  scale_preconditioner_.Swap(&temp_scale);
2588  offset_preconditioner_.Swap(&temp_offset);
2589 }
2590 
2591 
2592 std::string ConstantFunctionComponent::Info() const {
2593  std::ostringstream stream;
2594  stream << UpdatableComponent::Info()
2595  << ", " << Type() << ", input-dim=" << InputDim()
2596  << ", output-dim=" << OutputDim()
2597  << ", is-updatable=" << std::boolalpha << is_updatable_
2598  << ", use-natural-gradient=" << std::boolalpha
2600  PrintParameterStats(stream, "output", output_, true);
2601  return stream.str();
2602 }
2603 
2605  UpdatableComponent(), input_dim_(-1), is_updatable_(true),
2606  use_natural_gradient_(true) { }
2607 
2609  const ConstantFunctionComponent &other):
2610  UpdatableComponent(other), input_dim_(other.input_dim_),
2611  output_(other.output_), is_updatable_(other.is_updatable_),
2613  preconditioner_(other.preconditioner_) { }
2614 
2616  const ComponentPrecomputedIndexes *indexes,
2617  const CuMatrixBase<BaseFloat> &in,
2618  CuMatrixBase<BaseFloat> *out) const {
2619  out->CopyRowsFromVec(output_);
2620  return NULL;
2621 }
2622 
2624  const std::string &debug_info,
2625  const ComponentPrecomputedIndexes *indexes,
2626  const CuMatrixBase<BaseFloat> &, // in_value
2627  const CuMatrixBase<BaseFloat> &, // out_value
2628  const CuMatrixBase<BaseFloat> &out_deriv,
2629  void *memo,
2630  Component *to_update_in,
2631  CuMatrixBase<BaseFloat> *in_deriv) const {
2632  NVTX_RANGE("ConstantFunctionComponent::Backprop");
2633  // we don't update in_deriv, since we set the flag
2634  // kBackpropAdds, and the output doesn't depend on the
2635  // input, so the input-derivative is zero.
2636  if (to_update_in) {
2637  ConstantFunctionComponent *to_update =
2638  dynamic_cast<ConstantFunctionComponent*>(to_update_in);
2639  if (to_update->is_updatable_) {
2640  // only do the update if the is_updatable_ flag is set.
2641  KALDI_ASSERT(to_update && to_update->is_updatable_);
2642  if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
2643  CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
2644  BaseFloat scale = 1.0;
2645  to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
2646  &scale);
2647  to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
2648  out_deriv_copy);
2649  } else {
2650  to_update->output_.AddRowSumMat(to_update->learning_rate_,
2651  out_deriv);
2652  }
2653  }
2654  }
2655 }
2656 
2657 void ConstantFunctionComponent::Read(std::istream &is, bool binary) {
2658  std::string token;
2659  ReadToken(is, binary, &token);
2660  if (token == "<ConstantFunctionComponent>") {
2661  ReadToken(is, binary, &token);
2662  }
2663  if (token == "<LearningRateFactor>") {
2664  ReadBasicType(is, binary, &learning_rate_factor_);
2665  ReadToken(is, binary, &token);
2666  } else {
2667  learning_rate_factor_ = 1.0;
2668  }
2669  if (token == "<IsGradient>") {
2670  ReadBasicType(is, binary, &is_gradient_);
2671  ReadToken(is, binary, &token);
2672  } else {
2673  is_gradient_ = false;
2674  }
2675  if (token == "<LearningRate>") {
2676  ReadBasicType(is, binary, &learning_rate_);
2677  ReadToken(is, binary, &token);
2678  } else {
2679  learning_rate_ = 0.001;
2680  }
2681  if (token == "<InputDim>") {
2682  ReadBasicType(is, binary, &input_dim_);
2683  } else {
2684  KALDI_ERR << "Expected token <InputDim>, got "
2685  << token;
2686  }
2687  ExpectToken(is, binary, "<Output>");
2688  output_.Read(is, binary);
2689  ExpectToken(is, binary, "<IsUpdatable>");
2690  ReadBasicType(is, binary, &is_updatable_);
2691  ExpectToken(is, binary, "<UseNaturalGradient>");
2692  ReadBasicType(is, binary, &use_natural_gradient_);
2693  ExpectToken(is, binary, "</ConstantFunctionComponent>");
2694 }
2695 
2696 void ConstantFunctionComponent::Write(std::ostream &os, bool binary) const {
2697  WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate
2698  WriteToken(os, binary, "<InputDim>");
2699  WriteBasicType(os, binary, input_dim_);
2700  WriteToken(os, binary, "<Output>");
2701  output_.Write(os, binary);
2702  WriteToken(os, binary, "<IsUpdatable>");
2703  WriteBasicType(os, binary, is_updatable_);
2704  WriteToken(os, binary, "<UseNaturalGradient>");
2706  WriteToken(os, binary, "</ConstantFunctionComponent>");
2707 }
2708 
2710  return new ConstantFunctionComponent(*this);
2711 }
2712 
2714  if (is_updatable_) {
2715  if (scale == 0.0) {
2716  output_.SetZero();
2717  } else {
2718  output_.Scale(scale);
2719  }
2720  }
2721 }
2722 
2724  if (is_updatable_) {
2725  const ConstantFunctionComponent *other =
2726  dynamic_cast<const ConstantFunctionComponent*>(&other_in);
2727  KALDI_ASSERT(other != NULL);
2728  output_.AddVec(alpha, other->output_);
2729  }
2730 }
2731 
2733  CuVector<BaseFloat> temp_output(output_.Dim(), kUndefined);
2734  temp_output.SetRandn();
2735  output_.AddVec(stddev, temp_output);
2736 }
2737 
2739  const UpdatableComponent &other_in) const {
2741  const ConstantFunctionComponent *other =
2742  dynamic_cast<const ConstantFunctionComponent*>(&other_in);
2743  KALDI_ASSERT(other != NULL);
2744  return VecVec(output_, other->output_);
2745 }
2746 
2748  int32 output_dim = 0;
2750  bool ok = cfl->GetValue("output-dim", &output_dim) &&
2751  cfl->GetValue("input-dim", &input_dim_);
2752  cfl->GetValue("is-updatable", &is_updatable_);
2753  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
2754  BaseFloat output_mean = 0.0, output_stddev = 0.0;
2755  cfl->GetValue("output-mean", &output_mean);
2756  cfl->GetValue("output-stddev", &output_stddev);
2757  if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 ||
2758  output_dim <= 0) {
2759  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
2760  }
2761  Vector<BaseFloat> output(output_dim);
2762  output.SetRandn();
2763  output.Scale(output_stddev);
2764  output.Add(output_mean);
2765  output_ = output;
2766 }
2767 
2770  return output_.Dim();
2771 }
2772 
2774  params->CopyFromVec(output_);
2775 }
2776 
2778  output_.CopyFromVec(params);
2779 }
2780 
2783  preconditioner_.Swap(&temp);
2784 }
2785 
2786 void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
2787  ReadUpdatableCommon(is, binary); // Read the opening tag and learning rate
2788  ExpectToken(is, binary, "<LinearParams>");
2789  linear_params_.Read(is, binary);
2790  ExpectToken(is, binary, "<BiasParams>");
2791  bias_params_.Read(is, binary);
2792 
2793  BaseFloat num_samples_history, alpha;
2794  int32 rank_in, rank_out, update_period;
2795 
2796  ExpectToken(is, binary, "<RankIn>");
2797  ReadBasicType(is, binary, &rank_in);
2798  ExpectToken(is, binary, "<RankOut>");
2799  ReadBasicType(is, binary, &rank_out);
2800  if (PeekToken(is, binary) == 'O') {
2801  ExpectToken(is, binary, "<OrthonormalConstraint>");
2802  ReadBasicType(is, binary, &orthonormal_constraint_);
2803  } else {
2804  orthonormal_constraint_ = 0.0;
2805  }
2806  ExpectToken(is, binary, "<UpdatePeriod>");
2807  ReadBasicType(is, binary, &update_period);
2808  ExpectToken(is, binary, "<NumSamplesHistory>");
2809  ReadBasicType(is, binary, &num_samples_history);
2810  ExpectToken(is, binary, "<Alpha>");
2811  ReadBasicType(is, binary, &alpha);
2812 
2813  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
2814  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
2815  preconditioner_in_.SetAlpha(alpha);
2816  preconditioner_out_.SetAlpha(alpha);
2817  preconditioner_in_.SetRank(rank_in);
2818  preconditioner_out_.SetRank(rank_out);
2819  preconditioner_in_.SetUpdatePeriod(update_period);
2820  preconditioner_out_.SetUpdatePeriod(update_period);
2821 
2822  if (PeekToken(is, binary) == 'M') {
2823  // MaxChangePerSample, long ago removed; back compatibility.
2824  ExpectToken(is, binary, "<MaxChangePerSample>");
2825  BaseFloat temp;
2826  ReadBasicType(is, binary, &temp);
2827  }
2828  if (PeekToken(is, binary) == 'I') {
2829  // for back compatibility; we don't write this here any
2830  // more as it's written and read in Write/ReadUpdatableCommon
2831  ExpectToken(is, binary, "<IsGradient>");
2832  ReadBasicType(is, binary, &is_gradient_);
2833  }
2834  if (PeekToken(is, binary) == 'U') {
2835  ExpectToken(is, binary, "<UpdateCount>");
2836  // back-compatibility branch (these configs were added and then removed).
2837  double temp;
2838  ReadBasicType(is, binary, &temp);
2839  ExpectToken(is, binary, "<ActiveScalingCount>");
2840  ReadBasicType(is, binary, &temp);
2841  ExpectToken(is, binary, "<MaxChangeScaleStats>");
2842  ReadBasicType(is, binary, &temp);
2843  }
2844  std::string token;
2845  ReadToken(is, binary, &token);
2846  // the following has to handle a couple variants of
2847  if (token.find("NaturalGradientAffineComponent>") == std::string::npos)
2848  KALDI_ERR << "Expected <NaturalGradientAffineComponent> or "
2849  << "</NaturalGradientAffineComponent>, got " << token;
2850 }
2851 
2852 
2854  const CuMatrixBase<BaseFloat> &linear_params,
2855  const CuVectorBase<BaseFloat> &bias_params):
2856  AffineComponent(linear_params, bias_params, 0.001) {
2857  KALDI_ASSERT(bias_params.Dim() == linear_params.NumRows() &&
2858  bias_params.Dim() != 0);
2859 
2860  // set some default natural gradient configs.
2865 }
2866 
2868  bool ok = true;
2869  std::string matrix_filename;
2870 
2871  is_gradient_ = false; // not configurable; there's no reason you'd want this
2872 
2874 
2875  if (cfl->GetValue("matrix", &matrix_filename)) {
2876  CuMatrix<BaseFloat> mat;
2877  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
2878  KALDI_ASSERT(mat.NumCols() >= 2);
2879  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
2880  linear_params_.Resize(output_dim, input_dim);
2881  bias_params_.Resize(output_dim);
2882  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
2883  bias_params_.CopyColFromMat(mat, input_dim);
2884  if (cfl->GetValue("input-dim", &input_dim))
2885  KALDI_ASSERT(input_dim == InputDim() &&
2886  "input-dim mismatch vs. matrix.");
2887  if (cfl->GetValue("output-dim", &output_dim))
2888  KALDI_ASSERT(output_dim == OutputDim() &&
2889  "output-dim mismatch vs. matrix.");
2890  } else {
2891  int32 input_dim = -1, output_dim = -1;
2892 
2893  ok = ok && cfl->GetValue("input-dim", &input_dim);
2894  ok = ok && cfl->GetValue("output-dim", &output_dim);
2895  if (!ok)
2896  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
2897  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
2898  bias_stddev = 1.0, bias_mean = 0.0;
2899  cfl->GetValue("param-stddev", &param_stddev);
2900  cfl->GetValue("bias-stddev", &bias_stddev);
2901  cfl->GetValue("bias-mean", &bias_mean);
2902  linear_params_.Resize(output_dim, input_dim);
2903  bias_params_.Resize(output_dim);
2904  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0 &&
2905  bias_stddev >= 0.0);
2906  linear_params_.SetRandn(); // sets to random normally distributed noise.
2907  linear_params_.Scale(param_stddev);
2908  bias_params_.SetRandn();
2909  bias_params_.Scale(bias_stddev);
2910  bias_params_.Add(bias_mean);
2911  }
2912 
2914  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
2915 
2916  // Set natural-gradient configs.
2917  BaseFloat num_samples_history = 2000.0,
2918  alpha = 4.0;
2919  int32 rank_in = -1, rank_out = -1,
2920  update_period = 4;
2921  cfl->GetValue("num-samples-history", &num_samples_history);
2922  cfl->GetValue("alpha", &alpha);
2923  cfl->GetValue("rank-in", &rank_in);
2924  cfl->GetValue("rank-out", &rank_out);
2925  cfl->GetValue("update-period", &update_period);
2926 
2927  if (rank_in < 0)
2928  rank_in = std::min<int32>(20, (InputDim() + 1) / 2);
2929  if (rank_out < 0)
2930  rank_out = std::min<int32>(80, (OutputDim() + 1) / 2);
2931 
2932  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
2933  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
2936  preconditioner_in_.SetRank(rank_in);
2937  preconditioner_out_.SetRank(rank_out);
2938  preconditioner_in_.SetUpdatePeriod(update_period);
2939  preconditioner_out_.SetUpdatePeriod(update_period);
2940 
2941  if (cfl->HasUnusedValues())
2942  KALDI_ERR << "Could not process these elements in initializer: "
2943  << cfl->UnusedValues();
2944  if (!ok)
2945  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
2946 }
2947 
2949  bool binary) const {
2950  WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate
2951  WriteToken(os, binary, "<LinearParams>");
2952  linear_params_.Write(os, binary);
2953  WriteToken(os, binary, "<BiasParams>");
2954  bias_params_.Write(os, binary);
2955  WriteToken(os, binary, "<RankIn>");
2956  WriteBasicType(os, binary, preconditioner_in_.GetRank());
2957  WriteToken(os, binary, "<RankOut>");
2959  if (orthonormal_constraint_ != 0.0) {
2960  WriteToken(os, binary, "<OrthonormalConstraint>");
2962  }
2963  WriteToken(os, binary, "<UpdatePeriod>");
2965  WriteToken(os, binary, "<NumSamplesHistory>");
2967  WriteToken(os, binary, "<Alpha>");
2969  WriteToken(os, binary, "</NaturalGradientAffineComponent>");
2970 }
2971 
2973  std::ostringstream stream;
2974  stream << AffineComponent::Info();
2975  stream << ", rank-in=" << preconditioner_in_.GetRank()
2976  << ", rank-out=" << preconditioner_out_.GetRank()
2977  << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory()
2978  << ", update-period=" << preconditioner_in_.GetUpdatePeriod()
2979  << ", alpha=" << preconditioner_in_.GetAlpha();
2980  return stream.str();
2981 }
2982 
2984  return new NaturalGradientAffineComponent(*this);
2985 }
2986 
2988  const NaturalGradientAffineComponent &other):
2989  AffineComponent(other),
2992 
2994  const std::string &debug_info,
2995  const CuMatrixBase<BaseFloat> &in_value,
2996  const CuMatrixBase<BaseFloat> &out_deriv) {
2997  CuMatrix<BaseFloat> in_value_temp;
2998 
2999  in_value_temp.Resize(in_value.NumRows(),
3000  in_value.NumCols() + 1, kUndefined);
3001  in_value_temp.Range(0, in_value.NumRows(),
3002  0, in_value.NumCols()).CopyFromMat(in_value);
3003 
3004  // Add the 1.0 at the end of each row "in_value_temp"
3005  in_value_temp.Range(0, in_value.NumRows(),
3006  in_value.NumCols(), 1).Set(1.0);
3007 
3008  CuMatrix<BaseFloat> out_deriv_temp(out_deriv);
3009 
3010  // These "scale" values get will get multiplied into the learning rate (faster
3011  // than having the matrices scaled inside the preconditioning code).
3012  BaseFloat in_scale, out_scale;
3013 
3014  preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale);
3015  preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale);
3016 
3017  // "scale" is a scaling factor coming from the PreconditionDirections calls
3018  // (it's faster to have them output a scaling factor than to have them scale
3019  // their outputs).
3020  BaseFloat scale = in_scale * out_scale;
3021 
3022  CuSubMatrix<BaseFloat> in_value_precon_part(in_value_temp,
3023  0, in_value_temp.NumRows(),
3024  0, in_value_temp.NumCols() - 1);
3025  // this "precon_ones" is what happens to the vector of 1's representing
3026  // offsets, after multiplication by the preconditioner.
3027  CuVector<BaseFloat> precon_ones(in_value_temp.NumRows());
3028 
3029  precon_ones.CopyColFromMat(in_value_temp, in_value_temp.NumCols() - 1);
3030 
3031  BaseFloat local_lrate = scale * learning_rate_;
3032 
3033  bias_params_.AddMatVec(local_lrate, out_deriv_temp, kTrans,
3034  precon_ones, 1.0);
3035  linear_params_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
3036  in_value_precon_part, kNoTrans, 1.0);
3037 }
3038 
3040  if (scale == 0.0) {
3041  linear_params_.SetZero();
3042  bias_params_.SetZero();
3043  } else {
3044  linear_params_.Scale(scale);
3045  bias_params_.Scale(scale);
3046  }
3047 }
3048 
3050  const NaturalGradientAffineComponent *other =
3051  dynamic_cast<const NaturalGradientAffineComponent*>(&other_in);
3052  KALDI_ASSERT(other != NULL);
3053  linear_params_.AddMat(alpha, other->linear_params_);
3054  bias_params_.AddVec(alpha, other->bias_params_);
3055 }
3056 
3058  preconditioner_in_.Freeze(freeze);
3059  preconditioner_out_.Freeze(freeze);
3060 }
3061 
3064  preconditioner_in_.Swap(&temp_in);
3066  preconditioner_out_.Swap(&temp_out);
3067 }
3068 
3069 void LinearComponent::Read(std::istream &is, bool binary) {
3070  std::string token = ReadUpdatableCommon(is, binary);
3071  KALDI_ASSERT(token == "");
3072  ExpectToken(is, binary, "<Params>");
3073  params_.Read(is, binary);
3074  if (PeekToken(is, binary) == 'O') {
3075  ExpectToken(is, binary, "<OrthonormalConstraint>");
3076  ReadBasicType(is, binary, &orthonormal_constraint_);
3077  } else {
3079  }
3080  ExpectToken(is, binary, "<UseNaturalGradient>");
3081  ReadBasicType(is, binary, &use_natural_gradient_);
3082 
3083  // Read various natural-gradient-related configs.
3084  int32 rank_in, rank_out, update_period;
3085  BaseFloat alpha, num_samples_history;
3086  ExpectToken(is, binary, "<RankInOut>");
3087  ReadBasicType(is, binary, &rank_in);
3088  ReadBasicType(is, binary, &rank_out);
3089  ExpectToken(is, binary, "<Alpha>");
3090  ReadBasicType(is, binary, &alpha);
3091  ExpectToken(is, binary, "<NumSamplesHistory>");
3092  ReadBasicType(is, binary, &num_samples_history);
3093  ExpectToken(is, binary, "<UpdatePeriod>");
3094  ReadBasicType(is, binary, &update_period);
3095 
3098  preconditioner_in_.SetRank(rank_in);
3099  preconditioner_out_.SetRank(rank_out);
3100  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
3101  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
3102  preconditioner_in_.SetUpdatePeriod(update_period);
3103  preconditioner_out_.SetUpdatePeriod(update_period);
3104 
3105  ExpectToken(is, binary, "</LinearComponent>");
3106 }
3107 
3109  bool ok = true;
3110  std::string matrix_filename;
3111  is_gradient_ = false; // not configurable; there's no reason you'd want this
3112 
3114 
3115  int32 input_dim = -1, output_dim = -1;
3116  if (cfl->GetValue("matrix", &matrix_filename)) {
3117  ReadKaldiObject(matrix_filename, &params_); // will abort on failure.
3118  KALDI_ASSERT(params_.NumRows() != 0);
3119  if (cfl->GetValue("input-dim", &input_dim))
3120  KALDI_ASSERT(input_dim == InputDim() &&
3121  "input-dim mismatch vs. matrix.");
3122  if (cfl->GetValue("output-dim", &output_dim))
3123  KALDI_ASSERT(output_dim == OutputDim() &&
3124  "output-dim mismatch vs. matrix.");
3125  } else {
3126  ok = ok && cfl->GetValue("input-dim", &input_dim);
3127  ok = ok && cfl->GetValue("output-dim", &output_dim);
3128  if (!ok)
3129  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
3130  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim);
3131  cfl->GetValue("param-stddev", &param_stddev);
3132  params_.Resize(output_dim, input_dim);
3133  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
3134  params_.SetRandn(); // sets to random normally distributed noise.
3135  params_.Scale(param_stddev);
3136  }
3137  // Read various natural-gradient-related configs.
3138  int32 rank_in = -1, rank_out = -1, update_period = 4;
3139  BaseFloat alpha = 4.0,
3140  num_samples_history = 2000.0;
3141 
3142  use_natural_gradient_ = true;
3143 
3144  cfl->GetValue("num-samples-history", &num_samples_history);
3145  cfl->GetValue("alpha", &alpha);
3146  cfl->GetValue("rank-in", &rank_in);
3147  cfl->GetValue("rank-out", &rank_out);
3148  cfl->GetValue("update-period", &update_period);
3149  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
3150 
3151  if (rank_in < 0)
3152  rank_in = std::min<int32>(20, (InputDim() + 1) / 2);
3153  if (rank_out < 0)
3154  rank_out = std::min<int32>(80, (OutputDim() + 1) / 2);
3155 
3158  preconditioner_in_.SetRank(rank_in);
3159  preconditioner_out_.SetRank(rank_out);
3160  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
3161  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
3162  preconditioner_in_.SetUpdatePeriod(update_period);
3163  preconditioner_out_.SetUpdatePeriod(update_period);
3164 
3166  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
3167 
3168  if (cfl->HasUnusedValues())
3169  KALDI_ERR << "Could not process these elements in initializer: "
3170  << cfl->UnusedValues();
3171 }
3172 
3173 
3174 void LinearComponent::Write(std::ostream &os,
3175  bool binary) const {
3176  WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate
3177  WriteToken(os, binary, "<Params>");
3178  params_.Write(os, binary);
3179  if (orthonormal_constraint_ != 0.0) {
3180  WriteToken(os, binary, "<OrthonormalConstraint>");
3182  }
3183  WriteToken(os, binary, "<UseNaturalGradient>");
3184  WriteBasicType(os, binary, use_natural_gradient_);
3185 
3186  int32 rank_in = preconditioner_in_.GetRank(),
3187  rank_out = preconditioner_out_.GetRank(),
3188  update_period = preconditioner_in_.GetUpdatePeriod();
3190  num_samples_history = preconditioner_in_.GetNumSamplesHistory();
3191  WriteToken(os, binary, "<RankInOut>");
3192  WriteBasicType(os, binary, rank_in);
3193  WriteBasicType(os, binary, rank_out);
3194  WriteToken(os, binary, "<Alpha>");
3195  WriteBasicType(os, binary, alpha);
3196  WriteToken(os, binary, "<NumSamplesHistory>");
3197  WriteBasicType(os, binary, num_samples_history);
3198  WriteToken(os, binary, "<UpdatePeriod>");
3199  WriteBasicType(os, binary, update_period);
3200  WriteToken(os, binary, "</LinearComponent>");
3201 }
3202 
3203 std::string LinearComponent::Info() const {
3204  std::ostringstream stream;
3205  stream << UpdatableComponent::Info();
3206  PrintParameterStats(stream, "params", params_,
3207  false, // include_mean
3208  true, // include_row_norms
3209  true, // include_column_norms
3210  GetVerboseLevel() >= 2); // include_singular_values
3211  if (orthonormal_constraint_ != 0.0)
3212  stream << ", orthonormal-constraint=" << orthonormal_constraint_;
3213  stream << ", use-natural-gradient="
3214  << (use_natural_gradient_ ? "true" : "false")
3215  << ", rank-in=" << preconditioner_in_.GetRank()
3216  << ", rank-out=" << preconditioner_out_.GetRank()
3217  << ", num-samples-history="
3219  << ", update-period=" << preconditioner_in_.GetUpdatePeriod()
3220  << ", alpha=" << preconditioner_in_.GetAlpha();
3221  return stream.str();
3222 }
3223 
3225  const CuMatrixBase<BaseFloat> &in,
3226  CuMatrixBase<BaseFloat> *out) const {
3227  out->AddMatMat(1.0, in, kNoTrans, params_, kTrans, 1.0);
3228  return NULL;
3229 }
3230 
3231 void LinearComponent::Backprop(const std::string &debug_info,
3232  const ComponentPrecomputedIndexes *indexes,
3233  const CuMatrixBase<BaseFloat> &in_value,
3234  const CuMatrixBase<BaseFloat> &, // out_value
3235  const CuMatrixBase<BaseFloat> &out_deriv,
3236  void *memo,
3237  Component *to_update_in,
3238  CuMatrixBase<BaseFloat> *in_deriv) const {
3239  NVTX_RANGE("LinearComponent::Backprop");
3240  LinearComponent *to_update = dynamic_cast<LinearComponent*>(to_update_in);
3241 
3242  // Propagate the derivative back to the input. add with coefficient 1.0 since
3243  // property kBackpropAdds is true. If we wanted to add with coefficient 0.0
3244  // we'd need to zero the in_deriv, in case of infinities.
3245  if (in_deriv)
3246  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, params_, kNoTrans, 1.0);
3247 
3248  if (to_update != NULL) {
3249  if (!to_update->is_gradient_) {
3250  CuMatrix<BaseFloat> in_value_temp(in_value), out_deriv_temp(out_deriv);
3251  // These "scale" values get will get multiplied into the learning rate (faster
3252  // than having the matrices scaled inside the preconditioning code).
3253  BaseFloat in_scale, out_scale;
3254  to_update->preconditioner_in_.PreconditionDirections(&in_value_temp,
3255  &in_scale);
3256  to_update->preconditioner_out_.PreconditionDirections(&out_deriv_temp,
3257  &out_scale);
3258  BaseFloat local_lrate = in_scale * out_scale * to_update->learning_rate_;
3259 
3260  to_update->params_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
3261  in_value_temp, kNoTrans, 1.0);
3262  } else {
3263  to_update->params_.AddMatMat(to_update->learning_rate_,
3264  out_deriv, kTrans,
3265  in_value, kNoTrans, 1.0);
3266  }
3267  }
3268 }
3269 
3270 
3272  return new LinearComponent(*this);
3273 }
3274 
3276  const LinearComponent &other):
3277  UpdatableComponent(other),
3278  params_(other.params_),
3280  use_natural_gradient_(other.use_natural_gradient_),
3283 
3285  params_(params),
3287  use_natural_gradient_(true) {
3288  // Set defaults for natural gradient.
3293  // the component-level defaults of alpha and num_samples_history, at 4.0 and
3294  // 2000.0, are the same as in the NaturalGradientOnline code, so there is no
3295  // need to set those here.
3296 }
3297 
3299  if (scale == 0.0) params_.SetZero();
3300  else params_.Scale(scale);
3301 }
3302 
3303 void LinearComponent::Add(BaseFloat alpha, const Component &other_in) {
3304  const LinearComponent *other =
3305  dynamic_cast<const LinearComponent*>(&other_in);
3306  KALDI_ASSERT(other != NULL);
3307  params_.AddMat(alpha, other->params_);
3308 }
3309 
3311  CuMatrix<BaseFloat> temp_params(params_);
3312  temp_params.SetRandn();
3313  params_.AddMat(stddev, temp_params);
3314 }
3316  return params_.NumRows() * params_.NumCols();
3317 }
3319  KALDI_ASSERT(params->Dim() == this->NumParameters());
3320  params->CopyRowsFromMat(params_);
3321 }
3323  KALDI_ASSERT(params.Dim() == this->NumParameters());
3324  params_.CopyRowsFromVec(params);
3325 }
3327  const LinearComponent *other =
3328  dynamic_cast<const LinearComponent*>(&other_in);
3329  return TraceMatMat(params_, other->params_, kTrans);
3330 }
3331 
3333  preconditioner_in_.Freeze(freeze);
3334  preconditioner_out_.Freeze(freeze);
3335 }
3336 
3339  preconditioner_in_.Swap(&temp_in);
3341  preconditioner_out_.Swap(&temp_out);
3342 }
3343 
3344 std::string FixedAffineComponent::Info() const {
3345  std::ostringstream stream;
3346  stream << Component::Info();
3347  PrintParameterStats(stream, "linear-params", linear_params_);
3348  PrintParameterStats(stream, "bias", bias_params_, true);
3349  return stream.str();
3350 }
3351 
3353  KALDI_ASSERT(mat.NumCols() > 1);
3354  linear_params_ = mat.Range(0, mat.NumRows(), 0, mat.NumCols() - 1);
3355  bias_params_.Resize(mat.NumRows());
3356  bias_params_.CopyColFromMat(mat, mat.NumCols() - 1);
3357 }
3358 
3360  std::string filename;
3361  // Two forms allowed: "matrix=<rxfilename>", or "input-dim=x output-dim=y"
3362  // (for testing purposes only).
3363  if (cfl->GetValue("matrix", &filename)) {
3364  if (cfl->HasUnusedValues())
3365  KALDI_ERR << "Invalid initializer for layer of type "
3366  << Type() << ": \"" << cfl->WholeLine() << "\"";
3367 
3368  bool binary;
3369  Input ki(filename, &binary);
3370  CuMatrix<BaseFloat> mat;
3371  mat.Read(ki.Stream(), binary);
3372  KALDI_ASSERT(mat.NumRows() != 0);
3373  Init(mat);
3374  } else {
3375  int32 input_dim = -1, output_dim = -1;
3376  if (!cfl->GetValue("input-dim", &input_dim) ||
3377  !cfl->GetValue("output-dim", &output_dim) || cfl->HasUnusedValues()) {
3378  KALDI_ERR << "Invalid initializer for layer of type "
3379  << Type() << ": \"" << cfl->WholeLine() << "\"";
3380  }
3381  CuMatrix<BaseFloat> mat(output_dim, input_dim + 1);
3382  mat.SetRandn();
3383  Init(mat);
3384  }
3385 }
3386 
3387 
3389  linear_params_(c.LinearParams()),
3390  bias_params_(c.BiasParams()) { }
3391 
3393  const CuMatrixBase<BaseFloat> &in,
3394  CuMatrixBase<BaseFloat> *out) const {
3395  out->CopyRowsFromVec(bias_params_); // Adds the bias term first.
3396  out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 1.0);
3397  return NULL;
3398 }
3399 
3400 void FixedAffineComponent::Backprop(const std::string &debug_info,
3401  const ComponentPrecomputedIndexes *indexes,
3402  const CuMatrixBase<BaseFloat> &, //in_value
3403  const CuMatrixBase<BaseFloat> &, //out_value
3404  const CuMatrixBase<BaseFloat> &out_deriv,
3405  void *memo,
3406  Component *, //to_update
3407  CuMatrixBase<BaseFloat> *in_deriv) const {
3408  NVTX_RANGE("FixedAffineComponent::Backprop");
3409  // kBackpropAdds is true. It's the user's responsibility to zero out
3410  // <in_deriv> if they need it to be so.
3411  if (in_deriv)
3412  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans,
3413  linear_params_, kNoTrans, 1.0);
3414 }
3415 
3419  ans->bias_params_ = bias_params_;
3420  return ans;
3421 }
3422 
3423 void FixedAffineComponent::Write(std::ostream &os, bool binary) const {
3424  WriteToken(os, binary, "<FixedAffineComponent>");
3425  WriteToken(os, binary, "<LinearParams>");
3426  linear_params_.Write(os, binary);
3427  WriteToken(os, binary, "<BiasParams>");
3428  bias_params_.Write(os, binary);
3429  WriteToken(os, binary, "</FixedAffineComponent>");
3430 }
3431 
3432 void FixedAffineComponent::Read(std::istream &is, bool binary) {
3433  ExpectOneOrTwoTokens(is, binary, "<FixedAffineComponent>", "<LinearParams>");
3434  linear_params_.Read(is, binary);
3435  ExpectToken(is, binary, "<BiasParams>");
3436  bias_params_.Read(is, binary);
3437  ExpectToken(is, binary, "</FixedAffineComponent>");
3438 }
3439 
3440 void SumGroupComponent::Init(const std::vector<int32> &sizes) {
3441  KALDI_ASSERT(!sizes.empty());
3442  std::vector<Int32Pair> cpu_vec(sizes.size());
3443  std::vector<int32> reverse_cpu_vec;
3444  int32 cur_index = 0;
3445  for (size_t i = 0; i < sizes.size(); i++) {
3446  KALDI_ASSERT(sizes[i] > 0);
3447  cpu_vec[i].first = cur_index;
3448  cpu_vec[i].second = cur_index + sizes[i];
3449  cur_index += sizes[i];
3450  for (int32 j = cpu_vec[i].first; j < cpu_vec[i].second; j++)
3451  reverse_cpu_vec.push_back(i);
3452  }
3453  this->indexes_ = cpu_vec;
3454  this->reverse_indexes_ = reverse_cpu_vec;
3455  this->input_dim_ = cur_index;
3456  this->output_dim_ = sizes.size();
3457 }
3458 
3459 void SumGroupComponent::Init(int32 input_dim, int32 output_dim) {
3460  const int32 num_groups = output_dim;
3461  KALDI_ASSERT(input_dim % num_groups == 0);
3462  const int32 group_size = input_dim / num_groups;
3463 
3464  std::vector<Int32Pair> cpu_vec(num_groups);
3465  std::vector<int32> reverse_cpu_vec;
3466  int32 cur_index = 0;
3467  for (size_t i = 0; i < num_groups; i++) {
3468  cpu_vec[i].first = cur_index;
3469  cpu_vec[i].second = cur_index + group_size;
3470  cur_index += group_size;
3471  for (int32 j = cpu_vec[i].first; j < cpu_vec[i].second; j++)
3472  reverse_cpu_vec.push_back(i);
3473  }
3474  this->indexes_ = cpu_vec;
3475  this->reverse_indexes_ = reverse_cpu_vec;
3476  this->input_dim_ = input_dim;
3477  this->output_dim_ = num_groups;
3478 }
3479 
3481  std::vector<int32> sizes;
3482  bool has_sizes = cfl->GetValue("sizes", &sizes);
3483  if (has_sizes) {
3484  if (cfl->HasUnusedValues() || sizes.empty())
3485  KALDI_ERR << "Invalid initializer for layer of type "
3486  << Type() << ": \"" << cfl->WholeLine() << "\"";
3487  this->Init(sizes);
3488  } else { // each group has the same size
3489  int32 input_dim = -1, output_dim = -1;
3490  if (!cfl->GetValue("input-dim", &input_dim) ||
3491  !cfl->GetValue("output-dim", &output_dim) || cfl->HasUnusedValues()) {
3492  KALDI_ERR << "Invalid initializer for layer of type "
3493  << Type() << ": \"" << cfl->WholeLine() << "\"";
3494  }
3495  Init(input_dim, output_dim);
3496  }
3497 }
3498 
3500  SumGroupComponent *ans = new SumGroupComponent();
3501  ans->indexes_ = indexes_;
3502  ans->reverse_indexes_ = reverse_indexes_;
3503  ans->input_dim_ = input_dim_;
3504  ans->output_dim_ = output_dim_;
3505  return ans;
3506 }
3507 
3508 void SumGroupComponent::Read(std::istream &is, bool binary) {
3509  ExpectOneOrTwoTokens(is, binary, "<SumGroupComponent>", "<Sizes>");
3510  std::vector<int32> sizes;
3511  ReadIntegerVector(is, binary, &sizes);
3512 
3513  std::string token;
3514  ReadToken(is, binary, &token);
3515  if (!(token == "<SumGroupComponent>" ||
3516  token == "</SumGroupComponent>")) {
3517  KALDI_ERR << "Expected </SumGroupComponent>, got " << token;
3518  }
3519  this->Init(sizes);
3520 }
3521 
3522 void SumGroupComponent::GetSizes(std::vector<int32> *sizes) const {
3523  std::vector<Int32Pair> indexes;
3524  indexes_.CopyToVec(&indexes);
3525  sizes->resize(indexes.size());
3526  for (size_t i = 0; i < indexes.size(); i++) {
3527  (*sizes)[i] = indexes[i].second - indexes[i].first;
3528  if (i == 0) { KALDI_ASSERT(indexes[i].first == 0); }
3529  else { KALDI_ASSERT(indexes[i].first == indexes[i-1].second); }
3530  KALDI_ASSERT(indexes[i].second > indexes[i].first);
3531  (*sizes)[i] = indexes[i].second - indexes[i].first;
3532  }
3533 }
3534 
3535 void SumGroupComponent::Write(std::ostream &os, bool binary) const {
3536  WriteToken(os, binary, "<SumGroupComponent>");
3537  WriteToken(os, binary, "<Sizes>");
3538  std::vector<int32> sizes;
3539  this->GetSizes(&sizes);
3540  WriteIntegerVector(os, binary, sizes);
3541  WriteToken(os, binary, "</SumGroupComponent>");
3542 }
3543 
3545  const CuMatrixBase<BaseFloat> &in,
3546  CuMatrixBase<BaseFloat> *out) const {
3547  out->SumColumnRanges(in, indexes_);
3548  return NULL;
3549 }
3550 
3551 void SumGroupComponent::Backprop(const std::string &debug_info,
3552  const ComponentPrecomputedIndexes *indexes,
3553  const CuMatrixBase<BaseFloat> &, // in_value,
3554  const CuMatrixBase<BaseFloat> &, // out_value
3555  const CuMatrixBase<BaseFloat> &out_deriv,
3556  void *memo,
3557  Component *to_update_in,
3558  CuMatrixBase<BaseFloat> *in_deriv) const {
3559  NVTX_RANGE("SumGroupComponent::Backprop");
3560  in_deriv->CopyCols(out_deriv, reverse_indexes_);
3561 }
3562 
3564  const CuMatrixBase<BaseFloat> &in,
3565  CuMatrixBase<BaseFloat> *out) const {
3566  // Apply softmax function to each row of the output...
3567  // for that row, we do
3568  // x_i = exp(x_i) / sum_j exp(x_j).
3569  out->SoftMaxPerRow(in);
3570 
3571  // This floor on the output helps us deal with
3572  // almost-zeros in a way that doesn't lead to overflow.
3573  out->ApplyFloor(1.0e-20);
3574 
3575  return NULL;
3576 }
3577 
3578 void SoftmaxComponent::Backprop(const std::string &debug_info,
3579  const ComponentPrecomputedIndexes *indexes,
3580  const CuMatrixBase<BaseFloat> &, // in_value,
3581  const CuMatrixBase<BaseFloat> &out_value,
3582  const CuMatrixBase<BaseFloat> &out_deriv,
3583  void *memo,
3584  Component *to_update_in,
3585  CuMatrixBase<BaseFloat> *in_deriv) const {
3586  NVTX_RANGE("SoftmaxComponent::Backprop");
3587 
3588  if (to_update_in) {
3589  SoftmaxComponent *to_update =
3590  dynamic_cast<SoftmaxComponent*>(to_update_in);
3591  to_update->StoreBackpropStats(out_deriv);
3592  }
3593 
3594  if (in_deriv == NULL)
3595  return;
3596  /*
3597  Note on the derivative of the softmax function: let it be
3598  p_i = exp(x_i) / sum_i exp_i
3599  The [matrix-valued] Jacobian of this function is
3600  diag(p) - p p^T
3601  Let the derivative vector at the output be e, and at the input be
3602  d. We have
3603  d = diag(p) e - p (p^T e).
3604  d_i = p_i e_i - p_i (p^T e).
3605  */
3606  in_deriv->DiffSoftmaxPerRow(out_value, out_deriv);
3607 }
3608 
3610  const CuMatrixBase<BaseFloat> &out_value,
3611  void *memo) {
3612  // We don't store derivative stats for this component type, just activation
3613  // stats.
3614  StoreStatsInternal(out_value, NULL);
3615 }
3616 
3617 
3619  const CuMatrixBase<BaseFloat> &in,
3620  CuMatrixBase<BaseFloat> *out) const {
3621  // Applies log softmax function to each row of the output. For each row, we do
3622  // x_i = x_i - log(sum_j exp(x_j))
3623  out->LogSoftMaxPerRow(in);
3624  return NULL;
3625 }
3626 
3627 void LogSoftmaxComponent::Backprop(const std::string &debug_info,
3628  const ComponentPrecomputedIndexes *indexes,
3629  const CuMatrixBase<BaseFloat> &, // in_value
3630  const CuMatrixBase<BaseFloat> &out_value,
3631  const CuMatrixBase<BaseFloat> &out_deriv,
3632  void *memo,
3633  Component *to_update_in,
3634  CuMatrixBase<BaseFloat> *in_deriv) const {
3635  NVTX_RANGE("LogSoftmaxComponent::Backprop");
3636  if (to_update_in) {
3637  LogSoftmaxComponent *to_update =
3638  dynamic_cast<LogSoftmaxComponent*>(to_update_in);
3639  to_update->StoreBackpropStats(out_deriv);
3640  }
3641  if (in_deriv == NULL)
3642  return;
3643  in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv);
3644 }
3645 
3646 
3648  KALDI_ASSERT(scales.Dim() != 0);
3649  scales_ = scales;
3650 }
3651 
3652 
3654  std::string filename;
3655  // Accepts "scales" config (for filename) or "dim" -> random init, for testing.
3656  if (cfl->GetValue("scales", &filename)) {
3657  if (cfl->HasUnusedValues())
3658  KALDI_ERR << "Invalid initializer for layer of type "
3659  << Type() << ": \"" << cfl->WholeLine() << "\"";
3660  CuVector<BaseFloat> vec;
3661  ReadKaldiObject(filename, &vec);
3662  Init(vec);
3663  } else {
3664  int32 dim;
3665  BaseFloat scale = 1.0;
3666  bool scale_is_set = cfl->GetValue("scale", &scale);
3667  if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues())
3668  KALDI_ERR << "Invalid initializer for layer of type "
3669  << Type() << ": \"" << cfl->WholeLine() << "\"";
3670  KALDI_ASSERT(dim > 0);
3671  CuVector<BaseFloat> vec(dim);
3672  if (scale_is_set) {
3673  vec.Set(scale);
3674  } else {
3675  vec.SetRandn();
3676  }
3677  Init(vec);
3678  }
3679 }
3680 
3681 
3682 std::string FixedScaleComponent::Info() const {
3683  std::ostringstream stream;
3684  stream << Component::Info();
3685  PrintParameterStats(stream, "scales", scales_, true);
3686  return stream.str();
3687 }
3688 
3690  const CuMatrixBase<BaseFloat> &in,
3691  CuMatrixBase<BaseFloat> *out) const {
3692  out->CopyFromMat(in); // does nothing if same matrix.
3693  out->MulColsVec(scales_);
3694  return NULL;
3695 }
3696 
3697 void FixedScaleComponent::Backprop(const std::string &debug_info,
3698  const ComponentPrecomputedIndexes *indexes,
3699  const CuMatrixBase<BaseFloat> &, // in_value
3700  const CuMatrixBase<BaseFloat> &, // out_value
3701  const CuMatrixBase<BaseFloat> &out_deriv,
3702  void *memo,
3703  Component *, // to_update
3704  CuMatrixBase<BaseFloat> *in_deriv) const {
3705  NVTX_RANGE("FixedScaleComponent::Backprop");
3706  in_deriv->CopyFromMat(out_deriv); // does nothing if same memory.
3707  in_deriv->MulColsVec(scales_);
3708 }
3709 
3712  ans->scales_ = scales_;
3713  return ans;
3714 }
3715 
3716 
3717 void FixedScaleComponent::Write(std::ostream &os, bool binary) const {
3718  WriteToken(os, binary, "<FixedScaleComponent>");
3719  WriteToken(os, binary, "<Scales>");
3720  scales_.Write(os, binary);
3721  WriteToken(os, binary, "</FixedScaleComponent>");
3722 }
3723 
3724 void FixedScaleComponent::Read(std::istream &is, bool binary) {
3725  ExpectOneOrTwoTokens(is, binary, "<FixedScaleComponent>", "<Scales>");
3726  scales_.Read(is, binary);
3727  ExpectToken(is, binary, "</FixedScaleComponent>");
3728 }
3729 
3731  KALDI_ASSERT(bias.Dim() != 0);
3732  bias_ = bias;
3733 }
3734 
3736  std::string filename;
3737  // Accepts "bias" config (for filename) or "dim" -> random init, for testing.
3738  if (cfl->GetValue("bias", &filename)) {
3739  if (cfl->HasUnusedValues())
3740  KALDI_ERR << "Invalid initializer for layer of type "
3741  << Type() << ": \"" << cfl->WholeLine() << "\"";
3742  CuVector<BaseFloat> vec;
3743  ReadKaldiObject(filename, &vec);
3744  Init(vec);
3745  } else {
3746  int32 dim;
3747  if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues())
3748  KALDI_ERR << "Invalid initializer for layer of type "
3749  << Type() << ": \"" << cfl->WholeLine() << "\"";
3750  KALDI_ASSERT(dim > 0);
3751  CuVector<BaseFloat> vec(dim);
3752  vec.SetRandn();
3753  Init(vec);
3754  }
3755 }
3756 
3757 std::string FixedBiasComponent::Info() const {
3758  std::ostringstream stream;
3759  stream << Component::Info();
3760  PrintParameterStats(stream, "bias", bias_, true);
3761  return stream.str();
3762 }
3763 
3765  const CuMatrixBase<BaseFloat> &in,
3766  CuMatrixBase<BaseFloat> *out) const {
3767  out->CopyFromMat(in); // will do nothing if in and out have same memory.
3768  out->AddVecToRows(1.0, bias_, 1.0);
3769  return NULL;
3770 }
3771 
3772 void FixedBiasComponent::Backprop(const std::string &debug_info,
3773  const ComponentPrecomputedIndexes *indexes,
3774  const CuMatrixBase<BaseFloat> &, // in_value
3775  const CuMatrixBase<BaseFloat> &, // out_value
3776  const CuMatrixBase<BaseFloat> &out_deriv,
3777  void *memo,
3778  Component *, // to_update
3779  CuMatrixBase<BaseFloat> *in_deriv) const {
3780  NVTX_RANGE("FixedBiasComponent::Backprop");
3781  // the following statement will do nothing if in_deriv and out_deriv have same
3782  // memory.
3783  in_deriv->CopyFromMat(out_deriv);
3784 }
3785 
3788  ans->bias_ = bias_;
3789  return ans;
3790 }
3791 
3792 
3793 void FixedBiasComponent::Write(std::ostream &os, bool binary) const {
3794  WriteToken(os, binary, "<FixedBiasComponent>");
3795  WriteToken(os, binary, "<Bias>");
3796  bias_.Write(os, binary);
3797  WriteToken(os, binary, "</FixedBiasComponent>");
3798 }
3799 
3800 void FixedBiasComponent::Read(std::istream &is, bool binary) {
3801  ExpectOneOrTwoTokens(is, binary, "<FixedBiasComponent>", "<Bias>");
3802  bias_.Read(is, binary);
3803  ExpectToken(is, binary, "</FixedBiasComponent>");
3804 }
3805 
3806 
3808  std::istream &is, bool binary) {
3809  ReadUpdatableCommon(is, binary); // Read the opening tag and learning rate
3810  ExpectToken(is, binary, "<Params>");
3811  scales_.Read(is, binary);
3812  ExpectToken(is, binary, "<IsGradient>");
3813  ReadBasicType(is, binary, &is_gradient_);
3814  int32 rank, update_period;
3815  ExpectToken(is, binary, "<Rank>");
3816  ReadBasicType(is, binary, &rank);
3817  preconditioner_.SetRank(rank);
3818  ExpectToken(is, binary, "<UpdatePeriod>");
3819  ReadBasicType(is, binary, &update_period);
3820  preconditioner_.SetUpdatePeriod(update_period);
3821  BaseFloat num_samples_history, alpha;
3822  ExpectToken(is, binary, "<NumSamplesHistory>");
3823  ReadBasicType(is, binary, &num_samples_history);
3824  preconditioner_.SetNumSamplesHistory(num_samples_history);
3825  ExpectToken(is, binary, "<Alpha>");
3826  ReadBasicType(is, binary, &alpha);
3827  preconditioner_.SetAlpha(alpha);
3828  std::string token;
3829  ReadToken(is, binary, &token);
3830  if (token == "<MaxChangePerMinibatch>") {
3831  // back compatibility; this was removed, it's now handled by the
3832  // 'max-change' config variable.
3833  BaseFloat temp;
3834  ReadBasicType(is, binary, &temp);
3835  ReadToken(is, binary, &token);
3836  }
3837  KALDI_ASSERT(token == "</NaturalGradientPerElementScaleComponent>");
3838 }
3839 
3841  bool binary) const {
3842  WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate
3843  WriteToken(os, binary, "<Params>");
3844  scales_.Write(os, binary);
3845  WriteToken(os, binary, "<IsGradient>");
3846  WriteBasicType(os, binary, is_gradient_);
3847  WriteToken(os, binary, "<Rank>");
3848  WriteBasicType(os, binary, preconditioner_.GetRank());
3849  WriteToken(os, binary, "<UpdatePeriod>");
3850  WriteBasicType(os, binary, preconditioner_.GetUpdatePeriod());
3851  WriteToken(os, binary, "<NumSamplesHistory>");
3852  WriteBasicType(os, binary, preconditioner_.GetNumSamplesHistory());
3853  WriteToken(os, binary, "<Alpha>");
3854  WriteBasicType(os, binary, preconditioner_.GetAlpha());
3855  WriteToken(os, binary, "</NaturalGradientPerElementScaleComponent>");
3856 }
3857 
3859  std::ostringstream stream;
3860  stream << PerElementScaleComponent::Info()
3861  << ", rank=" << preconditioner_.GetRank()
3862  << ", update-period=" << preconditioner_.GetUpdatePeriod()
3863  << ", num-samples-history=" << preconditioner_.GetNumSamplesHistory()
3864  << ", alpha=" << preconditioner_.GetAlpha();
3865  return stream.str();
3866 }
3867 
3869  // First set various configuration values that have defaults.
3870  int32 rank = 8, // Use a small rank because in this case the amount of memory
3871  // for the preconditioner actually exceeds the memory for the
3872  // parameters (by "rank").
3873  update_period = 10;
3874  BaseFloat num_samples_history = 2000.0, alpha = 4.0;
3875  cfl->GetValue("rank", &rank);
3876  cfl->GetValue("update-period", &update_period);
3877  cfl->GetValue("num-samples-history", &num_samples_history);
3878  cfl->GetValue("alpha", &alpha);
3879  InitLearningRatesFromConfig(cfl);
3880  std::string filename;
3881  // Accepts "scales" config (for filename) or "dim" -> random init, for testing.
3882  if (cfl->GetValue("scales", &filename)) {
3883  if (cfl->HasUnusedValues())
3884  KALDI_ERR << "Invalid initializer for layer of type "
3885  << Type() << ": \"" << cfl->WholeLine() << "\"";
3886  Init(filename, rank, update_period, num_samples_history, alpha);
3887 
3888  } else {
3889  BaseFloat param_mean = 1.0, param_stddev = 0.0;
3890  cfl->GetValue("param-mean", &param_mean);
3891  cfl->GetValue("param-stddev", &param_stddev);
3892 
3893  int32 dim;
3894  if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues())
3895  KALDI_ERR << "Invalid initializer for layer of type "
3896  << Type() << ": \"" << cfl->WholeLine() << "\"";
3897  KALDI_ASSERT(dim > 0);
3898 
3899  Init(dim, param_mean, param_stddev, rank, update_period,
3900  num_samples_history, alpha);
3901  }
3902 }
3903 
3905  int32 dim, BaseFloat param_mean,
3906  BaseFloat param_stddev, int32 rank, int32 update_period,
3907  BaseFloat num_samples_history, BaseFloat alpha) {
3908  PerElementScaleComponent::Init(dim, param_mean,
3909  param_stddev);
3910  preconditioner_.SetRank(rank);
3911  preconditioner_.SetUpdatePeriod(update_period);
3912  preconditioner_.SetNumSamplesHistory(num_samples_history);
3913  preconditioner_.SetAlpha(alpha);
3914 }
3915 
3917  std::string vector_filename,
3918  int32 rank, int32 update_period, BaseFloat num_samples_history,
3919  BaseFloat alpha) {
3920  PerElementScaleComponent::Init(vector_filename);
3921  preconditioner_.SetRank(rank);
3922  preconditioner_.SetUpdatePeriod(update_period);
3923  preconditioner_.SetNumSamplesHistory(num_samples_history);
3924  preconditioner_.SetAlpha(alpha);
3925 }
3926 
3927 
3930  PerElementScaleComponent(other),
3931  preconditioner_(other.preconditioner_) { }
3932 
3933 
3934 
3935 
3937  return new NaturalGradientPerElementScaleComponent(*this);
3938 }
3939 
3941  const std::string &debug_info,
3942  const CuMatrixBase<BaseFloat> &in_value,
3943  const CuMatrixBase<BaseFloat> &out_deriv) {
3944 
3945  CuMatrix<BaseFloat> derivs_per_frame(in_value);
3946  derivs_per_frame.MulElements(out_deriv);
3947  // the non-natural-gradient update would just do
3948  // scales_.AddRowSumMat(learning_rate_, derivs_per_frame).
3949 
3950  BaseFloat scale;
3951  preconditioner_.PreconditionDirections(&derivs_per_frame, &scale);
3952 
3953  CuVector<BaseFloat> delta_scales(scales_.Dim());
3954  delta_scales.AddRowSumMat(scale * learning_rate_, derivs_per_frame);
3955  scales_.AddVec(1.0, delta_scales);
3956 }
3957 
3959  preconditioner_.Freeze(freeze);
3960 }
3961 
3964  preconditioner_.Swap(&temp);
3965 }
3966 
3968  int32 dim = column_map_.Dim();
3969  KALDI_ASSERT(dim > 0);
3970  std::vector<int32> reverse_column_map_cpu(dim, -1),
3971  column_map_cpu(dim);
3972  column_map_.CopyToVec(&column_map_cpu);
3973  for (int32 i = 0; i < dim; i++) {
3974  int32 &dest = reverse_column_map_cpu[column_map_cpu[i]];
3975  if (dest != -1)
3976  KALDI_ERR << "Column map does not represent a permutation.";
3977  dest = i;
3978  }
3979  reverse_column_map_.Resize(dim);
3980  reverse_column_map_.CopyFromVec(reverse_column_map_cpu);
3981 }
3982 
3984  PermuteComponent *ans = new PermuteComponent();
3985  ans->column_map_ = column_map_;
3986  ans->reverse_column_map_ = reverse_column_map_;
3987  return ans;
3988 }
3989 
3991  const CuMatrixBase<BaseFloat> &in,
3992  CuMatrixBase<BaseFloat> *out) const {
3993  out->CopyCols(in, column_map_);
3994  return NULL;
3995 }
3996 void PermuteComponent::Backprop(const std::string &debug_info,
3997  const ComponentPrecomputedIndexes *indexes,
3998  const CuMatrixBase<BaseFloat> &, //in_value
3999  const CuMatrixBase<BaseFloat> &, // out_value,
4000  const CuMatrixBase<BaseFloat> &out_deriv,
4001  void *memo,
4002  Component *to_update,
4003  CuMatrixBase<BaseFloat> *in_deriv) const {
4004  NVTX_RANGE("PermuteComponent::Backprop");
4005  in_deriv->CopyCols(out_deriv, reverse_column_map_);
4006 }
4007 
4009  bool ok = true;
4010  std::string column_map_str;
4011  ok = ok && cfl->GetValue("column-map", &column_map_str);
4012  std::vector<int32> column_map;
4013  if (!SplitStringToIntegers(column_map_str, ",", true, &column_map))
4014  KALDI_ERR << "Bad initializer in PermuteComponent: column-map="
4015  << column_map_str;
4016  if (cfl->HasUnusedValues())
4017  KALDI_ERR << "Could not process these elements in initializer: "
4018  << cfl->UnusedValues();
4019  if (!ok)
4020  KALDI_ERR << "Invalid initializer for layer of type "
4021  << Type() << ": \"" << cfl->WholeLine() << "\"";
4022  Init(column_map);
4023 }
4024 
4025 void PermuteComponent::Init(const std::vector<int32> &column_map) {
4026  KALDI_ASSERT(column_map.size() > 0);
4027  column_map_.CopyFromVec(column_map);
4028  ComputeReverseColumnMap();
4029 }
4030 
4031 void PermuteComponent::Read(std::istream &is, bool binary) {
4032  ExpectOneOrTwoTokens(is, binary, "<PermuteComponent>", "<ColumnMap>");
4033  std::vector<int32> column_map;
4034  if (binary && is.peek() == 'F') {
4035  // back-compatibility code [temporary]
4036  Vector<BaseFloat> float_map;
4037  float_map.Read(is, binary);
4038  column_map.resize(float_map.Dim());
4039  for (int32 i = 0; i < float_map.Dim(); i++) {
4040  // note: casting truncates toward zero: add 0.5 to approximate rounding.
4041  column_map[i] = static_cast<int32>(float_map(i) + 0.5);
4042  }
4043  // the next line is a workaround for a bug in the old
4044  // writing code, which now causes an assert failure. it's only
4045  // valid for the permutations we're currently using. anyway all this
4046  // code is only temporary.
4047  column_map.back() = float_map.Dim() - 1;
4048  } else {
4049  ReadIntegerVector(is, binary, &column_map);
4050  }
4051  column_map_.CopyFromVec(column_map);
4052  ExpectToken(is, binary, "</PermuteComponent>");
4053  ComputeReverseColumnMap();
4054 }
4055 
4056 void PermuteComponent::Write(std::ostream &os, bool binary) const {
4057  WriteToken(os, binary, "<PermuteComponent>");
4058  WriteToken(os, binary, "<ColumnMap>");
4059  std::ostringstream buffer;
4060  std::vector<int32> column_map;
4061  column_map_.CopyToVec(&column_map);
4062  WriteIntegerVector(os, binary, column_map);
4063  WriteToken(os, binary, "</PermuteComponent>");
4064 }
4065 
4066 std::string PermuteComponent::Info() const {
4067  std::ostringstream stream;
4068  stream << Type() << ", dim=" << column_map_.Dim();
4069  stream << " , column-map=[ ";
4070  std::vector<int32> column_map(column_map_.Dim());
4071  column_map_.CopyToVec(&column_map);
4072  int32 max_size = 5;
4073  for (size_t i = 0; i < column_map.size() && i < max_size; i++)
4074  stream << column_map[i] << ' ';
4075  if (static_cast<int32>(column_map.size()) > max_size)
4076  stream << "... ";
4077  stream << "]";
4078  return stream.str();
4079 }
4080 
4081 
4083  for (std::vector<Component*>::const_iterator iter = components_.begin(),
4084  end = components_.end(); iter != end; ++iter)
4085  if (((*iter)->Properties() & kUpdatableComponent) != 0)
4086  return true;
4087  return false;
4088 }
4089 
4090 // virtual
4092  KALDI_ASSERT(!components_.empty());
4093  return components_.front()->InputDim();
4094 }
4095 
4096 // virtual
4098  KALDI_ASSERT(!components_.empty());
4099  return components_.back()->OutputDim();
4100 }
4101 
4102 // virtual
4104  KALDI_ASSERT(!components_.empty());
4105  int32 last_component_properties = components_.back()->Properties(),
4106  first_component_properties = components_.front()->Properties();
4107  // We always assume backprop needs the input, as this would be necessary to
4108  // get the activations at intermediate layers, if these were not needed in
4109  // backprop, there would be no reason to use a CompositeComponent.
4111  (last_component_properties &
4113  (first_component_properties &
4115  (IsUpdatable() ? kUpdatableComponent : 0);
4116  // note, we don't return the kStoresStats property because that function is
4117  // not implemented; instead, for efficiency, we call StoreStats() on any
4118  // sub-components as part of the backprop phase.
4119  if (last_component_properties & kStoresStats)
4120  ans |= kBackpropNeedsOutput;
4121  return ans;
4122 }
4123 
4124 
4126  int32 num_components = components_.size();
4127  if ((components_[i]->Properties() & kOutputContiguous) ||
4128  (i + 1 < num_components &&
4129  (components_[i + 1]->Properties() & kInputContiguous)))
4130  return kStrideEqualNumCols;
4131  else
4132  return kDefaultStride;
4133 }
4134 
4135 
4136 // virtual
4138  const ComponentPrecomputedIndexes *, // indexes
4139  const CuMatrixBase<BaseFloat> &in,
4140  CuMatrixBase<BaseFloat> *out) const {
4141  KALDI_ASSERT(in.NumRows() == out->NumRows() && in.NumCols() == InputDim() &&
4142  out->NumCols() == OutputDim());
4143  int32 num_rows = in.NumRows(),
4144  num_components = components_.size();
4145  if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
4146  // recurse and process smaller parts of the data, to save memory.
4147  for (int32 row_offset = 0; row_offset < num_rows;
4148  row_offset += max_rows_process_) {
4149  int32 this_num_rows = std::min<int32>(max_rows_process_,
4150  num_rows - row_offset);
4151  const CuSubMatrix<BaseFloat> in_part(in, row_offset, this_num_rows,
4152  0, in.NumCols());
4153  CuSubMatrix<BaseFloat> out_part(*out, row_offset, this_num_rows,
4154  0, out->NumCols());
4155  this->Propagate(NULL, in_part, &out_part);
4156  }
4157  return NULL;
4158  }
4159  std::vector<CuMatrix<BaseFloat> > intermediate_outputs(num_components - 1);
4160  for (int32 i = 0; i < num_components; i++) {
4161  if (i + 1 < num_components) {
4162  MatrixResizeType resize_type =
4163  ((components_[i]->Properties() & kPropagateAdds) ?
4164  kSetZero : kUndefined);
4165  intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
4166  resize_type, GetStrideType(i));
4167  }
4168  const CuMatrixBase<BaseFloat> &this_in = (i == 0 ? in :
4169  intermediate_outputs[i-1]);
4170  CuMatrixBase<BaseFloat> *this_out = (i + 1 == num_components ?
4171  out : &(intermediate_outputs[i]));
4172  void *memo = components_[i]->Propagate(NULL, this_in, this_out);
4173  // we'll re-do the forward propagation in the backprop, and we can
4174  // regenerate any memos there, so no need to keep them.
4175  if (memo != NULL)
4176  components_[i]->DeleteMemo(memo);
4177  if (i > 0)
4178  intermediate_outputs[i-1].Resize(0, 0);
4179  }
4180  return NULL;
4181 }
4182 
4183 
4184 void CompositeComponent::Init(const std::vector<Component*> &components,
4185  int32 max_rows_process) {
4186  DeletePointers(&components_); // clean up.
4187  components_ = components;
4188  KALDI_ASSERT(!components.empty());
4189  max_rows_process_ = max_rows_process;
4190 
4191  for (size_t i = 0; i < components_.size(); i++) {
4192  // make sure all constituent components are simple.
4193  KALDI_ASSERT(components_[i]->Properties() & kSimpleComponent);
4194  if (i > 0) {
4195  // make sure all the internal dimensions match up.
4196  KALDI_ASSERT(components_[i]->InputDim() ==
4197  components_[i-1]->OutputDim());
4198  }
4199  }
4200 }
4201 
4202 // virtual
4203 void CompositeComponent::Read(std::istream &is, bool binary) {
4204  // Because we didn't previously write out the learning rate,
4205  // we need some temporary code.
4206  int32 max_rows_process;
4207  if (false) {
4208  ReadUpdatableCommon(is, binary);
4209  ExpectToken(is, binary, "<MaxRowsProcess>");
4210  ReadBasicType(is, binary, &max_rows_process);
4211  } else { // temporary code.
4212  std::string token;
4213  ReadToken(is, binary, &token);
4214  if (token == "<CompositeComponent>") {
4215  // if the first token is the opening tag, then
4216  // ignore it and get the next tag.
4217  ReadToken(is, binary, &token);
4218  }
4219  if (token == "<LearningRateFactor>") {
4220  ReadBasicType(is, binary, &learning_rate_factor_);
4221  ReadToken(is, binary, &token);
4222  } else {
4223  learning_rate_factor_ = 1.0;
4224  }
4225  if (token == "<IsGradient>") {
4226  ReadBasicType(is, binary, &is_gradient_);
4227  ReadToken(is, binary, &token);
4228  } else {
4229  is_gradient_ = false;
4230  }
4231  if (token == "<LearningRate>") {
4232  ReadBasicType(is, binary, &learning_rate_);
4233  ReadToken(is, binary, &token);
4234  }
4235  if (token != "<MaxRowsProcess>") {
4236  KALDI_ERR << "Expected token <MaxRowsProcess>, got "
4237  << token;
4238  }
4239  ReadBasicType(is, binary, &max_rows_process);
4240  }
4241  ExpectToken(is, binary, "<NumComponents>");
4242  int32 num_components;
4243  ReadBasicType(is, binary, &num_components); // Read dimension.
4244  if (num_components < 0 || num_components > 100000)
4245  KALDI_ERR << "Bad num-components";
4246  std::vector<Component*> components(num_components);
4247  for (int32 i = 0; i < num_components; i++)
4248  components[i] = ReadNew(is, binary);
4249  Init(components, max_rows_process);
4250  ExpectToken(is, binary, "</CompositeComponent>");
4251 }
4252 
4253 // virtual
4255  // we call ZeroStats() on all components without checking their flags; this
4256  // will do nothing if the component doesn't store stats. (components like
4257  // ReLU and sigmoid and tanh store stats on activations).
4258  for (size_t i = 0; i < components_.size(); i++)
4259  components_[i]->ZeroStats();
4260 }
4261 
4262 // virtual
4263 void CompositeComponent::Write(std::ostream &os, bool binary) const {
4264  WriteUpdatableCommon(os, binary); // Write opening tag and learning rate.
4265  WriteToken(os, binary, "<MaxRowsProcess>");
4266  WriteBasicType(os, binary, max_rows_process_);
4267  WriteToken(os, binary, "<NumComponents>");
4268  int32 num_components = components_.size();
4269  WriteBasicType(os, binary, num_components);
4270  for (int32 i = 0; i < num_components; i++)
4271  components_[i]->Write(os, binary);
4272  WriteToken(os, binary, "</CompositeComponent>");
4273 }
4274 
4275 
4276 // virtual
4277 void CompositeComponent::Backprop(const std::string &debug_info,
4278  const ComponentPrecomputedIndexes *indexes,
4279  const CuMatrixBase<BaseFloat> &in_value,
4280  const CuMatrixBase<BaseFloat> &out_value,
4281  const CuMatrixBase<BaseFloat> &out_deriv,
4282  void *memo,
4283  Component *to_update,
4284  CuMatrixBase<BaseFloat> *in_deriv) const {
4285  NVTX_RANGE("CompositeComponent::Backprop");
4286  KALDI_ASSERT(in_value.NumRows() == out_deriv.NumRows() &&
4287  in_value.NumCols() == InputDim() &&
4288  out_deriv.NumCols() == OutputDim());
4289  int32 num_rows = in_value.NumRows(),
4290  num_components = components_.size();
4291  if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
4292  KALDI_ASSERT(max_rows_process_ > 0);
4293  // recurse and process smaller parts of the data, to save memory.
4294  for (int32 row_offset = 0; row_offset < num_rows;
4295  row_offset += max_rows_process_) {
4296  bool have_output_value = (out_value.NumRows() != 0);
4297  int32 this_num_rows = std::min<int32>(max_rows_process_,
4298  num_rows - row_offset);
4299  // out_value_part will only be used if out_value is nonempty; otherwise we
4300  // make it a submatrix of 'out_deriv' to avoid errors in the constructor.
4301  const CuSubMatrix<BaseFloat> out_value_part(have_output_value ? out_value : out_deriv,
4302  row_offset, this_num_rows,
4303  0, out_deriv.NumCols());
4304  // in_deriv_value_part will only be used if in_deriv != NULL; otherwise we
4305  // make it a submatrix of 'in_value' to avoid errors in the constructor.
4306  CuSubMatrix<BaseFloat> in_deriv_part(in_deriv != NULL ? *in_deriv : in_value,
4307  row_offset, this_num_rows,
4308  0, in_value.NumCols());
4309  CuSubMatrix<BaseFloat> in_value_part(in_value, row_offset, this_num_rows,
4310  0, in_value.NumCols());
4311  const CuSubMatrix<BaseFloat> out_deriv_part(out_deriv,
4312  row_offset, this_num_rows,
4313  0, out_deriv.NumCols());
4314  CuMatrix<BaseFloat> empty_mat;
4315  this->Backprop(debug_info, NULL, in_value_part,
4316  (have_output_value ? static_cast<const CuMatrixBase<BaseFloat>&>(out_value_part) :
4317  static_cast<const CuMatrixBase<BaseFloat>&>(empty_mat)),
4318  out_deriv_part, NULL, to_update,
4319  in_deriv != NULL ? &in_deriv_part : NULL);
4320  }
4321  return;
4322  }
4323  // For now, assume all intermediate values and derivatives need to be
4324  // computed. in_value and out_deriv will always be supplied.
4325 
4326  // intermediate_outputs[i] contains the output of component i.
4327  std::vector<CuMatrix<BaseFloat> > intermediate_outputs(num_components);
4328  // intermediate_derivs[i] contains the deriative at the output of component i.
4329  std::vector<CuMatrix<BaseFloat> > intermediate_derivs(num_components - 1);
4330 
4331  KALDI_ASSERT(memo == NULL);
4332  // note: only a very few components use memos, but we need to support them.
4333  std::vector<void*> memos(num_components, NULL);
4334 
4335  int32 num_components_to_propagate = num_components;
4336  if (!(components_[num_components - 1]->Properties() & kUsesMemo)) {
4337  // we only need to propagate the very last component if it uses a memo.
4338  num_components_to_propagate--;
4339  if (num_components > 1) {
4340  // skip the last-but-one component's propagate if the last component's
4341  // backprop doesn't need the input and the last-but-one component's
4342  // backprop doesn't need the output. This is the lowest hanging fruit for
4343  // optimization; other propagates might also be skippable.
4344  int32 properties = components_[num_components - 2]->Properties(),
4345  next_properties = components_[num_components - 1]->Properties();
4346  if (!(properties & (kBackpropNeedsOutput | kUsesMemo)) &&
4347  !(next_properties & kBackpropNeedsInput)) {
4348  num_components_to_propagate--;
4349  }
4350  }
4351  }
4352 
4353 
4354  // Do the propagation again.
4355  for (int32 i = 0; i < num_components_to_propagate; i++) {
4356  MatrixResizeType resize_type =
4357  ((components_[i]->Properties() & kPropagateAdds) ?
4358  kSetZero : kUndefined);
4359  intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
4360  resize_type, GetStrideType(i));
4361  memos[i] =
4362  components_[i]->Propagate(NULL,
4363  (i == 0 ? in_value : intermediate_outputs[i-1]),
4364  &(intermediate_outputs[i]));
4365  }
4366 
4367  for (int32 i = num_components - 1; i >= 0; i--) {
4368  const CuMatrixBase<BaseFloat> &this_in_value =
4369  (i == 0 ? in_value : intermediate_outputs[i-1]),
4370  &this_out_value =
4371  (i == num_components - 1 ? out_value : intermediate_outputs[i]);
4372 
4373  Component *component_to_update =
4374  (to_update == NULL ? NULL :
4375  dynamic_cast<CompositeComponent*>(to_update)->components_[i]);
4376 
4377  if (component_to_update != NULL &&
4378  components_[i]->Properties() & kStoresStats)
4379  component_to_update->StoreStats(this_in_value, this_out_value, memos[i]);
4380 
4381  if (i > 0) {
4382  MatrixResizeType resize_type =
4383  ((components_[i]->Properties() & kBackpropAdds) ?
4384  kSetZero : kUndefined);
4385  intermediate_derivs[i-1].Resize(num_rows, components_[i]->InputDim(),
4386  resize_type, GetStrideType(i - 1));
4387  }
4388  // skip the first component's backprop if it's not updatable and in_deriv is
4389  // not requested. Again, this is the lowest-hanging fruit to optimize.
4390  if (!(i == 0 && !(components_[0]->Properties() & kUpdatableComponent) &&
4391  in_deriv == NULL)) {
4392  components_[i]->Backprop(debug_info, NULL,
4393  this_in_value, this_out_value,
4394  (i + 1 == num_components ? out_deriv : intermediate_derivs[i]),
4395  memos[i], component_to_update,
4396  (i == 0 ? in_deriv : &(intermediate_derivs[i-1])));
4397  }
4398  if (memos[i] != NULL)
4399  components_[i]->DeleteMemo(memos[i]);
4400  }
4401 }
4402 
4403 
4404 // virtual
4405 std::string CompositeComponent::Info() const {
4406  std::ostringstream stream;
4407  stream << Type() << " ";
4408  for (size_t i = 0; i < components_.size(); i++) {
4409  if (i > 0) stream << ", ";
4410  stream << "sub-component" << (i+1) << " = { "
4411  << components_[i]->Info() << " }";
4412  }
4413  return stream.str();
4414 }
4415 
4416 // virtual
4418  for (size_t i = 0; i < components_.size(); i++)
4419  components_[i]->Scale(scale);
4420 }
4421 
4422 // virtual
4423 void CompositeComponent::Add(BaseFloat alpha, const Component &other_in) {
4424  const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
4425  &other_in);
4426  KALDI_ASSERT(other != NULL && other->components_.size() ==
4427  components_.size() && "Mismatching nnet topologies");
4428  for (size_t i = 0; i < components_.size(); i++)
4429  components_[i]->Add(alpha, *(other->components_[i]));
4430 }
4431 
4432 // virtual
4434  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4435  for (size_t i = 0; i < components_.size(); i++) {
4436  if (components_[i]->Properties() & kUpdatableComponent) {
4437  UpdatableComponent *uc =
4438  dynamic_cast<UpdatableComponent*>(components_[i]);
4439  uc->PerturbParams(stddev);
4440  }
4441  }
4442 }
4443 
4445  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4447 
4448  // apply any learning-rate-factor that's set at this level (ill-advised, but
4449  // we'll do it.)
4450  BaseFloat effective_lrate = LearningRate();
4451  for (size_t i = 0; i < components_.size(); i++) {
4452  if (components_[i]->Properties() & kUpdatableComponent) {
4453  UpdatableComponent *uc =
4454  dynamic_cast<UpdatableComponent*>(components_[i]);
4455  uc->SetUnderlyingLearningRate(effective_lrate);
4456  }
4457  }
4458 }
4459 
4461  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4463  for (size_t i = 0; i < components_.size(); i++) {
4464  if (components_[i]->Properties() & kUpdatableComponent) {
4465  UpdatableComponent *uc =
4466  dynamic_cast<UpdatableComponent*>(components_[i]);
4467  uc->SetActualLearningRate(lrate);
4468  }
4469  }
4470 }
4471 
4472 // virtual
4474  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4476  for (size_t i = 0; i < components_.size(); i++) {
4477  if (components_[i]->Properties() & kUpdatableComponent) {
4478  UpdatableComponent *uc =
4479  dynamic_cast<UpdatableComponent*>(components_[i]);
4480  uc->SetAsGradient();
4481  }
4482  }
4483 }
4484 
4485 // virtual
4487  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4488  int32 ans = 0;
4489  for (size_t i = 0; i < components_.size(); i++) {
4490  if (components_[i]->Properties() & kUpdatableComponent) {
4491  UpdatableComponent *uc =
4492  dynamic_cast<UpdatableComponent*>(components_[i]);
4493  ans += uc->NumParameters();
4494  }
4495  }
4496  return ans;
4497 }
4498 
4499 // virtual
4501  int32 cur_offset = 0;
4502  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4503  for (size_t i = 0; i < components_.size(); i++) {
4504  if (components_[i]->Properties() & kUpdatableComponent) {
4505  UpdatableComponent *uc =
4506  dynamic_cast<UpdatableComponent*>(components_[i]);
4507  int32 this_size = uc->NumParameters();
4508  SubVector<BaseFloat> params_range(*params, cur_offset, this_size);
4509  uc->Vectorize(&params_range);
4510  cur_offset += this_size;
4511  }
4512  }
4513  KALDI_ASSERT(cur_offset == params->Dim());
4514 }
4515 
4516 // virtual
4518  int32 cur_offset = 0;
4519  KALDI_ASSERT(this->IsUpdatable()); // or should not be called.
4520  for (size_t i = 0; i < components_.size(); i++) {
4521  if (components_[i]->Properties() & kUpdatableComponent) {
4522  UpdatableComponent *uc =
4523  dynamic_cast<UpdatableComponent*>(components_[i]);
4524  int32 this_size = uc->NumParameters();
4525  SubVector<BaseFloat> params_range(params, cur_offset, this_size);
4526  uc->UnVectorize(params_range);
4527  cur_offset += this_size;
4528  }
4529  }
4530  KALDI_ASSERT(cur_offset == params.Dim());
4531 }
4532 
4533 // virtual
4535  const UpdatableComponent &other_in) const {
4536  const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
4537  &other_in);
4538  KALDI_ASSERT(other != NULL && other->components_.size() ==
4539  components_.size() && "Mismatching nnet topologies");
4540  BaseFloat ans = 0.0;
4541  for (size_t i = 0.0; i < components_.size(); i++) {
4542  if (components_[i]->Properties() & kUpdatableComponent) {
4543  UpdatableComponent *uc =
4544  dynamic_cast<UpdatableComponent*>(components_[i]);
4545  const UpdatableComponent *uc_other =
4546  dynamic_cast<UpdatableComponent*>(other->components_[i]);
4547  KALDI_ASSERT(uc != NULL && uc_other != NULL);
4548  ans += uc->DotProduct(*uc_other);
4549  }
4550  }
4551  return ans;
4552 }
4553 
4556  for (size_t i = 0; i < components_.size(); i++) {
4557  if (components_[i]->Properties() & kUpdatableComponent) {
4558  UpdatableComponent *uc =
4559  dynamic_cast<UpdatableComponent*>(components_[i]);
4560  KALDI_ASSERT(uc != NULL);
4561  uc->FreezeNaturalGradient(freeze);
4562  }
4563  }
4564 }
4565 
4566 // virtual
4568  std::vector<Component*> components(components_.size());
4569  for (size_t i = 0; i < components_.size(); i++)
4570  components[i] = components_[i]->Copy();
4572  ans->Init(components, max_rows_process_);
4573  return ans;
4574 }
4575 
4576 
4577 // virtual
4579  int32 max_rows_process = 4096, num_components = -1;
4580  cfl->GetValue("max-rows-process", &max_rows_process);
4581  if (!cfl->GetValue("num-components", &num_components) ||
4582  num_components < 1)
4583  KALDI_ERR << "Expected num-components to be defined in "
4584  << "CompositeComponent config line '" << cfl->WholeLine() << "'";
4585  std::vector<Component*> components;
4586  for (int32 i = 1; i <= num_components; i++) {
4587  std::ostringstream name_stream;
4588  name_stream << "component" << i;
4589  std::string component_config;
4590  if (!cfl->GetValue(name_stream.str(), &component_config)) {
4591  DeletePointers(&components);
4592  KALDI_ERR << "Expected '" << name_stream.str() << "' to be defined in "
4593  << "CompositeComponent config line '" << cfl->WholeLine() << "'";
4594  }
4595  ConfigLine nested_line;
4596  // note: the nested line may not contain comments.
4597  std::string component_type;
4598  Component *this_component = NULL;
4599  if (!nested_line.ParseLine(component_config) ||
4600  !nested_line.GetValue("type", &component_type) ||
4601  !(this_component = NewComponentOfType(component_type)) ||
4602  nested_line.FirstToken() != "") {
4603  DeletePointers(&components);
4604  KALDI_ERR << "Could not parse config line for '" << name_stream.str()
4605  << "(or undefined or bad component type [type=xxx]), in "
4606  << "CompositeComponent config line '" << cfl->WholeLine() << "'";
4607  }
4608  if(this_component->Type() == "CompositeComponent") {
4609  DeletePointers(&components);
4610  delete this_component;
4611  // This is not allowed. If memory is too much with just one
4612  // CompositeComponent, try decreasing max-rows-process instead.
4613  KALDI_ERR << "Found CompositeComponent nested within CompositeComponent."
4614  << "Nested line: '" << nested_line.WholeLine() << "'\n"
4615  << "Toplevel CompositeComponent line '" << cfl->WholeLine()
4616  << "'";
4617  }
4618  this_component->InitFromConfig(&nested_line);
4619  int32 props = this_component->Properties();
4620  if ((props & kRandomComponent) != 0 ||
4621  (props & kSimpleComponent) == 0) {
4622  KALDI_ERR << "CompositeComponent contains disallowed component type: "
4623  << nested_line.WholeLine();
4624  }
4625  components.push_back(this_component);
4626  }
4627  if (cfl->HasUnusedValues())
4628  KALDI_ERR << "Could not process these elements in initializer: "
4629  << cfl->UnusedValues();
4630  this->Init(components, max_rows_process);
4631 }
4632 
4634  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
4635  return components_[i];
4636 }
4637 
4639  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
4640  delete components_[i];
4641  components_[i] = component;
4642 }
4643 
4644 
4646  input_dim_(other.input_dim_), output_dim_(other.output_dim_),
4647  scale_(other.scale_) { }
4648 
4650  scale_ = 1.0;
4651  bool ok = cfl->GetValue("input-dim", &input_dim_) &&
4652  cfl->GetValue("output-dim", &output_dim_);
4653  if (!ok)
4654  KALDI_ERR << "input-dim and output-dim must both be provided.";
4655  if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0)
4656  KALDI_ERR << "Invalid values input-dim=" << input_dim_
4657  << " output-dim=" << output_dim_;
4658  cfl->GetValue("scale", &scale_);
4659  if (cfl->HasUnusedValues())
4660  KALDI_ERR << "Could not process these elements in initializer: "
4661  << cfl->UnusedValues();
4662 }
4663 
4664 void SumBlockComponent::Read(std::istream &is, bool binary) {
4665  ExpectOneOrTwoTokens(is, binary, "<SumBlockComponent>", "<InputDim>");
4666  ReadBasicType(is, binary, &input_dim_);
4667  ExpectToken(is, binary, "<OutputDim>");
4668  ReadBasicType(is, binary, &output_dim_);
4669  ExpectToken(is, binary, "<Scale>");
4670  ReadBasicType(is, binary, &scale_);
4671  ExpectToken(is, binary, "</SumBlockComponent>");
4672 }
4673 
4674 void SumBlockComponent::Write(std::ostream &os, bool binary) const {
4675  WriteToken(os, binary, "<SumBlockComponent>");
4676  WriteToken(os, binary, "<InputDim>");
4677  WriteBasicType(os, binary, input_dim_);
4678  WriteToken(os, binary, "<OutputDim>");
4679  WriteBasicType(os, binary, output_dim_);
4680  WriteToken(os, binary, "<Scale>");
4681  WriteBasicType(os, binary, scale_);
4682  WriteToken(os, binary, "</SumBlockComponent>");
4683 }
4684 
4685 std::string SumBlockComponent::Info() const {
4686  std::ostringstream stream;
4687  stream << Type() << ", input-dim=" << input_dim_
4688  << ", output-dim=" << output_dim_
4689  << ", scale=" << scale_;
4690  return stream.str();
4691 }
4692 
4694  const CuMatrixBase<BaseFloat> &in,
4695  CuMatrixBase<BaseFloat> *out) const {
4696  KALDI_ASSERT(out->NumRows() == in.NumRows() &&
4697  out->NumCols() == output_dim_ &&
4698  in.NumCols() == input_dim_);
4699  out->AddMatBlocks(scale_, in, kNoTrans);
4700  return NULL;
4701 }
4702 
4704  const std::string &debug_info,
4705  const ComponentPrecomputedIndexes *indexes,
4706  const CuMatrixBase<BaseFloat> &, //in_value
4707  const CuMatrixBase<BaseFloat> &, // out_value,
4708  const CuMatrixBase<BaseFloat> &out_deriv,
4709  void *memo,
4710  Component *to_update,
4711  CuMatrixBase<BaseFloat> *in_deriv) const {
4712  NVTX_RANGE("SumBlockComponent::Backprop");
4713  if (in_deriv) {
4714  in_deriv->AddMatBlocks(scale_, out_deriv, kNoTrans);
4715  }
4716 }
4717 
4718 
4719 
4720 } // namespace nnet3
4721 } // namespace kaldi
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
void Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping, BaseFloat self_repair_clipped_proportion_threshold, BaseFloat self_repair_target, BaseFloat self_repair_scale, int32 num_clipped, int32 count, int32 num_self_repaired, int32 num_backpropped)
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual int32 OutputDim() const
Returns output-dimension of this component.
virtual void Update(const std::string &debug_info, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_deriv)
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
MatrixIndexT Stride() const
Definition: cu-matrix.h:217
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void PropagateInternal(const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void ApplyCeiling(Real ceiling_val)
Definition: cu-matrix.h:455
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
virtual void SetActualLearningRate(BaseFloat lrate)
Sets the learning rate directly, bypassing learning_rate_factor_.
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
const std::string & FirstToken() const
Definition: text-utils.h:228
virtual void Resize(int32 input_dim, int32 output_dim)
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
virtual void SetUnderlyingLearningRate(BaseFloat lrate)
Sets the learning rate of gradient descent- gets multiplied by learning_rate_factor_.
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
void ApplyPow(Real power)
Definition: cu-matrix.h:438
void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
const std::string WholeLine()
Definition: text-utils.h:230
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
bool ParseLine(const std::string &line)
Definition: text-utils.cc:343
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev, int32 rank, int32 update_period, BaseFloat num_samples_history, BaseFloat alpha)
void DeletePointers(std::vector< A *> *v)
Deletes any non-NULL pointers in the vector v, and sets the corresponding entries of v to NULL...
Definition: stl-utils.h:184
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
MatrixResizeType
Definition: matrix-common.h:37
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
virtual void SetAsGradient()
Sets is_gradient_ to true and sets learning_rate_ to 1, ignoring learning_rate_factor_.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
const CuSubVector< Real > Row(MatrixIndexT i) const
Definition: cu-matrix.h:670
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
void Init(const CuVectorBase< BaseFloat > &scales)
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
Definition: kaldi-math.h:151
virtual void StoreStats(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, void *memo)
This function may store stats on average activation values, and for some component types...
virtual void ZeroStats()
Components that provide an implementation of StoreStats should also provide an implementation of Zero...
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
void GetSizes(std::vector< int32 > *sizes) const
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void CopyColFromMat(const CuMatrixBase< Real > &mat, MatrixIndexT col)
Definition: cu-vector.cc:103
Abstract base-class for neural-net components.
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void Add(Real value)
Definition: cu-vector.cc:1157
virtual Component * Copy() const
Copies component (deep copy).
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
Definition: text-utils.h:68
void SetNumSamplesHistory(BaseFloat num_samples_history)
void ConsolidateMemory()
This virtual function relates to memory management, and avoiding fragmentation.
virtual Component * Copy() const
Copies component (deep copy).
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void Init(int32 input_dim, int32 output_dim)
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
Real Sum() const
Definition: cu-vector.cc:297
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
void Init(const std::vector< int32 > &sizes)
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
void Set(Real value)
Definition: cu-vector.cc:1135
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
void AddMatDiagVec(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, CuVectorBase< Real > &v, Real beta=1.0)
Definition: cu-matrix.cc:1415
void InitLearningRatesFromConfig(ConfigLine *cfl)
Real Sum() const
Definition: cu-matrix.cc:3012
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
CuSubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:653
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
void ApplyFloor(Real floor_val)
Definition: cu-matrix.h:451
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void AddDiagMat2(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, Real beta)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
Definition: cu-vector.cc:595
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual Component * Copy() const
Copies component (deep copy).
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
kaldi::int32 int32
void ReadToken(std::istream &is, bool binary, std::string *str)
ReadToken gets the next token and puts it in str (exception on failure).
Definition: io-funcs.cc:154
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
Keywords for search: natural gradient, naturalgradient, NG-SGD.
void AddMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
*this += alpha * A
Definition: cu-matrix.cc:954
void BackpropInternal(const std::string &debug_info, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, ScaleAndOffsetComponent *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void FreezeNaturalGradient(bool freeze)
freezes/unfreezes NaturalGradient updates, if applicable (to be overriden by components that use Natu...
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void CopyRowsFromVec(const CuVectorBase< Real > &v)
This function has two modes of operation.
Definition: cu-matrix.cc:2301
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual void FreezeNaturalGradient(bool freeze)
freezes/unfreezes NaturalGradient updates, if applicable (to be overriden by components that use Natu...
PermuteComponent changes the order of the columns (i.e.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual Component * Copy() const
Copies component (deep copy).
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void RepairGradients(const CuMatrixBase< BaseFloat > &out_value, CuMatrixBase< BaseFloat > *in_deriv, SigmoidComponent *to_update) const
CompositeComponent is a component representing a sequence of [simple] components. ...
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
void Init(int32 input_dim, int32 output_dim)
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
FixedScaleComponent applies a fixed per-element scale; it&#39;s similar to the Rescale component in the n...
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
void CopyColFromVec(const CuVectorBase< Real > &v, const MatrixIndexT col)
Copy vector into specific column of matrix.
Definition: cu-matrix.cc:2414
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
This file contains declarations of components that are "simple", meaning they don&#39;t care about the in...
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual Component * Copy() const
Copies component (deep copy).
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
OnlineNaturalGradient preconditioner_in_
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
void ExpectOneOrTwoTokens(std::istream &is, bool binary, const std::string &token1, const std::string &token2)
This function is like ExpectToken but for two tokens, and it will either accept token1 and then token...
Definition: text-utils.cc:536
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual int32 OutputDim() const
Returns output-dimension of this component.
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
virtual Component * Copy() const
Copies component (deep copy).
void Init(int32 input_dim, int32 output_dim, BaseFloat param_stddev, BaseFloat bias_stddev)
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
void AddMatBlocks(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
This function is like AddMat (it does *this += alpha * src), except that it supports cases where *thi...
Definition: cu-matrix.cc:1119
virtual Component * Copy() const
Copies component (deep copy).
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void Scale(Real value)
Definition: cu-matrix.cc:644
MatrixStrideType GetStrideType(int32 i) const
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
std::string UnusedValues() const
returns e.g.
Definition: text-utils.cc:518
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
const size_t count
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
std::istream & Stream()
Definition: kaldi-io.cc:826
NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but it uses a natural gradie...
virtual void ConsolidateMemory()
This virtual function relates to memory management, and avoiding fragmentation.
virtual void FreezeNaturalGradient(bool freeze)
virtual
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
Definition: cu-vector.cc:1078
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void FreezeNaturalGradient(bool freeze)
freezes/unfreezes NaturalGradient updates, if applicable (to be overriden by components that use Natu...
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual int32 InputDim() const
Returns input-dimension of this component.
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
virtual int32 OutputDim() const
Returns output-dimension of this component.
virtual int32 Properties() const
Return bitmask of the component&#39;s properties.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void ReadIntegerVector(std::istream &is, bool binary, std::vector< T > *v)
Function for reading STL vector of integer types.
Definition: io-funcs-inl.h:232
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void AddVecToRows(Real alpha, const CuVectorBase< Real > &row, Real beta=1.0)
(for each row r of *this), r = alpha * row + beta * r
Definition: cu-matrix.cc:1261
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void ApplyPowAbs(Real power, bool include_sign=false)
Definition: cu-matrix.h:443
static void ExpectToken(const std::string &token, const std::string &what_we_are_parsing, const std::string **next_token)
virtual void ZeroStats()
Components that provide an implementation of StoreStats should also provide an implementation of Zero...
virtual Component * Copy() const
Copies component (deep copy).
void Sigmoid(const CuMatrixBase< Real > &src)
Set each element to the sigmoid of the corresponding element of "src": element by element...
Definition: cu-matrix.cc:1534
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
virtual void ZeroStats()
Components that provide an implementation of StoreStats should also provide an implementation of Zero...
virtual void UpdateSimple(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_deriv)
virtual BaseFloat DotProduct(const UpdatableComponent &other) const =0
Computes dot-product between parameters of two instances of a Component.
virtual void StoreStats(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, void *memo)
This function may store stats on average activation values, and for some component types...
void Add(Real value)
Definition: cu-matrix.cc:582
virtual int32 Properties() const =0
Return bitmask of the component&#39;s properties.
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void SetZero()
Math operations, some calling kernels.
Definition: cu-matrix.cc:509
void SoftMaxPerRow(const CuMatrixBase< Real > &src)
Softmax nonlinearity Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row, with attention to avoiding overflow or underflow.
Definition: cu-matrix.cc:1717
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
virtual Component * Copy() const
Copies component (deep copy).
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void MulElements(const CuMatrixBase< Real > &A)
Multiply two matrices elementwise: C = C .* A.
Definition: cu-matrix.cc:667
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual int32 OutputDim() const
Returns output-dimension of this component.
SumGroupComponent is used to sum up groups of posteriors.
BaseFloat learning_rate_
learning rate (typically 0.0..0.01)
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
MatrixStrideType
Definition: matrix-common.h:44
void Init(int32 input_dim, int32 output_dim, int32 num_blocks, BaseFloat param_stddev, BaseFloat bias_mean, BaseFloat bias_stddev)
void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
virtual Component * Copy() const
Copies component (deep copy).
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
std::string ReadUpdatableCommon(std::istream &is, bool binary)
void CopyColsFromVec(const CuVectorBase< Real > &v)
Copies vector into matrix, column-by-column.
Definition: cu-matrix.cc:2376
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
virtual void FreezeNaturalGradient(bool freeze)
freezes/unfreezes NaturalGradient updates, if applicable (to be overriden by components that use Natu...
void Init(const CuVectorBase< BaseFloat > &scales)
std::vector< Component * > components_
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
#define KALDI_ERR
Definition: kaldi-error.h:147
static Component * ReadNew(std::istream &is, bool binary)
Read component from stream (works out its type). Dies on error.
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
void GroupPnorm(const CuMatrixBase< Real > &src, Real pow)
Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p) where G = x...
Definition: cu-matrix.cc:1576
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void ApplyPow(Real power)
Definition: cu-vector.h:147
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
void PreconditionDirections(CuMatrixBase< BaseFloat > *X, BaseFloat *scale)
This call implements the main functionality of this class.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
virtual Component * Copy() const
Copies component (deep copy).
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void Init(const std::vector< Component *> &components, int32 max_rows_process)
virtual int32 InputDim() const
Returns input-dimension of this component.
BaseFloat learning_rate_factor_
learning rate factor (normally 1.0, but can be set to another < value so that when < you call SetLear...
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
virtual Component * Copy() const
Copies component (deep copy).
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
virtual void SetActualLearningRate(BaseFloat lrate)
Sets the learning rate directly, bypassing learning_rate_factor_.
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
void Init(const CuMatrixBase< BaseFloat > &matrix)
matrix should be of size input-dim+1 to output-dim, last col is offset
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
void DiffSoftmaxPerRow(const CuMatrixBase< Real > &value, const CuMatrixBase< Real > &diff)
Differentiate backward through the softmax function.
Definition: cu-matrix.cc:1868
virtual void ConsolidateMemory()
This virtual function relates to memory management, and avoiding fragmentation.
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void Scale(Real alpha)
Multiplies all elements by this constant.
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
int PeekToken(std::istream &is, bool binary)
PeekToken will return the first character of the next token, or -1 if end of file.
Definition: io-funcs.cc:170
virtual int32 InputDim() const
Returns input-dimension of this component.
void Swap(OnlineNaturalGradient *other)
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
void DiffTanh(const CuMatrixBase< Real > &value, const CuMatrixBase< Real > &diff)
Differentiate backward through the tanh function.
Definition: cu-matrix.cc:1809
void SetRandn()
Set vector to random normally-distributed noise.
virtual void SetAsGradient()
Sets is_gradient_ to true and sets learning_rate_ to 1, ignoring learning_rate_factor_.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void UpdateSimple(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_deriv)
Class UpdatableComponent is a Component which has trainable parameters; it extends the interface of C...
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
const Component * GetComponent(int32 i) const
Gets the ith component in this component.
virtual int32 Properties() const
Return bitmask of the component&#39;s properties.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void Init(int32 dim, BaseFloat dropout_proportion=0.0, bool dropout_per_frame=false)
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
void Heaviside(const CuMatrixBase< Real > &src)
Set each element to the Heaviside function of the corresponding element of "src", which we define as ...
Definition: cu-matrix.cc:2435
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual void StoreStats(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, void *memo)
This function may store stats on average activation values, and for some component types...
void AddVec(Real alpha, const CuVectorBase< Real > &vec, Real beta=1.0)
Definition: cu-vector.cc:1237
void RepairGradients(const std::string &debug_info, const CuMatrixBase< BaseFloat > &in_value, CuMatrixBase< BaseFloat > *in_deriv, ClipGradientComponent *to_update) const
virtual void ConsolidateMemory()
This virtual function relates to memory management, and avoiding fragmentation.
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
void DiffSigmoid(const CuMatrixBase< Real > &value, const CuMatrixBase< Real > &diff)
Differentiate backward through the sigmoid function.
Definition: cu-matrix.cc:1764
void MulColsVec(const CuVectorBase< Real > &scale)
scale i&#39;th column by scale[i]
Definition: cu-matrix.cc:765
virtual Component * Copy() const
Copies component (deep copy).
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746
virtual void ConsolidateMemory()
This virtual function relates to memory management, and avoiding fragmentation.
CuMatrix< BaseFloat > linear_params_
void SumColumnRanges(const CuMatrixBase< Real > &src, const CuArrayBase< Int32Pair > &indexes)
For each row r of this and for each column c, sets (*this)(r, c) to the sum src(r, j), where j ranges from indexes[c].first through indexes[c].second - 1.
Definition: cu-matrix.cc:2893
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
CuSubMatrix< Real > ColRange(const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:665
virtual std::string Type() const =0
Returns a string such as "SigmoidComponent", describing the type of the object.
virtual Component * Copy() const
Copies component (deep copy).
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
void StoreBackpropStats(const CuMatrixBase< BaseFloat > &out_deriv)
void EnsureNonzero(const CuMatrixBase< Real > &src, Real epsilon, CuMatrixBase< Real > *dest)
This function requires that src and dest have the same dimension and epsilon > 0. ...
Definition: cu-math.cc:209
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
virtual void PerturbParams(BaseFloat stddev)=0
This function is to be used in testing.
Matrix for CUDA computing.
Definition: matrix-common.h:69
virtual Component * Copy() const
Copies component (deep copy).
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual void ConsolidateMemory()
This virtual function relates to memory management, and avoiding fragmentation.
void DiffLogSoftmaxPerRow(const CuMatrixBase< Real > &out_value, const CuMatrixBase< Real > &out_deriv)
Differentiate backward through the log softmax function.
Definition: cu-matrix.cc:1903
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
void DiffGroupPnorm(const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_value, const CuMatrixBase< Real > &out_deriv, Real power)
Differentiate backward through the GroupPnorm function.
Definition: cu-matrix.cc:841
virtual Component * Copy() const
Copies component (deep copy).
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
A class representing a vector.
Definition: kaldi-vector.h:406
This class is responsible for parsing input like hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing=&#39;a b c&#39; baz="a b c d=&#39;a b&#39; e" and giving you access to the fields, in this case.
Definition: text-utils.h:205
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev)
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
bool is_gradient_
True if this component is to be treated as a gradient rather than as parameters.
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
PerElementScaleComponent scales each dimension of its input with a separate trainable scale; it&#39;s lik...
void WriteUpdatableCommon(std::ostream &is, bool binary) const
void Read(std::istream &is, bool binary)
I/O functions.
Definition: cu-matrix.cc:494
virtual void Update(const std::string &debug_info, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_deriv)
SumBlockComponent sums over blocks of its input: for instance, if you create one with the config "inp...
FixedBiasComponent applies a fixed per-element bias; it&#39;s similar to the AddShift component in the nn...
virtual void Update(const std::string &debug_info, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_deriv)
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void Read(std::istream &is, bool binary)
I/O.
Definition: cu-vector.cc:963
#define NVTX_RANGE(name)
Definition: cu-common.h:143
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void SetParams(const CuVectorBase< BaseFloat > &bias, const CuMatrixBase< BaseFloat > &linear)
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
void Scale(Real value)
Definition: cu-vector.cc:1216
void WriteIntegerVector(std::ostream &os, bool binary, const std::vector< T > &v)
Function for writing STL vectors of integer types.
Definition: io-funcs-inl.h:198
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
bool HasUnusedValues() const
Definition: text-utils.cc:510
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void CopyRowsFromMat(const MatrixBase< Real > &M)
Performs a row stack of the matrix M.
bool GetValue(const std::string &key, std::string *value)
Definition: text-utils.cc:427
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void SetComponent(int32 i, Component *component)
Sets the ith component.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void CopyCols(const CuMatrixBase< Real > &src, const CuArrayBase< MatrixIndexT > &indexes)
Copies column r from column indexes[r] of src.
Definition: cu-matrix.cc:2656
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void PrintParameterStats(std::ostringstream &os, const std::string &name, const CuVectorBase< BaseFloat > &params, bool include_mean)
Print to &#39;os&#39; some information about the mean and standard deviation of some parameters, used in Info() functions in nnet-simple-component.cc.
Definition: nnet-parse.cc:157
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
bool SameDimAndStride(const CuMatrixBase< Real > &M, const CuMatrixBase< Real > &N)
Definition: cu-matrix.h:954
void RepairGradients(CuMatrixBase< BaseFloat > *in_deriv, RectifiedLinearComponent *to_update) const
void LogSoftMaxPerRow(const CuMatrixBase< Real > &src)
LogSoftmax nonlinearity Y = LogSoftmax(X) : Yij = Xij - log(sum_k(e^Xik)), done to each row...
Definition: cu-matrix.cc:1740
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
void Init(const std::vector< int32 > &column_map)
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
virtual void Update(const std::string &debug_info, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_deriv)
void Add(Real c)
Add a constant to each element of a vector.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual void SetUnderlyingLearningRate(BaseFloat lrate)
Sets the learning rate of gradient descent- gets multiplied by learning_rate_factor_.
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
void SetMatMatDivMat(const CuMatrixBase< Real > &A, const CuMatrixBase< Real > &B, const CuMatrixBase< Real > &C)
*this = a * b / c (by element; when c = 0, *this = a) *this can be an alias of a, b or c safely and g...
Definition: cu-matrix.cc:1206
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
static Component * NewComponentOfType(const std::string &type)
Returns a new Component of the given type e.g.
void Tanh(const CuMatrixBase< Real > &src)
Compute the hyperbolic tangent (tanh) function; element by element, *this = tanh(src).
Definition: cu-matrix.cc:1786
#define KALDI_LOG
Definition: kaldi-error.h:153
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
virtual void Update(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_deriv)
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
void Set(Real value)
Definition: cu-matrix.cc:531
CuRand< BaseFloat > random_generator_
virtual void Update(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_deriv)
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i&#39;th row by scale[i]
Definition: cu-matrix.cc:792
void RepairGradients(const CuMatrixBase< BaseFloat > &out_value, CuMatrixBase< BaseFloat > *in_deriv, TanhComponent *to_update) const
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
OnlineNaturalGradient preconditioner_out_
virtual void StoreStats(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, void *memo)
This function may store stats on average activation values, and for some component types...
BaseFloat LearningRate() const
Gets the learning rate to be used in gradient descent.
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
void Read(std::istream &in, bool binary, bool add=false)
Read function using C++ streams.
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
BaseFloat dropout_proportion_
dropout-proportion is the proportion that is dropped out, e.g.
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
void Resize(MatrixIndexT rows, MatrixIndexT cols, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Allocate the memory.
Definition: cu-matrix.cc:50
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
virtual void StoreStats(const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, void *memo)
This function may store stats on average activation values, and for some component types...
virtual void InitFromConfig(ConfigLine *cfl)=0
Initialize, from a ConfigLine object.
void AddRowSumMat(Real alpha, const CuMatrixBase< Real > &mat, Real beta=1.0)
Sum the rows of the matrix, add to vector.
Definition: cu-vector.cc:1277
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
virtual Component * Copy() const
Copies component (deep copy).
void Init(int32 input_dim, int32 output_dim, int32 num_repeats, BaseFloat param_stddev, BaseFloat bias_mean, BaseFloat bias_stddev)
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
Definition: kaldi-math.cc:95
Vector for CUDA computing.
Definition: matrix-common.h:72
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).
Definition: cu-matrix.h:740
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
FixedAffineComponent is an affine transform that is supplied at network initialization time and is no...
virtual int32 InputDim() const
Returns input-dimension of this component.
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).
Definition: kaldi-vector.h:94
This class implements an affine transform using a block diagonal matrix e.g., one whose weight matrix...
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...