nnet-combined-component.cc
Go to the documentation of this file.
1 // nnet3/nnet-combined-component.cc
2 
3 // Copyright 2015-2018 Johns Hopkins University (author: Daniel Povey)
4 // 2015 Daniel Galvez
5 // 2018 Hang Lyu
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #include <iterator>
23 #include <sstream>
24 #include <algorithm>
25 #include <iomanip>
27 #include "nnet3/nnet-parse.h"
28 #include "cudamatrix/cu-math.h"
29 
30 namespace kaldi {
31 namespace nnet3 {
32 
33 // Constructors for the convolution component
36  input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
37  filt_x_dim_(0), filt_y_dim_(0),
38  filt_x_step_(0), filt_y_step_(0),
39  input_vectorization_(kZyx) { }
40 
42  const ConvolutionComponent &component):
43  UpdatableComponent(component),
44  input_x_dim_(component.input_x_dim_),
45  input_y_dim_(component.input_y_dim_),
46  input_z_dim_(component.input_z_dim_),
47  filt_x_dim_(component.filt_x_dim_),
48  filt_y_dim_(component.filt_y_dim_),
49  filt_x_step_(component.filt_x_step_),
50  filt_y_step_(component.filt_y_step_),
52  filter_params_(component.filter_params_),
53  bias_params_(component.bias_params_) { }
54 
56  const CuMatrixBase<BaseFloat> &filter_params,
57  const CuVectorBase<BaseFloat> &bias_params,
58  int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
59  int32 filt_x_dim, int32 filt_y_dim,
60  int32 filt_x_step, int32 filt_y_step,
61  TensorVectorizationType input_vectorization,
62  BaseFloat learning_rate):
63  input_x_dim_(input_x_dim),
64  input_y_dim_(input_y_dim),
65  input_z_dim_(input_z_dim),
66  filt_x_dim_(filt_x_dim),
67  filt_y_dim_(filt_y_dim),
68  filt_x_step_(filt_x_step),
69  filt_y_step_(filt_y_step),
70  input_vectorization_(input_vectorization),
71  filter_params_(filter_params),
72  bias_params_(bias_params){
73  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
74  bias_params.Dim() != 0);
75  KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim);
76  SetUnderlyingLearningRate(learning_rate);
77  is_gradient_ = false;
78 }
79 
80 // aquire input dim
83 }
84 
85 // aquire output dim
87  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
88  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
89  int32 num_filters = filter_params_.NumRows();
90  return num_x_steps * num_y_steps * num_filters;
91 }
92 
93 // initialize the component using hyperparameters
95  int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
96  int32 filt_x_dim, int32 filt_y_dim,
97  int32 filt_x_step, int32 filt_y_step, int32 num_filters,
98  TensorVectorizationType input_vectorization,
99  BaseFloat param_stddev, BaseFloat bias_stddev) {
100  input_x_dim_ = input_x_dim;
101  input_y_dim_ = input_y_dim;
102  input_z_dim_ = input_z_dim;
103  filt_x_dim_ = filt_x_dim;
104  filt_y_dim_ = filt_y_dim;
105  filt_x_step_ = filt_x_step;
106  filt_y_step_ = filt_y_step;
107  input_vectorization_ = input_vectorization;
110  int32 filter_dim = filt_x_dim_ * filt_y_dim_ * input_z_dim_;
111  filter_params_.Resize(num_filters, filter_dim);
112  bias_params_.Resize(num_filters);
113  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
114  filter_params_.SetRandn();
115  filter_params_.Scale(param_stddev);
116  bias_params_.SetRandn();
117  bias_params_.Scale(bias_stddev);
118 }
119 
120 // initialize the component using predefined matrix file
122  int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
123  int32 filt_x_dim, int32 filt_y_dim,
124  int32 filt_x_step, int32 filt_y_step,
125  TensorVectorizationType input_vectorization,
126  std::string matrix_filename) {
127  input_x_dim_ = input_x_dim;
128  input_y_dim_ = input_y_dim;
129  input_z_dim_ = input_z_dim;
130  filt_x_dim_ = filt_x_dim;
131  filt_y_dim_ = filt_y_dim;
132  filt_x_step_ = filt_x_step;
133  filt_y_step_ = filt_y_step;
134  input_vectorization_ = input_vectorization;
136  ReadKaldiObject(matrix_filename, &mat);
137  int32 filter_dim = (filt_x_dim_ * filt_y_dim_ * input_z_dim_);
138  int32 num_filters = mat.NumRows();
139  KALDI_ASSERT(mat.NumCols() == (filter_dim + 1));
140  filter_params_.Resize(num_filters, filter_dim);
141  bias_params_.Resize(num_filters);
142  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
143  bias_params_.CopyColFromMat(mat, filter_dim);
144 }
145 
146 // display information about component
147 std::string ConvolutionComponent::Info() const {
148  std::ostringstream stream;
149  stream << UpdatableComponent::Info()
150  << ", input-x-dim=" << input_x_dim_
151  << ", input-y-dim=" << input_y_dim_
152  << ", input-z-dim=" << input_z_dim_
153  << ", filt-x-dim=" << filt_x_dim_
154  << ", filt-y-dim=" << filt_y_dim_
155  << ", filt-x-step=" << filt_x_step_
156  << ", filt-y-step=" << filt_y_step_
157  << ", input-vectorization=" << input_vectorization_
158  << ", num-filters=" << filter_params_.NumRows();
159  PrintParameterStats(stream, "filter-params", filter_params_);
160  PrintParameterStats(stream, "bias-params", bias_params_, true);
161  return stream.str();
162 }
163 
164 // initialize the component using configuration file
166  bool ok = true;
167  std::string matrix_filename;
168  int32 input_x_dim = -1, input_y_dim = -1, input_z_dim = -1,
169  filt_x_dim = -1, filt_y_dim = -1,
170  filt_x_step = -1, filt_y_step = -1,
171  num_filters = -1;
172  std::string input_vectorization_order = "zyx";
174  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim);
175  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim);
176  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim);
177  ok = ok && cfl->GetValue("filt-x-dim", &filt_x_dim);
178  ok = ok && cfl->GetValue("filt-y-dim", &filt_y_dim);
179  ok = ok && cfl->GetValue("filt-x-step", &filt_x_step);
180  ok = ok && cfl->GetValue("filt-y-step", &filt_y_step);
181 
182  if (!ok)
183  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
184  // optional argument
185  TensorVectorizationType input_vectorization;
186  cfl->GetValue("input-vectorization-order", &input_vectorization_order);
187  if (input_vectorization_order.compare("zyx") == 0) {
188  input_vectorization = kZyx;
189  } else if (input_vectorization_order.compare("yzx") == 0) {
190  input_vectorization = kYzx;
191  } else {
192  KALDI_ERR << "Unknown or unsupported input vectorization order "
193  << input_vectorization_order
194  << " accepted candidates are 'yzx' and 'zyx'";
195  }
196 
197  if (cfl->GetValue("matrix", &matrix_filename)) {
198  // initialize from prefined parameter matrix
199  Init(input_x_dim, input_y_dim, input_z_dim,
200  filt_x_dim, filt_y_dim,
201  filt_x_step, filt_y_step,
202  input_vectorization,
203  matrix_filename);
204  } else {
205  ok = ok && cfl->GetValue("num-filters", &num_filters);
206  if (!ok)
207  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
208  // initialize from configuration
209  int32 filter_input_dim = filt_x_dim * filt_y_dim * input_z_dim;
210  BaseFloat param_stddev = 1.0 / std::sqrt(filter_input_dim), bias_stddev = 1.0;
211  cfl->GetValue("param-stddev", &param_stddev);
212  cfl->GetValue("bias-stddev", &bias_stddev);
213  Init(input_x_dim, input_y_dim, input_z_dim,
214  filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, num_filters,
215  input_vectorization, param_stddev, bias_stddev);
216  }
217  if (cfl->HasUnusedValues())
218  KALDI_ERR << "Could not process these elements in initializer: "
219  << cfl->UnusedValues();
220  if (!ok)
221  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
222 }
223 
224 // Inline methods to convert from tensor index i.e., (x,y,z) index
225 // to index in yzx or zyx vectorized tensors
227  int32 input_x_dim,
228  int32 input_y_dim,
229  int32 input_z_dim) {
230  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
231  return (input_y_dim * input_z_dim) * x + (input_y_dim) * z + y;
232 }
233 
235  int32 input_x_dim,
236  int32 input_y_dim,
237  int32 input_z_dim) {
238  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
239  return (input_y_dim * input_z_dim) * x + (input_z_dim) * y + z;
240 }
241 
242 // Method to convert from a matrix representing a minibatch of vectorized
243 // 3D tensors to patches for convolution, each patch corresponds to
244 // one dot product in the convolution
246  const CuMatrixBase<BaseFloat>& in,
247  CuMatrix<BaseFloat> *patches) const{
248  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
249  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
250  const int32 filt_x_step = filt_x_step_,
251  filt_y_step = filt_y_step_,
252  filt_x_dim = filt_x_dim_,
253  filt_y_dim = filt_y_dim_,
254  input_x_dim = input_x_dim_,
255  input_y_dim = input_y_dim_,
256  input_z_dim = input_z_dim_,
257  filter_dim = filter_params_.NumCols();
258 
259  std::vector<int32> column_map(patches->NumCols());
260  int32 column_map_size = column_map.size();
261  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
262  for (int32 y_step = 0; y_step < num_y_steps; y_step++) {
263  int32 patch_number = x_step * num_y_steps + y_step;
264  int32 patch_start_index = patch_number * filter_dim;
265  for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++) {
266  for (int32 y = 0; y < filt_y_dim; y++) {
267  for (int32 z = 0; z < input_z_dim; z++, index++) {
268  KALDI_ASSERT(index < column_map_size);
269  if (input_vectorization_ == kZyx) {
270  column_map[index] = ZyxVectorIndex(x_step * filt_x_step + x,
271  y_step * filt_y_step + y, z,
272  input_x_dim, input_y_dim,
273  input_z_dim);
274  } else if (input_vectorization_ == kYzx) {
275  column_map[index] = YzxVectorIndex(x_step * filt_x_step + x,
276  y_step * filt_y_step + y, z,
277  input_x_dim, input_y_dim,
278  input_z_dim);
279  }
280  }
281  }
282  }
283  }
284  }
285  CuArray<int32> cu_cols(column_map);
286  patches->CopyCols(in, cu_cols);
287 }
288 
289 
290 // propagation function
291 // see function declaration in nnet-simple-component.h for details
293  const CuMatrixBase<BaseFloat> &in,
294  CuMatrixBase<BaseFloat> *out) const {
295  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
296  num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
297  num_filters = filter_params_.NumRows(),
298  num_frames = in.NumRows(),
299  filter_dim = filter_params_.NumCols();
300  KALDI_ASSERT((*out).NumRows() == num_frames &&
301  (*out).NumCols() == (num_filters * num_x_steps * num_y_steps));
302 
303  CuMatrix<BaseFloat> patches(num_frames,
304  num_x_steps * num_y_steps * filter_dim,
305  kUndefined);
306  InputToInputPatches(in, &patches);
307  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
308  filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
309  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
310  filter_params_batch;
311 
312  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
313  for (int32 y_step = 0; y_step < num_y_steps; y_step++) {
314  int32 patch_number = x_step * num_y_steps + y_step;
315  tgt_batch.push_back(new CuSubMatrix<BaseFloat>(
316  out->ColRange(patch_number * num_filters, num_filters)));
317  patch_batch.push_back(new CuSubMatrix<BaseFloat>(
318  patches.ColRange(patch_number * filter_dim, filter_dim)));
319  filter_params_batch.push_back(filter_params_elem);
320  tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
321  }
322  }
323  // apply all filters
324  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch,
325  kNoTrans, filter_params_batch,
326  kTrans, 1.0);
327  // release memory
328  delete filter_params_elem;
329  for (int32 p = 0; p < tgt_batch.size(); p++) {
330  delete tgt_batch[p];
331  delete patch_batch[p];
332  }
333  return NULL;
334 }
335 
336 // scale the parameters
338  if (scale == 0.0) {
339  filter_params_.SetZero();
340  bias_params_.SetZero();
341  } else {
342  filter_params_.Scale(scale);
343  bias_params_.Scale(scale);
344  }
345 }
346 
347 // add another convolution component
348 void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
349  const ConvolutionComponent *other =
350  dynamic_cast<const ConvolutionComponent*>(&other_in);
351  KALDI_ASSERT(other != NULL);
352  filter_params_.AddMat(alpha, other->filter_params_);
353  bias_params_.AddVec(alpha, other->bias_params_);
354 }
355 
356 /*
357  This function transforms a vector of lists into a list of vectors,
358  padded with -1.
359  @param[in] The input vector of lists. Let in.size() be D, and let
360  the longest list length (i.e. the max of in[i].size()) be L.
361  @param[out] The output list of vectors. The length of the list will
362  be L, each vector-dimension will be D (i.e. out[i].size() == D),
363  and if in[i] == j, then for some k we will have that
364  out[k][j] = i. The output vectors are padded with -1
365  where necessary if not all the input lists have the same side.
366 */
367 void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
368  std::vector<std::vector<int32> > *out) {
369  int32 D = in.size();
370  int32 L = 0;
371  for (int32 i = 0; i < D; i++)
372  if (in[i].size() > L)
373  L = in[i].size();
374  out->resize(L);
375  for (int32 i = 0; i < L; i++)
376  (*out)[i].resize(D, -1);
377  for (int32 i = 0; i < D; i++) {
378  for (int32 j = 0; j < in[i].size(); j++) {
379  (*out)[j][i] = in[i][j];
380  }
381  }
382 }
383 
384 // Method to compute the input derivative matrix from the input derivatives
385 // for patches, where each patch corresponds to one dot product
386 // in the convolution
388  const CuMatrix<BaseFloat>& in_deriv_patches,
389  CuMatrixBase<BaseFloat> *in_deriv) const {
390 
391  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
392  num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
393  filt_x_step = filt_x_step_,
394  filt_y_step = filt_y_step_,
395  filt_x_dim = filt_x_dim_,
396  filt_y_dim = filt_y_dim_,
397  input_x_dim = input_x_dim_,
398  input_y_dim = input_y_dim_,
399  input_z_dim = input_z_dim_,
400  filter_dim = filter_params_.NumCols();
401 
402  // Compute the reverse column_map from the matrix with input
403  // derivative patches to input derivative matrix
404  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
405  int32 rev_col_map_size = reverse_column_map.size();
406  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
407  for (int32 y_step = 0; y_step < num_y_steps; y_step++) {
408  int32 patch_number = x_step * num_y_steps + y_step;
409  int32 patch_start_index = patch_number * filter_dim;
410  for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++) {
411  for (int32 y = 0; y < filt_y_dim; y++) {
412  for (int32 z = 0; z < input_z_dim; z++, index++) {
413  int32 vector_index;
414  if (input_vectorization_ == kZyx) {
415  vector_index = ZyxVectorIndex(x_step * filt_x_step + x,
416  y_step * filt_y_step + y, z,
417  input_x_dim, input_y_dim,
418  input_z_dim);
419  } else {
421  vector_index = YzxVectorIndex(x_step * filt_x_step + x,
422  y_step * filt_y_step + y, z,
423  input_x_dim, input_y_dim,
424  input_z_dim);
425  }
426  KALDI_ASSERT(vector_index < rev_col_map_size);
427  reverse_column_map[vector_index].push_back(index);
428  }
429  }
430  }
431  }
432  }
433  std::vector<std::vector<int32> > rearranged_column_map;
434  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
435  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
436  CuArray<int32> cu_cols(rearranged_column_map[p]);
437  in_deriv->AddCols(in_deriv_patches, cu_cols);
438  }
439 }
440 
441 // back propagation function
442 // see function declaration in nnet-simple-component.h for details
443 void ConvolutionComponent::Backprop(const std::string &debug_info,
444  const ComponentPrecomputedIndexes *indexes,
445  const CuMatrixBase<BaseFloat> &in_value,
446  const CuMatrixBase<BaseFloat> &, // out_value,
447  const CuMatrixBase<BaseFloat> &out_deriv,
448  void *memo,
449  Component *to_update_in,
450  CuMatrixBase<BaseFloat> *in_deriv) const {
451  NVTX_RANGE("ConvolutionComponent::Backprop");
452  ConvolutionComponent *to_update =
453  dynamic_cast<ConvolutionComponent*>(to_update_in);
454  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
455  num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
456  num_filters = filter_params_.NumRows(),
457  num_frames = out_deriv.NumRows(),
458  filter_dim = filter_params_.NumCols();
459 
460  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
461  out_deriv.NumCols() ==
462  (num_filters * num_x_steps * num_y_steps));
463 
464  // Compute inderiv patches
465  CuMatrix<BaseFloat> in_deriv_patches(num_frames,
466  num_x_steps * num_y_steps * filter_dim,
467  kSetZero);
468 
469  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
470  filter_params_batch;
471  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
472  filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
473 
474  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
475  for (int32 y_step = 0; y_step < num_y_steps; y_step++) {
476  int32 patch_number = x_step * num_y_steps + y_step;
477 
478  patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(
479  in_deriv_patches.ColRange(
480  patch_number * filter_dim, filter_dim)));
481  out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
482  patch_number * num_filters, num_filters)));
483  filter_params_batch.push_back(filter_params_elem);
484  }
485  }
486  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch,
487  out_deriv_batch, kNoTrans,
488  filter_params_batch, kNoTrans, 0.0);
489 
490  if (in_deriv) {
491  // combine the derivatives from the individual input deriv patches
492  // to compute input deriv matrix
493  InderivPatchesToInderiv(in_deriv_patches, in_deriv);
494  }
495 
496  if (to_update != NULL) {
497  to_update->Update(debug_info, in_value, out_deriv, out_deriv_batch);
498  }
499 
500  // release memory
501  delete filter_params_elem;
502  for (int32 p = 0; p < patch_deriv_batch.size(); p++) {
503  delete patch_deriv_batch[p];
504  delete out_deriv_batch[p];
505  }
506 }
507 
508 
509 // update parameters
510 // see function declaration in nnet-simple-component.h for details
511 void ConvolutionComponent::Update(const std::string &debug_info,
512  const CuMatrixBase<BaseFloat> &in_value,
513  const CuMatrixBase<BaseFloat> &out_deriv,
514  const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch) {
515  // useful dims
516  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
517  num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
518  num_filters = filter_params_.NumRows(),
519  num_frames = out_deriv.NumRows(),
520  filter_dim = filter_params_.NumCols();
521  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
522  out_deriv.NumCols() ==
523  (num_filters * num_x_steps * num_y_steps));
524 
525 
526  CuMatrix<BaseFloat> filters_grad;
527  CuVector<BaseFloat> bias_grad;
528 
529  CuMatrix<BaseFloat> input_patches(num_frames,
530  filter_dim * num_x_steps * num_y_steps,
531  kUndefined);
532  InputToInputPatches(in_value, &input_patches);
533 
534  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
535  bias_grad.Resize(num_filters, kSetZero); // reset
536 
537  // create a single large matrix holding the smaller matrices
538  // from the vector container filters_grad_batch along the rows
539  CuMatrix<BaseFloat> filters_grad_blocks_batch(
540  num_x_steps * num_y_steps * filters_grad.NumRows(),
541  filters_grad.NumCols());
542 
543  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, input_patch_batch;
544 
545  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
546  for (int32 y_step = 0; y_step < num_y_steps; y_step++) {
547  int32 patch_number = x_step * num_y_steps + y_step;
548  filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
549  filters_grad_blocks_batch.RowRange(
550  patch_number * filters_grad.NumRows(), filters_grad.NumRows())));
551 
552  input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
553  input_patches.ColRange(patch_number * filter_dim, filter_dim)));
554  }
555  }
556 
557  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, out_deriv_batch, kTrans,
558  input_patch_batch, kNoTrans, 1.0);
559 
560  // add the row blocks together to filters_grad
561  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
562 
563  // create a matrix holding the col blocks sum of out_deriv
564  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(),
565  num_filters);
566 
567  // add the col blocks together to out_deriv_col_blocks_sum
568  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
569 
570  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
571 
572  // release memory
573  for (int32 p = 0; p < input_patch_batch.size(); p++) {
574  delete filters_grad_batch[p];
575  delete input_patch_batch[p];
576  }
577 
578  //
579  // update
580  //
581  filter_params_.AddMat(learning_rate_, filters_grad);
582  bias_params_.AddVec(learning_rate_, bias_grad);
583 }
584 
585 void ConvolutionComponent::Read(std::istream &is, bool binary) {
586  ReadUpdatableCommon(is, binary); // Read opening tag and learning rate.
587  ExpectToken(is, binary, "<InputXDim>");
588  ReadBasicType(is, binary, &input_x_dim_);
589  ExpectToken(is, binary, "<InputYDim>");
590  ReadBasicType(is, binary, &input_y_dim_);
591  ExpectToken(is, binary, "<InputZDim>");
592  ReadBasicType(is, binary, &input_z_dim_);
593  ExpectToken(is, binary, "<FiltXDim>");
594  ReadBasicType(is, binary, &filt_x_dim_);
595  ExpectToken(is, binary, "<FiltYDim>");
596  ReadBasicType(is, binary, &filt_y_dim_);
597  ExpectToken(is, binary, "<FiltXStep>");
598  ReadBasicType(is, binary, &filt_x_step_);
599  ExpectToken(is, binary, "<FiltYStep>");
600  ReadBasicType(is, binary, &filt_y_step_);
601  ExpectToken(is, binary, "<InputVectorization>");
602  int32 input_vectorization;
603  ReadBasicType(is, binary, &input_vectorization);
604  input_vectorization_ = static_cast<TensorVectorizationType>(input_vectorization);
605  ExpectToken(is, binary, "<FilterParams>");
606  filter_params_.Read(is, binary);
607  ExpectToken(is, binary, "<BiasParams>");
608  bias_params_.Read(is, binary);
609  std::string tok;
610  ReadToken(is, binary, &tok);
611  if (tok == "<IsGradient>") {
612  ReadBasicType(is, binary, &is_gradient_);
613  ExpectToken(is, binary, "</ConvolutionComponent>");
614  } else {
615  is_gradient_ = false;
616  KALDI_ASSERT(tok == "</ConvolutionComponent>");
617  }
618 }
619 
620 void ConvolutionComponent::Write(std::ostream &os, bool binary) const {
621  WriteUpdatableCommon(os, binary); // write opening tag and learning rate.
622  WriteToken(os, binary, "<InputXDim>");
623  WriteBasicType(os, binary, input_x_dim_);
624  WriteToken(os, binary, "<InputYDim>");
625  WriteBasicType(os, binary, input_y_dim_);
626  WriteToken(os, binary, "<InputZDim>");
627  WriteBasicType(os, binary, input_z_dim_);
628  WriteToken(os, binary, "<FiltXDim>");
629  WriteBasicType(os, binary, filt_x_dim_);
630  WriteToken(os, binary, "<FiltYDim>");
631  WriteBasicType(os, binary, filt_y_dim_);
632  WriteToken(os, binary, "<FiltXStep>");
633  WriteBasicType(os, binary, filt_x_step_);
634  WriteToken(os, binary, "<FiltYStep>");
635  WriteBasicType(os, binary, filt_y_step_);
636  WriteToken(os, binary, "<InputVectorization>");
637  WriteBasicType(os, binary, static_cast<int32>(input_vectorization_));
638  WriteToken(os, binary, "<FilterParams>");
639  filter_params_.Write(os, binary);
640  WriteToken(os, binary, "<BiasParams>");
641  bias_params_.Write(os, binary);
642  WriteToken(os, binary, "<IsGradient>");
643  WriteBasicType(os, binary, is_gradient_);
644  WriteToken(os, binary, "</ConvolutionComponent>");
645 }
646 
648  const ConvolutionComponent *other =
649  dynamic_cast<const ConvolutionComponent*>(&other_in);
651  + VecVec(bias_params_, other->bias_params_);
652 }
653 
655  ConvolutionComponent *ans = new ConvolutionComponent(*this);
656  return ans;
657 }
658 
660  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
661  temp_filter_params.SetRandn();
662  filter_params_.AddMat(stddev, temp_filter_params);
663 
664  CuVector<BaseFloat> temp_bias_params(bias_params_);
665  temp_bias_params.SetRandn();
666  bias_params_.AddVec(stddev, temp_bias_params);
667 }
668 
670  const MatrixBase<BaseFloat> &filter) {
671  bias_params_ = bias;
672  filter_params_ = filter;
673  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
674 }
675 
677  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
678 }
679 
681  KALDI_ASSERT(params->Dim() == this->NumParameters());
682  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
683  params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_);
684  params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_);
685 }
687  KALDI_ASSERT(params.Dim() == this->NumParameters());
688  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
689  filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params));
690  bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
691 }
692 
693 // aquire input dim
696 }
697 
699  const MaxpoolingComponent &component):
700  input_x_dim_(component.input_x_dim_),
701  input_y_dim_(component.input_y_dim_),
702  input_z_dim_(component.input_z_dim_),
703  pool_x_size_(component.pool_x_size_),
704  pool_y_size_(component.pool_y_size_),
705  pool_z_size_(component.pool_z_size_),
706  pool_x_step_(component.pool_x_step_),
707  pool_y_step_(component.pool_y_step_),
708  pool_z_step_(component.pool_z_step_) { }
709 
710 // aquire output dim
712  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
713  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
714  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
715  return num_pools_x * num_pools_y * num_pools_z;
716 }
717 
718 // check the component parameters
720  // sanity check of the max pooling parameters
739 }
740 
741 // initialize the component using configuration file
743  bool ok = true;
744 
745  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_);
746  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_);
747  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_);
748  ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_);
749  ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_);
750  ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_);
751  ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_);
752  ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_);
753  ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_);
754 
755  if (cfl->HasUnusedValues())
756  KALDI_ERR << "Could not process these elements in initializer: "
757  << cfl->UnusedValues();
758  if (!ok)
759  KALDI_ERR << "Bad initializer " << cfl->WholeLine();
760 
761  Check();
762 }
763 
764 // Method to convert from a matrix representing a minibatch of vectorized
765 // 3D tensors to patches for 3d max pooling, each patch corresponds to
766 // the nodes having the same local coordinatenodes from each pool
768  const CuMatrixBase<BaseFloat>& in,
769  CuMatrix<BaseFloat> *patches) const{
770  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
771  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
772  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
773 
774  std::vector<int32> column_map(patches->NumCols());
775  int32 column_map_size = column_map.size();
776  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
777  for (int32 y = 0; y < pool_y_size_; y++) {
778  for (int32 z = 0; z < pool_z_size_; z++) {
779  // given the local node coordinate, group them from each pool
780  // to form a patch
781  for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
782  for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
783  for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
784  KALDI_ASSERT(index < column_map_size);
785  column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
786  (y_pool * pool_y_step_ + y) * input_z_dim_ +
787  (z_pool * pool_z_step_ + z);
788 
789  }
790  }
791  }
792  }
793  }
794  }
795  CuArray<int32> cu_cols(column_map);
796  patches->CopyCols(in, cu_cols);
797 }
798 
799 /*
800  This is the 3d max pooling propagate function.
801  It is assumed that each row of the input matrix
802  is a vectorized 3D-tensor of type zxy.
803  Similar to the propagate function of ConvolutionComponent,
804  the input matrix is first arranged into patches so that
805  pools (with / without overlapping) could be
806  processed in a parallelizable manner.
807  The output matrix is also a vectorized 3D-tensor of type zxy.
808 */
809 
811  const CuMatrixBase<BaseFloat> &in,
812  CuMatrixBase<BaseFloat> *out) const {
813  int32 num_frames = in.NumRows();
814  int32 num_pools = OutputDim();
816  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
817  InputToInputPatches(in, &patches);
818 
819  out->Set(-1e20); // reset a large negative value
820  for (int32 q = 0; q < pool_size; q++)
821  out->Max(patches.ColRange(q * num_pools, num_pools));
822  return NULL;
823 }
824 
825 // Method to compute the input derivative matrix from the input derivatives
826 // for patches, where each patch corresponds to
827 // the nodes having the same local coordinatenodes from each pool
829  const CuMatrix<BaseFloat>& in_deriv_patches,
830  CuMatrixBase<BaseFloat> *in_deriv) const {
831 
832  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
833  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
834  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
835 
836  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
837  int32 rev_col_map_size = reverse_column_map.size();
838  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
839  for (int32 y = 0; y < pool_y_size_; y++) {
840  for (int32 z = 0; z < pool_z_size_; z++) {
841 
842  for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
843  for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
844  for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
845  int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
846  (y_pool * pool_y_step_ + y) * input_z_dim_ +
847  (z_pool * pool_z_step_ + z);
848 
849  KALDI_ASSERT(vector_index < rev_col_map_size);
850  reverse_column_map[vector_index].push_back(index);
851  }
852  }
853  }
854  }
855  }
856  }
857  std::vector<std::vector<int32> > rearranged_column_map;
858  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
859  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
860  CuArray<int32> cu_cols(rearranged_column_map[p]);
861  in_deriv->AddCols(in_deriv_patches, cu_cols);
862  }
863 }
864 
865 /*
866  3d max pooling backpropagate function
867  This function backpropagate the error from
868  out_deriv to in_deriv.
869  In order to select the node in each pool to
870  backpropagate the error, it has to compare
871  the output pool value stored in the out_value
872  matrix with each of its input pool member node
873  stroed in the in_value matrix.
874 */
875 void MaxpoolingComponent::Backprop(const std::string &debug_info,
876  const ComponentPrecomputedIndexes *indexes,
877  const CuMatrixBase<BaseFloat> &in_value,
878  const CuMatrixBase<BaseFloat> &out_value,
879  const CuMatrixBase<BaseFloat> &out_deriv,
880  void *memo,
881  Component *, // to_update,
882  CuMatrixBase<BaseFloat> *in_deriv) const {
883  NVTX_RANGE("MaxpoolingComponent::Backprop");
884  if (!in_deriv)
885  return;
886 
887  int32 num_frames = in_value.NumRows();
888  int32 num_pools = OutputDim();
890  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
891  InputToInputPatches(in_value, &patches);
892 
893  for (int32 q = 0; q < pool_size; q++) {
894  // zero-out mask
895  CuMatrix<BaseFloat> mask;
896  out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask);
897  mask.MulElements(out_deriv);
898  patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
899  }
900 
901  // combine the derivatives from the individual input deriv patches
902  // to compute input deriv matrix
903  InderivPatchesToInderiv(patches, in_deriv);
904 }
905 
906 void MaxpoolingComponent::Read(std::istream &is, bool binary) {
907  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputXDim>");
908  ReadBasicType(is, binary, &input_x_dim_);
909  ExpectToken(is, binary, "<InputYDim>");
910  ReadBasicType(is, binary, &input_y_dim_);
911  ExpectToken(is, binary, "<InputZDim>");
912  ReadBasicType(is, binary, &input_z_dim_);
913  ExpectToken(is, binary, "<PoolXSize>");
914  ReadBasicType(is, binary, &pool_x_size_);
915  ExpectToken(is, binary, "<PoolYSize>");
916  ReadBasicType(is, binary, &pool_y_size_);
917  ExpectToken(is, binary, "<PoolZSize>");
918  ReadBasicType(is, binary, &pool_z_size_);
919  ExpectToken(is, binary, "<PoolXStep>");
920  ReadBasicType(is, binary, &pool_x_step_);
921  ExpectToken(is, binary, "<PoolYStep>");
922  ReadBasicType(is, binary, &pool_y_step_);
923  ExpectToken(is, binary, "<PoolZStep>");
924  ReadBasicType(is, binary, &pool_z_step_);
925  ExpectToken(is, binary, "</MaxpoolingComponent>");
926  Check();
927 }
928 
929 void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
930  WriteToken(os, binary, "<MaxpoolingComponent>");
931  WriteToken(os, binary, "<InputXDim>");
932  WriteBasicType(os, binary, input_x_dim_);
933  WriteToken(os, binary, "<InputYDim>");
934  WriteBasicType(os, binary, input_y_dim_);
935  WriteToken(os, binary, "<InputZDim>");
936  WriteBasicType(os, binary, input_z_dim_);
937  WriteToken(os, binary, "<PoolXSize>");
938  WriteBasicType(os, binary, pool_x_size_);
939  WriteToken(os, binary, "<PoolYSize>");
940  WriteBasicType(os, binary, pool_y_size_);
941  WriteToken(os, binary, "<PoolZSize>");
942  WriteBasicType(os, binary, pool_z_size_);
943  WriteToken(os, binary, "<PoolXStep>");
944  WriteBasicType(os, binary, pool_x_step_);
945  WriteToken(os, binary, "<PoolYStep>");
946  WriteBasicType(os, binary, pool_y_step_);
947  WriteToken(os, binary, "<PoolZStep>");
948  WriteBasicType(os, binary, pool_z_step_);
949  WriteToken(os, binary, "</MaxpoolingComponent>");
950 }
951 
952 // display information about component
953 std::string MaxpoolingComponent::Info() const {
954  std::ostringstream stream;
955  stream << Type()
956  << ", input-x-dim=" << input_x_dim_
957  << ", input-y-dim=" << input_y_dim_
958  << ", input-z-dim=" << input_z_dim_
959  << ", pool-x-size=" << pool_x_size_
960  << ", pool-y-size=" << pool_y_size_
961  << ", pool-z-size=" << pool_z_size_
962  << ", pool-x-step=" << pool_x_step_
963  << ", pool-y-step=" << pool_y_step_
964  << ", pool-z-step=" << pool_z_step_;
965  return stream.str();
966 }
967 
968 
970  int32 cell_dim = value_sum_.NumCols();
971  return cell_dim * 5 + (use_dropout_ ? 3 : 0);
972 }
973 
975  int32 cell_dim = value_sum_.NumCols();
976  return cell_dim * 2;
977 }
978 
979 
980 void LstmNonlinearityComponent::Read(std::istream &is, bool binary) {
981  ReadUpdatableCommon(is, binary); // Read opening tag and learning rate.
982  ExpectToken(is, binary, "<Params>");
983  params_.Read(is, binary);
984  ExpectToken(is, binary, "<ValueAvg>");
985  value_sum_.Read(is, binary);
986  ExpectToken(is, binary, "<DerivAvg>");
987  deriv_sum_.Read(is, binary);
988  ExpectToken(is, binary, "<SelfRepairConfig>");
989  self_repair_config_.Read(is, binary);
990  ExpectToken(is, binary, "<SelfRepairProb>");
991  self_repair_total_.Read(is, binary);
992 
993  std::string tok;
994  ReadToken(is, binary, &tok);
995  if (tok == "<UseDropout>") {
996  ReadBasicType(is, binary, &use_dropout_);
997  ReadToken(is, binary, &tok);
998  } else {
999  use_dropout_ = false;
1000  }
1001  KALDI_ASSERT(tok == "<Count>");
1002  ReadBasicType(is, binary, &count_);
1003 
1004  // For the on-disk format, we normalze value_sum_, deriv_sum_ and
1005  // self_repair_total_ by dividing by the count, but in memory they are scaled
1006  // by the count. [for self_repair_total_, the scaling factor is count_ *
1007  // cell_dim].
1008  value_sum_.Scale(count_);
1009  deriv_sum_.Scale(count_);
1010  int32 cell_dim = params_.NumCols();
1011  self_repair_total_.Scale(count_ * cell_dim);
1012 
1013  InitNaturalGradient();
1014 
1015  ExpectToken(is, binary, "</LstmNonlinearityComponent>");
1016 
1017 }
1018 
1019 void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const {
1020  WriteUpdatableCommon(os, binary); // Read opening tag and learning rate.
1021 
1022  WriteToken(os, binary, "<Params>");
1023  params_.Write(os, binary);
1024  WriteToken(os, binary, "<ValueAvg>");
1025  {
1026  Matrix<BaseFloat> value_avg(value_sum_);
1027  if (count_ != 0.0)
1028  value_avg.Scale(1.0 / count_);
1029  value_avg.Write(os, binary);
1030  }
1031  WriteToken(os, binary, "<DerivAvg>");
1032  {
1033  Matrix<BaseFloat> deriv_avg(deriv_sum_);
1034  if (count_ != 0.0)
1035  deriv_avg.Scale(1.0 / count_);
1036  deriv_avg.Write(os, binary);
1037  }
1038  WriteToken(os, binary, "<SelfRepairConfig>");
1039  self_repair_config_.Write(os, binary);
1040  WriteToken(os, binary, "<SelfRepairProb>");
1041  {
1042  int32 cell_dim = params_.NumCols();
1043  Vector<BaseFloat> self_repair_prob(self_repair_total_);
1044  if (count_ != 0.0)
1045  self_repair_prob.Scale(1.0 / (count_ * cell_dim));
1046  self_repair_prob.Write(os, binary);
1047  }
1048  if (use_dropout_) {
1049  // only write this if true; we have back-compat code in reading anyway.
1050  // this makes the models without dropout easier to read with older code.
1051  WriteToken(os, binary, "<UseDropout>");
1052  WriteBasicType(os, binary, use_dropout_);
1053  }
1054  WriteToken(os, binary, "<Count>");
1055  WriteBasicType(os, binary, count_);
1056  WriteToken(os, binary, "</LstmNonlinearityComponent>");
1057 }
1058 
1059 
1060 
1061 std::string LstmNonlinearityComponent::Info() const {
1062  std::ostringstream stream;
1063  int32 cell_dim = params_.NumCols();
1064  stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim
1065  << ", use-dropout=" << (use_dropout_ ? "true" : "false");
1066  PrintParameterStats(stream, "w_ic", params_.Row(0));
1067  PrintParameterStats(stream, "w_fc", params_.Row(1));
1068  PrintParameterStats(stream, "w_oc", params_.Row(2));
1069 
1070  // Note: some of the following code mirrors the code in
1071  // UpdatableComponent::Info(), in nnet-component-itf.cc.
1072  if (count_ > 0) {
1073  stream << ", count=" << std::setprecision(3) << count_
1074  << std::setprecision(6);
1075  }
1076  static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh",
1077  "o_t_sigmoid", "m_t_tanh" };
1078  for (int32 i = 0; i < 5; i++) {
1079  stream << ", " << nonlin_names[i] << "={";
1080  stream << " self-repair-lower-threshold=" << self_repair_config_(i)
1081  << ", self-repair-scale=" << self_repair_config_(i + 5);
1082 
1083  if (count_ != 0) {
1084  BaseFloat self_repaired_proportion =
1085  self_repair_total_(i) / (count_ * cell_dim);
1086  stream << ", self-repaired-proportion=" << self_repaired_proportion;
1087  Vector<double> value_sum(value_sum_.Row(i)),
1088  deriv_sum(deriv_sum_.Row(i));
1089  Vector<BaseFloat> value_avg(value_sum), deriv_avg(deriv_sum);
1090  value_avg.Scale(1.0 / count_);
1091  deriv_avg.Scale(1.0 / count_);
1092  stream << ", value-avg=" << SummarizeVector(value_avg)
1093  << ", deriv-avg=" << SummarizeVector(deriv_avg);
1094  }
1095  stream << " }";
1096  }
1097  return stream.str();
1098 }
1099 
1100 
1102  return new LstmNonlinearityComponent(*this);
1103 }
1104 
1106  value_sum_.SetZero();
1107  deriv_sum_.SetZero();
1108  self_repair_total_.SetZero();
1109  count_ = 0.0;
1110 }
1111 
1113  if (scale == 0.0) {
1114  params_.SetZero();
1115  value_sum_.SetZero();
1116  deriv_sum_.SetZero();
1117  self_repair_total_.SetZero();
1118  count_ = 0.0;
1119  } else {
1120  params_.Scale(scale);
1121  value_sum_.Scale(scale);
1122  deriv_sum_.Scale(scale);
1123  self_repair_total_.Scale(scale);
1124  count_ *= scale;
1125  }
1126 }
1127 
1129  const Component &other_in) {
1130  const LstmNonlinearityComponent *other =
1131  dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
1132  KALDI_ASSERT(other != NULL);
1133  params_.AddMat(alpha, other->params_);
1134  value_sum_.AddMat(alpha, other->value_sum_);
1135  deriv_sum_.AddMat(alpha, other->deriv_sum_);
1136  self_repair_total_.AddVec(alpha, other->self_repair_total_);
1137  count_ += alpha * other->count_;
1138 }
1139 
1141  CuMatrix<BaseFloat> temp_params(params_.NumRows(), params_.NumCols());
1142  temp_params.SetRandn();
1143  params_.AddMat(stddev, temp_params);
1144 }
1145 
1147  const UpdatableComponent &other_in) const {
1148  const LstmNonlinearityComponent *other =
1149  dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
1150  KALDI_ASSERT(other != NULL);
1151  return TraceMatMat(params_, other->params_, kTrans);
1152 }
1153 
1155  return params_.NumRows() * params_.NumCols();
1156 }
1157 
1159  KALDI_ASSERT(params->Dim() == NumParameters());
1160  params->CopyRowsFromMat(params_);
1161 }
1162 
1163 
1165  const VectorBase<BaseFloat> &params) {
1166  KALDI_ASSERT(params.Dim() == NumParameters());
1167  params_.CopyRowsFromVec(params);
1168 }
1169 
1170 
1172  const ComponentPrecomputedIndexes *, // indexes
1173  const CuMatrixBase<BaseFloat> &in,
1174  CuMatrixBase<BaseFloat> *out) const {
1175  cu::ComputeLstmNonlinearity(in, params_, out);
1176  return NULL;
1177 }
1178 
1179 
1181  const std::string &debug_info,
1182  const ComponentPrecomputedIndexes *indexes,
1183  const CuMatrixBase<BaseFloat> &in_value,
1184  const CuMatrixBase<BaseFloat> &, // out_value,
1185  const CuMatrixBase<BaseFloat> &out_deriv,
1186  void *memo,
1187  Component *to_update_in,
1188  CuMatrixBase<BaseFloat> *in_deriv) const {
1189  NVTX_RANGE("LstmNonlinearityComponent::Backprop");
1190 
1191  if (to_update_in == NULL) {
1192  cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
1193  deriv_sum_, self_repair_config_,
1194  count_, in_deriv,
1195  (CuMatrixBase<BaseFloat>*) NULL,
1196  (CuMatrixBase<double>*) NULL,
1197  (CuMatrixBase<double>*) NULL,
1198  (CuMatrixBase<BaseFloat>*) NULL);
1199  } else {
1200  LstmNonlinearityComponent *to_update =
1201  dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
1202  KALDI_ASSERT(to_update != NULL);
1203 
1204  int32 cell_dim = params_.NumCols();
1205  CuMatrix<BaseFloat> params_deriv(3, cell_dim, kUndefined);
1206  CuMatrix<BaseFloat> self_repair_total(5, cell_dim, kUndefined);
1207 
1208  cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
1209  deriv_sum_, self_repair_config_,
1210  count_, in_deriv, &params_deriv,
1211  &(to_update->value_sum_),
1212  &(to_update->deriv_sum_),
1213  &self_repair_total);
1214 
1215  CuVector<BaseFloat> self_repair_total_sum(5);
1216  self_repair_total_sum.AddColSumMat(1.0, self_repair_total, 0.0);
1217  to_update->self_repair_total_.AddVec(1.0, self_repair_total_sum);
1218  to_update->count_ += static_cast<double>(in_value.NumRows());
1219 
1220  BaseFloat scale = 1.0;
1221  if (!to_update->is_gradient_) {
1223  &params_deriv, &scale);
1224  }
1225  to_update->params_.AddMat(to_update->learning_rate_ * scale,
1226  params_deriv);
1227  }
1228 }
1229 
1231  const LstmNonlinearityComponent &other):
1232  UpdatableComponent(other),
1233  params_(other.params_),
1234  use_dropout_(other.use_dropout_),
1235  value_sum_(other.value_sum_),
1236  deriv_sum_(other.deriv_sum_),
1237  self_repair_config_(other.self_repair_config_),
1238  self_repair_total_(other.self_repair_total_),
1239  count_(other.count_),
1240  preconditioner_(other.preconditioner_) { }
1241 
1243  int32 cell_dim, bool use_dropout,
1244  BaseFloat param_stddev,
1245  BaseFloat tanh_self_repair_threshold,
1246  BaseFloat sigmoid_self_repair_threshold,
1247  BaseFloat self_repair_scale) {
1248  KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 &&
1249  tanh_self_repair_threshold >= 0.0 &&
1250  tanh_self_repair_threshold <= 1.0 &&
1251  sigmoid_self_repair_threshold >= 0.0 &&
1252  sigmoid_self_repair_threshold <= 0.25 &&
1253  self_repair_scale >= 0.0 && self_repair_scale <= 0.1);
1254  use_dropout_ = use_dropout;
1255  params_.Resize(3, cell_dim);
1256  params_.SetRandn();
1257  params_.Scale(param_stddev);
1258  value_sum_.Resize(5, cell_dim);
1259  deriv_sum_.Resize(5, cell_dim);
1260  self_repair_config_.Resize(10);
1261  self_repair_config_.Range(0, 5).Set(sigmoid_self_repair_threshold);
1262  self_repair_config_(2) = tanh_self_repair_threshold;
1263  self_repair_config_(4) = tanh_self_repair_threshold;
1264  self_repair_config_.Range(5, 5).Set(self_repair_scale);
1266  count_ = 0.0;
1268 
1269 }
1270 
1272  // As regards the configuration for the natural-gradient preconditioner, we
1273  // don't make it configurable from the command line-- it's unlikely that any
1274  // differences from changing this would be substantial enough to effectively
1275  // tune the configuration. Because the preconditioning code doesn't 'see' the
1276  // derivatives from individual frames, but only averages over the minibatch,
1277  // there is a fairly small amount of data available to estimate the Fisher
1278  // information matrix, so we set the rank, update period and
1279  // num-samples-history to smaller values than normal.
1283 }
1284 
1287  preconditioner_.Freeze(freeze);
1288 }
1289 
1292  bool ok = true;
1293  bool use_dropout = false;
1294  int32 cell_dim;
1295  // these self-repair thresholds are the normal defaults for tanh and sigmoid
1296  // respectively. If, later on, we decide that we want to support different
1297  // self-repair config values for the individual sigmoid and tanh
1298  // nonlinearities, we can modify this code then.
1299  BaseFloat tanh_self_repair_threshold = 0.2,
1300  sigmoid_self_repair_threshold = 0.05,
1301  self_repair_scale = 1.0e-05;
1302  // param_stddev is the stddev of the parameters. it may be better to
1303  // use a smaller value but this was the default in the python scripts
1304  // for a while.
1305  BaseFloat param_stddev = 1.0;
1306  ok = ok && cfl->GetValue("cell-dim", &cell_dim);
1307  cfl->GetValue("param-stddev", &param_stddev);
1308  cfl->GetValue("tanh-self-repair-threshold",
1309  &tanh_self_repair_threshold);
1310  cfl->GetValue("sigmoid-self-repair-threshold",
1311  &sigmoid_self_repair_threshold);
1312  cfl->GetValue("self-repair-scale", &self_repair_scale);
1313  cfl->GetValue("use-dropout", &use_dropout);
1314 
1315  // We may later on want to make it possible to initialize the different
1316  // parameters w_ic, w_fc and w_oc with different biases. We'll implement
1317  // that when and if it's needed.
1318 
1319  if (cfl->HasUnusedValues())
1320  KALDI_ERR << "Could not process these elements in initializer: "
1321  << cfl->UnusedValues();
1322  if (ok) {
1323  Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold,
1324  sigmoid_self_repair_threshold, self_repair_scale);
1325  } else {
1326  KALDI_ERR << "Invalid initializer for layer of type "
1327  << Type() << ": \"" << cfl->WholeLine() << "\"";
1328  }
1329 }
1330 
1332  OnlineNaturalGradient preconditioner_temp(preconditioner_);
1334 }
1335 
1336 
1337 int32 GruNonlinearityComponent::InputDim() const {
1338  if (recurrent_dim_ == cell_dim_) {
1339  // non-projected GRU.
1340  return 4 * cell_dim_;
1341  } else {
1342  return 3 * cell_dim_ + 2 * recurrent_dim_;
1343  }
1344 }
1345 
1346 int32 GruNonlinearityComponent::OutputDim() const {
1347  return 2 * cell_dim_;
1348 }
1349 
1350 
1351 std::string GruNonlinearityComponent::Info() const {
1352  std::ostringstream stream;
1353  stream << UpdatableComponent::Info()
1354  << ", cell-dim=" << cell_dim_
1355  << ", recurrent-dim=" << recurrent_dim_;
1356  PrintParameterStats(stream, "w_h", w_h_);
1357  stream << ", self-repair-threshold=" << self_repair_threshold_
1358  << ", self-repair-scale=" << self_repair_scale_;
1359  if (count_ > 0) { // c.f. NonlinearComponent::Info().
1360  stream << ", count=" << std::setprecision(3) << count_
1361  << std::setprecision(6);
1362  stream << ", self-repaired-proportion="
1363  << (self_repair_total_ / (count_ * cell_dim_));
1364  Vector<double> value_avg_dbl(value_sum_);
1365  Vector<BaseFloat> value_avg(value_avg_dbl);
1366  value_avg.Scale(1.0 / count_);
1367  stream << ", value-avg=" << SummarizeVector(value_avg);
1368  Vector<double> deriv_avg_dbl(deriv_sum_);
1369  Vector<BaseFloat> deriv_avg(deriv_avg_dbl);
1370  deriv_avg.Scale(1.0 / count_);
1371  stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
1372  }
1373  // natural-gradient parameters.
1374  stream << ", alpha=" << preconditioner_in_.GetAlpha()
1375  << ", rank-in=" << preconditioner_in_.GetRank()
1376  << ", rank-out=" << preconditioner_out_.GetRank()
1377  << ", update-period="
1378  << preconditioner_in_.GetUpdatePeriod();
1379  return stream.str();
1380 }
1381 
1382 void GruNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
1383  cell_dim_ = -1;
1384  recurrent_dim_ = -1;
1385  self_repair_threshold_ = 0.2;
1386  self_repair_scale_ = 1.0e-05;
1387 
1389  if (!cfl->GetValue("cell-dim", &cell_dim_) || cell_dim_ <= 0)
1390  KALDI_ERR << "cell-dim > 0 is required for GruNonlinearityComponent.";
1391 
1392  BaseFloat param_stddev = 1.0 / std::sqrt(cell_dim_),
1393  alpha = 4.0;
1394  int32 rank_in = 20, rank_out = 80,
1395  update_period = 4;
1396 
1397  cfl->GetValue("recurrent-dim", &recurrent_dim_);
1398  cfl->GetValue("self-repair-threshold", &self_repair_threshold_);
1399  cfl->GetValue("self-repair-scale", &self_repair_scale_);
1400  cfl->GetValue("param-stddev", &param_stddev);
1401  cfl->GetValue("alpha", &alpha);
1402  cfl->GetValue("rank-in", &rank_in);
1403  cfl->GetValue("rank-out", &rank_out);
1404  cfl->GetValue("update-period", &update_period);
1405 
1406  if (recurrent_dim_ < 0)
1407  recurrent_dim_ = cell_dim_;
1408  if (recurrent_dim_ == 0 || recurrent_dim_ > cell_dim_)
1409  KALDI_ERR << "Invalid values for cell-dim and recurrent-dim";
1410 
1411  w_h_.Resize(cell_dim_, recurrent_dim_);
1412  w_h_.SetRandn();
1413  w_h_.Scale(param_stddev);
1414 
1415  preconditioner_in_.SetAlpha(alpha);
1416  preconditioner_in_.SetRank(rank_in);
1417  preconditioner_in_.SetUpdatePeriod(update_period);
1418  preconditioner_out_.SetAlpha(alpha);
1419  preconditioner_out_.SetRank(rank_out);
1420  preconditioner_out_.SetUpdatePeriod(update_period);
1421 
1422  count_ = 0.0;
1423  self_repair_total_ = 0.0;
1424  value_sum_.Resize(cell_dim_);
1425  deriv_sum_.Resize(cell_dim_);
1426 
1427  Check();
1428 }
1429 
1430 void* GruNonlinearityComponent::Propagate(
1431  const ComponentPrecomputedIndexes *indexes,
1432  const CuMatrixBase<BaseFloat> &in,
1433  CuMatrixBase<BaseFloat> *out) const {
1434  KALDI_ASSERT(in.NumRows() == out->NumRows() &&
1435  in.NumCols() == InputDim() &&
1436  out->NumCols() == OutputDim());
1437  // If recurrent_dim_ != cell_dim_, this is projected GRU and we
1438  // are computing:
1439  // (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t).
1440  // Otherwise (no projection), it's
1441  // (z_t, r_t, hpart_t, y_{t-1},) -> (h_t, y_t).
1442  // but to understand this code, it's better to rename y to c:
1443  // (z_t, r_t, hpart_t, c_{t-1}) -> (h_t, c_t).
1444  int32 num_rows = in.NumRows(),
1445  c = cell_dim_,
1446  r = recurrent_dim_;
1447  CuSubMatrix<BaseFloat> z_t(in, 0, num_rows, 0, c),
1448  r_t(in, 0, num_rows, c, r),
1449  hpart_t(in, 0, num_rows, c + r, c),
1450  c_t1(in, 0, num_rows, c + r + c, c);
1451  // note: the variable named 'c_t1' actually represents
1452  // y_{t-1} for non-projected GRUs.
1453 
1454  // By setting s_t1 to the last recurrent_dim_ rows of 'in', we get something
1455  // that represents s_{t-1} for recurrent setups and y_{t-1} (which we're
1456  // renaming to c_{t-1}) for non-projected GRUs. The key thing is that
1457  // in the non-projected case, the variables c_t1 and s_t1 point to the
1458  // same memory.
1459  CuSubMatrix<BaseFloat> s_t1(in, 0, num_rows, in.NumCols() - r, r);
1460 
1461  // note: for non-projected GRUs, c_t below is actually y_t.
1462  CuSubMatrix<BaseFloat> h_t(*out, 0, num_rows, 0, c),
1463  c_t(*out, 0, num_rows, c, c);
1464 
1465  // sdotr is the only temporary storage we need in the forward pass.
1466  CuMatrix<BaseFloat> sdotr(num_rows, r);
1467  sdotr.AddMatMatElements(1.0, r_t, s_t1, 0.0);
1468  // now sdotr = r_t \dot s_{t-1}.
1469  h_t.CopyFromMat(hpart_t);
1470  // now h_t = hpart_t (note: hpart_t actually means U^h x_t).
1471  h_t.AddMatMat(1.0, sdotr, kNoTrans, w_h_, kTrans, 1.0);
1472  // now h_t = hpart_t + W^h (s_{t-1} \dot r_t).
1473  h_t.Tanh(h_t);
1474  // now, h_t = tanh(hpart_t + W^h (s_{t-1} \dot r_t)).
1475 
1476  c_t.CopyFromMat(h_t);
1477  // now c_t = h_t
1478  c_t.AddMatMatElements(-1.0, z_t, h_t, 1.0);
1479  // now c_t = (1 - z_t) \dot h_t.
1480  c_t.AddMatMatElements(1.0, z_t, c_t1, 1.0);
1481  // now c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
1482  return NULL;
1483 }
1484 
1485 void GruNonlinearityComponent::Backprop(
1486  const std::string &debug_info,
1487  const ComponentPrecomputedIndexes *, // indexes
1488  const CuMatrixBase<BaseFloat> &in_value,
1489  const CuMatrixBase<BaseFloat> &out_value,
1490  const CuMatrixBase<BaseFloat> &out_deriv,
1491  void *memo,
1492  Component *to_update_in,
1493  CuMatrixBase<BaseFloat> *in_deriv) const {
1494  NVTX_RANGE("GruNonlinearityComponent::Backprop");
1495  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
1496  in_value.NumRows() == out_value.NumRows() &&
1497  in_value.NumCols() == InputDim() &&
1498  out_value.NumCols() == OutputDim() &&
1499  (in_deriv == NULL || SameDim(in_value, *in_deriv)) &&
1500  memo == NULL);
1501  GruNonlinearityComponent *to_update =
1502  dynamic_cast<GruNonlinearityComponent*>(to_update_in);
1503  KALDI_ASSERT(in_deriv != NULL || to_update != NULL);
1504  int32 num_rows = in_value.NumRows(),
1505  c = cell_dim_,
1506  r = recurrent_dim_;
1507 
1508  // To understand what's going on here, compare this code with the
1509  // corresponding 'forward' code in Propagate().
1510 
1511 
1512  CuSubMatrix<BaseFloat> z_t(in_value, 0, num_rows, 0, c),
1513  r_t(in_value, 0, num_rows, c, r),
1514  hpart_t(in_value, 0, num_rows, c + r, c),
1515  c_t1(in_value, 0, num_rows, c + r + c, c),
1516  s_t1(in_value, 0, num_rows, in_value.NumCols() - r, r);
1517 
1518 
1519  // The purpose of this 'in_deriv_ptr' is so that we can create submatrices
1520  // like z_t_deriv without the code crashing. If in_deriv is NULL these point
1521  // to 'in_value', and we'll be careful never to actually write to these
1522  // sub-matrices, which aside from being conceptually wrong would violate the
1523  // const semantics of this function.
1524  const CuMatrixBase<BaseFloat> *in_deriv_ptr =
1525  (in_deriv == NULL ? &in_value : in_deriv);
1526  CuSubMatrix<BaseFloat> z_t_deriv(*in_deriv_ptr, 0, num_rows, 0, c),
1527  r_t_deriv(*in_deriv_ptr, 0, num_rows, c, r),
1528  hpart_t_deriv(*in_deriv_ptr, 0, num_rows, c + r, c),
1529  c_t1_deriv(*in_deriv_ptr, 0, num_rows, c + r + c, c),
1530  s_t1_deriv(*in_deriv_ptr, 0, num_rows, in_value.NumCols() - r, r);
1531 
1532  // Note: the output h_t is never actually used in the GRU computation (we only
1533  // output it because we want the value to be cached to save computation in the
1534  // backprop), so we expect that the 'h_t_deriv', if we extracted it in the
1535  // obvious way, would be all zeros.
1536  // We create a different, local h_t_deriv
1537  // variable that backpropagates the derivative from c_t_deriv.
1538  CuSubMatrix<BaseFloat> h_t(out_value, 0, num_rows, 0, c),
1539  c_t(out_value, 0, num_rows, c, c),
1540  c_t_deriv(out_deriv, 0, num_rows, c, c);
1541  CuMatrix<BaseFloat> h_t_deriv(num_rows, c, kUndefined);
1542 
1543  { // we initialize h_t_deriv with the derivative from 'out_deriv'.
1544  // In real life in a GRU, this would always be zero; but in testing
1545  // code it may be nonzero and we include this term so that
1546  // the tests don't fail. Note: if you were to remove these
1547  // lines, you'd have to change 'h_t_deriv.AddMat(1.0, c_t_deriv);' below
1548  // to a CopyFromMat() call.
1549  CuSubMatrix<BaseFloat> h_t_deriv_in(out_deriv, 0, num_rows, 0, c);
1550  h_t_deriv.CopyFromMat(h_t_deriv_in);
1551  }
1552 
1553 
1554  // sdotr is the same variable as used in the forward pass, it will contain
1555  // r_t \dot s_{t-1}.
1556  CuMatrix<BaseFloat> sdotr(num_rows, r);
1557  sdotr.AddMatMatElements(1.0, r_t, s_t1, 0.0);
1558 
1559 
1560  { // This block does the
1561  // backprop corresponding to the
1562  // forward-pass expression: c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
1563 
1564  // First do: h_t_deriv = c_t_deriv \dot (1 - z_t).
1565  h_t_deriv.AddMat(1.0, c_t_deriv);
1566  h_t_deriv.AddMatMatElements(-1.0, c_t_deriv, z_t, 1.0);
1567 
1568  if (in_deriv) {
1569  // these should be self-explanatory if you study
1570  // the expression "c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}".
1571  z_t_deriv.AddMatMatElements(-1.0, c_t_deriv, h_t, 1.0);
1572  z_t_deriv.AddMatMatElements(1.0, c_t_deriv, c_t1, 1.0);
1573  c_t1_deriv.AddMatMatElements(1.0, c_t_deriv, z_t, 1.0);
1574  }
1575  }
1576 
1577  h_t_deriv.DiffTanh(h_t, h_t_deriv);
1578  if (to_update)
1579  to_update->TanhStatsAndSelfRepair(h_t, &h_t_deriv);
1580 
1581 
1582  if (to_update)
1583  to_update->UpdateParameters(sdotr, h_t_deriv);
1584 
1585  // At this point, 'h_t_deriv' contains the derivative w.r.t.
1586  // the argument of the tanh function, i.e. w.r.t. the expression:
1587  // hpart_t + W^h (s_{t-1} \dot r_t).
1588  // The next block propagates this to the derivatives for
1589  // hpart_t, s_{t-1} and r_t.
1590  if (in_deriv) {
1591  hpart_t_deriv.AddMat(1.0, h_t_deriv);
1592 
1593  // We re-use the memory that we used for s_{t-1} \dot r_t,
1594  // for its derivative.
1595  CuMatrix<BaseFloat> &sdotr_deriv(sdotr);
1596  sdotr_deriv.AddMatMat(1.0, h_t_deriv, kNoTrans, w_h_, kNoTrans, 0.0);
1597 
1598  // we add to all the input-derivatives instead of setting them,
1599  // because we chose to export the flag kBackpropAdds.
1600  r_t_deriv.AddMatMatElements(1.0, sdotr_deriv, s_t1, 1.0);
1601  s_t1_deriv.AddMatMatElements(1.0, sdotr_deriv, r_t, 1.0);
1602  }
1603 }
1604 
1605 
1606 void GruNonlinearityComponent::TanhStatsAndSelfRepair(
1607  const CuMatrixBase<BaseFloat> &h_t,
1608  CuMatrixBase<BaseFloat> *h_t_deriv) {
1609  KALDI_ASSERT(SameDim(h_t, *h_t_deriv));
1610 
1611  // we use this probability (hardcoded for now) to limit the stats accumulation
1612  // and self-repair code to running on about half of the minibatches.
1613  BaseFloat repair_and_stats_probability = 0.5;
1614  if (RandUniform() > repair_and_stats_probability)
1615  return;
1616 
1617  // OK, accumulate stats.
1618  // For the next few lines, compare with TanhComponent::StoreStats(), which is where
1619  // we got this code.
1620  // tanh_deriv is the function derivative of the tanh function,
1621  // tanh'(x) = tanh(x) * (1.0 - tanh(x)). h_t corresponds to tanh(x).
1622  CuMatrix<BaseFloat> tanh_deriv(h_t);
1623  tanh_deriv.ApplyPow(2.0);
1624  tanh_deriv.Scale(-1.0);
1625  tanh_deriv.Add(1.0);
1626 
1627  count_ += h_t.NumRows();
1628  CuVector<BaseFloat> temp(cell_dim_);
1629  temp.AddRowSumMat(1.0, h_t, 0.0);
1630  value_sum_.AddVec(1.0, temp);
1631  temp.AddRowSumMat(1.0, tanh_deriv, 0.0);
1632  deriv_sum_.AddVec(1.0, temp);
1633 
1634  if (count_ <= 0.0) {
1635  // this would be rather pathological if it happened.
1636  return;
1637  }
1638 
1639  // The rest of this function contains code modified from
1640  // TanhComponent::RepairGradients().
1641 
1642  // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside
1643  // function isn't defined for vectors).
1644  CuMatrix<BaseFloat> thresholds(1, cell_dim_);
1645  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
1646  thresholds_vec.AddVec(-1.0, deriv_sum_);
1647  thresholds_vec.Add(self_repair_threshold_ * count_);
1648  thresholds.ApplyHeaviside();
1649  self_repair_total_ += thresholds_vec.Sum();
1650 
1651  // there is a comment explaining what we are doing with
1652  // 'thresholds_vec', at this point in TanhComponent::RepairGradients().
1653  // We won't repeat it here.
1654 
1655  h_t_deriv->AddMatDiagVec(-self_repair_scale_ / repair_and_stats_probability,
1656  h_t, kNoTrans, thresholds_vec);
1657 }
1658 
1659 void GruNonlinearityComponent::UpdateParameters(
1660  const CuMatrixBase<BaseFloat> &sdotr,
1661  const CuMatrixBase<BaseFloat> &h_t_deriv) {
1662  if (is_gradient_) {
1663  // 'simple' update, no natural gradient. Compare
1664  // with AffineComponent::UpdateSimple().
1665  w_h_.AddMatMat(learning_rate_, h_t_deriv, kTrans,
1666  sdotr, kNoTrans, 1.0);
1667  } else {
1668  // the natural-gradient update.
1669  CuMatrix<BaseFloat> in_value_temp(sdotr),
1670  out_deriv_temp(h_t_deriv);
1671 
1672  // These "scale" values get will get multiplied into the learning rate.
1673  BaseFloat in_scale, out_scale;
1674 
1675  preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale);
1676  preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale);
1677 
1678  BaseFloat local_lrate = learning_rate_ * in_scale * out_scale;
1679  w_h_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
1680  in_value_temp, kNoTrans, 1.0);
1681  }
1682 }
1683 
1684 
1685 
1686 void GruNonlinearityComponent::Read(std::istream &is, bool binary) {
1687  ReadUpdatableCommon(is, binary);
1688  ExpectToken(is, binary, "<CellDim>");
1689  ReadBasicType(is, binary, &cell_dim_);
1690  ExpectToken(is, binary, "<RecurrentDim>");
1691  ReadBasicType(is, binary, &recurrent_dim_);
1692  ExpectToken(is, binary, "<w_h>");
1693  w_h_.Read(is, binary);
1694  ExpectToken(is, binary, "<ValueAvg>");
1695  value_sum_.Read(is, binary);
1696  ExpectToken(is, binary, "<DerivAvg>");
1697  deriv_sum_.Read(is, binary);
1698  ExpectToken(is, binary, "<SelfRepairTotal>");
1699  ReadBasicType(is, binary, &self_repair_total_);
1700  ExpectToken(is, binary, "<Count>");
1701  ReadBasicType(is, binary, &count_);
1702  value_sum_.Scale(count_); // we read in the averages, not the sums.
1704  ExpectToken(is, binary, "<SelfRepairThreshold>");
1705  ReadBasicType(is, binary, &self_repair_threshold_);
1706  ExpectToken(is, binary, "<SelfRepairScale>");
1707  ReadBasicType(is, binary, &self_repair_scale_);
1708  BaseFloat alpha;
1709  int32 rank_in, rank_out, update_period;
1710  ExpectToken(is, binary, "<Alpha>");
1711  ReadBasicType(is, binary, &alpha);
1712  ExpectToken(is, binary, "<RankInOut>");
1713  ReadBasicType(is, binary, &rank_in);
1714  ReadBasicType(is, binary, &rank_out);
1715  ExpectToken(is, binary, "<UpdatePeriod>");
1716  ReadBasicType(is, binary, &update_period);
1717  preconditioner_in_.SetRank(rank_in);
1718  preconditioner_out_.SetRank(rank_out);
1719  preconditioner_in_.SetAlpha(alpha);
1720  preconditioner_out_.SetAlpha(alpha);
1721  preconditioner_in_.SetUpdatePeriod(update_period);
1722  preconditioner_out_.SetUpdatePeriod(update_period);
1723  ExpectToken(is, binary, "</GruNonlinearityComponent>");
1724 }
1725 
1726 void GruNonlinearityComponent::Write(std::ostream &os, bool binary) const {
1727  WriteUpdatableCommon(os, binary);
1728  WriteToken(os, binary, "<CellDim>");
1729  WriteBasicType(os, binary, cell_dim_);
1730  WriteToken(os, binary, "<RecurrentDim>");
1731  WriteBasicType(os, binary, recurrent_dim_);
1732  WriteToken(os, binary, "<w_h>");
1733  w_h_.Write(os, binary);
1734  {
1735  // Write the value and derivative stats in a count-normalized way, for
1736  // greater readability in text form.
1737  WriteToken(os, binary, "<ValueAvg>");
1739  if (count_ != 0.0) temp.Scale(1.0 / count_);
1740  temp.Write(os, binary);
1741  WriteToken(os, binary, "<DerivAvg>");
1742  temp.CopyFromVec(deriv_sum_);
1743  if (count_ != 0.0) temp.Scale(1.0 / count_);
1744  temp.Write(os, binary);
1745  }
1746  WriteToken(os, binary, "<SelfRepairTotal>");
1747  WriteBasicType(os, binary, self_repair_total_);
1748  WriteToken(os, binary, "<Count>");
1749  WriteBasicType(os, binary, count_);
1750  WriteToken(os, binary, "<SelfRepairThreshold>");
1751  WriteBasicType(os, binary, self_repair_threshold_);
1752  WriteToken(os, binary, "<SelfRepairScale>");
1753  WriteBasicType(os, binary, self_repair_scale_);
1754 
1755  BaseFloat alpha = preconditioner_in_.GetAlpha();
1756  int32 rank_in = preconditioner_in_.GetRank(),
1757  rank_out = preconditioner_out_.GetRank(),
1758  update_period = preconditioner_in_.GetUpdatePeriod();
1759  WriteToken(os, binary, "<Alpha>");
1760  WriteBasicType(os, binary, alpha);
1761  WriteToken(os, binary, "<RankInOut>");
1762  WriteBasicType(os, binary, rank_in);
1763  WriteBasicType(os, binary, rank_out);
1764  WriteToken(os, binary, "<UpdatePeriod>");
1765  WriteBasicType(os, binary, update_period);
1766  WriteToken(os, binary, "</GruNonlinearityComponent>");
1767 }
1768 
1769 void GruNonlinearityComponent::Scale(BaseFloat scale) {
1770  if (scale == 0.0) {
1771  w_h_.SetZero();
1772  value_sum_.SetZero();
1773  deriv_sum_.SetZero();
1774  self_repair_total_ = 0.0;
1775  count_ = 0.0;
1776  } else {
1777  w_h_.Scale(scale);
1778  value_sum_.Scale(scale);
1779  deriv_sum_.Scale(scale);
1780  self_repair_total_ *= scale;
1781  count_ *= scale;
1782  }
1783 }
1784 
1785 void GruNonlinearityComponent::Add(BaseFloat alpha,
1786  const Component &other_in) {
1787  const GruNonlinearityComponent *other =
1788  dynamic_cast<const GruNonlinearityComponent*>(&other_in);
1789  KALDI_ASSERT(other != NULL);
1790  w_h_.AddMat(alpha, other->w_h_);
1791  value_sum_.AddVec(alpha, other->value_sum_);
1792  deriv_sum_.AddVec(alpha, other->deriv_sum_);
1793  self_repair_total_ += alpha * other->self_repair_total_;
1794  count_ += alpha * other->count_;
1795 }
1796 
1797 void GruNonlinearityComponent::ZeroStats() {
1798  value_sum_.SetZero();
1799  deriv_sum_.SetZero();
1800  self_repair_total_ = 0.0;
1801  count_ = 0.0;
1802 }
1803 
1804 void GruNonlinearityComponent::Check() const {
1805  KALDI_ASSERT(cell_dim_ > 0 && recurrent_dim_ > 0 &&
1806  recurrent_dim_ <= cell_dim_ &&
1807  self_repair_threshold_ >= 0.0 &&
1808  self_repair_scale_ >= 0.0 );
1809  KALDI_ASSERT(w_h_.NumRows() == cell_dim_ &&
1810  w_h_.NumCols() == recurrent_dim_);
1811  KALDI_ASSERT(value_sum_.Dim() == cell_dim_ &&
1812  deriv_sum_.Dim() == cell_dim_);
1813 }
1814 
1816  CuMatrix<BaseFloat> temp_params(w_h_.NumRows(), w_h_.NumCols());
1817  temp_params.SetRandn();
1818  w_h_.AddMat(stddev, temp_params);
1819 }
1820 
1822  const UpdatableComponent &other_in) const {
1823  const GruNonlinearityComponent *other =
1824  dynamic_cast<const GruNonlinearityComponent*>(&other_in);
1825  KALDI_ASSERT(other != NULL);
1826  return TraceMatMat(w_h_, other->w_h_, kTrans);
1827 }
1828 
1830  return w_h_.NumRows() * w_h_.NumCols();
1831 }
1832 
1833 void GruNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
1834  KALDI_ASSERT(params->Dim() == NumParameters());
1835  params->CopyRowsFromMat(w_h_);
1836 }
1837 
1838 
1839 void GruNonlinearityComponent::UnVectorize(
1840  const VectorBase<BaseFloat> &params) {
1841  KALDI_ASSERT(params.Dim() == NumParameters());
1842  w_h_.CopyRowsFromVec(params);
1843 }
1844 
1846  preconditioner_in_.Freeze(freeze);
1847  preconditioner_out_.Freeze(freeze);
1848 }
1849 
1850 GruNonlinearityComponent::GruNonlinearityComponent(
1851  const GruNonlinearityComponent &other):
1852  UpdatableComponent(other),
1853  cell_dim_(other.cell_dim_),
1854  recurrent_dim_(other.recurrent_dim_),
1855  w_h_(other.w_h_),
1856  value_sum_(other.value_sum_),
1857  deriv_sum_(other.deriv_sum_),
1858  self_repair_total_(other.self_repair_total_),
1859  count_(other.count_),
1860  self_repair_threshold_(other.self_repair_threshold_),
1861  self_repair_scale_(other.self_repair_scale_),
1862  preconditioner_in_(other.preconditioner_in_),
1863  preconditioner_out_(other.preconditioner_out_) {
1864  Check();
1865 }
1866 
1867 
1868 int32 OutputGruNonlinearityComponent::InputDim() const {
1869  return 3 * cell_dim_;
1870 }
1871 
1872 int32 OutputGruNonlinearityComponent::OutputDim() const {
1873  return 2 * cell_dim_;
1874 }
1875 
1876 
1877 std::string OutputGruNonlinearityComponent::Info() const {
1878  std::ostringstream stream;
1879  stream << UpdatableComponent::Info()
1880  << ", cell-dim=" << cell_dim_;
1881  PrintParameterStats(stream, "w_h", w_h_);
1882  stream << ", self-repair-threshold=" << self_repair_threshold_
1883  << ", self-repair-scale=" << self_repair_scale_;
1884  if (count_ > 0) { // c.f. NonlinearComponent::Info().
1885  stream << ", count=" << std::setprecision(3) << count_
1886  << std::setprecision(6);
1887  stream << ", self-repaired-proportion="
1888  << (self_repair_total_ / (count_ * cell_dim_));
1889  Vector<double> value_avg_dbl(value_sum_);
1890  Vector<BaseFloat> value_avg(value_avg_dbl);
1891  value_avg.Scale(1.0 / count_);
1892  stream << ", value-avg=" << SummarizeVector(value_avg);
1893  Vector<double> deriv_avg_dbl(deriv_sum_);
1894  Vector<BaseFloat> deriv_avg(deriv_avg_dbl);
1895  deriv_avg.Scale(1.0 / count_);
1896  stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
1897  }
1898  // natural-gradient parameters.
1899  stream << ", alpha=" << preconditioner_.GetAlpha()
1900  << ", rank=" << preconditioner_.GetRank()
1901  << ", update-period="
1903  return stream.str();
1904 }
1905 
1906 void OutputGruNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
1907  cell_dim_ = -1;
1908  self_repair_threshold_ = 0.2;
1909  self_repair_scale_ = 1.0e-05;
1910 
1912  if (!cfl->GetValue("cell-dim", &cell_dim_) || cell_dim_ <= 0)
1913  KALDI_ERR << "cell-dim > 0 is required for GruNonlinearityComponent.";
1914 
1915  BaseFloat param_mean = 0.0, param_stddev = 1.0,
1916  alpha = 4.0;
1917  int32 rank=8,
1918  update_period = 10;
1919 
1920  cfl->GetValue("self-repair-threshold", &self_repair_threshold_);
1921  cfl->GetValue("self-repair-scale", &self_repair_scale_);
1922  cfl->GetValue("param-mean", &param_mean);
1923  cfl->GetValue("param-stddev", &param_stddev);
1924  cfl->GetValue("alpha", &alpha);
1925  cfl->GetValue("rank", &rank);
1926  cfl->GetValue("update-period", &update_period);
1927 
1928 
1929  w_h_.Resize(cell_dim_);
1930  w_h_.SetRandn();
1931  w_h_.Scale(param_stddev);
1932  w_h_.Add(param_mean);
1933 
1934  preconditioner_.SetAlpha(alpha);
1935  preconditioner_.SetRank(rank);
1936  preconditioner_.SetUpdatePeriod(update_period);
1937 
1938  count_ = 0.0;
1939  self_repair_total_ = 0.0;
1940  value_sum_.Resize(cell_dim_);
1941  deriv_sum_.Resize(cell_dim_);
1942 
1943  Check();
1944 }
1945 
1946 void* OutputGruNonlinearityComponent::Propagate(
1947  const ComponentPrecomputedIndexes *indexes,
1948  const CuMatrixBase<BaseFloat> &in,
1949  CuMatrixBase<BaseFloat> *out) const {
1950  KALDI_ASSERT(in.NumRows() == out->NumRows() &&
1951  in.NumCols() == InputDim() &&
1952  out->NumCols() == OutputDim());
1953  // This component implements the function
1954  // (z_t, hpart_t, c_{t-1}) -> (h_t, c_t)
1955  // of dimensions
1956  // (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim),
1957  // where:
1958  // h_t = \tanh( hpart_t + W^h \dot c_{t-1} )
1959  // c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
1960  int32 num_rows = in.NumRows(),
1961  c = cell_dim_;
1962  CuSubMatrix<BaseFloat> z_t(in, 0, num_rows, 0, c),
1963  hpart_t(in, 0, num_rows, c, c),
1964  c_t1(in, 0, num_rows, c + c, c);
1965 
1966  CuSubMatrix<BaseFloat> h_t(*out, 0, num_rows, 0, c),
1967  c_t(*out, 0, num_rows, c, c);
1968 
1969  h_t.CopyFromMat(c_t1);
1970  // now h_t = c_{t-1}
1971  h_t.MulColsVec(w_h_);
1972  // now h_t = W^h \dot c_{t-1}
1973  h_t.AddMat(1.0, hpart_t, kNoTrans);
1974  // now h_t = hpart_t + W^h \dot c_{t-1}.(note: hpart_t actually means U^h x_t).
1975  h_t.Tanh(h_t);
1976  // now, h_t = tanh(hpart_t + W^h \dot c_{t-1}).
1977 
1978  c_t.CopyFromMat(h_t);
1979  // now c_t = h_t
1980  c_t.AddMatMatElements(-1.0, z_t, h_t, 1.0);
1981  // now c_t = (1 - z_t) \dot h_t.
1982  c_t.AddMatMatElements(1.0, z_t, c_t1, 1.0);
1983  // now c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
1984  return NULL;
1985 }
1986 
1987 void OutputGruNonlinearityComponent::Backprop(
1988  const std::string &debug_info,
1989  const ComponentPrecomputedIndexes *, // indexes
1990  const CuMatrixBase<BaseFloat> &in_value,
1991  const CuMatrixBase<BaseFloat> &out_value,
1992  const CuMatrixBase<BaseFloat> &out_deriv,
1993  void *memo,
1994  Component *to_update_in,
1995  CuMatrixBase<BaseFloat> *in_deriv) const {
1996  NVTX_RANGE("OutputGruNonlinearityComponent::Backprop");
1997  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
1998  in_value.NumRows() == out_value.NumRows() &&
1999  in_value.NumCols() == InputDim() &&
2000  out_value.NumCols() == OutputDim() &&
2001  (in_deriv == NULL || SameDim(in_value, *in_deriv)) &&
2002  memo == NULL);
2003  OutputGruNonlinearityComponent *to_update =
2004  dynamic_cast<OutputGruNonlinearityComponent*>(to_update_in);
2005  KALDI_ASSERT(in_deriv != NULL || to_update != NULL);
2006  int32 num_rows = in_value.NumRows(),
2007  c = cell_dim_;
2008 
2009  // To understand what's going on here, compare this code with the
2010  // corresponding 'forward' code in Propagate().
2011 
2012 
2013  CuSubMatrix<BaseFloat> z_t(in_value, 0, num_rows, 0, c),
2014  hpart_t(in_value, 0, num_rows, c, c),
2015  c_t1(in_value, 0, num_rows, c + c, c);
2016 
2017  // The purpose of this 'in_deriv_ptr' is so that we can create submatrices
2018  // like z_t_deriv without the code crashing. If in_deriv is NULL these point
2019  // to 'in_value', and we'll be careful never to actually write to these
2020  // sub-matrices, which aside from being conceptually wrong would violate the
2021  // const semantics of this function.
2022  const CuMatrixBase<BaseFloat> *in_deriv_ptr =
2023  (in_deriv == NULL ? &in_value : in_deriv);
2024  CuSubMatrix<BaseFloat> z_t_deriv(*in_deriv_ptr, 0, num_rows, 0, c),
2025  hpart_t_deriv(*in_deriv_ptr, 0, num_rows, c, c),
2026  c_t1_deriv(*in_deriv_ptr, 0, num_rows, c + c, c);
2027 
2028  // Note: the output h_t is never actually used in the GRU computation (we only
2029  // output it because we want the value to be cached to save computation in the
2030  // backprop), so we expect that the 'h_t_deriv', if we extracted it in the
2031  // obvious way, would be all zeros.
2032  // We create a different, local h_t_deriv
2033  // variable that backpropagates the derivative from c_t_deriv.
2034  CuSubMatrix<BaseFloat> h_t(out_value, 0, num_rows, 0, c),
2035  c_t(out_value, 0, num_rows, c, c),
2036  c_t_deriv(out_deriv, 0, num_rows, c, c);
2037  CuMatrix<BaseFloat> h_t_deriv(num_rows, c, kUndefined);
2038 
2039  { // we initialize h_t_deriv with the derivative from 'out_deriv'.
2040  // In real life in a GRU, this would always be zero; but in testing
2041  // code it may be nonzero and we include this term so that
2042  // the tests don't fail. Note: if you were to remove these
2043  // lines, you'd have to change 'h_t_deriv.AddMat(1.0, c_t_deriv);' below
2044  // to a CopyFromMat() call.
2045  CuSubMatrix<BaseFloat> h_t_deriv_in(out_deriv, 0, num_rows, 0, c);
2046  h_t_deriv.CopyFromMat(h_t_deriv_in);
2047  }
2048 
2049 
2050  { // This block does the
2051  // backprop corresponding to the
2052  // forward-pass expression: c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
2053 
2054  // First do: h_t_deriv = c_t_deriv \dot (1 - z_t).
2055  h_t_deriv.AddMat(1.0, c_t_deriv);
2056  h_t_deriv.AddMatMatElements(-1.0, c_t_deriv, z_t, 1.0);
2057 
2058  if (in_deriv) {
2059  // these should be self-explanatory if you study
2060  // the expression "c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}".
2061  z_t_deriv.AddMatMatElements(-1.0, c_t_deriv, h_t, 1.0);
2062  z_t_deriv.AddMatMatElements(1.0, c_t_deriv, c_t1, 1.0);
2063  c_t1_deriv.AddMatMatElements(1.0, c_t_deriv, z_t, 1.0);
2064  }
2065  }
2066 
2067  h_t_deriv.DiffTanh(h_t, h_t_deriv);
2068  if (to_update)
2069  to_update->TanhStatsAndSelfRepair(h_t, &h_t_deriv);
2070 
2071  if (to_update)
2072  to_update->UpdateParameters(c_t1, h_t_deriv);
2073  // At this point, 'h_t_deriv' contains the derivative w.r.t.
2074  // the argument of the tanh function, i.e. w.r.t. the expression:
2075  // hpart_t + W^h \dot c_{t-1}.
2076  // The next block propagates this to the derivative for h_part_t and c_t1
2077  // The derivative of z_t has already been finished.
2078  if (in_deriv) {
2079  hpart_t_deriv.AddMat(1.0, h_t_deriv);
2080 
2081  // Currently, c_t1_deriv contains the derivative from
2082  // c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}
2083  // Now compute the h_t = \tanh(hpart_t + W^h \dot c_{t-1}) part
2084  h_t_deriv.MulColsVec(w_h_);
2085  // Combine the two parts
2086  c_t1_deriv.AddMat(1.0, h_t_deriv);
2087  }
2088 }
2089 
2090 
2091 void OutputGruNonlinearityComponent::TanhStatsAndSelfRepair(
2092  const CuMatrixBase<BaseFloat> &h_t,
2093  CuMatrixBase<BaseFloat> *h_t_deriv) {
2094  KALDI_ASSERT(SameDim(h_t, *h_t_deriv));
2095 
2096  // we use this probability (hardcoded for now) to limit the stats accumulation
2097  // and self-repair code to running on about half of the minibatches.
2098  BaseFloat repair_and_stats_probability = 0.5;
2099  if (RandUniform() > repair_and_stats_probability)
2100  return;
2101 
2102  // OK, accumulate stats.
2103  // For the next few lines, compare with TanhComponent::StoreStats(), which is where
2104  // we got this code.
2105  // tanh_deriv is the function derivative of the tanh function,
2106  // tanh'(x) = tanh(x) * (1.0 - tanh(x)). h_t corresponds to tanh(x).
2107  CuMatrix<BaseFloat> tanh_deriv(h_t);
2108  tanh_deriv.ApplyPow(2.0);
2109  tanh_deriv.Scale(-1.0);
2110  tanh_deriv.Add(1.0);
2111 
2112  count_ += h_t.NumRows();
2113  CuVector<BaseFloat> temp(cell_dim_);
2114  temp.AddRowSumMat(1.0, h_t, 0.0);
2115  value_sum_.AddVec(1.0, temp);
2116  temp.AddRowSumMat(1.0, tanh_deriv, 0.0);
2117  deriv_sum_.AddVec(1.0, temp);
2118 
2119  if (count_ <= 0.0) {
2120  // this would be rather pathological if it happened.
2121  return;
2122  }
2123 
2124  // The rest of this function contains code modified from
2125  // TanhComponent::RepairGradients().
2126 
2127  // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside
2128  // function isn't defined for vectors).
2129  CuMatrix<BaseFloat> thresholds(1, cell_dim_);
2130  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
2131  thresholds_vec.AddVec(-1.0, deriv_sum_);
2132  thresholds_vec.Add(self_repair_threshold_ * count_);
2133  thresholds.ApplyHeaviside();
2134  self_repair_total_ += thresholds_vec.Sum();
2135 
2136  // there is a comment explaining what we are doing with
2137  // 'thresholds_vec', at this point in TanhComponent::RepairGradients().
2138  // We won't repeat it here.
2139 
2140  h_t_deriv->AddMatDiagVec(-self_repair_scale_ / repair_and_stats_probability,
2141  h_t, kNoTrans, thresholds_vec);
2142 }
2143 
2144 void OutputGruNonlinearityComponent::UpdateParameters(
2145  const CuMatrixBase<BaseFloat> &c_t1_value,
2146  const CuMatrixBase<BaseFloat> &h_t_deriv) {
2147  if (is_gradient_) {
2148  // 'simple' update, no natural gradient. Compare
2149  // with PerElementScaleComponent::UpdateSimple().
2150  w_h_.AddDiagMatMat(learning_rate_, h_t_deriv, kTrans,
2151  c_t1_value, kNoTrans, 1.0);
2152  } else {
2153  // the natural-gradient update.
2154  CuMatrix<BaseFloat> derivs_per_frame(c_t1_value);
2155  derivs_per_frame.MulElements(h_t_deriv);
2156 
2157  // This "scale" value gets will get multiplied into the learning rate.
2158  BaseFloat scale;
2159 
2160  preconditioner_.PreconditionDirections(&derivs_per_frame, &scale);
2161 
2162  CuVector<BaseFloat> delta_w_h(w_h_.Dim());
2163  delta_w_h.AddRowSumMat(scale * learning_rate_, derivs_per_frame);
2164  w_h_.AddVec(1.0, delta_w_h);
2165  }
2166 }
2167 
2168 
2169 
2170 void OutputGruNonlinearityComponent::Read(std::istream &is, bool binary) {
2171  ReadUpdatableCommon(is, binary);
2172  ExpectToken(is, binary, "<CellDim>");
2173  ReadBasicType(is, binary, &cell_dim_);
2174  ExpectToken(is, binary, "<w_h>");
2175  w_h_.Read(is, binary);
2176  ExpectToken(is, binary, "<ValueAvg>");
2177  value_sum_.Read(is, binary);
2178  ExpectToken(is, binary, "<DerivAvg>");
2179  deriv_sum_.Read(is, binary);
2180  ExpectToken(is, binary, "<SelfRepairTotal>");
2181  ReadBasicType(is, binary, &self_repair_total_);
2182  ExpectToken(is, binary, "<Count>");
2183  ReadBasicType(is, binary, &count_);
2184  value_sum_.Scale(count_); // we read in the averages, not the sums.
2186  ExpectToken(is, binary, "<SelfRepairThreshold>");
2187  ReadBasicType(is, binary, &self_repair_threshold_);
2188  ExpectToken(is, binary, "<SelfRepairScale>");
2189  ReadBasicType(is, binary, &self_repair_scale_);
2190  BaseFloat alpha;
2191  int32 rank, update_period;
2192  ExpectToken(is, binary, "<Alpha>");
2193  ReadBasicType(is, binary, &alpha);
2194  ExpectToken(is, binary, "<Rank>");
2195  ReadBasicType(is, binary, &rank);
2196  ExpectToken(is, binary, "<UpdatePeriod>");
2197  ReadBasicType(is, binary, &update_period);
2198  preconditioner_.SetRank(rank);
2199  preconditioner_.SetAlpha(alpha);
2200  preconditioner_.SetUpdatePeriod(update_period);
2201  ExpectToken(is, binary, "</OutputGruNonlinearityComponent>");
2202 }
2203 
2204 void OutputGruNonlinearityComponent::Write(std::ostream &os, bool binary) const {
2205  WriteUpdatableCommon(os, binary);
2206  WriteToken(os, binary, "<CellDim>");
2207  WriteBasicType(os, binary, cell_dim_);
2208  WriteToken(os, binary, "<w_h>");
2209  w_h_.Write(os, binary);
2210  {
2211  // Write the value and derivative stats in a count-normalized way, for
2212  // greater readability in text form.
2213  WriteToken(os, binary, "<ValueAvg>");
2215  if (count_ != 0.0) temp.Scale(1.0 / count_);
2216  temp.Write(os, binary);
2217  WriteToken(os, binary, "<DerivAvg>");
2218  temp.CopyFromVec(deriv_sum_);
2219  if (count_ != 0.0) temp.Scale(1.0 / count_);
2220  temp.Write(os, binary);
2221  }
2222  WriteToken(os, binary, "<SelfRepairTotal>");
2223  WriteBasicType(os, binary, self_repair_total_);
2224  WriteToken(os, binary, "<Count>");
2225  WriteBasicType(os, binary, count_);
2226  WriteToken(os, binary, "<SelfRepairThreshold>");
2227  WriteBasicType(os, binary, self_repair_threshold_);
2228  WriteToken(os, binary, "<SelfRepairScale>");
2229  WriteBasicType(os, binary, self_repair_scale_);
2230 
2232  int32 rank = preconditioner_.GetRank(),
2233  update_period = preconditioner_.GetUpdatePeriod();
2234  WriteToken(os, binary, "<Alpha>");
2235  WriteBasicType(os, binary, alpha);
2236  WriteToken(os, binary, "<Rank>");
2237  WriteBasicType(os, binary, rank);
2238  WriteToken(os, binary, "<UpdatePeriod>");
2239  WriteBasicType(os, binary, update_period);
2240  WriteToken(os, binary, "</OutputGruNonlinearityComponent>");
2241 }
2242 
2243 void OutputGruNonlinearityComponent::Scale(BaseFloat scale) {
2244  if (scale == 0.0) {
2245  w_h_.SetZero();
2246  value_sum_.SetZero();
2247  deriv_sum_.SetZero();
2248  self_repair_total_ = 0.0;
2249  count_ = 0.0;
2250  } else {
2251  w_h_.Scale(scale);
2252  value_sum_.Scale(scale);
2253  deriv_sum_.Scale(scale);
2254  self_repair_total_ *= scale;
2255  count_ *= scale;
2256  }
2257 }
2258 
2259 void OutputGruNonlinearityComponent::Add(BaseFloat alpha,
2260  const Component &other_in) {
2261  const OutputGruNonlinearityComponent *other =
2262  dynamic_cast<const OutputGruNonlinearityComponent*>(&other_in);
2263  KALDI_ASSERT(other != NULL);
2264  w_h_.AddVec(alpha, other->w_h_);
2265  value_sum_.AddVec(alpha, other->value_sum_);
2266  deriv_sum_.AddVec(alpha, other->deriv_sum_);
2267  self_repair_total_ += alpha * other->self_repair_total_;
2268  count_ += alpha * other->count_;
2269 }
2270 
2271 void OutputGruNonlinearityComponent::ZeroStats() {
2272  value_sum_.SetZero();
2273  deriv_sum_.SetZero();
2274  self_repair_total_ = 0.0;
2275  count_ = 0.0;
2276 }
2277 
2278 void OutputGruNonlinearityComponent::Check() const {
2279  KALDI_ASSERT(cell_dim_ > 0 &&
2280  self_repair_threshold_ >= 0.0 &&
2281  self_repair_scale_ >= 0.0 );
2282  KALDI_ASSERT(w_h_.Dim() == cell_dim_);
2283  KALDI_ASSERT(value_sum_.Dim() == cell_dim_ &&
2284  deriv_sum_.Dim() == cell_dim_);
2285 }
2286 
2288  CuVector<BaseFloat> temp_params(w_h_.Dim());
2289  temp_params.SetRandn();
2290  w_h_.AddVec(stddev, temp_params);
2291 }
2292 
2294  const UpdatableComponent &other_in) const {
2295  const OutputGruNonlinearityComponent *other =
2296  dynamic_cast<const OutputGruNonlinearityComponent*>(&other_in);
2297  KALDI_ASSERT(other != NULL);
2298  return VecVec(w_h_, other->w_h_);
2299 }
2300 
2302  return w_h_.Dim();
2303 }
2304 
2305 void OutputGruNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
2306  KALDI_ASSERT(params->Dim() == NumParameters());
2307  params->CopyFromVec(w_h_);
2308 }
2309 
2310 
2311 void OutputGruNonlinearityComponent::UnVectorize(
2312  const VectorBase<BaseFloat> &params) {
2313  KALDI_ASSERT(params.Dim() == NumParameters());
2314  w_h_.CopyFromVec(params);
2315 }
2316 
2318  preconditioner_.Freeze(freeze);
2319 }
2320 
2321 OutputGruNonlinearityComponent::OutputGruNonlinearityComponent(
2322  const OutputGruNonlinearityComponent &other):
2323  UpdatableComponent(other),
2324  cell_dim_(other.cell_dim_),
2325  w_h_(other.w_h_),
2326  value_sum_(other.value_sum_),
2327  deriv_sum_(other.deriv_sum_),
2328  self_repair_total_(other.self_repair_total_),
2329  count_(other.count_),
2330  self_repair_threshold_(other.self_repair_threshold_),
2331  self_repair_scale_(other.self_repair_scale_),
2332  preconditioner_(other.preconditioner_) {
2333  Check();
2334 }
2335 
2336 } // namespace nnet3
2337 } // namespace kaldi
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void InputToInputPatches(const CuMatrixBase< BaseFloat > &in, CuMatrix< BaseFloat > *patches) const
void Write(std::ostream &out, bool binary) const
write to stream.
virtual int32 OutputDim() const
Returns output-dimension of this component.
void ApplyPow(Real power)
Definition: cu-matrix.h:438
const std::string WholeLine()
Definition: text-utils.h:230
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
void InputToInputPatches(const CuMatrixBase< BaseFloat > &in, CuMatrix< BaseFloat > *patches) const
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
Definition: kaldi-math.h:151
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim, int32 filt_x_dim, int32 filt_y_dim, int32 filt_x_step, int32 filt_y_step, int32 num_filters, TensorVectorizationType input_vectorization, BaseFloat param_stddev, BaseFloat bias_stddev)
virtual Component * Copy() const
Copies component (deep copy).
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
Abstract base-class for neural-net components.
virtual void ZeroStats()
Components that provide an implementation of StoreStats should also provide an implementation of Zero...
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
void Add(Real value)
Definition: cu-vector.cc:1157
void SetNumSamplesHistory(BaseFloat num_samples_history)
Real Sum() const
Definition: cu-vector.cc:297
virtual Component * Copy() const
Copies component (deep copy).
void Write(std::ostream &Out, bool binary) const
Writes to C++ stream (option to write in binary).
void AddMatDiagVec(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, CuVectorBase< Real > &v, Real beta=1.0)
Definition: cu-matrix.cc:1415
void InitLearningRatesFromConfig(ConfigLine *cfl)
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
CuSubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:653
void InderivPatchesToInderiv(const CuMatrix< BaseFloat > &in_deriv_patches, CuMatrixBase< BaseFloat > *in_deriv) const
std::string SummarizeVector(const VectorBase< float > &vec)
Returns a string that summarizes a vector fairly succintly, for printing stats in info lines...
Definition: nnet-parse.cc:111
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
kaldi::int32 int32
void ReadToken(std::istream &is, bool binary, std::string *str)
ReadToken gets the next token and puts it in str (exception on failure).
Definition: io-funcs.cc:154
Keywords for search: natural gradient, naturalgradient, NG-SGD.
void AddMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
*this += alpha * A
Definition: cu-matrix.cc:954
void AddMatMatElements(const Real alpha, const CuMatrixBase< Real > &A, const CuMatrixBase< Real > &B, const Real beta)
*this = beta * *this + alpha * A .* B (.* element by element multiplication)
Definition: cu-matrix.cc:1447
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
void AddCols(const CuMatrixBase< Real > &src, const CuArrayBase< MatrixIndexT > &indices)
Add column indices[r] of src to column r.
Definition: cu-matrix.cc:2701
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
virtual int32 OutputDim() const
Returns output-dimension of this component.
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
void BackpropLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
This function does the &#39;backward&#39; pass corresponding to the function ComputeLstmNonlinearity.
Definition: cu-math.cc:768
virtual int32 NumParameters() const
The following new virtual function returns the total dimension of the parameters in this class...
void ExpectOneOrTwoTokens(std::istream &is, bool binary, const std::string &token1, const std::string &token2)
This function is like ExpectToken but for two tokens, and it will either accept token1 and then token...
Definition: text-utils.cc:536
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
void AddMatBlocks(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
This function is like AddMat (it does *this += alpha * src), except that it supports cases where *thi...
Definition: cu-matrix.cc:1119
void Scale(Real value)
Definition: cu-matrix.cc:644
void FreezeNaturalGradient(bool freeze, Nnet *nnet)
Controls if natural gradient will be updated.
Definition: nnet-utils.cc:432
virtual std::string Type() const
Returns a string such as "SigmoidComponent", describing the type of the object.
std::string UnusedValues() const
returns e.g.
Definition: text-utils.cc:518
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
void SetParams(const VectorBase< BaseFloat > &bias, const MatrixBase< BaseFloat > &filter)
int32 NumParameters(const Nnet &src)
Returns the total of the number of parameters in the updatable components of the nnet.
Definition: nnet-utils.cc:359
virtual void Add(BaseFloat alpha, const Component &other)
This virtual function when called by – an UpdatableComponent adds the parameters of another updatabl...
virtual int32 InputDim() const
Returns input-dimension of this component.
void Max(const CuMatrixBase< Real > &A)
Do, elementwise, *this = max(*this, A).
Definition: cu-matrix.cc:715
virtual void UnVectorize(const VectorBase< BaseFloat > &params)
Converts the parameters from vector form.
void AddVecToRows(Real alpha, const CuVectorBase< Real > &row, Real beta=1.0)
(for each row r of *this), r = alpha * row + beta * r
Definition: cu-matrix.cc:1261
virtual int32 InputDim() const
Returns input-dimension of this component.
static void ExpectToken(const std::string &token, const std::string &what_we_are_parsing, const std::string **next_token)
void AddColSumMat(Real alpha, const CuMatrixBase< Real > &mat, Real beta=1.0)
Sum the columns of the matrix, add to vector.
Definition: cu-vector.cc:1298
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
void Scale(Real alpha)
Multiply each element with a scalar value.
void Add(Real value)
Definition: cu-matrix.cc:582
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
void SetZero()
Math operations, some calling kernels.
Definition: cu-matrix.cc:509
void MulElements(const CuMatrixBase< Real > &A)
Multiply two matrices elementwise: C = C .* A.
Definition: cu-matrix.cc:667
BaseFloat learning_rate_
learning rate (typically 0.0..0.01)
virtual void Vectorize(VectorBase< BaseFloat > *params) const
Turns the parameters into vector form.
You can view this as an overflow from nnet-simple-component.h.
std::string ReadUpdatableCommon(std::istream &is, bool binary)
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void Resize(MatrixIndexT dim, MatrixResizeType t=kSetZero)
Allocate the memory.
Definition: cu-vector.cc:993
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
#define KALDI_ERR
Definition: kaldi-error.h:147
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
#define KALDI_PARANOID_ASSERT(cond)
Definition: kaldi-error.h:206
void PreconditionDirections(CuMatrixBase< BaseFloat > *X, BaseFloat *scale)
This call implements the main functionality of this class.
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
BaseFloat DotProduct(const Nnet &nnet1, const Nnet &nnet2)
Returns dot product between two networks of the same structure (calls the DotProduct functions of the...
Definition: nnet-utils.cc:250
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
void Scale(Real alpha)
Multiplies all elements by this constant.
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update_in, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
virtual int32 OutputDim() const
Returns output-dimension of this component.
void Swap(OnlineNaturalGradient *other)
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
void DiffTanh(const CuMatrixBase< Real > &value, const CuMatrixBase< Real > &diff)
Differentiate backward through the tanh function.
Definition: cu-matrix.cc:1809
Class UpdatableComponent is a Component which has trainable parameters; it extends the interface of C...
void AddVec(Real alpha, const CuVectorBase< Real > &vec, Real beta=1.0)
Definition: cu-vector.cc:1237
void MulColsVec(const CuVectorBase< Real > &scale)
scale i&#39;th column by scale[i]
Definition: cu-matrix.cc:765
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_value, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
CuSubMatrix< Real > ColRange(const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:665
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
virtual void * Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out) const
Propagate function.
A class representing a vector.
Definition: kaldi-vector.h:406
This class is responsible for parsing input like hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing=&#39;a b c&#39; baz="a b c d=&#39;a b&#39; e" and giving you access to the fields, in this case.
Definition: text-utils.h:205
virtual std::string Info() const
Returns some text-form information about this component, for diagnostics.
bool is_gradient_
True if this component is to be treated as a gradient rather than as parameters.
virtual int32 InputDim() const
Returns input-dimension of this component.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void WriteUpdatableCommon(std::ostream &is, bool binary) const
void Read(std::istream &is, bool binary)
I/O functions.
Definition: cu-matrix.cc:494
virtual BaseFloat DotProduct(const UpdatableComponent &other) const
Computes dot-product between parameters of two instances of a Component.
virtual void Scale(BaseFloat scale)
This virtual function when called on – an UpdatableComponent scales the parameters by "scale" when c...
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
#define NVTX_RANGE(name)
Definition: cu-common.h:143
virtual void InitFromConfig(ConfigLine *cfl)
Initialize, from a ConfigLine object.
void PerturbParams(BaseFloat stddev, Nnet *nnet)
Calls PerturbParams (with the given stddev) on all updatable components of the nnet.
Definition: nnet-utils.cc:199
bool HasUnusedValues() const
Definition: text-utils.cc:510
::MatrixDim Dim() const
Definition: cu-matrix.h:221
void CopyRowsFromMat(const MatrixBase< Real > &M)
Performs a row stack of the matrix M.
bool GetValue(const std::string &key, std::string *value)
Definition: text-utils.cc:427
virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &, const CuMatrixBase< BaseFloat > &out_deriv, void *memo, Component *to_update_in, CuMatrixBase< BaseFloat > *in_deriv) const
Backprop function; depending on which of the arguments &#39;to_update&#39; and &#39;in_deriv&#39; are non-NULL...
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34
void CopyCols(const CuMatrixBase< Real > &src, const CuArrayBase< MatrixIndexT > &indexes)
Copies column r from column indexes[r] of src.
Definition: cu-matrix.cc:2656
virtual void PerturbParams(BaseFloat stddev)
This function is to be used in testing.
int32 YzxVectorIndex(int32 x, int32 y, int32 z, int32 input_x_dim, int32 input_y_dim, int32 input_z_dim)
virtual void FreezeNaturalGradient(bool freeze)
virtual
void PrintParameterStats(std::ostringstream &os, const std::string &name, const CuVectorBase< BaseFloat > &params, bool include_mean)
Print to &#39;os&#39; some information about the mean and standard deviation of some parameters, used in Info() functions in nnet-simple-component.cc.
Definition: nnet-parse.cc:157
void Init(int32 cell_dim, bool use_dropout, BaseFloat param_stddev, BaseFloat tanh_self_repair_threshold, BaseFloat sigmoid_self_repair_threshold, BaseFloat self_repair_scale)
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
void InderivPatchesToInderiv(const CuMatrix< BaseFloat > &in_deriv_patches, CuMatrixBase< BaseFloat > *in_deriv) const
virtual void SetUnderlyingLearningRate(BaseFloat lrate)
Sets the learning rate of gradient descent- gets multiplied by learning_rate_factor_.
int32 ZyxVectorIndex(int32 x, int32 y, int32 z, int32 input_x_dim, int32 input_y_dim, int32 input_z_dim)
void Tanh(const CuMatrixBase< Real > &src)
Compute the hyperbolic tangent (tanh) function; element by element, *this = tanh(src).
Definition: cu-matrix.cc:1786
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
void Set(Real value)
Definition: cu-matrix.cc:531
virtual void Write(std::ostream &os, bool binary) const
Write component to stream.
void EqualElementMask(const CuMatrixBase< Real > &mat, CuMatrix< Real > *mask) const
Definition: cu-matrix.cc:3429
void Resize(MatrixIndexT rows, MatrixIndexT cols, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Allocate the memory.
Definition: cu-matrix.cc:50
WARNING, this component is deprecated in favor of TimeHeightConvolutionComponent, and will be deleted...
void Update(const std::string &debug_info, const CuMatrixBase< BaseFloat > &in_value, const CuMatrixBase< BaseFloat > &out_deriv, const std::vector< CuSubMatrix< BaseFloat > *> &out_deriv_batch)
void AddRowSumMat(Real alpha, const CuMatrixBase< Real > &mat, Real beta=1.0)
Sum the rows of the matrix, add to vector.
Definition: cu-vector.cc:1277
void RearrangeIndexes(const std::vector< std::vector< int32 > > &in, std::vector< std::vector< int32 > > *out)
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69
Vector for CUDA computing.
Definition: matrix-common.h:72
virtual void ConsolidateMemory()
This virtual function relates to memory management, and avoiding fragmentation.
virtual void Read(std::istream &is, bool binary)
Read function (used after we know the type of the Component); accepts input that is missing the token...
void ComputeLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, CuMatrixBase< Real > *output)
this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propaga...
Definition: cu-math.cc:489
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).
Definition: kaldi-vector.h:94