nnet-blstm-projected.h
Go to the documentation of this file.
1 // nnet/nnet-blstm-projected-streams.h
2 
3 // Copyright 2016 Brno University of Techology (author: Karel Vesely)
4 // Copyright 2015 Chongjia Ni
5 // Copyright 2014 Jiayu DU (Jerry), Wei Li
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 
22 #ifndef KALDI_NNET_NNET_BLSTM_PROJECTED_H_
23 #define KALDI_NNET_NNET_BLSTM_PROJECTED_H_
24 
25 #include <string>
26 #include <vector>
27 
28 #include "nnet/nnet-component.h"
29 #include "nnet/nnet-utils.h"
30 #include "cudamatrix/cu-math.h"
31 
32 /*************************************
33  * x: input neuron
34  * g: squashing neuron near input
35  * i: Input gate
36  * f: Forget gate
37  * o: Output gate
38  * c: memory Cell (CEC)
39  * h: squashing neuron near output
40  * m: output neuron of Memory block
41  * r: recurrent projection neuron
42  * y: output neuron of LSTMP
43  * f-*: forward direction
44  * b-*: backward direction
45  *************************************/
46 
47 namespace kaldi {
48 namespace nnet1 {
49 
51  public:
52  BlstmProjected(int32 input_dim, int32 output_dim):
53  MultistreamComponent(input_dim, output_dim),
54  cell_dim_(0),
55  proj_dim_(static_cast<int32>(output_dim/2)),
56  cell_clip_(50.0),
57  diff_clip_(1.0),
58  cell_diff_clip_(0.0),
59  grad_clip_(250.0)
60  { }
61 
63  { }
64 
65  Component* Copy() const { return new BlstmProjected(*this); }
67 
68  void InitData(std::istream &is) {
69  // define options,
70  float param_range = 0.1;
71  // parse the line from prototype,
72  std::string token;
73  while (is >> std::ws, !is.eof()) {
74  ReadToken(is, false, &token);
75  if (token == "<ParamRange>") ReadBasicType(is, false, &param_range);
76  else if (token == "<CellDim>") ReadBasicType(is, false, &cell_dim_);
77  else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
78  else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
79  else if (token == "<CellClip>") ReadBasicType(is, false, &cell_clip_);
80  else if (token == "<DiffClip>") ReadBasicType(is, false, &diff_clip_);
81  else if (token == "<CellDiffClip>") ReadBasicType(is, false, &cell_diff_clip_);
82  else if (token == "<GradClip>") ReadBasicType(is, false, &grad_clip_);
83  else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
84  << " (ParamRange|CellDim|LearnRateCoef|BiasLearnRateCoef|CellClip|DiffClip|GradClip)";
85  }
86 
87  // init the weights and biases (from uniform dist.),
88  // forward direction,
91  f_bias_.Resize(4*cell_dim_, kUndefined);
96  // (mean), (range)
97  RandUniform(0.0, 2.0 * param_range, &f_w_gifo_x_);
98  RandUniform(0.0, 2.0 * param_range, &f_w_gifo_r_);
99  RandUniform(0.0, 2.0 * param_range, &f_bias_);
100  RandUniform(0.0, 2.0 * param_range, &f_peephole_i_c_);
101  RandUniform(0.0, 2.0 * param_range, &f_peephole_f_c_);
102  RandUniform(0.0, 2.0 * param_range, &f_peephole_o_c_);
103  RandUniform(0.0, 2.0 * param_range, &f_w_r_m_);
104 
105  // Add 1.0 to forget-gate bias
106  // [Miao IS16: AN EMPIRICAL EXPLORATION...]
107  f_bias_.Range(2*cell_dim_, cell_dim_).Add(1.0);
108 
109  // backward direction,
112  b_bias_.Resize(4*cell_dim_, kUndefined);
117 
118  RandUniform(0.0, 2.0 * param_range, &b_w_gifo_x_);
119  RandUniform(0.0, 2.0 * param_range, &b_w_gifo_r_);
120  RandUniform(0.0, 2.0 * param_range, &b_bias_);
121  RandUniform(0.0, 2.0 * param_range, &b_peephole_i_c_);
122  RandUniform(0.0, 2.0 * param_range, &b_peephole_f_c_);
123  RandUniform(0.0, 2.0 * param_range, &b_peephole_o_c_);
124  RandUniform(0.0, 2.0 * param_range, &b_w_r_m_);
125 
126  // Add 1.0 to forget-gate bias,
127  // [Miao IS16: AN EMPIRICAL EXPLORATION...]
128  b_bias_.Range(2*cell_dim_, cell_dim_).Add(1.0);
129 
130  KALDI_ASSERT(cell_dim_ > 0);
133  }
134 
135  void ReadData(std::istream &is, bool binary) {
136  // Read all the '<Tokens>' in arbitrary order,
137  while ('<' == Peek(is, binary)) {
138  std::string token;
139  int first_char = PeekToken(is, binary);
140  switch (first_char) {
141  case 'C': ReadToken(is, false, &token);
142  if (token == "<CellDim>") ReadBasicType(is, binary, &cell_dim_);
143  else if (token == "<CellClip>") ReadBasicType(is, binary, &cell_clip_);
144  else if (token == "<CellDiffClip>") ReadBasicType(is, binary, &cell_diff_clip_);
145  else if (token == "<ClipGradient>") ReadBasicType(is, binary, &grad_clip_); // bwd-compat.
146  else KALDI_ERR << "Unknown token: " << token;
147  break;
148  case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
149  ReadBasicType(is, binary, &learn_rate_coef_);
150  break;
151  case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
152  ReadBasicType(is, binary, &bias_learn_rate_coef_);
153  break;
154  case 'D': ExpectToken(is, binary, "<DiffClip>");
155  ReadBasicType(is, binary, &diff_clip_);
156  break;
157  case 'G': ExpectToken(is, binary, "<GradClip>");
158  ReadBasicType(is, binary, &grad_clip_);
159  break;
160  default: ReadToken(is, false, &token);
161  KALDI_ERR << "Unknown token: " << token;
162  }
163  }
164  KALDI_ASSERT(cell_dim_ != 0);
165  // Read the data (data follow the tokens),
166 
167  // reading parameters corresponding to forward direction
168  f_w_gifo_x_.Read(is, binary);
169  f_w_gifo_r_.Read(is, binary);
170  f_bias_.Read(is, binary);
171 
172  f_peephole_i_c_.Read(is, binary);
173  f_peephole_f_c_.Read(is, binary);
174  f_peephole_o_c_.Read(is, binary);
175 
176  f_w_r_m_.Read(is, binary);
177 
178  // reading parameters corresponding to backward direction
179  b_w_gifo_x_.Read(is, binary);
180  b_w_gifo_r_.Read(is, binary);
181  b_bias_.Read(is, binary);
182 
183  b_peephole_i_c_.Read(is, binary);
184  b_peephole_f_c_.Read(is, binary);
185  b_peephole_o_c_.Read(is, binary);
186 
187  b_w_r_m_.Read(is, binary);
188  }
189 
190  void WriteData(std::ostream &os, bool binary) const {
191  WriteToken(os, binary, "<CellDim>");
192  WriteBasicType(os, binary, cell_dim_);
193 
194  WriteToken(os, binary, "<LearnRateCoef>");
195  WriteBasicType(os, binary, learn_rate_coef_);
196  WriteToken(os, binary, "<BiasLearnRateCoef>");
198 
199  WriteToken(os, binary, "<CellClip>");
200  WriteBasicType(os, binary, cell_clip_);
201  WriteToken(os, binary, "<DiffClip>");
202  WriteBasicType(os, binary, diff_clip_);
203  WriteToken(os, binary, "<CellDiffClip>");
204  WriteBasicType(os, binary, cell_diff_clip_);
205  WriteToken(os, binary, "<GradClip>");
206  WriteBasicType(os, binary, grad_clip_);
207 
208  if (!binary) os << "\n";
209  // writing parameters, forward direction,
210  f_w_gifo_x_.Write(os, binary);
211  f_w_gifo_r_.Write(os, binary);
212  f_bias_.Write(os, binary);
213 
214  f_peephole_i_c_.Write(os, binary);
215  f_peephole_f_c_.Write(os, binary);
216  f_peephole_o_c_.Write(os, binary);
217 
218  f_w_r_m_.Write(os, binary);
219 
220  if (!binary) os << "\n";
221  // writing parameters, backward direction,
222  b_w_gifo_x_.Write(os, binary);
223  b_w_gifo_r_.Write(os, binary);
224  b_bias_.Write(os, binary);
225 
226  b_peephole_i_c_.Write(os, binary);
227  b_peephole_f_c_.Write(os, binary);
228  b_peephole_o_c_.Write(os, binary);
229 
230  b_w_r_m_.Write(os, binary);
231  }
232 
233  int32 NumParams() const {
234  return 2 * ( f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols() +
235  f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols() +
236  f_bias_.Dim() +
237  f_peephole_i_c_.Dim() +
238  f_peephole_f_c_.Dim() +
239  f_peephole_o_c_.Dim() +
240  f_w_r_m_.NumRows() * f_w_r_m_.NumCols() );
241  }
242 
243  void GetGradient(VectorBase<BaseFloat>* gradient) const {
244  KALDI_ASSERT(gradient->Dim() == NumParams());
245  int32 offset, len;
246 
247  // Copying parameters corresponding to forward direction
248  offset = 0; len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
249  gradient->Range(offset, len).CopyRowsFromMat(f_w_gifo_x_corr_);
250 
251  offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
252  gradient->Range(offset, len).CopyRowsFromMat(f_w_gifo_r_corr_);
253 
254  offset += len; len = f_bias_.Dim();
255  gradient->Range(offset, len).CopyFromVec(f_bias_corr_);
256 
257  offset += len; len = f_peephole_i_c_.Dim();
258  gradient->Range(offset, len).CopyFromVec(f_peephole_i_c_corr_);
259 
260  offset += len; len = f_peephole_f_c_.Dim();
261  gradient->Range(offset, len).CopyFromVec(f_peephole_f_c_corr_);
262 
263  offset += len; len = f_peephole_o_c_.Dim();
264  gradient->Range(offset, len).CopyFromVec(f_peephole_o_c_corr_);
265 
266  offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
267  gradient->Range(offset, len).CopyRowsFromMat(f_w_r_m_corr_);
268 
269  // Copying parameters corresponding to backward direction
270  offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
271  gradient->Range(offset, len).CopyRowsFromMat(b_w_gifo_x_corr_);
272 
273  offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
274  gradient->Range(offset, len).CopyRowsFromMat(b_w_gifo_r_corr_);
275 
276  offset += len; len = b_bias_.Dim();
277  gradient->Range(offset, len).CopyFromVec(b_bias_corr_);
278 
279  offset += len; len = b_peephole_i_c_.Dim();
280  gradient->Range(offset, len).CopyFromVec(b_peephole_i_c_corr_);
281 
282  offset += len; len = b_peephole_f_c_.Dim();
283  gradient->Range(offset, len).CopyFromVec(b_peephole_f_c_corr_);
284 
285  offset += len; len = b_peephole_o_c_.Dim();
286  gradient->Range(offset, len).CopyFromVec(b_peephole_o_c_corr_);
287 
288  offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
289  gradient->Range(offset, len).CopyRowsFromMat(b_w_r_m_corr_);
290 
291  // check the dim,
292  offset += len;
293  KALDI_ASSERT(offset == NumParams());
294  }
295 
296  void GetParams(VectorBase<BaseFloat>* params) const {
297  KALDI_ASSERT(params->Dim() == NumParams());
298  int32 offset, len;
299 
300  // Copying parameters corresponding to forward direction
301  offset = 0; len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
302  params->Range(offset, len).CopyRowsFromMat(f_w_gifo_x_);
303 
304  offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
305  params->Range(offset, len).CopyRowsFromMat(f_w_gifo_r_);
306 
307  offset += len; len = f_bias_.Dim();
308  params->Range(offset, len).CopyFromVec(f_bias_);
309 
310  offset += len; len = f_peephole_i_c_.Dim();
311  params->Range(offset, len).CopyFromVec(f_peephole_i_c_);
312 
313  offset += len; len = f_peephole_f_c_.Dim();
314  params->Range(offset, len).CopyFromVec(f_peephole_f_c_);
315 
316  offset += len; len = f_peephole_o_c_.Dim();
317  params->Range(offset, len).CopyFromVec(f_peephole_o_c_);
318 
319  offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
320  params->Range(offset, len).CopyRowsFromMat(f_w_r_m_);
321 
322  // Copying parameters corresponding to backward direction
323  offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
324  params->Range(offset, len).CopyRowsFromMat(b_w_gifo_x_);
325 
326  offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
327  params->Range(offset, len).CopyRowsFromMat(b_w_gifo_r_);
328 
329  offset += len; len = b_bias_.Dim();
330  params->Range(offset, len).CopyFromVec(b_bias_);
331 
332  offset += len; len = b_peephole_i_c_.Dim();
333  params->Range(offset, len).CopyFromVec(b_peephole_i_c_);
334 
335  offset += len; len = b_peephole_f_c_.Dim();
336  params->Range(offset, len).CopyFromVec(b_peephole_f_c_);
337 
338  offset += len; len = b_peephole_o_c_.Dim();
339  params->Range(offset, len).CopyFromVec(b_peephole_o_c_);
340 
341  offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
342  params->Range(offset, len).CopyRowsFromMat(b_w_r_m_);
343 
344  // check the dim,
345  offset += len;
346  KALDI_ASSERT(offset == NumParams());
347  }
348 
349  void SetParams(const VectorBase<BaseFloat>& params) {
350  KALDI_ASSERT(params.Dim() == NumParams());
351  int32 offset, len;
352 
353  // Copying parameters corresponding to forward direction
354  offset = 0; len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
355  f_w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
356 
357  offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
358  f_w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
359 
360  offset += len; len = f_bias_.Dim();
361  f_bias_.CopyFromVec(params.Range(offset, len));
362 
363  offset += len; len = f_peephole_i_c_.Dim();
364  f_peephole_i_c_.CopyFromVec(params.Range(offset, len));
365 
366  offset += len; len = f_peephole_f_c_.Dim();
367  f_peephole_f_c_.CopyFromVec(params.Range(offset, len));
368 
369  offset += len; len = f_peephole_o_c_.Dim();
370  f_peephole_o_c_.CopyFromVec(params.Range(offset, len));
371 
372  offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
373  f_w_r_m_.CopyRowsFromVec(params.Range(offset, len));
374 
375  // Copying parameters corresponding to backward direction
376  offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
377  b_w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
378 
379  offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
380  b_w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
381 
382  offset += len; len = b_bias_.Dim();
383  b_bias_.CopyFromVec(params.Range(offset, len));
384 
385  offset += len; len = b_peephole_i_c_.Dim();
386  b_peephole_i_c_.CopyFromVec(params.Range(offset, len));
387 
388  offset += len; len = b_peephole_f_c_.Dim();
389  b_peephole_f_c_.CopyFromVec(params.Range(offset, len));
390 
391  offset += len; len = b_peephole_o_c_.Dim();
392  b_peephole_o_c_.CopyFromVec(params.Range(offset, len));
393 
394  offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
395  b_w_r_m_.CopyRowsFromVec(params.Range(offset, len));
396 
397  // check the dim,
398  offset += len;
399  KALDI_ASSERT(offset == NumParams());
400  }
401 
402 
403  std::string Info() const {
404  return std::string("cell-dim 2x") + ToString(cell_dim_) + " " +
405  "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
406  ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
407  ", cell_clip_ " + ToString(cell_clip_) +
408  ", diff_clip_ " + ToString(diff_clip_) +
409  ", grad_clip_ " + ToString(grad_clip_) + " )" +
410  "\n Forward Direction weights:" +
411  "\n f_w_gifo_x_ " + MomentStatistics(f_w_gifo_x_) +
412  "\n f_w_gifo_r_ " + MomentStatistics(f_w_gifo_r_) +
413  "\n f_bias_ " + MomentStatistics(f_bias_) +
414  "\n f_peephole_i_c_ " + MomentStatistics(f_peephole_i_c_) +
415  "\n f_peephole_f_c_ " + MomentStatistics(f_peephole_f_c_) +
416  "\n f_peephole_o_c_ " + MomentStatistics(f_peephole_o_c_) +
417  "\n f_w_r_m_ " + MomentStatistics(f_w_r_m_) +
418  "\n Backward Direction weights:" +
419  "\n b_w_gifo_x_ " + MomentStatistics(b_w_gifo_x_) +
420  "\n b_w_gifo_r_ " + MomentStatistics(b_w_gifo_r_) +
421  "\n b_bias_ " + MomentStatistics(b_bias_) +
422  "\n b_peephole_i_c_ " + MomentStatistics(b_peephole_i_c_) +
423  "\n b_peephole_f_c_ " + MomentStatistics(b_peephole_f_c_) +
424  "\n b_peephole_o_c_ " + MomentStatistics(b_peephole_o_c_) +
425  "\n b_w_r_m_ " + MomentStatistics(b_w_r_m_);
426  }
427 
428 
429  std::string InfoGradient() const {
430  // forward-direction activations,
439 
440  // forward-direction derivatives,
449 
450  // backward-direction activations,
459 
460  // backward-direction derivatives,
469 
470  return std::string("") +
471  "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
472  ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
473  ", cell_clip_ " + ToString(cell_clip_) +
474  ", diff_clip_ " + ToString(diff_clip_) +
475  ", grad_clip_ " + ToString(grad_clip_) + " )" +
476  "\n ### Gradients " +
477  "\n f_w_gifo_x_corr_ " + MomentStatistics(f_w_gifo_x_corr_) +
478  "\n f_w_gifo_r_corr_ " + MomentStatistics(f_w_gifo_r_corr_) +
479  "\n f_bias_corr_ " + MomentStatistics(f_bias_corr_) +
480  "\n f_peephole_i_c_corr_ " + MomentStatistics(f_peephole_i_c_corr_) +
481  "\n f_peephole_f_c_corr_ " + MomentStatistics(f_peephole_f_c_corr_) +
482  "\n f_peephole_o_c_corr_ " + MomentStatistics(f_peephole_o_c_corr_) +
483  "\n f_w_r_m_corr_ " + MomentStatistics(f_w_r_m_corr_) +
484  "\n ---" +
485  "\n b_w_gifo_x_corr_ " + MomentStatistics(b_w_gifo_x_corr_) +
486  "\n b_w_gifo_r_corr_ " + MomentStatistics(b_w_gifo_r_corr_) +
487  "\n b_bias_corr_ " + MomentStatistics(b_bias_corr_) +
488  "\n b_peephole_i_c_corr_ " + MomentStatistics(b_peephole_i_c_corr_) +
489  "\n b_peephole_f_c_corr_ " + MomentStatistics(b_peephole_f_c_corr_) +
490  "\n b_peephole_o_c_corr_ " + MomentStatistics(b_peephole_o_c_corr_) +
491  "\n b_w_r_m_corr_ " + MomentStatistics(b_w_r_m_corr_) +
492  "\n" +
493  "\n ### Activations (mostly after non-linearities)" +
494  "\n YI_FW(0..1)^ " + MomentStatistics(YI_FW) +
495  "\n YF_FW(0..1)^ " + MomentStatistics(YF_FW) +
496  "\n YO_FW(0..1)^ " + MomentStatistics(YO_FW) +
497  "\n YG_FW(-1..1) " + MomentStatistics(YG_FW) +
498  "\n YC_FW(-R..R)* " + MomentStatistics(YC_FW) +
499  "\n YH_FW(-1..1) " + MomentStatistics(YH_FW) +
500  "\n YM_FW(-1..1) " + MomentStatistics(YM_FW) +
501  "\n YR_FW(-R..R) " + MomentStatistics(YR_FW) +
502  "\n ---" +
503  "\n YI_BW(0..1)^ " + MomentStatistics(YI_BW) +
504  "\n YF_BW(0..1)^ " + MomentStatistics(YF_BW) +
505  "\n YO_BW(0..1)^ " + MomentStatistics(YO_BW) +
506  "\n YG_BW(-1..1) " + MomentStatistics(YG_BW) +
507  "\n YC_BW(-R..R)* " + MomentStatistics(YC_BW) +
508  "\n YH_BW(-1..1) " + MomentStatistics(YH_BW) +
509  "\n YM_BW(-1..1) " + MomentStatistics(YM_BW) +
510  "\n YR_BW(-R..R) " + MomentStatistics(YR_BW) +
511  "\n" +
512  "\n ### Derivatives (w.r.t. inputs of non-linearities)" +
513  "\n DI_FW^ " + MomentStatistics(DI_FW) +
514  "\n DF_FW^ " + MomentStatistics(DF_FW) +
515  "\n DO_FW^ " + MomentStatistics(DO_FW) +
516  "\n DG_FW " + MomentStatistics(DG_FW) +
517  "\n DC_FW* " + MomentStatistics(DC_FW) +
518  "\n DH_FW " + MomentStatistics(DH_FW) +
519  "\n DM_FW " + MomentStatistics(DM_FW) +
520  "\n DR_FW " + MomentStatistics(DR_FW) +
521  "\n ---" +
522  "\n DI_BW^ " + MomentStatistics(DI_BW) +
523  "\n DF_BW^ " + MomentStatistics(DF_BW) +
524  "\n DO_BW^ " + MomentStatistics(DO_BW) +
525  "\n DG_BW " + MomentStatistics(DG_BW) +
526  "\n DC_BW* " + MomentStatistics(DC_BW) +
527  "\n DH_BW " + MomentStatistics(DH_BW) +
528  "\n DM_BW " + MomentStatistics(DM_BW) +
529  "\n DR_BW " + MomentStatistics(DR_BW);
530  }
531 
534 
535  KALDI_ASSERT(in.NumRows() % NumStreams() == 0);
536  int32 S = NumStreams();
537  int32 T = in.NumRows() / NumStreams();
538 
539  // buffers,
540  f_propagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
541  b_propagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
542 
543  // forward-direction activations,
552  CuSubMatrix<BaseFloat> F_YGIFO(f_propagate_buf_.ColRange(0, 4*cell_dim_));
553 
554  // backward-direction activations,
563  CuSubMatrix<BaseFloat> B_YGIFO(b_propagate_buf_.ColRange(0, 4*cell_dim_));
564 
565  // FORWARD DIRECTION,
566  // x -> g, i, f, o, not recurrent, do it all in once
567  F_YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, f_w_gifo_x_, kTrans, 0.0);
568 
569  // bias -> g, i, f, o
570  F_YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, f_bias_);
571 
572  // BufferPadding [T0]:dummy, [1, T]:current sequence, [T+1]:dummy
573  for (int t = 1; t <= T; t++) {
574  // multistream buffers for current time-step,
575  CuSubMatrix<BaseFloat> y_all(f_propagate_buf_.RowRange(t*S, S));
576  CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S, S));
577  CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S, S));
578  CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S, S));
579  CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S, S));
580  CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S, S));
581  CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S, S));
582  CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S, S));
583  CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S, S));
584  CuSubMatrix<BaseFloat> y_gifo(F_YGIFO.RowRange(t*S, S));
585 
586  // r(t-1) -> g, i, f, o
587  y_gifo.AddMatMat(1.0, F_YR.RowRange((t-1)*S, S), kNoTrans, f_w_gifo_r_, kTrans, 1.0);
588 
589  // c(t-1) -> i(t) via peephole
590  y_i.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S, S), kNoTrans, f_peephole_i_c_, 1.0);
591 
592  // c(t-1) -> f(t) via peephole
593  y_f.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S, S), kNoTrans, f_peephole_f_c_, 1.0);
594 
595  // i, f sigmoid squashing
596  y_i.Sigmoid(y_i);
597  y_f.Sigmoid(y_f);
598 
599  // g tanh squashing
600  y_g.Tanh(y_g);
601 
602  // g * i -> c
603  y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
604  // c(t-1) * f -> c(t) via forget-gate
605  y_c.AddMatMatElements(1.0, F_YC.RowRange((t-1)*S, S), y_f, 1.0);
606 
607  if (cell_clip_ > 0.0) {
608  y_c.ApplyFloor(-cell_clip_); // Optional clipping of cell activation,
609  y_c.ApplyCeiling(cell_clip_); // Google paper Interspeech2014: LSTM for LVCSR
610  }
611 
612  // c(t) -> o(t) via peephole (not recurrent, using c(t))
613  y_o.AddMatDiagVec(1.0, y_c, kNoTrans, f_peephole_o_c_, 1.0);
614 
615  // o sigmoid squashing,
616  y_o.Sigmoid(y_o);
617 
618  // c -> h, tanh squashing,
619  y_h.Tanh(y_c);
620 
621  // h * o -> m via output gate,
622  y_m.AddMatMatElements(1.0, y_h, y_o, 0.0);
623 
624  // m -> r
625  y_r.AddMatMat(1.0, y_m, kNoTrans, f_w_r_m_, kTrans, 0.0);
626 
627  // set zeros to padded frames,
628  if (sequence_lengths_.size() > 0) {
629  for (int s = 0; s < S; s++) {
630  if (t > sequence_lengths_[s]) {
631  y_all.Row(s).SetZero();
632  }
633  }
634  }
635  }
636 
637  // BACKWARD DIRECTION,
638  // x -> g, i, f, o, not recurrent, do it all in once
639  B_YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, b_w_gifo_x_, kTrans, 0.0);
640 
641  // bias -> g, i, f, o
642  B_YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, b_bias_);
643 
644  // BufferPadding [T0]:dummy, [1, T]:current sequence, [T+1]:dummy
645  for (int t = T; t >= 1; t--) {
646  // multistream buffers for current time-step,
647  CuSubMatrix<BaseFloat> y_all(b_propagate_buf_.RowRange(t*S, S));
648  CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S, S));
649  CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S, S));
650  CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S, S));
651  CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S, S));
652  CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S, S));
653  CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S, S));
654  CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S, S));
655  CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S, S));
656  CuSubMatrix<BaseFloat> y_gifo(B_YGIFO.RowRange(t*S, S));
657 
658  // r(t+1) -> g, i, f, o
659  y_gifo.AddMatMat(1.0, B_YR.RowRange((t+1)*S, S), kNoTrans, b_w_gifo_r_, kTrans, 1.0);
660 
661  // c(t+1) -> i(t) via peephole
662  y_i.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S, S), kNoTrans, b_peephole_i_c_, 1.0);
663 
664  // c(t+1) -> f(t) via peephole
665  y_f.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S, S), kNoTrans, b_peephole_f_c_, 1.0);
666 
667  // i, f sigmoid squashing
668  y_i.Sigmoid(y_i);
669  y_f.Sigmoid(y_f);
670 
671  // g tanh squashing
672  y_g.Tanh(y_g);
673 
674  // g * i -> c
675  y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
676  // c(t+1) * f -> c(t) via forget-gate
677  y_c.AddMatMatElements(1.0, B_YC.RowRange((t+1)*S, S), y_f, 1.0);
678 
679  if (cell_clip_ > 0.0) {
680  y_c.ApplyFloor(-cell_clip_); // optional clipping of cell activation,
681  y_c.ApplyCeiling(cell_clip_); // google paper Interspeech2014: LSTM for LVCSR
682  }
683 
684  // c(t) -> o(t) via peephole (not recurrent, using c(t))
685  y_o.AddMatDiagVec(1.0, y_c, kNoTrans, b_peephole_o_c_, 1.0);
686 
687  // o sigmoid squashing,
688  y_o.Sigmoid(y_o);
689 
690  // h tanh squashing,
691  y_h.Tanh(y_c);
692 
693  // h * o -> m via output gate,
694  y_m.AddMatMatElements(1.0, y_h, y_o, 0.0);
695 
696  // m -> r
697  y_r.AddMatMat(1.0, y_m, kNoTrans, b_w_r_m_, kTrans, 0.0);
698 
699  // set zeros to padded frames,
700  if (sequence_lengths_.size() > 0) {
701  for (int s = 0; s < S; s++) {
702  if (t > sequence_lengths_[s]) {
703  y_all.Row(s).SetZero();
704  }
705  }
706  }
707  }
708 
709  CuMatrix<BaseFloat> YR_FB;
710  YR_FB.Resize((T+2)*S, 2 * proj_dim_, kSetZero);
711  // forward part
712  YR_FB.ColRange(0, proj_dim_).CopyFromMat(f_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
713  // backward part
714  YR_FB.ColRange(proj_dim_, proj_dim_).CopyFromMat(b_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
715  // recurrent projection layer is also feed-forward as BLSTM output
716  out->CopyFromMat(YR_FB.RowRange(1*S, T*S));
717  }
718 
719 
721  const CuMatrixBase<BaseFloat> &out,
722  const CuMatrixBase<BaseFloat> &out_diff,
723  CuMatrixBase<BaseFloat> *in_diff) {
724 
725  // the number of sequences to be processed in parallel
726  int32 T = in.NumRows() / NumStreams();
727  int32 S = NumStreams();
728 
729  // buffers,
730  f_backpropagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
731  b_backpropagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
732 
733  // FORWARD DIRECTION,
734  // forward-direction activations,
743 
744  // forward-direction derivatives,
753  CuSubMatrix<BaseFloat> F_DGIFO(f_backpropagate_buf_.ColRange(0, 4*cell_dim_));
754 
755  // pre-copy partial derivatives from the BLSTM output,
756  F_DR.RowRange(1*S, T*S).CopyFromMat(out_diff.ColRange(0, proj_dim_));
757 
758  // BufferPadding [T0]:dummy, [1,T]:current sequence, [T+1]: dummy,
759  for (int t = T; t >= 1; t--) {
760  CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S, S));
761  CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S, S));
762  CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S, S));
763  CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S, S));
764  // CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S, S));
765  CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S, S));
766  // CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S, S));
767  // CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S, S));
768 
769  CuSubMatrix<BaseFloat> d_all(f_backpropagate_buf_.RowRange(t*S, S));
770  CuSubMatrix<BaseFloat> d_g(F_DG.RowRange(t*S, S));
771  CuSubMatrix<BaseFloat> d_i(F_DI.RowRange(t*S, S));
772  CuSubMatrix<BaseFloat> d_f(F_DF.RowRange(t*S, S));
773  CuSubMatrix<BaseFloat> d_o(F_DO.RowRange(t*S, S));
774  CuSubMatrix<BaseFloat> d_c(F_DC.RowRange(t*S, S));
775  CuSubMatrix<BaseFloat> d_h(F_DH.RowRange(t*S, S));
776  CuSubMatrix<BaseFloat> d_m(F_DM.RowRange(t*S, S));
777  CuSubMatrix<BaseFloat> d_r(F_DR.RowRange(t*S, S));
778  CuSubMatrix<BaseFloat> d_gifo(F_DGIFO.RowRange(t*S, S));
779 
780  // r
781  // Version 1 (precise gradients):
782  // backprop error from g(t+1), i(t+1), f(t+1), o(t+1) to r(t)
783  d_r.AddMatMat(1.0, F_DGIFO.RowRange((t+1)*S, S), kNoTrans, f_w_gifo_r_, kNoTrans, 1.0);
784 
785  /*
786  // Version 2 (Alex Graves' PhD dissertation):
787  // only backprop g(t+1) to r(t)
788  CuSubMatrix<BaseFloat> w_g_r_(w_gifo_r_.RowRange(0, cell_dim_));
789  d_r.AddMatMat(1.0, DG.RowRange((t+1)*S,S), kNoTrans, w_g_r_, kNoTrans, 1.0);
790  */
791 
792  /*
793  // Version 3 (Felix Gers' PhD dissertation):
794  // truncate gradients of g(t+1), i(t+1), f(t+1), o(t+1) once they leak out memory block
795  // CEC(with forget connection) is the only "error-bridge" through time
796  ;
797  */
798 
799  // r -> m
800  d_m.AddMatMat(1.0, d_r, kNoTrans, f_w_r_m_, kNoTrans, 0.0);
801 
802  // m -> h, via output gate
803  d_h.AddMatMatElements(1.0, d_m, y_o, 0.0);
804  d_h.DiffTanh(y_h, d_h);
805 
806  // o
807  d_o.AddMatMatElements(1.0, d_m, y_h, 0.0);
808  d_o.DiffSigmoid(y_o, d_o);
809 
810  // c
811  // 1. diff from h(t)
812  // 2. diff from c(t+1) (via forget-gate between CEC)
813  // 3. diff from i(t+1) (via peephole)
814  // 4. diff from f(t+1) (via peephole)
815  // 5. diff from o(t) (via peephole, not recurrent)
816  d_c.AddMat(1.0, d_h);
817  d_c.AddMatMatElements(1.0, F_DC.RowRange((t+1)*S, S), F_YF.RowRange((t+1)*S, S), 1.0);
818  d_c.AddMatDiagVec(1.0, F_DI.RowRange((t+1)*S, S), kNoTrans, f_peephole_i_c_, 1.0);
819  d_c.AddMatDiagVec(1.0, F_DF.RowRange((t+1)*S, S), kNoTrans, f_peephole_f_c_, 1.0);
820  d_c.AddMatDiagVec(1.0, d_o , kNoTrans, f_peephole_o_c_, 1.0);
821  // optionally clip the cell_derivative,
822  if (cell_diff_clip_ > 0.0) {
823  d_c.ApplyFloor(-cell_diff_clip_);
824  d_c.ApplyCeiling(cell_diff_clip_);
825  }
826 
827  // f
828  d_f.AddMatMatElements(1.0, d_c, F_YC.RowRange((t-1)*S, S), 0.0);
829  d_f.DiffSigmoid(y_f, d_f);
830 
831  // i
832  d_i.AddMatMatElements(1.0, d_c, y_g, 0.0);
833  d_i.DiffSigmoid(y_i, d_i);
834 
835  // c -> g, via input gate
836  d_g.AddMatMatElements(1.0, d_c, y_i, 0.0);
837  d_g.DiffTanh(y_g, d_g);
838 
839  // Clipping per-frame derivatives for the next `t'.
840  // Clipping applied to gates and input gate (as done in Google).
841  // [ICASSP2015, Sak, Learning acoustic frame labelling...],
842  //
843  // The path from 'out_diff' to 'd_c' via 'd_h' is unclipped,
844  // which is probably important for the 'Constant Error Carousel'
845  // to work well.
846  //
847  if (diff_clip_ > 0.0) {
848  d_gifo.ApplyFloor(-diff_clip_);
849  d_gifo.ApplyCeiling(diff_clip_);
850  }
851 
852  // set zeros to padded frames,
853  if (sequence_lengths_.size() > 0) {
854  for (int s = 0; s < S; s++) {
855  if (t > sequence_lengths_[s]) {
856  d_all.Row(s).SetZero();
857  }
858  }
859  }
860  }
861 
862  // BACKWARD DIRECTION,
863  // backward-direction activations,
872 
873  // backward-direction derivatives,
882  CuSubMatrix<BaseFloat> B_DGIFO(b_backpropagate_buf_.ColRange(0, 4*cell_dim_));
883 
884  // pre-copy partial derivatives from the BLSTM output,
885  B_DR.RowRange(1*S, T*S).CopyFromMat(out_diff.ColRange(proj_dim_, proj_dim_));
886 
887  // BufferPadding [T0]:dummy, [1,T]:current sequence, [T+1]: dummy,
888  for (int t = 1; t <= T; t++) {
889  CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S, S));
890  CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S, S));
891  CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S, S));
892  CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S, S));
893  // CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S, S));
894  CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S, S));
895  // CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S, S));
896  // CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S, S));
897 
898  CuSubMatrix<BaseFloat> d_all(b_backpropagate_buf_.RowRange(t*S, S));
899  CuSubMatrix<BaseFloat> d_g(B_DG.RowRange(t*S, S));
900  CuSubMatrix<BaseFloat> d_i(B_DI.RowRange(t*S, S));
901  CuSubMatrix<BaseFloat> d_f(B_DF.RowRange(t*S, S));
902  CuSubMatrix<BaseFloat> d_o(B_DO.RowRange(t*S, S));
903  CuSubMatrix<BaseFloat> d_c(B_DC.RowRange(t*S, S));
904  CuSubMatrix<BaseFloat> d_h(B_DH.RowRange(t*S, S));
905  CuSubMatrix<BaseFloat> d_m(B_DM.RowRange(t*S, S));
906  CuSubMatrix<BaseFloat> d_r(B_DR.RowRange(t*S, S));
907  CuSubMatrix<BaseFloat> d_gifo(B_DGIFO.RowRange(t*S, S));
908 
909  // r
910  // Version 1 (precise gradients):
911  // backprop error from g(t-1), i(t-1), f(t-1), o(t-1) to r(t)
912  d_r.AddMatMat(1.0, B_DGIFO.RowRange((t-1)*S, S), kNoTrans, b_w_gifo_r_, kNoTrans, 1.0);
913 
914  /*
915  // Version 2 (Alex Graves' PhD dissertation):
916  // only backprop g(t+1) to r(t)
917  CuSubMatrix<BaseFloat> w_g_r_(w_gifo_r_.RowRange(0, cell_dim_));
918  d_r.AddMatMat(1.0, DG.RowRange((t+1)*S,S), kNoTrans, w_g_r_, kNoTrans, 1.0);
919  */
920 
921  /*
922  // Version 3 (Felix Gers' PhD dissertation):
923  // truncate gradients of g(t+1), i(t+1), f(t+1), o(t+1) once they leak out memory block
924  // CEC(with forget connection) is the only "error-bridge" through time
925  */
926 
927  // r -> m
928  d_m.AddMatMat(1.0, d_r, kNoTrans, b_w_r_m_, kNoTrans, 0.0);
929 
930  // m -> h via output gate
931  d_h.AddMatMatElements(1.0, d_m, y_o, 0.0);
932  d_h.DiffTanh(y_h, d_h);
933 
934  // o
935  d_o.AddMatMatElements(1.0, d_m, y_h, 0.0);
936  d_o.DiffSigmoid(y_o, d_o);
937 
938  // c
939  // 1. diff from h(t)
940  // 2. diff from c(t+1) (via forget-gate between CEC)
941  // 3. diff from i(t+1) (via peephole)
942  // 4. diff from f(t+1) (via peephole)
943  // 5. diff from o(t) (via peephole, not recurrent)
944  d_c.AddMat(1.0, d_h);
945  d_c.AddMatMatElements(1.0, B_DC.RowRange((t-1)*S, S), B_YF.RowRange((t-1)*S, S), 1.0);
946  d_c.AddMatDiagVec(1.0, B_DI.RowRange((t-1)*S, S), kNoTrans, b_peephole_i_c_, 1.0);
947  d_c.AddMatDiagVec(1.0, B_DF.RowRange((t-1)*S, S), kNoTrans, b_peephole_f_c_, 1.0);
948  d_c.AddMatDiagVec(1.0, d_o , kNoTrans, b_peephole_o_c_, 1.0);
949  // optionally clip the cell_derivative,
950  if (cell_diff_clip_ > 0.0) {
951  d_c.ApplyFloor(-cell_diff_clip_);
952  d_c.ApplyCeiling(cell_diff_clip_);
953  }
954 
955  // f
956  d_f.AddMatMatElements(1.0, d_c, B_YC.RowRange((t-1)*S, S), 0.0);
957  d_f.DiffSigmoid(y_f, d_f);
958 
959  // i
960  d_i.AddMatMatElements(1.0, d_c, y_g, 0.0);
961  d_i.DiffSigmoid(y_i, d_i);
962 
963  // c -> g, via input gate,
964  d_g.AddMatMatElements(1.0, d_c, y_i, 0.0);
965  d_g.DiffTanh(y_g, d_g);
966 
967  // Clipping per-frame derivatives for the next `t'.
968  // Clipping applied to gates and input gate (as done in Google).
969  // [ICASSP2015, Sak, Learning acoustic frame labelling...],
970  //
971  // The path from 'out_diff' to 'd_c' via 'd_h' is unclipped,
972  // which is probably important for the 'Constant Error Carousel'
973  // to work well.
974  //
975  if (diff_clip_ > 0.0) {
976  d_gifo.ApplyFloor(-diff_clip_);
977  d_gifo.ApplyCeiling(diff_clip_);
978  }
979 
980  // set zeros to padded frames,
981  if (sequence_lengths_.size() > 0) {
982  for (int s = 0; s < S; s++) {
983  if (t > sequence_lengths_[s]) {
984  d_all.Row(s).SetZero();
985  }
986  }
987  }
988  }
989 
990  // g,i,f,o -> x, calculating input derivatives,
991  // forward direction difference
992  in_diff->AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kNoTrans, f_w_gifo_x_, kNoTrans, 0.0);
993  // backward direction difference
994  in_diff->AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kNoTrans, b_w_gifo_x_, kNoTrans, 1.0);
995 
996  // lazy initialization of udpate buffers,
997  if (f_w_gifo_x_corr_.NumRows() == 0) {
998  // init delta buffers,
999  // forward direction,
1002  f_bias_corr_.Resize(4*cell_dim_, kSetZero);
1007 
1008  // backward direction,
1011  b_bias_corr_.Resize(4*cell_dim_, kSetZero);
1016  }
1017 
1018  // calculate delta
1019  const BaseFloat mmt = opts_.momentum;
1020 
1021  // forward direction
1022  // weight x -> g, i, f, o
1023  f_w_gifo_x_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kTrans,
1024  in, kNoTrans, mmt);
1025  // recurrent weight r -> g, i, f, o
1026  f_w_gifo_r_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kTrans,
1027  F_YR.RowRange(0*S, T*S), kNoTrans, mmt);
1028  // bias of g, i, f, o
1029  f_bias_corr_.AddRowSumMat(1.0, F_DGIFO.RowRange(1*S, T*S), mmt);
1030 
1031  // recurrent peephole c -> i
1032  f_peephole_i_c_corr_.AddDiagMatMat(1.0, F_DI.RowRange(1*S, T*S), kTrans,
1033  F_YC.RowRange(0*S, T*S), kNoTrans, mmt);
1034  // recurrent peephole c -> f
1035  f_peephole_f_c_corr_.AddDiagMatMat(1.0, F_DF.RowRange(1*S, T*S), kTrans,
1036  F_YC.RowRange(0*S, T*S), kNoTrans, mmt);
1037  // peephole c -> o
1038  f_peephole_o_c_corr_.AddDiagMatMat(1.0, F_DO.RowRange(1*S, T*S), kTrans,
1039  F_YC.RowRange(1*S, T*S), kNoTrans, mmt);
1040 
1041  f_w_r_m_corr_.AddMatMat(1.0, F_DR.RowRange(1*S, T*S), kTrans,
1042  F_YM.RowRange(1*S, T*S), kNoTrans, mmt);
1043 
1044  // backward direction backpropagate
1045  // weight x -> g, i, f, o
1046  b_w_gifo_x_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kTrans, in, kNoTrans, mmt);
1047  // recurrent weight r -> g, i, f, o
1048  b_w_gifo_r_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kTrans,
1049  B_YR.RowRange(0*S, T*S) , kNoTrans, mmt);
1050  // bias of g, i, f, o
1051  b_bias_corr_.AddRowSumMat(1.0, B_DGIFO.RowRange(1*S, T*S), mmt);
1052 
1053  // recurrent peephole c -> i, c(t+1) --> i
1054  b_peephole_i_c_corr_.AddDiagMatMat(1.0, B_DI.RowRange(1*S, T*S), kTrans,
1055  B_YC.RowRange(2*S, T*S), kNoTrans, mmt);
1056  // recurrent peephole c -> f, c(t+1) --> f
1057  b_peephole_f_c_corr_.AddDiagMatMat(1.0, B_DF.RowRange(1*S, T*S), kTrans,
1058  B_YC.RowRange(2*S, T*S), kNoTrans, mmt);
1059  // peephole c -> o
1060  b_peephole_o_c_corr_.AddDiagMatMat(1.0, B_DO.RowRange(1*S, T*S), kTrans,
1061  B_YC.RowRange(1*S, T*S), kNoTrans, mmt);
1062 
1063  b_w_r_m_corr_.AddMatMat(1.0, B_DR.RowRange(1*S, T*S), kTrans,
1064  B_YM.RowRange(1*S, T*S), kNoTrans, mmt);
1065  }
1066 
1067  void Update(const CuMatrixBase<BaseFloat> &input,
1068  const CuMatrixBase<BaseFloat> &diff) {
1069 
1070  // apply the gradient clipping,
1071  if (grad_clip_ > 0.0) {
1072  f_w_gifo_x_corr_.ApplyFloor(-grad_clip_);
1073  f_w_gifo_x_corr_.ApplyCeiling(grad_clip_);
1074  f_w_gifo_r_corr_.ApplyFloor(-grad_clip_);
1075  f_w_gifo_r_corr_.ApplyCeiling(grad_clip_);
1076  f_bias_corr_.ApplyFloor(-grad_clip_);
1077  f_bias_corr_.ApplyCeiling(grad_clip_);
1078  f_w_r_m_corr_.ApplyFloor(-grad_clip_);
1079  f_w_r_m_corr_.ApplyCeiling(grad_clip_);
1080  f_peephole_i_c_corr_.ApplyFloor(-grad_clip_);
1081  f_peephole_i_c_corr_.ApplyCeiling(grad_clip_);
1082  f_peephole_f_c_corr_.ApplyFloor(-grad_clip_);
1083  f_peephole_f_c_corr_.ApplyCeiling(grad_clip_);
1084  f_peephole_o_c_corr_.ApplyFloor(-grad_clip_);
1085  f_peephole_o_c_corr_.ApplyCeiling(grad_clip_);
1086 
1087  b_w_gifo_x_corr_.ApplyFloor(-grad_clip_);
1088  b_w_gifo_x_corr_.ApplyCeiling(grad_clip_);
1089  b_w_gifo_r_corr_.ApplyFloor(-grad_clip_);
1090  b_w_gifo_r_corr_.ApplyCeiling(grad_clip_);
1091  b_bias_corr_.ApplyFloor(-grad_clip_);
1092  b_bias_corr_.ApplyCeiling(grad_clip_);
1093  b_w_r_m_corr_.ApplyFloor(-grad_clip_);
1094  b_w_r_m_corr_.ApplyCeiling(grad_clip_);
1095  b_peephole_i_c_corr_.ApplyFloor(-grad_clip_);
1096  b_peephole_i_c_corr_.ApplyCeiling(grad_clip_);
1097  b_peephole_f_c_corr_.ApplyFloor(-grad_clip_);
1098  b_peephole_f_c_corr_.ApplyCeiling(grad_clip_);
1099  b_peephole_o_c_corr_.ApplyFloor(-grad_clip_);
1100  b_peephole_o_c_corr_.ApplyCeiling(grad_clip_);
1101  }
1102 
1103  const BaseFloat lr = opts_.learn_rate;
1104 
1105  // forward direction update
1107  f_w_gifo_r_.AddMat(-lr * learn_rate_coef_, f_w_gifo_r_corr_);
1108  f_bias_.AddVec(-lr * bias_learn_rate_coef_, f_bias_corr_, 1.0);
1109 
1110  f_peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_i_c_corr_, 1.0);
1111  f_peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_f_c_corr_, 1.0);
1112  f_peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_o_c_corr_, 1.0);
1113 
1114  f_w_r_m_.AddMat(-lr * learn_rate_coef_, f_w_r_m_corr_);
1115 
1116  // backward direction update
1117  b_w_gifo_x_.AddMat(-lr * learn_rate_coef_, b_w_gifo_x_corr_);
1118  b_w_gifo_r_.AddMat(-lr * learn_rate_coef_, b_w_gifo_r_corr_);
1119  b_bias_.AddVec(-lr * bias_learn_rate_coef_, b_bias_corr_, 1.0);
1120 
1121  b_peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_i_c_corr_, 1.0);
1122  b_peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_f_c_corr_, 1.0);
1123  b_peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_o_c_corr_, 1.0);
1124 
1125  b_w_r_m_.AddMat(-lr * learn_rate_coef_, b_w_r_m_corr_);
1126  }
1127 
1128  private:
1129  // dims
1132 
1137 
1138  // feed-forward connections: from x to [g, i, f, o]
1139  // forward direction
1142  // backward direction
1145 
1146  // recurrent projection connections: from r to [g, i, f, o]
1147  // forward direction
1150  // backward direction
1153 
1154  // biases of [g, i, f, o]
1155  // forward direction
1158  // backward direction
1161 
1162  // peephole from c to i, f, g
1163  // peephole connections are diagonal, so we use vector form,
1164  // forward direction
1168  // backward direction
1172 
1173  // forward direction
1177  // backward direction
1181 
1182  // projection layer r: from m to r
1183  // forward direction
1186  // backward direction
1189 
1190  // propagate buffer: output of [g, i, f, o, c, h, m, r]
1191  // forward direction
1193  // backward direction
1195 
1196  // back-propagate buffer: diff-input of [g, i, f, o, c, h, m, r]
1197  // forward direction
1199  // backward direction
1201 }; // class BlstmProjected
1202 
1203 } // namespace nnet1
1204 } // namespace kaldi
1205 
1206 #endif // KALDI_NNET_NNET_BLSTM_PROJECTED_H_
void GetParams(VectorBase< BaseFloat > *params) const
Get the trainable parameters reshaped as a vector,.
std::string ToString(const T &t)
Convert basic type to a string (please don&#39;t overuse),.
Definition: nnet-utils.h:52
CuVector< BaseFloat > b_peephole_i_c_
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
CuMatrix< BaseFloat > b_propagate_buf_
CuVector< BaseFloat > b_peephole_o_c_corr_
CuMatrix< BaseFloat > b_w_gifo_x_corr_
BaseFloat diff_clip_
Clipping of &#39;derivatives&#39; in backprop (per-frame),.
NnetTrainOptions opts_
Option-class with training hyper-parameters,.
std::string MomentStatistics(const VectorBase< Real > &vec)
Get a string with statistics of the data in a vector, so we can print them easily.
Definition: nnet-utils.h:63
int32 input_dim_
Data members,.
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
BaseFloat bias_learn_rate_coef_
Scalar applied to learning rate for bias (to be used in ::Update method),.
BaseFloat learn_rate_coef_
Scalar applied to learning rate for weight matrices (to be used in ::Update method),.
CuVector< BaseFloat > f_peephole_o_c_corr_
std::string InfoGradient() const
Print some additional info about gradient (after <...> and dims),.
void Update(const CuMatrixBase< BaseFloat > &input, const CuMatrixBase< BaseFloat > &diff)
Compute gradient and update parameters,.
CuVector< BaseFloat > b_peephole_i_c_corr_
CuMatrix< BaseFloat > f_w_gifo_x_corr_
void RandUniform(BaseFloat mu, BaseFloat range, CuMatrixBase< Real > *mat, struct RandomState *state=NULL)
Fill CuMatrix with random numbers (Uniform distribution): mu = the mean value, range = the &#39;width&#39; of...
Definition: nnet-utils.h:188
kaldi::int32 int32
void ReadToken(std::istream &is, bool binary, std::string *str)
ReadToken gets the next token and puts it in str (exception on failure).
Definition: io-funcs.cc:154
void WriteData(std::ostream &os, bool binary) const
Writes the component content.
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
CuMatrix< BaseFloat > f_backpropagate_buf_
CuVector< BaseFloat > f_peephole_o_c_
BaseFloat cell_diff_clip_
Clipping of &#39;cell-derivatives&#39; accumulated over CEC (per-frame),.
void ReadData(std::istream &is, bool binary)
Reads the component content.
int Peek(std::istream &is, bool binary)
Peek consumes whitespace (if binary == false) and then returns the peek() value of the stream...
Definition: io-funcs.cc:145
ComponentType
Component type identification mechanism,.
CuVector< BaseFloat > f_bias_corr_
int32 proj_dim_
recurrent projection layer dim,
void InitData(std::istream &is)
Initialize the content of the component by the &#39;line&#39; from the prototype,.
void SetParams(const VectorBase< BaseFloat > &params)
Set the trainable parameters from, reshaped as a vector,.
std::string Info() const
Print some additional info (after <ComponentName> and the dims),.
CuVector< BaseFloat > b_peephole_f_c_
BaseFloat grad_clip_
Clipping of the updates,.
void PropagateFnc(const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out)
Abstract interface for propagation/backpropagation.
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191
void BackpropagateFnc(const CuMatrixBase< BaseFloat > &in, const CuMatrixBase< BaseFloat > &out, const CuMatrixBase< BaseFloat > &out_diff, CuMatrixBase< BaseFloat > *in_diff)
Backward pass transformation (to be implemented by descending class...)
void GetGradient(VectorBase< BaseFloat > *gradient) const
Get gradient reshaped as a vector,.
CuVector< BaseFloat > f_peephole_i_c_corr_
#define KALDI_ERR
Definition: kaldi-error.h:147
CuVector< BaseFloat > f_peephole_f_c_
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
ComponentType GetType() const
Get Type Identification of the component,.
CuVector< BaseFloat > b_peephole_o_c_
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
int PeekToken(std::istream &is, bool binary)
PeekToken will return the first character of the next token, or -1 if end of file.
Definition: io-funcs.cc:170
CuSubMatrix< Real > RowRange(const MatrixIndexT row_offset, const MatrixIndexT num_rows) const
Definition: cu-matrix.h:660
CuVector< BaseFloat > b_peephole_f_c_corr_
Class MultistreamComponent is an extension of UpdatableComponent for recurrent networks, which are trained with parallel sequences.
CuVector< BaseFloat > f_peephole_i_c_
BlstmProjected(int32 input_dim, int32 output_dim)
CuMatrix< BaseFloat > b_w_gifo_r_corr_
CuSubMatrix< Real > ColRange(const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:665
CuMatrix< BaseFloat > f_w_r_m_corr_
Matrix for CUDA computing.
Definition: matrix-common.h:69
int32 NumParams() const
Number of trainable parameters,.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
CuVector< BaseFloat > f_peephole_f_c_corr_
Component * Copy() const
Copy component (deep copy),.
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34
Abstract class, building block of the network.
CuMatrix< BaseFloat > f_propagate_buf_
int32 cell_dim_
the number of memory-cell blocks,
std::vector< int32 > sequence_lengths_
BaseFloat cell_clip_
Clipping of &#39;cell-values&#39; in forward pass (per-frame),.
CuMatrix< BaseFloat > b_w_r_m_corr_
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
CuMatrix< BaseFloat > f_w_gifo_r_corr_
CuMatrix< BaseFloat > b_backpropagate_buf_
void Resize(MatrixIndexT rows, MatrixIndexT cols, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Allocate the memory.
Definition: cu-matrix.cc:50
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).
Definition: kaldi-vector.h:94
CuVector< BaseFloat > b_bias_corr_