nnet-lstm-projected.h
Go to the documentation of this file.
1 // nnet/nnet-lstm-projected-streams.h
2 
3 // Copyright 2015-2016 Brno University of Technology (author: Karel Vesely)
4 // Copyright 2014 Jiayu DU (Jerry), Wei Li
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 
22 #ifndef KALDI_NNET_NNET_LSTM_PROJECTED_H_
23 #define KALDI_NNET_NNET_LSTM_PROJECTED_H_
24 
25 #include <string>
26 #include <vector>
27 
28 #include "nnet/nnet-component.h"
29 #include "nnet/nnet-utils.h"
30 #include "cudamatrix/cu-math.h"
31 
32 /*************************************
33  * x: input neuron
34  * g: squashing neuron near input
35  * i: Input gate
36  * f: Forget gate
37  * o: Output gate
38  * c: memory Cell (CEC)
39  * h: squashing neuron near output
40  * m: output neuron of Memory block
41  * r: recurrent projection neuron
42  * y: output neuron of LSTMP
43  *************************************/
44 
45 namespace kaldi {
46 namespace nnet1 {
47 
49  public:
50  LstmProjected(int32 input_dim, int32 output_dim):
51  MultistreamComponent(input_dim, output_dim),
52  cell_dim_(0),
53  proj_dim_(output_dim),
54  cell_clip_(50.0),
55  diff_clip_(1.0),
56  cell_diff_clip_(0.0),
57  grad_clip_(250.0)
58  { }
59 
61  { }
62 
63  Component* Copy() const { return new LstmProjected(*this); }
64  ComponentType GetType() const { return kLstmProjected; }
65 
66  void InitData(std::istream &is) {
67  // define options,
68  float param_range = 0.1;
69  // parse the line from prototype,
70  std::string token;
71  while (is >> std::ws, !is.eof()) {
72  ReadToken(is, false, &token);
73  if (token == "<ParamRange>") ReadBasicType(is, false, &param_range);
74  else if (token == "<CellDim>") ReadBasicType(is, false, &cell_dim_);
75  else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
76  else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
77  else if (token == "<CellClip>") ReadBasicType(is, false, &cell_clip_);
78  else if (token == "<DiffClip>") ReadBasicType(is, false, &diff_clip_);
79  else if (token == "<CellDiffClip>") ReadBasicType(is, false, &cell_diff_clip_);
80  else if (token == "<GradClip>") ReadBasicType(is, false, &grad_clip_);
81  else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
82  << " (ParamRange|CellDim|LearnRateCoef|BiasLearnRateCoef|CellClip|DiffClip|GradClip)";
83  }
84 
85  // init the weights and biases (from uniform dist.),
88  bias_.Resize(4*cell_dim_, kUndefined);
93  // (mean), (range)
94  RandUniform(0.0, 2.0 * param_range, &w_gifo_x_);
95  RandUniform(0.0, 2.0 * param_range, &w_gifo_r_);
96  RandUniform(0.0, 2.0 * param_range, &bias_);
97  RandUniform(0.0, 2.0 * param_range, &peephole_i_c_);
98  RandUniform(0.0, 2.0 * param_range, &peephole_f_c_);
99  RandUniform(0.0, 2.0 * param_range, &peephole_o_c_);
100  RandUniform(0.0, 2.0 * param_range, &w_r_m_);
101 
102  KALDI_ASSERT(cell_dim_ > 0);
105  }
106 
107  void ReadData(std::istream &is, bool binary) {
108  // Read all the '<Tokens>' in arbitrary order,
109  while ('<' == Peek(is, binary)) {
110  std::string token;
111  int first_char = PeekToken(is, binary);
112  switch (first_char) {
113  case 'C': ReadToken(is, false, &token);
114  if (token == "<CellDim>") ReadBasicType(is, binary, &cell_dim_);
115  else if (token == "<CellClip>") ReadBasicType(is, binary, &cell_clip_);
116  else if (token == "<CellDiffClip>") ReadBasicType(is, binary, &cell_diff_clip_);
117  else if (token == "<ClipGradient>") ReadBasicType(is, binary, &grad_clip_); // bwd-compat.
118  else KALDI_ERR << "Unknown token: " << token;
119  break;
120  case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
121  ReadBasicType(is, binary, &learn_rate_coef_);
122  break;
123  case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
124  ReadBasicType(is, binary, &bias_learn_rate_coef_);
125  break;
126  case 'D': ExpectToken(is, binary, "<DiffClip>");
127  ReadBasicType(is, binary, &diff_clip_);
128  break;
129  case 'G': ExpectToken(is, binary, "<GradClip>");
130  ReadBasicType(is, binary, &grad_clip_);
131  break;
132  default: ReadToken(is, false, &token);
133  KALDI_ERR << "Unknown token: " << token;
134  }
135  }
136  KALDI_ASSERT(cell_dim_ != 0);
137 
138  // Read the model parameters,
139  w_gifo_x_.Read(is, binary);
140  w_gifo_r_.Read(is, binary);
141  bias_.Read(is, binary);
142 
143  peephole_i_c_.Read(is, binary);
144  peephole_f_c_.Read(is, binary);
145  peephole_o_c_.Read(is, binary);
146 
147  w_r_m_.Read(is, binary);
148  }
149 
150  void WriteData(std::ostream &os, bool binary) const {
151  WriteToken(os, binary, "<CellDim>");
152  WriteBasicType(os, binary, cell_dim_);
153 
154  WriteToken(os, binary, "<LearnRateCoef>");
155  WriteBasicType(os, binary, learn_rate_coef_);
156  WriteToken(os, binary, "<BiasLearnRateCoef>");
158 
159  WriteToken(os, binary, "<CellClip>");
160  WriteBasicType(os, binary, cell_clip_);
161  WriteToken(os, binary, "<DiffClip>");
162  WriteBasicType(os, binary, diff_clip_);
163  WriteToken(os, binary, "<CellDiffClip>");
164  WriteBasicType(os, binary, cell_diff_clip_);
165  WriteToken(os, binary, "<GradClip>");
166  WriteBasicType(os, binary, grad_clip_);
167 
168  // write model parameters,
169  if (!binary) os << "\n";
170  w_gifo_x_.Write(os, binary);
171  w_gifo_r_.Write(os, binary);
172  bias_.Write(os, binary);
173 
174  peephole_i_c_.Write(os, binary);
175  peephole_f_c_.Write(os, binary);
176  peephole_o_c_.Write(os, binary);
177 
178  w_r_m_.Write(os, binary);
179  }
180 
181  int32 NumParams() const {
182  return ( w_gifo_x_.NumRows() * w_gifo_x_.NumCols() +
183  w_gifo_r_.NumRows() * w_gifo_r_.NumCols() +
184  bias_.Dim() +
185  peephole_i_c_.Dim() +
186  peephole_f_c_.Dim() +
187  peephole_o_c_.Dim() +
188  w_r_m_.NumRows() * w_r_m_.NumCols() );
189  }
190 
191  void GetGradient(VectorBase<BaseFloat>* gradient) const {
192  KALDI_ASSERT(gradient->Dim() == NumParams());
193  int32 offset, len;
194 
195  offset = 0; len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
196  gradient->Range(offset, len).CopyRowsFromMat(w_gifo_x_corr_);
197 
198  offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
199  gradient->Range(offset, len).CopyRowsFromMat(w_gifo_r_corr_);
200 
201  offset += len; len = bias_.Dim();
202  gradient->Range(offset, len).CopyFromVec(bias_corr_);
203 
204  offset += len; len = peephole_i_c_.Dim();
205  gradient->Range(offset, len).CopyFromVec(peephole_i_c_corr_);
206 
207  offset += len; len = peephole_f_c_.Dim();
208  gradient->Range(offset, len).CopyFromVec(peephole_f_c_corr_);
209 
210  offset += len; len = peephole_o_c_.Dim();
211  gradient->Range(offset, len).CopyFromVec(peephole_o_c_corr_);
212 
213  offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
214  gradient->Range(offset, len).CopyRowsFromMat(w_r_m_corr_);
215 
216  offset += len;
217  KALDI_ASSERT(offset == NumParams());
218  }
219 
220  void GetParams(VectorBase<BaseFloat>* params) const {
221  KALDI_ASSERT(params->Dim() == NumParams());
222  int32 offset, len;
223 
224  offset = 0; len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
225  params->Range(offset, len).CopyRowsFromMat(w_gifo_x_);
226 
227  offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
228  params->Range(offset, len).CopyRowsFromMat(w_gifo_r_);
229 
230  offset += len; len = bias_.Dim();
231  params->Range(offset, len).CopyFromVec(bias_);
232 
233  offset += len; len = peephole_i_c_.Dim();
234  params->Range(offset, len).CopyFromVec(peephole_i_c_);
235 
236  offset += len; len = peephole_f_c_.Dim();
237  params->Range(offset, len).CopyFromVec(peephole_f_c_);
238 
239  offset += len; len = peephole_o_c_.Dim();
240  params->Range(offset, len).CopyFromVec(peephole_o_c_);
241 
242  offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
243  params->Range(offset, len).CopyRowsFromMat(w_r_m_);
244 
245  offset += len;
246  KALDI_ASSERT(offset == NumParams());
247  }
248 
249  void SetParams(const VectorBase<BaseFloat>& params) {
250  KALDI_ASSERT(params.Dim() == NumParams());
251  int32 offset, len;
252 
253  offset = 0; len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
254  w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
255 
256  offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
257  w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
258 
259  offset += len; len = bias_.Dim();
260  bias_.CopyFromVec(params.Range(offset, len));
261 
262  offset += len; len = peephole_i_c_.Dim();
263  peephole_i_c_.CopyFromVec(params.Range(offset, len));
264 
265  offset += len; len = peephole_f_c_.Dim();
266  peephole_f_c_.CopyFromVec(params.Range(offset, len));
267 
268  offset += len; len = peephole_o_c_.Dim();
269  peephole_o_c_.CopyFromVec(params.Range(offset, len));
270 
271  offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
272  w_r_m_.CopyRowsFromVec(params.Range(offset, len));
273 
274  offset += len;
275  KALDI_ASSERT(offset == NumParams());
276  }
277 
278  std::string Info() const {
279  return std::string("cell-dim ") + ToString(cell_dim_) + " " +
280  "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
281  ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
282  ", cell_clip_ " + ToString(cell_clip_) +
283  ", diff_clip_ " + ToString(diff_clip_) +
284  ", grad_clip_ " + ToString(grad_clip_) + " )" +
285  "\n w_gifo_x_ " + MomentStatistics(w_gifo_x_) +
286  "\n w_gifo_r_ " + MomentStatistics(w_gifo_r_) +
287  "\n bias_ " + MomentStatistics(bias_) +
288  "\n peephole_i_c_ " + MomentStatistics(peephole_i_c_) +
289  "\n peephole_f_c_ " + MomentStatistics(peephole_f_c_) +
290  "\n peephole_o_c_ " + MomentStatistics(peephole_o_c_) +
291  "\n w_r_m_ " + MomentStatistics(w_r_m_);
292  }
293 
294  std::string InfoGradient() const {
295  // disassemble forward-propagation buffer into different neurons,
304 
305  // disassemble backpropagate buffer into different neurons,
314 
315  return std::string("") +
316  "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
317  ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
318  ", cell_clip_ " + ToString(cell_clip_) +
319  ", diff_clip_ " + ToString(diff_clip_) +
320  ", grad_clip_ " + ToString(grad_clip_) + " )" +
321  "\n ### Gradients " +
322  "\n w_gifo_x_corr_ " + MomentStatistics(w_gifo_x_corr_) +
323  "\n w_gifo_r_corr_ " + MomentStatistics(w_gifo_r_corr_) +
324  "\n bias_corr_ " + MomentStatistics(bias_corr_) +
325  "\n peephole_i_c_corr_ " + MomentStatistics(peephole_i_c_corr_) +
326  "\n peephole_f_c_corr_ " + MomentStatistics(peephole_f_c_corr_) +
327  "\n peephole_o_c_corr_ " + MomentStatistics(peephole_o_c_corr_) +
328  "\n w_r_m_corr_ " + MomentStatistics(w_r_m_corr_) +
329  "\n ### Activations (mostly after non-linearities)" +
330  "\n YI(0..1)^ " + MomentStatistics(YI) +
331  "\n YF(0..1)^ " + MomentStatistics(YF) +
332  "\n YO(0..1)^ " + MomentStatistics(YO) +
333  "\n YG(-1..1) " + MomentStatistics(YG) +
334  "\n YC(-R..R)* " + MomentStatistics(YC) +
335  "\n YH(-1..1) " + MomentStatistics(YH) +
336  "\n YM(-1..1) " + MomentStatistics(YM) +
337  "\n YR(-R..R) " + MomentStatistics(YR) +
338  "\n ### Derivatives (w.r.t. inputs of non-linearities)" +
339  "\n DI^ " + MomentStatistics(DI) +
340  "\n DF^ " + MomentStatistics(DF) +
341  "\n DO^ " + MomentStatistics(DO) +
342  "\n DG " + MomentStatistics(DG) +
343  "\n DC* " + MomentStatistics(DC) +
344  "\n DH " + MomentStatistics(DH) +
345  "\n DM " + MomentStatistics(DM) +
346  "\n DR " + MomentStatistics(DR);
347  }
348 
352  void ResetStreams(const std::vector<int32>& stream_reset_flag) {
353  KALDI_ASSERT(NumStreams() == stream_reset_flag.size());
354  if (prev_nnet_state_.NumRows() != stream_reset_flag.size()) {
356  } else {
357  for (int s = 0; s < NumStreams(); s++) {
358  if (stream_reset_flag[s] == 1) {
359  prev_nnet_state_.Row(s).SetZero();
360  }
361  }
362  }
363  }
364 
367 
368  // reset context on each sentence if 'sequence_lengths_' not set
369  // (happens in 'nnet-forward' or 'single-stream' training),
370  if (sequence_lengths_.size() == 0) {
371  ResetStreams(std::vector<int32>(1, 1));
372  }
373 
374  KALDI_ASSERT(in.NumRows() % NumStreams() == 0);
375  int32 T = in.NumRows() / NumStreams();
376  int32 S = NumStreams();
377 
378  // buffers,
379  propagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
380  if (prev_nnet_state_.NumRows() != NumStreams()) {
381  prev_nnet_state_.Resize(NumStreams(), 7*cell_dim_ + 1*proj_dim_, kSetZero); // lazy init,
382  } else {
383  propagate_buf_.RowRange(0, S).CopyFromMat(prev_nnet_state_); // use the 'previous-state',
384  }
385 
386  // split activations by neuron types,
395  CuSubMatrix<BaseFloat> YGIFO(propagate_buf_.ColRange(0, 4*cell_dim_));
396 
397  // x -> g, i, f, o, not recurrent, do it all in once
398  YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, w_gifo_x_, kTrans, 0.0);
399 
400  // bias -> g, i, f, o
401  YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, bias_);
402 
403  // BufferPadding [T0]:dummy, [1, T]:current sequence, [T+1]:dummy
404  for (int t = 1; t <= T; t++) {
405  // multistream buffers for current time-step,
406  CuSubMatrix<BaseFloat> y_all(propagate_buf_.RowRange(t*S, S));
407  CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S, S));
408  CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S, S));
409  CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S, S));
410  CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S, S));
411  CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S, S));
412  CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S, S));
413  CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S, S));
414  CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S, S));
415  CuSubMatrix<BaseFloat> y_gifo(YGIFO.RowRange(t*S, S));
416 
417  // r(t-1) -> g, i, f, o
418  y_gifo.AddMatMat(1.0, YR.RowRange((t-1)*S, S), kNoTrans, w_gifo_r_, kTrans, 1.0);
419 
420  // c(t-1) -> i(t) via peephole
421  y_i.AddMatDiagVec(1.0, YC.RowRange((t-1)*S, S), kNoTrans, peephole_i_c_, 1.0);
422 
423  // c(t-1) -> f(t) via peephole
424  y_f.AddMatDiagVec(1.0, YC.RowRange((t-1)*S, S), kNoTrans, peephole_f_c_, 1.0);
425 
426  // i, f sigmoid squashing
427  y_i.Sigmoid(y_i);
428  y_f.Sigmoid(y_f);
429 
430  // g tanh squashing
431  y_g.Tanh(y_g);
432 
433  // g * i -> c
434  y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
435  // c(t-1) * f -> c(t) via forget-gate
436  y_c.AddMatMatElements(1.0, YC.RowRange((t-1)*S, S), y_f, 1.0);
437 
438  if (cell_clip_ > 0.0) {
439  y_c.ApplyFloor(-cell_clip_); // optional clipping of cell activation,
440  y_c.ApplyCeiling(cell_clip_); // google paper Interspeech2014: LSTM for LVCSR
441  }
442 
443  // c(t) -> o(t) via peephole (non-recurrent, using c(t))
444  y_o.AddMatDiagVec(1.0, y_c, kNoTrans, peephole_o_c_, 1.0);
445 
446  // o sigmoid squashing,
447  y_o.Sigmoid(y_o);
448 
449  // h tanh squashing,
450  y_h.Tanh(y_c);
451 
452  // h * o -> m via output gate,
453  y_m.AddMatMatElements(1.0, y_h, y_o, 0.0);
454 
455  // m -> r
456  y_r.AddMatMat(1.0, y_m, kNoTrans, w_r_m_, kTrans, 0.0);
457 
458  // set zeros to padded frames,
459  if (sequence_lengths_.size() > 0) {
460  for (int s = 0; s < S; s++) {
461  if (t > sequence_lengths_[s]) {
462  y_all.Row(s).SetZero();
463  }
464  }
465  }
466  }
467 
468  // set the 'projection layer' output as the LSTM output,
469  out->CopyFromMat(YR.RowRange(1*S, T*S));
470 
471  // the state in the last 'frame' is transferred (can be zero vector)
472  prev_nnet_state_.CopyFromMat(propagate_buf_.RowRange(T*S, S));
473  }
474 
476  const CuMatrixBase<BaseFloat> &out,
477  const CuMatrixBase<BaseFloat> &out_diff,
478  CuMatrixBase<BaseFloat> *in_diff) {
479 
480  // the number of sequences to be processed in parallel
481  int32 T = in.NumRows() / NumStreams();
482  int32 S = NumStreams();
483 
484  // buffer,
485  backpropagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
486 
487  // split activations by neuron types,
496 
497  // split derivatives by neuron types,
507 
508  // pre-copy partial derivatives from the LSTM output,
509  DR.RowRange(1*S, T*S).CopyFromMat(out_diff);
510 
511  // BufferPadding [T0]:dummy, [1,T]:current sequence, [T+1]: dummy,
512  for (int t = T; t >= 1; t--) {
513  CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S, S));
514  CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S, S));
515  CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S, S));
516  CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S, S));
517  // CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S, S));
518  CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S, S));
519  // CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S, S));
520  // CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S, S));
521 
522  CuSubMatrix<BaseFloat> d_all(backpropagate_buf_.RowRange(t*S, S));
523  CuSubMatrix<BaseFloat> d_g(DG.RowRange(t*S, S));
524  CuSubMatrix<BaseFloat> d_i(DI.RowRange(t*S, S));
525  CuSubMatrix<BaseFloat> d_f(DF.RowRange(t*S, S));
526  CuSubMatrix<BaseFloat> d_o(DO.RowRange(t*S, S));
527  CuSubMatrix<BaseFloat> d_c(DC.RowRange(t*S, S));
528  CuSubMatrix<BaseFloat> d_h(DH.RowRange(t*S, S));
529  CuSubMatrix<BaseFloat> d_m(DM.RowRange(t*S, S));
530  CuSubMatrix<BaseFloat> d_r(DR.RowRange(t*S, S));
531  CuSubMatrix<BaseFloat> d_gifo(DGIFO.RowRange(t*S, S));
532 
533  // r
534  // Version 1 (precise gradients):
535  // backprop error from g(t+1), i(t+1), f(t+1), o(t+1) to r(t)
536  d_r.AddMatMat(1.0, DGIFO.RowRange((t+1)*S, S), kNoTrans, w_gifo_r_, kNoTrans, 1.0);
537 
538  /*
539  // Version 2 (Alex Graves' PhD dissertation):
540  // only backprop g(t+1) to r(t)
541  CuSubMatrix<BaseFloat> w_g_r_(w_gifo_r_.RowRange(0, cell_dim_));
542  d_r.AddMatMat(1.0, DG.RowRange((t+1)*S,S), kNoTrans, w_g_r_, kNoTrans, 1.0);
543  */
544 
545  /*
546  // Version 3 (Felix Gers' PhD dissertation):
547  // truncate gradients of g(t+1), i(t+1), f(t+1), o(t+1) once they leak out memory block
548  // CEC(with forget connection) is the only "error-bridge" through time
549  */
550 
551  // r -> m
552  d_m.AddMatMat(1.0, d_r, kNoTrans, w_r_m_, kNoTrans, 0.0);
553 
554  // m -> h via output gate
555  d_h.AddMatMatElements(1.0, d_m, y_o, 0.0);
556  d_h.DiffTanh(y_h, d_h);
557 
558  // o
559  d_o.AddMatMatElements(1.0, d_m, y_h, 0.0);
560  d_o.DiffSigmoid(y_o, d_o);
561 
562  // c
563  // 1. diff from h(t)
564  // 2. diff from c(t+1) (via forget-gate between CEC)
565  // 3. diff from i(t+1) (via peephole)
566  // 4. diff from f(t+1) (via peephole)
567  // 5. diff from o(t) (via peephole, not recurrent)
568  d_c.AddMat(1.0, d_h);
569  d_c.AddMatMatElements(1.0, DC.RowRange((t+1)*S, S), YF.RowRange((t+1)*S,S), 1.0);
570  d_c.AddMatDiagVec(1.0, DI.RowRange((t+1)*S, S), kNoTrans, peephole_i_c_, 1.0);
571  d_c.AddMatDiagVec(1.0, DF.RowRange((t+1)*S, S), kNoTrans, peephole_f_c_, 1.0);
572  d_c.AddMatDiagVec(1.0, d_o , kNoTrans, peephole_o_c_, 1.0);
573  // optionally clip the cell_derivative,
574  if (cell_diff_clip_ > 0.0) {
575  d_c.ApplyFloor(-cell_diff_clip_);
576  d_c.ApplyCeiling(cell_diff_clip_);
577  }
578 
579  // f
580  d_f.AddMatMatElements(1.0, d_c, YC.RowRange((t-1)*S,S), 0.0);
581  d_f.DiffSigmoid(y_f, d_f);
582 
583  // i
584  d_i.AddMatMatElements(1.0, d_c, y_g, 0.0);
585  d_i.DiffSigmoid(y_i, d_i);
586 
587  // c -> g via input gate
588  d_g.AddMatMatElements(1.0, d_c, y_i, 0.0);
589  d_g.DiffTanh(y_g, d_g);
590 
591  // Clipping per-frame derivatives for the next `t'.
592  // Clipping applied to gates and input gate (as done in Google).
593  // [ICASSP2015, Sak, Learning acoustic frame labelling...],
594  //
595  // The path from 'out_diff' to 'd_c' via 'd_h' is unclipped,
596  // which is probably important for the 'Constant Error Carousel'
597  // to work well.
598  //
599  if (diff_clip_ > 0.0) {
600  d_gifo.ApplyFloor(-diff_clip_);
601  d_gifo.ApplyCeiling(diff_clip_);
602  }
603 
604  // set zeros to padded frames,
605  if (sequence_lengths_.size() > 0) {
606  for (int s = 0; s < S; s++) {
607  if (t > sequence_lengths_[s]) {
608  d_all.Row(s).SetZero();
609  }
610  }
611  }
612  }
613 
614  // g,i,f,o -> x, calculating input derivatives,
615  in_diff->AddMatMat(1.0, DGIFO.RowRange(1*S,T*S), kNoTrans, w_gifo_x_, kNoTrans, 0.0);
616 
617  // lazy initialization of udpate buffers,
618  if (w_gifo_x_corr_.NumRows() == 0) {
621  bias_corr_.Resize(4*cell_dim_, kSetZero);
626  }
627 
628  // calculate delta
629  const BaseFloat mmt = opts_.momentum;
630 
631  // weight x -> g, i, f, o
632  w_gifo_x_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S, T*S), kTrans,
633  in , kNoTrans, mmt);
634  // recurrent weight r -> g, i, f, o
635  w_gifo_r_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S, T*S), kTrans,
636  YR.RowRange(0*S, T*S) , kNoTrans, mmt);
637  // bias of g, i, f, o
638  bias_corr_.AddRowSumMat(1.0, DGIFO.RowRange(1*S, T*S), mmt);
639 
640  // recurrent peephole c -> i
641  peephole_i_c_corr_.AddDiagMatMat(1.0, DI.RowRange(1*S, T*S), kTrans,
642  YC.RowRange(0*S, T*S), kNoTrans, mmt);
643  // recurrent peephole c -> f
644  peephole_f_c_corr_.AddDiagMatMat(1.0, DF.RowRange(1*S, T*S), kTrans,
645  YC.RowRange(0*S, T*S), kNoTrans, mmt);
646  // peephole c -> o
647  peephole_o_c_corr_.AddDiagMatMat(1.0, DO.RowRange(1*S, T*S), kTrans,
648  YC.RowRange(1*S, T*S), kNoTrans, mmt);
649 
650  w_r_m_corr_.AddMatMat(1.0, DR.RowRange(1*S, T*S), kTrans,
651  YM.RowRange(1*S, T*S), kNoTrans, mmt);
652  }
653 
654  void Update(const CuMatrixBase<BaseFloat> &input,
655  const CuMatrixBase<BaseFloat> &diff) {
656 
657  // apply the gradient clipping,
658  if (grad_clip_ > 0.0) {
659  w_gifo_x_corr_.ApplyFloor(-grad_clip_);
660  w_gifo_x_corr_.ApplyCeiling(grad_clip_);
661  w_gifo_r_corr_.ApplyFloor(-grad_clip_);
662  w_gifo_r_corr_.ApplyCeiling(grad_clip_);
663  bias_corr_.ApplyFloor(-grad_clip_);
664  bias_corr_.ApplyCeiling(grad_clip_);
665  w_r_m_corr_.ApplyFloor(-grad_clip_);
666  w_r_m_corr_.ApplyCeiling(grad_clip_);
667  peephole_i_c_corr_.ApplyFloor(-grad_clip_);
668  peephole_i_c_corr_.ApplyCeiling(grad_clip_);
669  peephole_f_c_corr_.ApplyFloor(-grad_clip_);
670  peephole_f_c_corr_.ApplyCeiling(grad_clip_);
671  peephole_o_c_corr_.ApplyFloor(-grad_clip_);
672  peephole_o_c_corr_.ApplyCeiling(grad_clip_);
673  }
674 
675  const BaseFloat lr = opts_.learn_rate;
676 
678  w_gifo_r_.AddMat(-lr * learn_rate_coef_, w_gifo_r_corr_);
679  bias_.AddVec(-lr * bias_learn_rate_coef_, bias_corr_, 1.0);
680 
681  peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_i_c_corr_, 1.0);
682  peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_f_c_corr_, 1.0);
683  peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_o_c_corr_, 1.0);
684 
685  w_r_m_.AddMat(-lr * learn_rate_coef_, w_r_m_corr_);
686  }
687 
688  private:
689  // dims
692 
697 
698  // buffer for transfering state across batches,
700 
701  // feed-forward connections: from x to [g, i, f, o]
704 
705  // recurrent projection connections: from r to [g, i, f, o]
708 
709  // biases of [g, i, f, o]
712 
713  // peephole from c to i, f, g
714  // peephole connections are block-internal, so we use vector form
718 
722 
723  // projection layer r: from m to r
726 
727  // propagate buffer: output of [g, i, f, o, c, h, m, r]
729 
730  // back-propagate buffer: diff-input of [g, i, f, o, c, h, m, r]
732 }; // class LstmProjected
733 
734 } // namespace nnet1
735 } // namespace kaldi
736 
737 #endif // KALDI_NNET_NNET_LSTM_PROJECTED_H_
BaseFloat cell_diff_clip_
Clipping of &#39;cell-derivatives&#39; accumulated over CEC (per-frame),.
std::string ToString(const T &t)
Convert basic type to a string (please don&#39;t overuse),.
Definition: nnet-utils.h:52
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
CuMatrix< BaseFloat > backpropagate_buf_
void BackpropagateFnc(const CuMatrixBase< BaseFloat > &in, const CuMatrixBase< BaseFloat > &out, const CuMatrixBase< BaseFloat > &out_diff, CuMatrixBase< BaseFloat > *in_diff)
Backward pass transformation (to be implemented by descending class...)
void PropagateFnc(const CuMatrixBase< BaseFloat > &in, CuMatrixBase< BaseFloat > *out)
Abstract interface for propagation/backpropagation.
ComponentType GetType() const
Get Type Identification of the component,.
CuVector< BaseFloat > peephole_f_c_corr_
void SetParams(const VectorBase< BaseFloat > &params)
Set the trainable parameters from, reshaped as a vector,.
NnetTrainOptions opts_
Option-class with training hyper-parameters,.
std::string MomentStatistics(const VectorBase< Real > &vec)
Get a string with statistics of the data in a vector, so we can print them easily.
Definition: nnet-utils.h:63
int32 input_dim_
Data members,.
void ResetStreams(const std::vector< int32 > &stream_reset_flag)
TODO: Do we really need this?
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
BaseFloat bias_learn_rate_coef_
Scalar applied to learning rate for bias (to be used in ::Update method),.
CuMatrix< BaseFloat > propagate_buf_
BaseFloat learn_rate_coef_
Scalar applied to learning rate for weight matrices (to be used in ::Update method),.
LstmProjected(int32 input_dim, int32 output_dim)
CuMatrix< BaseFloat > w_gifo_r_corr_
void RandUniform(BaseFloat mu, BaseFloat range, CuMatrixBase< Real > *mat, struct RandomState *state=NULL)
Fill CuMatrix with random numbers (Uniform distribution): mu = the mean value, range = the &#39;width&#39; of...
Definition: nnet-utils.h:188
void GetGradient(VectorBase< BaseFloat > *gradient) const
Get gradient reshaped as a vector,.
kaldi::int32 int32
CuVector< BaseFloat > peephole_o_c_
void ReadToken(std::istream &is, bool binary, std::string *str)
ReadToken gets the next token and puts it in str (exception on failure).
Definition: io-funcs.cc:154
BaseFloat diff_clip_
Clipping of &#39;derivatives&#39; in backprop (per-frame),.
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
int Peek(std::istream &is, bool binary)
Peek consumes whitespace (if binary == false) and then returns the peek() value of the stream...
Definition: io-funcs.cc:145
ComponentType
Component type identification mechanism,.
int32 NumParams() const
Number of trainable parameters,.
CuMatrix< BaseFloat > w_r_m_
CuVector< BaseFloat > peephole_i_c_
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191
BaseFloat grad_clip_
Clipping of the updates,.
CuMatrix< BaseFloat > w_gifo_r_
#define KALDI_ERR
Definition: kaldi-error.h:147
CuVector< BaseFloat > peephole_i_c_corr_
CuVector< BaseFloat > peephole_f_c_
Component * Copy() const
Copy component (deep copy),.
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
int32 proj_dim_
recurrent projection layer dim
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
CuVector< BaseFloat > peephole_o_c_corr_
int PeekToken(std::istream &is, bool binary)
PeekToken will return the first character of the next token, or -1 if end of file.
Definition: io-funcs.cc:170
void GetParams(VectorBase< BaseFloat > *params) const
Get the trainable parameters reshaped as a vector,.
CuSubMatrix< Real > RowRange(const MatrixIndexT row_offset, const MatrixIndexT num_rows) const
Definition: cu-matrix.h:660
Class MultistreamComponent is an extension of UpdatableComponent for recurrent networks, which are trained with parallel sequences.
std::string Info() const
Print some additional info (after <ComponentName> and the dims),.
CuMatrix< BaseFloat > w_r_m_corr_
BaseFloat cell_clip_
Clipping of &#39;cell-values&#39; in forward pass (per-frame),.
Matrix for CUDA computing.
Definition: matrix-common.h:69
void ReadData(std::istream &is, bool binary)
Reads the component content.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34
CuVector< BaseFloat > bias_corr_
Abstract class, building block of the network.
std::string InfoGradient() const
Print some additional info about gradient (after <...> and dims),.
CuMatrix< BaseFloat > prev_nnet_state_
std::vector< int32 > sequence_lengths_
void InitData(std::istream &is)
Initialize the content of the component by the &#39;line&#39; from the prototype,.
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
void WriteData(std::ostream &os, bool binary) const
Writes the component content.
CuMatrix< BaseFloat > w_gifo_x_corr_
CuMatrix< BaseFloat > w_gifo_x_
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).
Definition: kaldi-vector.h:94
void Update(const CuMatrixBase< BaseFloat > &input, const CuMatrixBase< BaseFloat > &diff)
Compute gradient and update parameters,.