natural-gradient-online.cc
Go to the documentation of this file.
1 // nnet3/natural-gradient-online.cc
2 
3 // Copyright 2013 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
21 #include "nnet3/nnet-parse.h"
22 
23 namespace kaldi {
24 namespace nnet3 {
25 
26 
28  rank_(40), update_period_(1), num_samples_history_(2000.0),
29  num_minibatches_history_(0.0), alpha_(4.0),
30  epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0),
31  self_debug_(false), rho_t_(-1.0e+10) { }
32 
33 
45 // static
47  int32 num_rows = R->NumRows(), num_cols = R->NumCols();
48  KALDI_ASSERT(num_cols >= num_rows);
49  R->SetZero();
50  std::vector<MatrixElement<BaseFloat> > elems;
51  elems.reserve(num_cols);
52  BaseFloat first_elem = 1.1;
53  for (int32 r = 0; r < num_rows; r++) {
54  std::vector<int32> cols; // columns that have an entry for this row
55  for (int32 c = r; c < num_cols; c += num_rows)
56  cols.push_back(c);
57  BaseFloat normalizer = 1.0 / sqrt(first_elem * first_elem +
58  cols.size() - 1);
59  for (size_t i = 0; i < cols.size(); i++) {
60  int32 c = cols[i];
61  MatrixElement<BaseFloat> e = { r, c,
62  normalizer * (i == 0 ? first_elem :
63  BaseFloat(1.0)) };
64  elems.push_back(e);
65  }
66  }
67  R->AddElements(1.0, elems);
68 }
69 
70 
72  if (rank_ >= D) {
73  KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
74  << ", setting it to "
75  << (D - 1) << " (but this is probably still too high)";
76  rank_ = D - 1;
77  }
78  if (rank_ == 0) {
79  // Dimension of input data was 1, so the natural gradient preconditioner
80  // would always be the unit matrix.
81  // We'll handle this as a special case, for generality.
82  return;
83  }
86  num_minibatches_history_ > 1.0) &&
87  num_minibatches_history_ < 1.0e+06);
88  KALDI_ASSERT(alpha_ >= 0.0);
89  KALDI_ASSERT(rank_ > 0);
90  KALDI_ASSERT(epsilon_ > 0.0 && epsilon_ <= 1.0e-05); // plausible values.
91  KALDI_ASSERT(delta_ > 0.0 && delta_ <= 1.0e-02); // plausible values.
92 
93  // to initialize, in the equation
94  // F_t =(def) R_t^T D_t R_t + \rho_t I
95  // we will set the orthogonal R_t to a special orthogonal matrix with no zero
96  // rows or columns (see the function), rho_t to epsilon,
97  // and D_t to epsilon. But we don't store R_t directly. Instead, we store
98  // W_t =(def) E_t^{0.5} R_t,
99  // where E_t =(def) 1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1}
100  // from (eqn:tii),
101  // e_{tii} = 1/(\beta_t/d_{tii} + 1),
102  // where
103  // \beta_t =(def) \rho_t + \alpha/D tr(F_t)
104  // = epsilon + alpha/D * (epsilon * D + epsilon * rank)
105  // = epsilon * (1 + alpha * (D + rank) / D)
106  // And d_{tii} is epsilon, so
107  // e_{tii} = 1/((1 + alpha * (D + rank) / D) + 1) [for each i.]
108  // = 1/(2 + alpha * (D + rank) / D)).
109  BaseFloat epsilon = epsilon_; // we could make this a bit more.
110  rho_t_ = epsilon;
111  d_t_.Resize(rank_, kUndefined);
112  d_t_.Set(epsilon);
113  W_t_.Resize(rank_, D, kUndefined);
114  // after the next line, W_ will store the orthogonal matrix R_t.
116  BaseFloat E_tii = 1.0 / ( 2.0 + (D + rank_) * alpha_ / D );
117  // W_t =(def) E_t^{0.5} R_t.
118  W_t_.Scale(sqrt(E_tii));
119  t_ = 0;
120 }
121 
123  int32 D = X0.NumCols();
124  // for locking reasons it's better to use a different object.
125  OnlineNaturalGradient this_copy(*this);
126  this_copy.InitDefault(D);
127  this_copy.t_ = 1; // Prevent recursion to Init() again.
128 
129  CuMatrix<BaseFloat> X0_copy(X0.NumRows(), X0.NumCols(), kUndefined);
130  // 'num_iters' is number of iterations with the same data from a pseudorandom
131  // start. this is a faster way of starting than doing eigenvalue
132  // decomposition.
133  //
134  // Note: we only do three iterations of initialization if we have enough data
135  // that it's reasonably possible to estimate the subspace of dimension
136  // this_copy.rank_. If we don't have more than that many rows in our initial
137  // minibatch X0, we just do one iteration... this gives us almost exactly
138  // (barring small effects due to epsilon_ > 0) the row subspace of X0 after
139  // one iteration anyway.
140  int32 num_init_iters;
141  if (X0.NumRows() <= this_copy.rank_)
142  num_init_iters = 1;
143  else
144  num_init_iters = 3;
145 
146  this_copy.frozen_ = false; // un-freeze if it was frozen, so we can
147  // initialize.
148  for (int32 i = 0; i < num_init_iters; i++) {
149  BaseFloat scale;
150  X0_copy.CopyFromMat(X0);
151  this_copy.PreconditionDirections(&X0_copy, &scale);
152  }
153  rank_ = this_copy.rank_;
154  W_t_.Swap(&this_copy.W_t_);
155  d_t_.Swap(&this_copy.d_t_);
156  rho_t_ = this_copy.rho_t_;
157 }
158 
161  BaseFloat *scale) {
162  NVTX_RANGE(__func__);
163  if (X_t->NumCols() == 1) {
164  // If the dimension of the space equals one then our natural gradient update
165  // with rescaling becomes a no-op, but the code wouldn't naturally handle it
166  // because rank would be zero. Support this as a special case.
167  if (scale)
168  *scale = 1.0;
169  return;
170  }
171 
172  if (t_ == 0) // not initialized
173  Init(*X_t);
174 
175  int32 R = W_t_.NumRows(), D = W_t_.NumCols();
176  // space for W_t, J_t, K_t, L_t.
177  CuMatrix<BaseFloat> WJKL_t(2 * R, D + R);
178  WJKL_t.Range(0, R, 0, D).CopyFromMat(W_t_);
179  BaseFloat rho_t(rho_t_);
180  Vector<BaseFloat> d_t(d_t_);
181 
182  bool updating = Updating();
183 
184  BaseFloat initial_product;
185  initial_product = TraceMatMat(*X_t, *X_t, kTrans);
186 
187  PreconditionDirectionsInternal(rho_t, initial_product,
188  updating, d_t, &WJKL_t, X_t);
189 
190  if (scale) {
191  if (initial_product <= 0.0) {
192  *scale = 1.0;
193  } else {
194  BaseFloat final_product = TraceMatMat(*X_t, *X_t, kTrans);
195  *scale = sqrt(initial_product / final_product);
196  }
197  }
198  t_ += 1;
199 }
200 
202  const VectorBase<BaseFloat> &d_t1,
203  BaseFloat rho_t1,
205  CuMatrixBase<BaseFloat> *temp_W,
206  CuMatrixBase<BaseFloat> *temp_O) {
207  // threshold is a configuration value: a desired threshold on orthogonality,
208  // below which we won't reorthogonalize.
209  const BaseFloat threshold = 1.0e-03;
210 
211  int32 R = W_t1->NumRows(), D = W_t1->NumCols();
212  BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
213  Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
214  inv_sqrt_e_t1(R, kUndefined);
215  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
216 
217  temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
218  // O_{t+1} = E_{t+1}^{-0.5} W_{t+1} W_{t+1}^T E_{t+1}^{-0.5}
219  Matrix<BaseFloat> O_mat(*temp_O);
221  for (int32 i = 0; i < R; i++) {
222  BaseFloat i_factor = inv_sqrt_e_t1(i);
223  for (int32 j = 0; j <= i; j++) {
224  BaseFloat j_factor = inv_sqrt_e_t1(j);
225  O(i, j) *= i_factor * j_factor;
226  }
227  }
228  if (O.IsUnit(threshold)) {
229  if (self_debug_) {
230  KALDI_WARN << "Not reorthogonalizing since already orthognoal: " << O;
231  }
232  return;
233  }
234  TpMatrix<BaseFloat> C(R);
235  bool cholesky_ok = true;
236  try {
237  // one of the following two calls may throw an exception.
238  C.Cholesky(O);
239  C.Invert(); // Now it's C^{-1}.
240  if (!(C.Max() < 100.0)) {
241  KALDI_WARN << "Cholesky out of expected range, "
242  << "reorthogonalizing with Gram-Schmidt";
243  cholesky_ok = false;
244  }
245  } catch (...) {
246  // We do a Gram-Schmidt orthogonalization, which is a bit less efficient but
247  // more robust than the method using Cholesky.
248  KALDI_WARN << "Cholesky or Invert() failed while re-orthogonalizing R_t. "
249  << "Re-orthogonalizing on CPU.";
250  cholesky_ok = false;
251  }
252  if (!cholesky_ok) {
253  Matrix<BaseFloat> cpu_W_t1(*W_t1);
254  cpu_W_t1.OrthogonalizeRows();
255  W_t1->CopyFromMat(cpu_W_t1);
256  // at this point cpu_W_t1 represents R_{t+1}- it has orthonormal
257  // rows. Do: W_{t+1} = E_{t+1}^{0.5} R_{t+1}
258  CuVector<BaseFloat> sqrt_e_t1_gpu(sqrt_e_t1);
259  W_t1->MulRowsVec(sqrt_e_t1_gpu);
260  return;
261  }
262  // Next, compute (E_t^{0.5} C^{-1} E_t^{-0.5})
263  // but it's really t+1, not t.
264  for (int32 i = 0; i < R; i++) {
265  BaseFloat i_factor = sqrt_e_t1(i);
266  for (int32 j = 0; j < i; j++) {
267  // skip j == i because i_factor * j_factor == 1 for j == i.
268  BaseFloat j_factor = inv_sqrt_e_t1(j);
269  C(i, j) *= i_factor * j_factor;
270  }
271  }
272  O_mat.CopyFromTp(C);
273  temp_O->CopyFromMat(O_mat);
274  temp_W->CopyFromMat(*W_t1);
275  W_t1->AddMatMat(1.0, *temp_O, kNoTrans, *temp_W, kNoTrans, 0.0);
276 }
277 
278 // makes sure certain invariants are being preserved
281  BaseFloat d_t_max = d_t_.Max(), d_t_min = d_t_.Min();
282  KALDI_ASSERT(d_t_min >= epsilon_);
283  KALDI_ASSERT(d_t_min > 0.9 * delta_ * d_t_max);
284  KALDI_ASSERT(rho_t_ > 0.9 * delta_ * d_t_max);
285 
286  int32 D = W_t_.NumCols(), R = W_t_.NumRows();
287  BaseFloat beta_t = rho_t_ * (1.0 + alpha_) + alpha_ * d_t_.Sum() / D;
288  Vector<BaseFloat> e_t(R, kUndefined), sqrt_e_t(R, kUndefined),
289  inv_sqrt_e_t(R, kUndefined);
290  ComputeEt(d_t_, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
291 
293  S.AddMat2(1.0, W_t_, kNoTrans, 0.0);
294  SpMatrix<BaseFloat> O(S);
295  for (int32 i = 0; i < R; i++) {
296  BaseFloat i_factor = inv_sqrt_e_t(i);
297  for (int32 j = 0; j <= i; j++) {
298  BaseFloat j_factor = inv_sqrt_e_t(j);
299  O(i, j) *= i_factor * j_factor;
300  }
301  }
302  if (!O.IsUnit(1.0e-04) || O(0, 0) != O(0, 0)) {
303  BaseFloat worst_error = 0.0;
304  int32 worst_i = 0, worst_j = 0;
305  for (int32 i = 0; i < R; i++) {
306  for (int32 j = 0; j < R; j++) {
307  BaseFloat elem = O(i, j);
308  BaseFloat error = fabs(elem - (i == j ? 1.0 : 0.0));
309  if (error > worst_error || error != error) {
310  worst_error = error;
311  worst_i = i;
312  worst_j = j;
313  }
314  }
315  }
316  if (worst_error > 1.0e-02 || worst_error != worst_error) {
317  KALDI_WARN << "Failed to verify W_t (worst error: O[" << worst_i << ','
318  << worst_j << "] = " << O(worst_i, worst_j)
319  << ", d_t = " << d_t_;
320  }
321  }
322 }
323 
325  const BaseFloat rho_t,
326  const BaseFloat tr_X_Xt,
327  bool updating,
328  const Vector<BaseFloat> &d_t,
329  CuMatrixBase<BaseFloat> *WJKL_t,
331  NVTX_RANGE(__func__);
332  int32 N = X_t->NumRows(), // Minibatch size.
333  D = X_t->NumCols(), // Dimensions of vectors we're preconditioning
334  R = rank_; // Rank of correction to unit matrix.
335  KALDI_ASSERT(R > 0 && R < D);
336  BaseFloat eta = Eta(N);
337 
338  CuMatrix<BaseFloat> H_t(N, R);
339  const CuSubMatrix<BaseFloat> W_t(*WJKL_t, 0, R, 0, D);
340  // Below, WJ_t and LK_t are combinations of two matrices,
341  // which we define in order to combine two separate multiplications into one.
342  CuSubMatrix<BaseFloat> J_t(*WJKL_t, R, R, 0, D),
343  L_t(*WJKL_t, 0, R, D, R),
344  K_t(*WJKL_t, R, R, D, R),
345  WJ_t(*WJKL_t, 0, 2 * R, 0, D),
346  LK_t(*WJKL_t, 0, 2 * R, D, R);
347 
348  H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0); // H_t = X_t W_t^T
349 
350  if (!updating) {
351  // We're not updating the estimate of the Fisher matrix; we just apply the
352  // preconditioning and return.
353  // X_hat_t = X_t - H_t W_t
354  X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
355  return;
356  }
357  J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0); // J_t = H_t^T X_t
358 
359  bool compute_lk_together = (N > D);
360 
361  if (compute_lk_together) {
362  // do the following two multiplies in one operation...
363  // note
364  // L_t = W_t J_t^T
365  // K_t = J_t J_t^T
366  // Note: L_t was defined as L_t = J_t W_t^T, but it's actually symmetric,
367  // so we can compute it as L_t = W_t J_t^T.
368  LK_t.AddMatMat(1.0, WJ_t, kNoTrans, J_t, kTrans, 0.0);
369  } else {
370  K_t.SymAddMat2(1.0, J_t, kNoTrans, 0.0);
371  L_t.SymAddMat2(1.0, H_t, kTrans, 0.0);
372  }
373 
374  Matrix<BaseFloat> LK_cpu(LK_t); // contains L and K on the CPU.
375  SubMatrix<BaseFloat> L_t_cpu(LK_cpu, 0, R, 0, R),
376  K_t_cpu(LK_cpu, R, R, 0, R);
377  if (!compute_lk_together) {
378  // the SymAddMat2 operations only set the lower triangle and diagonal.
379  L_t_cpu.CopyLowerToUpper();
380  K_t_cpu.CopyLowerToUpper();
381  }
382 
383  // beta_t = \rho_t(1+\alpha) + \alpha/D tr(D_t)
384  BaseFloat beta_t = rho_t * (1.0 + alpha_) + alpha_ * d_t.Sum() / D;
385  Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
386  ComputeEt(d_t, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
387  KALDI_VLOG(5) << "e_t = " << e_t;
388 
389  // The double-precision Z_t here, and the scaling, is to avoid potential
390  // overflow, because Z_t is proportional to the fourth power of data.
391  SpMatrix<double> Z_t_double(R);
392  ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t_double);
393  BaseFloat z_t_scale = std::max<double>(1.0, Z_t_double.Trace());
394  Z_t_double.Scale(1.0 / z_t_scale);
395  SpMatrix<BaseFloat> Z_t_scaled(Z_t_double);
396 
397  Matrix<BaseFloat> U_t(R, R);
398  Vector<BaseFloat> c_t(R);
399  // do the symmetric eigenvalue decomposition Z_t = U_t C_t U_t^T.
400  Z_t_scaled.Eig(&c_t, &U_t);
401  SortSvd(&c_t, &U_t);
402  c_t.Scale(z_t_scale);
403 
404  const BaseFloat condition_threshold = 1.0e+06;
405  // must_reorthogonalize will be true if the last diagonal element of c_t is
406  // negative, since we don't take the absolute value, but this is the right
407  // thing anyway.
408  bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1));
409 
410  BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2);
411  int32 nf;
412  c_t.ApplyFloor(c_t_floor, &nf);
413  if (nf > 0)
414  must_reorthogonalize = true;
415  if (nf > 0 && self_debug_) {
416  KALDI_WARN << "Floored " << nf << " elements of C_t.";
417  }
418 
419  X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); // X_hat_t = X_t - H_t W_t
420 
421  Vector<BaseFloat> sqrt_c_t(c_t);
422  sqrt_c_t.ApplyPow(0.5);
423 
424  // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
425  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_X_Xt
426  + (1-eta)*(D * rho_t + d_t.Sum())
427  - sqrt_c_t.Sum());
428  // D_{t+1} = C_t^{0.5} - \rho_{t+1} I
429  Vector<BaseFloat> d_t1(sqrt_c_t);
430  d_t1.Add(-rho_t1);
431  BaseFloat floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max());
432  if (rho_t1 < floor_val)
433  rho_t1 = floor_val;
434  d_t1.ApplyFloor(floor_val);
435 
436  CuMatrix<BaseFloat> W_t1(R, D); // W_{t+1}
437  ComputeWt1(N, d_t, d_t1, rho_t, rho_t1, U_t, sqrt_c_t, inv_sqrt_e_t,
438  W_t, &J_t, &W_t1);
439 
440  if (must_reorthogonalize) {
441  if (self_debug_) {
442  KALDI_WARN << "Reorthogonalizing.";
443  }
444  ReorthogonalizeRt1(d_t1,
445  rho_t1,
446  &W_t1,
447  &J_t,
448  &L_t);
449  }
450 
451  W_t_.Swap(&W_t1);
452  d_t_.CopyFromVec(d_t1);
453  rho_t_ = rho_t1;
454 
455  if (self_debug_)
456  SelfTest();
457 }
458 
460  // Just hard-code it here that we do 10 initial updates before skipping any.
461  // This must be > 'num_init_iters = 3' from Init().
462  const int num_initial_updates = 10;
463 
464  return (!frozen_ &&
465  (t_ <= num_initial_updates ||
466  (t_ - num_initial_updates) % update_period_ == 0));
467 }
468 
469 
471  if (num_minibatches_history_ > 0.0) {
473  return 1.0 / num_minibatches_history_;
474  } else {
476  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
477  // Don't let eta approach 1 too closely, as it can lead to NaN's appearing if
478  // the input is all zero.
479  if (ans > 0.9) ans = 0.9;
480  return ans;
481  }
482 }
483 
485  const VectorBase<BaseFloat> &d_t,
486  const VectorBase<BaseFloat> &d_t1,
487  BaseFloat rho_t,
488  BaseFloat rho_t1,
489  const MatrixBase<BaseFloat> &U_t,
490  const VectorBase<BaseFloat> &sqrt_c_t,
491  const VectorBase<BaseFloat> &inv_sqrt_e_t,
492  const CuMatrixBase<BaseFloat> &W_t,
494  CuMatrixBase<BaseFloat> *W_t1) const {
495 
496  int32 R = d_t.Dim(), D = W_t.NumCols();
497  BaseFloat eta = Eta(N);
498 
499  // \beta_{t+1} = \rho_{t+1} (1+\alpha) + \alpha/D tr(D_{t+1})
500  BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
501  KALDI_ASSERT(beta_t1 > 0.0);
502  Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
503  inv_sqrt_e_t1(R, kUndefined);
504  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
505  Vector<BaseFloat> inv_sqrt_c_t(sqrt_c_t);
506  inv_sqrt_c_t.InvertElements();
507 
508  Vector<BaseFloat> w_t_coeff(R);
509  for (int32 i = 0; i < R; i++)
510  w_t_coeff(i) = (1.0 - eta) / (eta/N) * (d_t(i) + rho_t);
511  CuVector<BaseFloat> w_t_coeff_gpu(w_t_coeff);
512  // B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
513  J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0);
514 
515  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}
516  Matrix<BaseFloat> A_t(U_t, kTrans);
517  for (int32 i = 0; i < R; i++) {
518  BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i);
519  for (int32 j = 0; j < R; j++) {
520  BaseFloat j_factor = inv_sqrt_e_t(j);
521  A_t(i, j) *= i_factor * j_factor;
522  }
523  }
524  // W_{t+1} = A_t B_t
525  CuMatrix<BaseFloat> A_t_gpu(A_t);
526  W_t1->AddMatMat(1.0, A_t_gpu, kNoTrans, *J_t, kNoTrans, 0.0);
527 }
528 
530  BaseFloat rho_t,
531  const VectorBase<BaseFloat> &d_t,
532  const VectorBase<BaseFloat> &inv_sqrt_e_t,
533  const MatrixBase<BaseFloat> &K_t,
534  const MatrixBase<BaseFloat> &L_t,
535  SpMatrix<double> *Z_t) const {
536  // Use doubles because the range of quantities in Z_t can get large (fourth
537  // power of data), and we want to avoid overflow. This routine is fast.
538  BaseFloat eta = Eta(N);
539  Vector<BaseFloat> d_t_rho_t(d_t);
540  d_t_rho_t.Add(rho_t); // now d_t_rho_t is diag(D_t + \rho_t I).
541  double etaN = eta / N, eta1 = 1.0 - eta,
542  etaN_sq = etaN * etaN, eta1_sq = eta1 * eta1,
543  etaN_eta1 = etaN * eta1;
544  int32 R = d_t.Dim();
545  for (int32 i = 0; i < R; i++) {
546  double inv_sqrt_e_t_i = inv_sqrt_e_t(i), d_t_rho_t_i = d_t_rho_t(i);
547  for (int32 j = 0; j <= i; j++) {
548  double inv_sqrt_e_t_j = inv_sqrt_e_t(j), d_t_rho_t_j = d_t_rho_t(j),
549  L_t_i_j = 0.5 * (L_t(i, j) + L_t(j, i)),
550  K_t_i_j = 0.5 * (K_t(i, j) + K_t(j, i));
551  // See (eqn:Zt) in header.
552  (*Z_t)(i, j) = etaN_sq * inv_sqrt_e_t_i * K_t_i_j * inv_sqrt_e_t_j
553  + etaN_eta1 * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j * d_t_rho_t_j
554  + etaN_eta1 * d_t_rho_t_i * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j
555  + (i == j ? eta1_sq * d_t_rho_t_i * d_t_rho_t_i : 0.0);
556  }
557  }
558 }
559 
561  BaseFloat beta_t,
563  VectorBase<BaseFloat> *sqrt_e_t,
564  VectorBase<BaseFloat> *inv_sqrt_e_t) const {
565  // e_{tii} = 1/(\beta_t/d_{tii} + 1)
566  int32 D = d_t.Dim();
567  const BaseFloat *d = d_t.Data();
568  BaseFloat *e = e_t->Data();
569  for (int32 i = 0; i < D; i++)
570  e[i] = 1.0 / (beta_t / d[i] + 1);
571  sqrt_e_t->CopyFromVec(*e_t);
572  sqrt_e_t->ApplyPow(0.5);
573  inv_sqrt_e_t->CopyFromVec(*sqrt_e_t);
574  inv_sqrt_e_t->InvertElements();
575 }
576 
577 
579  rank_(other.rank_), update_period_(other.update_period_),
582  alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_),
583  frozen_(other.frozen_), t_(other.t_),
584  self_debug_(other.self_debug_), W_t_(other.W_t_),
585  rho_t_(other.rho_t_), d_t_(other.d_t_) { }
586 
587 
589  const OnlineNaturalGradient &other) {
590  rank_ = other.rank_;
593  alpha_ = other.alpha_;
594  epsilon_ = other.epsilon_;
595  delta_ = other.delta_;
596  t_ = other.t_;
597  self_debug_ = other.self_debug_;
598  W_t_ = other.W_t_;
599  rho_t_ = other.rho_t_;
600  d_t_ = other.d_t_;
601  return *this;
602 }
603 
605  KALDI_ASSERT(rank > 0);
606  rank_ = rank;
607 }
609  KALDI_ASSERT(update_period > 0);
610  update_period_ = update_period;
611 }
613  KALDI_ASSERT(num_samples_history > 0.0 &&
614  num_samples_history < 1.0e+6);
615  num_samples_history_ = num_samples_history;
616 }
618  BaseFloat num_minibatches_history) {
619  KALDI_ASSERT(num_minibatches_history > 1.0);
620  num_minibatches_history_ = num_minibatches_history;
621 }
622 
624  KALDI_ASSERT(alpha >= 0.0);
625  alpha_ = alpha;
626 }
627 
629  std::swap(rank_, other->rank_);
633  std::swap(alpha_, other->alpha_);
634  std::swap(epsilon_, other->epsilon_);
635  std::swap(delta_, other->delta_);
636  std::swap(frozen_, other->frozen_);
637  std::swap(t_, other->t_);
639  W_t_.Swap(&(other->W_t_));
640  std::swap(rho_t_, other->rho_t_);
641  d_t_.Swap(&(other->d_t_));
642 }
643 
644 } // namespace nnet3
645 } // namespace kaldi
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
static void InitOrthonormalSpecial(CuMatrixBase< BaseFloat > *R)
This function creates a matrix with orthonormal rows that is like the following matrix, except with each row normalized to have unit 2-norm: [ 1.1 0 1 0 1 0 0 1.1 0 1 0 1 ] The reason why the first element in each row is 1.1 and not 1, is for symmetry-breaking...
bool IsUnit(Real cutoff=1.0e-05) const
Definition: sp-matrix.cc:480
Packed symetric matrix class.
Definition: matrix-common.h:62
void Scale(Real c)
void ReorthogonalizeRt1(const VectorBase< BaseFloat > &d_t1, BaseFloat rho_t1, CuMatrixBase< BaseFloat > *W_t1, CuMatrixBase< BaseFloat > *temp_W, CuMatrixBase< BaseFloat > *temp_O)
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
Real Trace() const
Definition: sp-matrix.cc:171
void SetNumSamplesHistory(BaseFloat num_samples_history)
OnlineNaturalGradient & operator=(const OnlineNaturalGradient &other)
void AddElements(Real alpha, const std::vector< MatrixElement< Real > > &input)
Definition: cu-matrix.cc:3277
CuSubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:653
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
kaldi::int32 int32
void Eig(VectorBase< Real > *s, MatrixBase< Real > *P=NULL) const
Solves the symmetric eigenvalue problem: at end we should have (*this) = P * diag(s) * P^T...
Definition: qr.cc:433
Keywords for search: natural gradient, naturalgradient, NG-SGD.
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=nullptr)
Applies floor to all elements.
Definition: kaldi-vector.h:149
void ComputeWt1(int32 N, const VectorBase< BaseFloat > &d_t, const VectorBase< BaseFloat > &d_t1, BaseFloat rho_t, BaseFloat rho_t1, const MatrixBase< BaseFloat > &U_t, const VectorBase< BaseFloat > &sqrt_c_t, const VectorBase< BaseFloat > &inv_sqrt_e_t, const CuMatrixBase< BaseFloat > &W_t, CuMatrixBase< BaseFloat > *J_t, CuMatrixBase< BaseFloat > *W_t1) const
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
void Cholesky(const SpMatrix< Real > &orig)
Definition: tp-matrix.cc:88
float BaseFloat
Definition: kaldi-types.h:29
void SetZero()
Math operations, some calling kernels.
Definition: cu-matrix.cc:509
void SymAddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transA, Real beta)
*this = beta * *this + alpha * M M^T, for symmetric matrices.
Definition: cu-matrix.cc:1353
void CopyFromTp(const TpMatrix< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given tpmatrix. (no resize is done).
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
void PreconditionDirections(CuMatrixBase< BaseFloat > *X, BaseFloat *scale)
This call implements the main functionality of this class.
Packed symetric matrix class.
Definition: matrix-common.h:63
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
#define KALDI_WARN
Definition: kaldi-error.h:150
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
void ComputeEt(const VectorBase< BaseFloat > &d_t, BaseFloat beta_t, VectorBase< BaseFloat > *e_t, VectorBase< BaseFloat > *sqrt_e_t, VectorBase< BaseFloat > *inv_sqrt_e_t) const
void Scale(Real alpha)
Multiplies all elements by this constant.
void Swap(OnlineNaturalGradient *other)
Real Sum() const
Returns sum of the elements.
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
A class representing a vector.
Definition: kaldi-vector.h:406
void InvertElements()
Invert all elements.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void ApplyPow(Real power)
Take all elements of vector to a power.
Definition: kaldi-vector.h:179
#define NVTX_RANGE(name)
Definition: cu-common.h:143
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
void ComputeZt(int32 N, BaseFloat rho_t, const VectorBase< BaseFloat > &d_t, const VectorBase< BaseFloat > &inv_sqrt_e_t, const MatrixBase< BaseFloat > &K_t, const MatrixBase< BaseFloat > &L_t, SpMatrix< double > *Z_t) const
void SetNumMinibatchesHistory(BaseFloat num_minibatches_history)
void OrthogonalizeRows()
This function orthogonalizes the rows of a matrix using the Gram-Schmidt process. ...
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
void Add(Real c)
Add a constant to each element of a vector.
void Init(const CuMatrixBase< BaseFloat > &R0)
void PreconditionDirectionsInternal(const BaseFloat rho_t, const BaseFloat tr_X_Xt, bool updating, const Vector< BaseFloat > &d_t, CuMatrixBase< BaseFloat > *WJKL_t, CuMatrixBase< BaseFloat > *X_t)
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i&#39;th row by scale[i]
Definition: cu-matrix.cc:792
Sub-matrix representation.
Definition: kaldi-matrix.h:988
void AddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const Real beta)
void SortSvd(VectorBase< Real > *s, MatrixBase< Real > *U, MatrixBase< Real > *Vt, bool sort_on_absolute_value)
Function to ensure that SVD is sorted.
void CopyLowerToUpper()
Copy lower triangle to upper triangle (symmetrize)
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
Definition: cu-matrix.cc:1382