nnet-precondition-online.cc
Go to the documentation of this file.
1 // nnet2/nnet-precondition-online.cc
2 
3 // Copyright 2013-2015 Johns Hopkins University (author: Daniel Povey)
4 // 2015 Xiaohui Zhang
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
22 
23 namespace kaldi {
24 namespace nnet2 {
25 
26 
28  rank_(40), update_period_(1), num_samples_history_(2000.0), alpha_(4.0),
29  epsilon_(1.0e-10), delta_(5.0e-04), t_(-1),
30  num_updates_skipped_(0), self_debug_(false) { }
31 
32 
44 // static
46  int32 num_rows = R->NumRows(), num_cols = R->NumCols();
47  KALDI_ASSERT(num_cols >= num_rows);
48  R->SetZero();
49  std::vector<MatrixElement<BaseFloat> > elems;
50  elems.reserve(num_cols);
51  BaseFloat first_elem = 1.1;
52  for (int32 r = 0; r < num_rows; r++) {
53  std::vector<int32> cols; // columns that have an entry for this row
54  for (int32 c = r; c < num_cols; c += num_rows)
55  cols.push_back(c);
56  BaseFloat normalizer = 1.0 / sqrt(first_elem * first_elem +
57  cols.size() - 1);
58  for (size_t i = 0; i < cols.size(); i++) {
59  int32 c = cols[i];
60  MatrixElement<BaseFloat> e = { r, c,
61  normalizer * (i == 0 ? first_elem :
62  BaseFloat(1.0)) };
63  elems.push_back(e);
64  }
65  }
66  R->AddElements(1.0, elems);
67  { // TODO: remove this testing code.
68  CuMatrix<BaseFloat> prod(num_rows, num_rows);
69  prod.AddMatMat(1.0, *R, kNoTrans, *R, kTrans, 0.0);
70  KALDI_ASSERT(prod.IsUnit());
71  }
72 }
73 
74 
76  if (rank_ >= D) {
77  KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
78  << ", setting it to "
79  << (D - 1) << " (but this is probably still too high)";
80  rank_ = D - 1;
81  }
82  if (rank_ == 0) {
83  // Dimension of input data was 1, so the natural gradient preconditioner
84  // would always be the unit matrix.
85  // We'll handle this as a special case, for generality.
86  return;
87  }
89  KALDI_ASSERT(alpha_ >= 0.0);
90  KALDI_ASSERT(rank_ > 0);
91  KALDI_ASSERT(epsilon_ > 0.0 && epsilon_ <= 1.0e-05); // plausible values.
92  KALDI_ASSERT(delta_ > 0.0 && delta_ <= 1.0e-02); // plausible values.
93 
94  // to initialize, in the equation
95  // F_t =(def) R_t^T D_t R_t + \rho_t I
96  // we will set the orthogonal R_t to a special orthogonal matrix with no zero
97  // rows or columns (see the function), rho_t to epsilon,
98  // and D_t to epsilon. But we don't store R_t directly. Instead, we store
99  // W_t =(def) E_t^{0.5} R_t,
100  // where E_t =(def) 1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1}
101  // from (eqn:tii),
102  // e_{tii} = 1/(\beta_t/d_{tii} + 1),
103  // where
104  // \beta_t =(def) \rho_t + \alpha/D tr(F_t)
105  // = epsilon + alpha/D * (epsilon * D + epsilon * rank)
106  // = epsilon * (1 + alpha * (D + rank) / D)
107  // And d_{tii} is epsilon, so
108  // e_{tii} = 1/((1 + alpha * (D + rank) / D) + 1) [for each i.]
109  // = 1/(2 + alpha * (D + rank) / D)).
110  BaseFloat epsilon = epsilon_; // we could make this a bit more.
111  rho_t_ = epsilon;
112  d_t_.Resize(rank_, kUndefined);
113  d_t_.Set(epsilon);
114  W_t_.Resize(rank_, D, kUndefined);
115  // after the next line, W_ will store the orthogonal matrix R_t.
117  BaseFloat E_tii = 1.0 / ( 2.0 + (D + rank_) * alpha_ / D );
118  // W_t =(def) E_t^{0.5} R_t.
119  W_t_.Scale(sqrt(E_tii));
120  t_ = 0;
121 }
122 
124  int32 D = R0.NumCols();
125  // for locking reasons it's better to use a different object.
126  OnlinePreconditioner this_copy(*this);
127  this_copy.InitDefault(D);
128 
129  CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
130  // number of iterations with the same data from a pseudorandom start.
131  // this is a faster way of starting than doing eigenvalue decomposition.
132  int32 num_init_iters = 3;
133  for (int32 i = 0; i < num_init_iters; i++) {
134  BaseFloat scale;
135  R0_copy.CopyFromMat(R0);
136  this_copy.PreconditionDirections(&R0_copy, NULL, &scale);
137  }
138  rank_ = this_copy.rank_;
139  W_t_.Swap(&this_copy.W_t_);
140  d_t_.Swap(&this_copy.d_t_);
141  rho_t_ = this_copy.rho_t_;
142  t_ = 0;
143 }
144 
147  CuVectorBase<BaseFloat> *row_prod,
148  BaseFloat *scale) {
149  if (X_t->NumCols() == 1) {
150  // If the dimension of the space equals one then our natural gradient update
151  // with rescaling becomes a no-op, but the code wouldn't naturally handle it
152  // because rank would be zero. Support this as a special case.
153  if (row_prod)
154  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
155  *scale = 1.0;
156  return;
157  }
158 
159  if (row_prod == NULL) {
160  CuVector<BaseFloat> row_prod_tmp(X_t->NumRows());
161  PreconditionDirections(X_t, &row_prod_tmp, scale);
162  return;
163  }
164 
165  read_write_mutex_.lock();
166  if (t_ == -1) // not initialized
167  Init(*X_t);
168 
169  // Now t_ >= 0.
170  // We create local copies of the class variables... this is intended for
171  // multi-threaded safety so we can't read them in an inconsistent state,
172  // but we don't really waste anything here (a copy of W_t is needed anyway,
173  // if we're to update it).
174  int32 t = t_, R = W_t_.NumRows(), D = W_t_.NumCols();
175  // space for W_t, J_t, K_t, L_t.
176  CuMatrix<BaseFloat> WJKL_t(2 * R, D + R);
177  WJKL_t.Range(0, R, 0, D).CopyFromMat(W_t_);
178  BaseFloat rho_t(rho_t_);
179  Vector<BaseFloat> d_t(d_t_);
180  read_write_mutex_.unlock();
181  PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, X_t, row_prod, scale);
182 }
183 
185  const VectorBase<BaseFloat> &d_t1,
186  BaseFloat rho_t1,
188  CuMatrixBase<BaseFloat> *temp_W,
189  CuMatrixBase<BaseFloat> *temp_O) {
190  // threshold is a configuration value: a desired threshold on orthogonality,
191  // below which we won't reorthogonalize.
192  const BaseFloat threshold = 1.0e-03;
193 
194  int32 R = W_t1->NumRows(), D = W_t1->NumCols();
195  BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
196  Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
197  inv_sqrt_e_t1(R, kUndefined);
198  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
199 
200  temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
201  // O_t = E_t^{-0.5} W_t W_t^T E_t^{-0.5}
202  Matrix<BaseFloat> O_mat(*temp_O);
204  for (int32 i = 0; i < R; i++) {
205  BaseFloat i_factor = inv_sqrt_e_t1(i);
206  for (int32 j = 0; j <= i; j++) {
207  BaseFloat j_factor = inv_sqrt_e_t1(j);
208  O(i, j) *= i_factor * j_factor;
209  }
210  }
211  if (O.IsUnit(threshold)) {
212  if (self_debug_) {
213  KALDI_WARN << "Not reorthogonalizing since already orthognoal: " << O;
214  }
215  return;
216  }
217  TpMatrix<BaseFloat> C(R);
218  try {
219  C.Cholesky(O);
220  C.Invert(); // Now it's C^{-1}.
221  if (!(C.Max() < 100.0))
222  KALDI_ERR << "Cholesky out of expected range, "
223  << "reorthogonalizing with Gram-Schmidt";
224  } catch (...) {
225  // We do a Gram-Schmidt orthogonalization, which is a bit less efficient but
226  // more robust than the method using Cholesky.
227  KALDI_WARN << "Cholesky or Invert() failed while re-orthogonalizing R_t. "
228  << "Re-orthogonalizing on CPU.";
229  Matrix<BaseFloat> cpu_W_t1(*W_t1);
230  cpu_W_t1.OrthogonalizeRows();
231  W_t1->CopyFromMat(cpu_W_t1);
232  // at this point cpu_W_t1 represents R_{t+1}- it has orthonormal
233  // rows. Do: W_{t+1} = E_{t+1}^{0.5} R_{t+1}
234  CuVector<BaseFloat> sqrt_e_t1_gpu(sqrt_e_t1);
235  W_t1->MulRowsVec(sqrt_e_t1_gpu);
236  return;
237  }
238  // Next, compute (E_t^{0.5} C^{-1} E_t^{-0.5})
239  // but it's really t+1, not t.
240  for (int32 i = 0; i < R; i++) {
241  BaseFloat i_factor = sqrt_e_t1(i);
242  for (int32 j = 0; j < i; j++) {
243  // skip j == i because i_factor * j_factor == 1 for j == i.
244  BaseFloat j_factor = inv_sqrt_e_t1(j);
245  C(i, j) *= i_factor * j_factor;
246  }
247  }
248  O_mat.CopyFromTp(C);
249  temp_O->CopyFromMat(O_mat);
250  temp_W->CopyFromMat(*W_t1);
251  W_t1->AddMatMat(1.0, *temp_O, kNoTrans, *temp_W, kNoTrans, 0.0);
252 }
253 
254 // makes sure certain invariants are being preserved
257  BaseFloat d_t_max = d_t_.Max(), d_t_min = d_t_.Min();
258  KALDI_ASSERT(d_t_min >= epsilon_);
259  KALDI_ASSERT(d_t_min > 0.9 * delta_ * d_t_max);
260  KALDI_ASSERT(rho_t_ > 0.9 * delta_ * d_t_max);
261 
262  int32 D = W_t_.NumCols(), R = W_t_.NumRows();
263  BaseFloat beta_t = rho_t_ * (1.0 + alpha_) + alpha_ * d_t_.Sum() / D;
264  Vector<BaseFloat> e_t(R, kUndefined), sqrt_e_t(R, kUndefined),
265  inv_sqrt_e_t(R, kUndefined);
266  ComputeEt(d_t_, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
267 
269  S.AddMat2(1.0, W_t_, kNoTrans, 0.0);
270  SpMatrix<BaseFloat> O(S);
271  for (int32 i = 0; i < R; i++) {
272  BaseFloat i_factor = inv_sqrt_e_t(i);
273  for (int32 j = 0; j <= i; j++) {
274  BaseFloat j_factor = inv_sqrt_e_t(j);
275  O(i, j) *= i_factor * j_factor;
276  }
277  }
278  if (!O.IsUnit(1.0e-04) || O(0, 0) != O(0, 0)) {
279  BaseFloat worst_error = 0.0;
280  int32 worst_i = 0, worst_j = 0;
281  for (int32 i = 0; i < R; i++) {
282  for (int32 j = 0; j < R; j++) {
283  BaseFloat elem = O(i, j);
284  BaseFloat error = fabs(elem - (i == j ? 1.0 : 0.0));
285  if (error > worst_error || error != error) {
286  worst_error = error;
287  worst_i = i;
288  worst_j = j;
289  }
290  }
291  }
292  if (worst_error > 1.0e-02 || worst_error != worst_error) {
293  KALDI_WARN << "Failed to verify W_t (worst error: O[" << worst_i << ','
294  << worst_j << "] = " << O(worst_i, worst_j)
295  << ", d_t = " << d_t_;
296  }
297  }
298 }
299 
301  const int32 t,
302  const BaseFloat rho_t,
303  const Vector<BaseFloat> &d_t,
304  CuMatrixBase<BaseFloat> *WJKL_t,
306  CuVectorBase<BaseFloat> *row_prod,
307  BaseFloat *scale) {
308  int32 N = X_t->NumRows(), // Minibatch size.
309  D = X_t->NumCols(), // Dimensions of vectors we're preconditioning
310  R = rank_; // Rank of correction to unit matrix.
311  KALDI_ASSERT(R > 0 && R < D);
312  BaseFloat eta = Eta(N);
313 
314  CuMatrix<BaseFloat> H_t(N, R);
315  const CuSubMatrix<BaseFloat> W_t(*WJKL_t, 0, R, 0, D);
316  // Below, WJ_t and LK_t are combinations of two matrices,
317  // which we define in order to combine two separate multiplications into one.
318  CuSubMatrix<BaseFloat> J_t(*WJKL_t, R, R, 0, D),
319  L_t(*WJKL_t, 0, R, D, R),
320  K_t(*WJKL_t, R, R, D, R),
321  WJ_t(*WJKL_t, 0, 2 * R, 0, D),
322  LK_t(*WJKL_t, 0, 2 * R, D, R);
323 
324  H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0); // H_t = X_t W_t^T
325 
326  bool locked = update_mutex_.try_lock();
327  if (locked) {
328  // Just hard-code it here that we do 10 updates before skipping any.
329  const int num_initial_updates = 10;
330  if (t_ > t || (num_updates_skipped_ < update_period_ - 1 &&
331  t_ >= num_initial_updates)) {
332  update_mutex_.unlock();
333  // We got the lock but we were already beaten to it by another thread, or
334  // we don't want to update yet due to update_period_ > 1 (this saves
335  // compute), so release the lock.
336  locked = false;
337  }
338  }
339 
340  if (!locked) {
341  // We're not updating the parameters, either because another thread is
342  // working on updating them, or because another thread already did so from
343  // the same or later starting point (making our update stale), or because
344  // update_period_ > 1. We just apply the preconditioning and return.
345 
346  // note: we don't bother with any locks before incrementing
347  // num_updates_skipped_ below, because the worst that could happen is that,
348  // on very rare occasions, we could skip one or two more updates than we
349  // intended.
351 
352  BaseFloat tr_Xt_XtT = TraceMatMat(*X_t, *X_t, kTrans);
353  // X_hat_t = X_t - H_t W_t
354  X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
355  // each element i of row_prod will be inner product of row i of X_hat_t with
356  // itself.
357  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
358  BaseFloat tr_Xhat_XhatT = row_prod->Sum();
359  KALDI_ASSERT(tr_Xhat_XhatT == tr_Xhat_XhatT); // Check for NaN.
360  BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
361  sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
362  *scale = gamma_t;
363  return;
364  }
365  J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0); // J_t = H_t^T X_t
366 
367  bool compute_lk_together = (N > D);
368 
369  if (compute_lk_together) {
370  // do the following two multiplies in one operation...
371  // note
372  // L_t = W_t J_t^T
373  // K_t = J_t J_t^T
374  // Note: L_t was defined as L_t = J_t W_t^T, but it's actually symmetric,
375  // so we can compute it as L_t = W_t J_t^T.
376  LK_t.AddMatMat(1.0, WJ_t, kNoTrans, J_t, kTrans, 0.0);
377  } else {
378  K_t.SymAddMat2(1.0, J_t, kNoTrans, 0.0);
379  L_t.SymAddMat2(1.0, H_t, kTrans, 0.0);
380  }
381 
382  Matrix<BaseFloat> LK_cpu(LK_t); // contains L and K on the CPU.
383  SubMatrix<BaseFloat> L_t_cpu(LK_cpu, 0, R, 0, R),
384  K_t_cpu(LK_cpu, R, R, 0, R);
385  if (!compute_lk_together) {
386  // the SymAddMat2 operations only set the lower triangle and diagonal.
387  L_t_cpu.CopyLowerToUpper();
388  K_t_cpu.CopyLowerToUpper();
389  }
390 
391  // beta_t = \rho_t(1+\alpha) + \alpha/D tr(D_t)
392  BaseFloat beta_t = rho_t * (1.0 + alpha_) + alpha_ * d_t.Sum() / D;
393  Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
394  ComputeEt(d_t, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
395  KALDI_VLOG(5) << "e_t = " << e_t;
396 
397  // The double-precision Z_t here, and the scaling, is to avoid potential
398  // overflow, because Z_t is proportional to the fourth power of data.
399  SpMatrix<double> Z_t_double(R);
400  ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t_double);
401  BaseFloat z_t_scale = std::max<double>(1.0, Z_t_double.Trace());
402  Z_t_double.Scale(1.0 / z_t_scale);
403  SpMatrix<BaseFloat> Z_t_scaled(Z_t_double);
404 
405  Matrix<BaseFloat> U_t(R, R);
406  Vector<BaseFloat> c_t(R);
407  // do the symmetric eigenvalue decomposition Z_t = U_t C_t U_t^T.
408  Z_t_scaled.Eig(&c_t, &U_t);
409  SortSvd(&c_t, &U_t);
410  c_t.Scale(z_t_scale);
411 
412  const BaseFloat condition_threshold = 1.0e+06;
413  // must_reorthogonalize will be true if the last diagonal element of c_t is
414  // negative, since we don't take the absolute value, but this is the right
415  // thing anyway.
416  bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1));
417 
418  BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2);
419  int32 nf;
420  c_t.ApplyFloor(c_t_floor, &nf);
421  if (nf > 0)
422  must_reorthogonalize = true;
423  if (nf > 0 && self_debug_) {
424  KALDI_WARN << "Floored " << nf << " elements of C_t.";
425  }
426  BaseFloat tr_Xt_XtT_check;
427  if (self_debug_)
428  tr_Xt_XtT_check = TraceMatMat(*X_t, *X_t, kTrans);
429 
430  X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); // X_hat_t = X_t - H_t W_t
431  // set *row_prod to inner products of each row of X_hat_t with itself.
432  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
433 
434  BaseFloat tr_Xhat_XhatT = row_prod->Sum();
435  // tr(X_t X_t^T) = tr(X_hat_t X_hat_t^T) - tr(L_t E_t) + 2 tr(L_t)
436  double tr_Xt_XtT = tr_Xhat_XhatT;
437  for (int32 i = 0; i < R; i++)
438  tr_Xt_XtT += L_t_cpu(i, i) * (2.0 - e_t(i));
439  if (self_debug_) {
440  KALDI_ASSERT(ApproxEqual(tr_Xt_XtT, tr_Xt_XtT_check));
441  }
442  BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
443  sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
444  *scale = gamma_t;
445 
446  Vector<BaseFloat> sqrt_c_t(c_t);
447  sqrt_c_t.ApplyPow(0.5);
448 
449  // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
450  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Xt_XtT
451  + (1-eta)*(D * rho_t + d_t.Sum())
452  - sqrt_c_t.Sum());
453  // D_{t+1} = C_t^{0.5} - \rho_{t+1} I
454  Vector<BaseFloat> d_t1(sqrt_c_t);
455  d_t1.Add(-rho_t1);
456  BaseFloat floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max());
457  if (rho_t1 < floor_val)
458  rho_t1 = floor_val;
459  d_t1.ApplyFloor(floor_val);
460 
461  CuMatrix<BaseFloat> W_t1(R, D); // W_{t+1}
462  ComputeWt1(N, d_t, d_t1, rho_t, rho_t1, U_t, sqrt_c_t, inv_sqrt_e_t,
463  W_t, &J_t, &W_t1);
464 
465  if (must_reorthogonalize) {
466  if (self_debug_) {
467  KALDI_WARN << "Reorthogonalizing.";
468  }
469  ReorthogonalizeXt1(d_t1,
470  rho_t1,
471  &W_t1,
472  &J_t,
473  &L_t);
474  }
475 
476  // Commit the new parameters.
477  read_write_mutex_.lock();
478  KALDI_ASSERT(t_ == t); // we already ensured this.
479  t_ = t + 1;
481  W_t_.Swap(&W_t1);
482  d_t_.CopyFromVec(d_t1);
483  rho_t_ = rho_t1;
484 
485  if (self_debug_)
486  SelfTest();
487 
488  read_write_mutex_.unlock();
489  update_mutex_.unlock();
490 }
491 
494  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
495  // Don't let eta approach 1 too closely, as it can lead to NaN's appearing if
496  // the input is all zero.
497  if (ans > 0.9) ans = 0.9;
498  return ans;
499 }
500 
502  const VectorBase<BaseFloat> &d_t,
503  const VectorBase<BaseFloat> &d_t1,
504  BaseFloat rho_t,
505  BaseFloat rho_t1,
506  const MatrixBase<BaseFloat> &U_t,
507  const VectorBase<BaseFloat> &sqrt_c_t,
508  const VectorBase<BaseFloat> &inv_sqrt_e_t,
509  const CuMatrixBase<BaseFloat> &W_t,
511  CuMatrixBase<BaseFloat> *W_t1) const {
512 
513  int32 R = d_t.Dim(), D = W_t.NumCols();
514  BaseFloat eta = Eta(N);
515 
516  // \beta_{t+1} = \rho_{t+1} (1+\alpha) + \alpha/D tr(D_{t+1})
517  BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
518  KALDI_ASSERT(beta_t1 > 0.0);
519  Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
520  inv_sqrt_e_t1(R, kUndefined);
521  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
522  Vector<BaseFloat> inv_sqrt_c_t(sqrt_c_t);
523  inv_sqrt_c_t.InvertElements();
524 
525  Vector<BaseFloat> w_t_coeff(R);
526  for (int32 i = 0; i < R; i++)
527  w_t_coeff(i) = (1.0 - eta) / (eta/N) * (d_t(i) + rho_t);
528  CuVector<BaseFloat> w_t_coeff_gpu(w_t_coeff);
529  // B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
530  J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0);
531 
532  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} B_t
533  Matrix<BaseFloat> A_t(U_t, kTrans);
534  for (int32 i = 0; i < R; i++) {
535  BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i);
536  for (int32 j = 0; j < R; j++) {
537  BaseFloat j_factor = inv_sqrt_e_t(j);
538  A_t(i, j) *= i_factor * j_factor;
539  }
540  }
541  // W_{t+1} = A_t B_t
542  CuMatrix<BaseFloat> A_t_gpu(A_t);
543  W_t1->AddMatMat(1.0, A_t_gpu, kNoTrans, *J_t, kNoTrans, 0.0);
544 }
545 
547  BaseFloat rho_t,
548  const VectorBase<BaseFloat> &d_t,
549  const VectorBase<BaseFloat> &inv_sqrt_e_t,
550  const MatrixBase<BaseFloat> &K_t,
551  const MatrixBase<BaseFloat> &L_t,
552  SpMatrix<double> *Z_t) const {
553  // Use doubles because the range of quantities in Z_t can get large (fourth
554  // power of data), and we want to avoid overflow. This routine is fast.
555  BaseFloat eta = Eta(N);
556  Vector<BaseFloat> d_t_rho_t(d_t);
557  d_t_rho_t.Add(rho_t); // now d_t_rho_t is diag(D_t + \rho_t I).
558  double etaN = eta / N, eta1 = 1.0 - eta,
559  etaN_sq = etaN * etaN, eta1_sq = eta1 * eta1,
560  etaN_eta1 = etaN * eta1;
561  int32 R = d_t.Dim();
562  for (int32 i = 0; i < R; i++) {
563  double inv_sqrt_e_t_i = inv_sqrt_e_t(i), d_t_rho_t_i = d_t_rho_t(i);
564  for (int32 j = 0; j <= i; j++) {
565  double inv_sqrt_e_t_j = inv_sqrt_e_t(j), d_t_rho_t_j = d_t_rho_t(j),
566  L_t_i_j = 0.5 * (L_t(i, j) + L_t(j, i)),
567  K_t_i_j = 0.5 * (K_t(i, j) + K_t(j, i));
568  // See (eqn:Zt) in header.
569  (*Z_t)(i, j) = etaN_sq * inv_sqrt_e_t_i * K_t_i_j * inv_sqrt_e_t_j
570  + etaN_eta1 * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j * d_t_rho_t_j
571  + etaN_eta1 * d_t_rho_t_i * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j
572  + (i == j ? eta1_sq * d_t_rho_t_i * d_t_rho_t_i : 0.0);
573  }
574  }
575 }
576 
578  BaseFloat beta_t,
580  VectorBase<BaseFloat> *sqrt_e_t,
581  VectorBase<BaseFloat> *inv_sqrt_e_t) const {
582  // e_{tii} = 1/(\beta_t/d_{tii} + 1)
583  int32 D = d_t.Dim();
584  const BaseFloat *d = d_t.Data();
585  BaseFloat *e = e_t->Data();
586  for (int32 i = 0; i < D; i++)
587  e[i] = 1.0 / (beta_t / d[i] + 1);
588  sqrt_e_t->CopyFromVec(*e_t);
589  sqrt_e_t->ApplyPow(0.5);
590  inv_sqrt_e_t->CopyFromVec(*sqrt_e_t);
591  inv_sqrt_e_t->InvertElements();
592 }
593 
594 
596  rank_(other.rank_), update_period_(other.update_period_),
598  alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_),
600  self_debug_(other.self_debug_), W_t_(other.W_t_),
601  rho_t_(other.rho_t_), d_t_(other.d_t_) {
602  // use default constructor for the mutexes.
603 }
604 
606  const OnlinePreconditioner &other) {
607  rank_ = other.rank_;
610  alpha_ = other.alpha_;
611  epsilon_ = other.epsilon_;
612  delta_ = other.delta_;
613  t_ = other.t_;
614  self_debug_ = other.self_debug_;
615  W_t_ = other.W_t_;
616  rho_t_ = other.rho_t_;
617  d_t_ = other.d_t_;
618  return *this;
619 }
620 
622  KALDI_ASSERT(rank > 0);
623  rank_ = rank;
624 }
626  KALDI_ASSERT(update_period > 0);
627  update_period_ = update_period;
628 }
630  KALDI_ASSERT(num_samples_history > 0.0 &&
631  num_samples_history < 1.0e+6);
632  num_samples_history_ = num_samples_history;
633 }
635  KALDI_ASSERT(alpha >= 0.0);
636  alpha_ = alpha;
637 }
638 
639 
640 }
641 }
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool IsUnit(Real cutoff=1.0e-05) const
Definition: sp-matrix.cc:480
Packed symetric matrix class.
Definition: matrix-common.h:62
void Scale(Real c)
void ComputeWt1(int32 N, const VectorBase< BaseFloat > &d_t, const VectorBase< BaseFloat > &d_t1, BaseFloat rho_t, BaseFloat rho_t1, const MatrixBase< BaseFloat > &U_t, const VectorBase< BaseFloat > &sqrt_c_t, const VectorBase< BaseFloat > &inv_sqrt_e_t, const CuMatrixBase< BaseFloat > &W_t, CuMatrixBase< BaseFloat > *J_t, CuMatrixBase< BaseFloat > *W_t1) const
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
Real Trace() const
Definition: sp-matrix.cc:171
Real Sum() const
Definition: cu-vector.cc:297
void AddElements(Real alpha, const std::vector< MatrixElement< Real > > &input)
Definition: cu-matrix.cc:3277
CuSubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Definition: cu-matrix.h:653
void AddDiagMat2(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, Real beta)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
Definition: cu-vector.cc:595
kaldi::int32 int32
void Eig(VectorBase< Real > *s, MatrixBase< Real > *P=NULL) const
Solves the symmetric eigenvalue problem: at end we should have (*this) = P * diag(s) * P^T...
Definition: qr.cc:433
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
static void InitOrthonormalSpecial(CuMatrixBase< BaseFloat > *R)
This function creates a matrix with orthonormal rows that is like the following matrix, except with each row normalized to have unit 2-norm: [ 1.1 0 1 0 1 0 0 1.1 0 1 0 1 ] The reason why the first element in each row is 1.1 and not 1, is for symmetry-breaking...
void ComputeEt(const VectorBase< BaseFloat > &d_t, BaseFloat beta_t, VectorBase< BaseFloat > *e_t, VectorBase< BaseFloat > *sqrt_e_t, VectorBase< BaseFloat > *inv_sqrt_e_t) const
void ComputeZt(int32 N, BaseFloat rho_t, const VectorBase< BaseFloat > &d_t, const VectorBase< BaseFloat > &inv_sqrt_e_t, const MatrixBase< BaseFloat > &K_t, const MatrixBase< BaseFloat > &L_t, SpMatrix< double > *Z_t) const
bool IsUnit(Real tol=0.001) const
Definition: cu-matrix.cc:629
void PreconditionDirections(CuMatrixBase< BaseFloat > *R, CuVectorBase< BaseFloat > *row_prod, BaseFloat *scale)
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=nullptr)
Applies floor to all elements.
Definition: kaldi-vector.h:149
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
void Cholesky(const SpMatrix< Real > &orig)
Definition: tp-matrix.cc:88
float BaseFloat
Definition: kaldi-types.h:29
void SetNumSamplesHistory(BaseFloat num_samples_history)
void SetZero()
Math operations, some calling kernels.
Definition: cu-matrix.cc:509
void SymAddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transA, Real beta)
*this = beta * *this + alpha * M M^T, for symmetric matrices.
Definition: cu-matrix.cc:1353
void CopyFromTp(const TpMatrix< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given tpmatrix. (no resize is done).
#define KALDI_ERR
Definition: kaldi-error.h:147
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
Packed symetric matrix class.
Definition: matrix-common.h:63
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
#define KALDI_WARN
Definition: kaldi-error.h:150
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
void Scale(Real alpha)
Multiplies all elements by this constant.
Real Sum() const
Returns sum of the elements.
void ReorthogonalizeXt1(const VectorBase< BaseFloat > &d_t1, BaseFloat rho_t1, CuMatrixBase< BaseFloat > *W_t1, CuMatrixBase< BaseFloat > *temp_W, CuMatrixBase< BaseFloat > *temp_O)
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
A class representing a vector.
Definition: kaldi-vector.h:406
void InvertElements()
Invert all elements.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
OnlinePreconditioner & operator=(const OnlinePreconditioner &other)
void ApplyPow(Real power)
Take all elements of vector to a power.
Definition: kaldi-vector.h:179
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
void Init(const CuMatrixBase< BaseFloat > &R0)
void OrthogonalizeRows()
This function orthogonalizes the rows of a matrix using the Gram-Schmidt process. ...
Keywords for search: natural gradient, naturalgradient, NG-SGD.
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
void Add(Real c)
Add a constant to each element of a vector.
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i&#39;th row by scale[i]
Definition: cu-matrix.cc:792
Sub-matrix representation.
Definition: kaldi-matrix.h:988
void AddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const Real beta)
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265
void PreconditionDirectionsInternal(const int32 t, const BaseFloat rho_t, const Vector< BaseFloat > &d_t, CuMatrixBase< BaseFloat > *WJKL_t, CuMatrixBase< BaseFloat > *X_t, CuVectorBase< BaseFloat > *row_prod, BaseFloat *scale)
void SortSvd(VectorBase< Real > *s, MatrixBase< Real > *U, MatrixBase< Real > *Vt, bool sort_on_absolute_value)
Function to ensure that SVD is sorted.
void CopyLowerToUpper()
Copy lower triangle to upper triangle (symmetrize)
Vector for CUDA computing.
Definition: matrix-common.h:72
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
Definition: cu-matrix.cc:1382