optimization.cc
Go to the documentation of this file.
1 // matrix/optimization.cc
2 
3 // Copyright 2012 Johns Hopkins University (author: Daniel Povey)
4 
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 //
21 // (*) incorporates, with permission, FFT code from his book
22 // "Signal Processing with Lapped Transforms", Artech, 1992.
23 
24 #include <algorithm>
25 
26 #include "matrix/optimization.h"
27 #include "matrix/sp-matrix.h"
28 
29 namespace kaldi {
30 
31 
32 // Below, N&W refers to Nocedal and Wright, "Numerical Optimization", 2nd Ed.
33 
34 template<typename Real>
36  const LbfgsOptions &opts):
37  opts_(opts), k_(0), computation_state_(kBeforeStep), H_was_set_(false) {
38  KALDI_ASSERT(opts.m > 0); // dimension.
39  MatrixIndexT dim = x.Dim();
40  KALDI_ASSERT(dim > 0);
41  x_ = x; // this is the value of x_k
42  new_x_ = x; // this is where we'll evaluate the function next.
43  deriv_.Resize(dim);
44  temp_.Resize(dim);
45  data_.Resize(2 * opts.m, dim);
46  rho_.Resize(opts.m);
47  // Just set f_ to some invalid value, as we haven't yet set it.
48  f_ = (opts.minimize ? 1 : -1 ) * std::numeric_limits<Real>::infinity();
49  best_f_ = f_;
50  best_x_ = x_;
51 }
52 
53 
54 template<typename Real>
56  size_t n = step_lengths_.size();
57  if (n == 0) return std::numeric_limits<Real>::infinity();
58  else {
59  if (n >= 2 && step_lengths_[n-1] == 0.0 && step_lengths_[n-2] == 0.0)
60  return 0.0; // two zeros in a row means repeated restarts, which is
61  // a loop. Short-circuit this by returning zero.
62  Real avg = 0.0;
63  for (size_t i = 0; i < n; i++)
64  avg += step_lengths_[i] / n;
65  return avg;
66  }
67 }
68 
69 template<typename Real>
71  if (k_ == 0) {
72  if (H_.Dim() == 0) {
73  // H was never set up. Set it up for the first time.
74  Real learning_rate;
75  if (opts_.first_step_length > 0.0) { // this takes
76  // precedence over first_step_learning_rate, if set.
77  // We are setting up H for the first time.
78  Real gradient_length = gradient.Norm(2.0);
79  learning_rate = (gradient_length > 0.0 ?
80  opts_.first_step_length / gradient_length :
81  1.0);
82  } else if (opts_.first_step_impr > 0.0) {
83  Real gradient_length = gradient.Norm(2.0);
84  learning_rate = (gradient_length > 0.0 ?
85  opts_.first_step_impr / (gradient_length * gradient_length) :
86  1.0);
87  } else {
88  learning_rate = opts_.first_step_learning_rate;
89  }
90  H_.Resize(x_.Dim());
91  KALDI_ASSERT(learning_rate > 0.0);
92  H_.Set(opts_.minimize ? learning_rate : -learning_rate);
93  }
94  } else { // k_ > 0
95  if (!H_was_set_) { // The user never specified an approximate
96  // diagonal inverse Hessian.
97  // Set it using formula 7.20: H_k^{(0)} = \gamma_k I, where
98  // \gamma_k = s_{k-1}^T y_{k-1} / y_{k-1}^T y_{k-1}
99  SubVector<Real> y_km1 = Y(k_-1);
100  double gamma_k = VecVec(S(k_-1), y_km1) / VecVec(y_km1, y_km1);
101  if (KALDI_ISNAN(gamma_k) || KALDI_ISINF(gamma_k)) {
102  KALDI_WARN << "NaN encountered in L-BFGS (already converged?)";
103  gamma_k = (opts_.minimize ? 1.0 : -1.0);
104  }
105  H_.Set(gamma_k);
106  }
107  }
108 }
109 
110 // This represents the first 2 lines of Algorithm 7.5 (N&W), which
111 // in fact is mostly a call to Algorithm 7.4.
112 // Note: this is valid whether we are minimizing or maximizing.
113 template<typename Real>
115  const VectorBase<Real> &gradient) {
117  SignedMatrixIndexT m = M(), k = k_;
118  ComputeHifNeeded(gradient);
119  // The rest of this is computing p_k <-- - H_k \nabla f_k using Algorithm
120  // 7.4 of N&W.
121  Vector<Real> &q(deriv_), &r(new_x_); // Use deriv_ as a temporary place to put
122  // q, and new_x_ as a temporay place to put r.
123  // The if-statement below is just to get rid of spurious warnings from
124  // valgrind about memcpy source and destination overlap, since sometimes q and
125  // gradient are the same variable.
126  if (&q != &gradient)
127  q.CopyFromVec(gradient); // q <-- \nabla f_k.
128  Vector<Real> alpha(m);
129  // for i = k - 1, k - 2, ... k - m
130  for (SignedMatrixIndexT i = k - 1;
131  i >= std::max(k - m, static_cast<SignedMatrixIndexT>(0));
132  i--) {
133  alpha(i % m) = rho_(i % m) * VecVec(S(i), q); // \alpha_i <-- \rho_i s_i^T q.
134  q.AddVec(-alpha(i % m), Y(i)); // q <-- q - \alpha_i y_i
135  }
136  r.SetZero();
137  r.AddVecVec(1.0, H_, q, 0.0); // r <-- H_k^{(0)} q.
138  // for k = k - m, k - m + 1, ... , k - 1
139  for (SignedMatrixIndexT i = std::max(k - m, static_cast<SignedMatrixIndexT>(0));
140  i < k;
141  i++) {
142  Real beta = rho_(i % m) * VecVec(Y(i), r); // \beta <-- \rho_i y_i^T r
143  r.AddVec(alpha(i % m) - beta, S(i)); // r <-- r + s_i (\alpha_i - \beta)
144  }
145 
146  { // TEST. Note, -r will be the direction.
147  Real dot = VecVec(gradient, r);
148  if ((opts_.minimize && dot < 0) || (!opts_.minimize && dot > 0))
149  KALDI_WARN << "Step direction has the wrong sign! Routine will fail.";
150  }
151 
152  // Now we're out of Alg. 7.4 and back into Alg. 7.5.
153  // Alg. 7.4 returned r (using new_x_ as the location), and with \alpha_k = 1
154  // as the initial guess, we're setting x_{k+1} = x_k + \alpha_k p_k, with
155  // p_k = -r [hence the statement new_x_.Scale(-1.0)]., and \alpha_k = 1.
156  // This is the first place we'll get the user to evaluate the function;
157  // any backtracking (or acceptance of that step) occurs inside StepSizeIteration.
158  // We're still within iteration k; we haven't yet finalized the step size.
159  new_x_.Scale(-1.0);
160  new_x_.AddVec(1.0, x_);
161  if (&deriv_ != &gradient)
162  deriv_.CopyFromVec(gradient);
163  f_ = function_value;
164  d_ = opts_.d;
169 }
170 
171 
172 template<typename Real>
173 bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
174  const VectorBase<Real> &gradient) {
175  // Save s_k = x_{k+1} - x_{k}, and y_k = \nabla f_{k+1} - \nabla f_k.
176  SubVector<Real> s = S(k_), y = Y(k_);
177  s.CopyFromVec(new_x_);
178  s.AddVec(-1.0, x_); // s = new_x_ - x_.
179  y.CopyFromVec(gradient);
180  y.AddVec(-1.0, deriv_); // y = gradient - deriv_.
181 
182  // Warning: there is a division in the next line. This could
183  // generate inf or nan, but this wouldn't necessarily be an error
184  // at this point because for zero step size or derivative we should
185  // terminate the iterations. But this is up to the calling code.
186  Real prod = VecVec(y, s);
187  rho_(k_ % opts_.m) = 1.0 / prod;
188  Real len = s.Norm(2.0);
189 
190  if ((opts_.minimize && prod <= 1.0e-20) || (!opts_.minimize && prod >= -1.0e-20)
191  || len == 0.0)
192  return false; // This will force restart.
193 
194  KALDI_VLOG(3) << "Accepted step; length was " << len
195  << ", prod was " << prod;
196  RecordStepLength(len);
197 
198  // store x_{k+1} and the function value f_{k+1}.
199  x_.CopyFromVec(new_x_);
200  f_ = function_value;
201  k_++;
202 
203  return true; // We successfully accepted the step.
204 }
205 
206 template<typename Real>
208  step_lengths_.push_back(s);
209  if (step_lengths_.size() > static_cast<size_t>(opts_.avg_step_length))
210  step_lengths_.erase(step_lengths_.begin(), step_lengths_.begin() + 1);
211 }
212 
213 
214 template<typename Real>
216  Real f,
217  const VectorBase<Real> &gradient) {
218  // Note: we will consider restarting (the transition of x_ -> x)
219  // as a step, even if it has zero step size. This is necessary in
220  // order for convergence to be detected.
221  {
222  Vector<Real> &diff(temp_);
223  diff.CopyFromVec(x);
224  diff.AddVec(-1.0, x_);
225  RecordStepLength(diff.Norm(2.0));
226  }
227  k_ = 0; // Restart the iterations! [But note that the Hessian,
228  // whatever it was, stays as before.]
229  if (&x_ != &x)
230  x_.CopyFromVec(x);
231  new_x_.CopyFromVec(x);
232  f_ = f;
234  ComputeNewDirection(f, gradient);
235 }
236 
237 template<typename Real>
238 void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
239  const VectorBase<Real> &gradient) {
240  KALDI_VLOG(3) << "In step size iteration, function value changed "
241  << f_ << " to " << function_value;
242 
243  // We're in some part of the backtracking, and the user is providing
244  // the objective function value and gradient.
245  // We're checking two conditions: Wolfe i) [the Armijo rule] and
246  // Wolfe ii).
247 
248  // The Armijo rule (when minimizing) is:
249  // f(k_k + \alpha_k p_k) <= f(x_k) + c_1 \alpha_k p_k^T \nabla f(x_k), where
250  // \nabla means the derivative.
251  // Below, "temp" is the RHS of this equation, where (\alpha_k p_k) equals
252  // (new_x_ - x_); we don't store \alpha or p_k separately, they are implicit
253  // as the difference new_x_ - x_.
254 
255  // Below, pf is \alpha_k p_k^T \nabla f(x_k).
256  Real pf = VecVec(new_x_, deriv_) - VecVec(x_, deriv_);
257  Real temp = f_ + opts_.c1 * pf;
258 
259  bool wolfe_i_ok;
260  if (opts_.minimize) wolfe_i_ok = (function_value <= temp);
261  else wolfe_i_ok = (function_value >= temp);
262 
263  // Wolfe condition ii) can be written as:
264  // p_k^T \nabla f(x_k + \alpha_k p_k) >= c_2 p_k^T \nabla f(x_k)
265  // p2f equals \alpha_k p_k^T \nabla f(x_k + \alpha_k p_k), where
266  // (\alpha_k p_k^T) is (new_x_ - x_).
267  // Note that in our version of Wolfe condition (ii) we have an extra
268  // factor alpha, which doesn't affect anything.
269  Real p2f = VecVec(new_x_, gradient) - VecVec(x_, gradient);
270  //eps = (sizeof(Real) == 4 ? 1.0e-05 : 1.0e-10) *
271  //(std::abs(p2f) + std::abs(pf));
272  bool wolfe_ii_ok;
273  if (opts_.minimize) wolfe_ii_ok = (p2f >= opts_.c2 * pf);
274  else wolfe_ii_ok = (p2f <= opts_.c2 * pf);
275 
276  enum { kDecrease, kNoChange } d_action; // What do do with d_: leave it alone,
277  // or take the square root.
278  enum { kAccept, kDecreaseStep, kIncreaseStep, kRestart } iteration_action;
279  // What we'll do in the overall iteration: accept this value, DecreaseStep
280  // (reduce the step size), IncreaseStep (increase the step size), or kRestart
281  // (set k back to zero). Generally when we can't get both conditions to be
282  // true with a reasonable period of time, it makes sense to restart, because
283  // probably we've almost converged and got into numerical issues; from here
284  // we'll just produced NaN's. Restarting is a safe thing to do and the outer
285  // code will quickly detect convergence.
286 
287  d_action = kNoChange; // the default.
288 
289  if (wolfe_i_ok && wolfe_ii_ok) {
290  iteration_action = kAccept;
291  d_action = kNoChange; // actually doesn't matter, it'll get reset.
292  } else if (!wolfe_i_ok) {
293  // If wolfe i) [the Armijo rule] failed then we went too far (or are
294  // meeting numerical problems).
295  if (last_failure_type_ == kWolfeII) { // Last time we failed it was Wolfe ii).
296  // When we switch between them we decrease d.
297  d_action = kDecrease;
298  }
299  iteration_action = kDecreaseStep;
302  } else if (!wolfe_ii_ok) {
303  // Curvature condition failed -> we did not go far enough.
304  if (last_failure_type_ == kWolfeI) // switching between wolfe i and ii failures->
305  d_action = kDecrease; // decrease value of d.
306  iteration_action = kIncreaseStep;
309  }
310 
311  // Test whether we've been switching too many times betwen wolfe i) and ii)
312  // failures, or overall have an excessive number of failures. We just give up
313  // and restart L-BFGS. Probably we've almost converged.
316  KALDI_VLOG(2) << "Too many steps in line search -> restarting.";
317  iteration_action = kRestart;
318  }
319 
320  if (d_action == kDecrease)
321  d_ = std::sqrt(d_);
322 
323  KALDI_VLOG(3) << "d = " << d_ << ", iter = " << k_ << ", action = "
324  << (iteration_action == kAccept ? "accept" :
325  (iteration_action == kDecreaseStep ? "decrease" :
326  (iteration_action == kIncreaseStep ? "increase" :
327  "reject")));
328 
329  // Note: even if iteration_action != Restart at this point,
330  // some code below may set it to Restart.
331  if (iteration_action == kAccept) {
332  if (AcceptStep(function_value, gradient)) { // If we did
333  // not detect a problem while accepting the step..
335  ComputeNewDirection(function_value, gradient);
336  } else {
337  KALDI_VLOG(2) << "Restarting L-BFGS computation; problem found while "
338  << "accepting step.";
339  iteration_action = kRestart; // We'll have to restart now.
340  }
341  }
342  if (iteration_action == kDecreaseStep || iteration_action == kIncreaseStep) {
343  Real scale = (iteration_action == kDecreaseStep ? 1.0 / d_ : d_);
344  temp_.CopyFromVec(new_x_);
345  new_x_.Scale(scale);
346  new_x_.AddVec(1.0 - scale, x_);
347  if (new_x_.ApproxEqual(temp_, 0.0)) {
348  // Value of new_x_ did not change at all --> we must restart.
349  KALDI_VLOG(3) << "Value of x did not change, when taking step; "
350  << "will restart computation.";
351  iteration_action = kRestart;
352  }
353  if (new_x_.ApproxEqual(temp_, 1.0e-08) &&
354  std::abs(f_ - function_value) < 1.0e-08 *
355  std::abs(f_) && iteration_action == kDecreaseStep) {
356  // This is common and due to roundoff.
357  KALDI_VLOG(3) << "We appear to be backtracking while we are extremely "
358  << "close to the old value; restarting.";
359  iteration_action = kRestart;
360  }
361 
362  if (iteration_action == kDecreaseStep) {
365  } else {
368  }
369  }
370  if (iteration_action == kRestart) {
371  // We want to restart the computation. If the objf at new_x_ is
372  // better than it was at x_, we'll start at new_x_, else at x_.
373  bool use_newx;
374  if (opts_.minimize) use_newx = (function_value < f_);
375  else use_newx = (function_value > f_);
376  KALDI_VLOG(3) << "Restarting computation.";
377  if (use_newx) Restart(new_x_, function_value, gradient);
378  else Restart(x_, f_, deriv_);
379  }
380 }
381 
382 template<typename Real>
383 void OptimizeLbfgs<Real>::DoStep(Real function_value,
384  const VectorBase<Real> &gradient) {
385  if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
386  best_f_ = function_value;
387  best_x_.CopyFromVec(new_x_);
388  }
390  ComputeNewDirection(function_value, gradient);
391  else // kWithinStep{1,2,3}
392  StepSizeIteration(function_value, gradient);
393 }
394 
395 template<typename Real>
396 void OptimizeLbfgs<Real>::DoStep(Real function_value,
397  const VectorBase<Real> &gradient,
398  const VectorBase<Real> &diag_approx_2nd_deriv) {
399  if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
400  best_f_ = function_value;
401  best_x_.CopyFromVec(new_x_);
402  }
403  if (opts_.minimize) {
404  KALDI_ASSERT(diag_approx_2nd_deriv.Min() > 0.0);
405  } else {
406  KALDI_ASSERT(diag_approx_2nd_deriv.Max() < 0.0);
407  }
408  H_was_set_ = true;
409  H_.CopyFromVec(diag_approx_2nd_deriv);
410  H_.InvertElements();
411  DoStep(function_value, gradient);
412 }
413 
414 template<typename Real>
415 const VectorBase<Real>&
416 OptimizeLbfgs<Real>::GetValue(Real *objf_value) const {
417  if (objf_value != NULL) *objf_value = best_f_;
418  return best_x_;
419 }
420 
421 // to compute the alpha, we are minimizing f(x) = x^T b - 0.5 x_k^T A x_k along
422 // direction p_k... consider alpha
423 // d/dx of f(x) = b - A x_k = r.
424 
425 // Notation based on Sec. 5.1 of Nocedal and Wright
426 // Computation based on Alg. 5.2 of Nocedal and Wright (Pg. 112)
427 // Notation (replicated for convenience):
428 // To solve Ax=b for x
429 // k : current iteration
430 // x_k : estimate of x (at iteration k)
431 // r_k : residual ( r_k \eqdef A x_k - b )
432 // \alpha_k : step size
433 // p_k : A-conjugate direction
434 // \beta_k : coefficient used in A-conjugate direction computation for next
435 // iteration
436 //
437 // Algo. LinearCG(A,b,x_0)
438 // ========================
439 // r_0 = Ax_0 - b
440 // p_0 = -r_0
441 // k = 0
442 //
443 // while r_k != 0
444 // \alpha_k = (r_k^T r_k) / (p_k^T A p_k)
445 // x_{k+1} = x_k + \alpha_k p_k;
446 // r_{k+1} = r_k + \alpha_k A p_k
447 // \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
448 // p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
449 // k = k + 1
450 // end
451 
452 template<class Real>
454  const SpMatrix<Real> &A,
455  const VectorBase<Real> &b,
456  VectorBase<Real> *x) {
457  // Initialize the variables
458  //
459  int32 M = A.NumCols();
460 
461  Matrix<Real> storage(4, M);
462  SubVector<Real> r(storage, 0), p(storage, 1), Ap(storage, 2), x_orig(storage, 3);
463  p.CopyFromVec(b);
464  p.AddSpVec(-1.0, A, *x, 1.0); // p_0 = b - A x_0
465  r.AddVec(-1.0, p); // r_0 = - p_0
466  x_orig.CopyFromVec(*x); // in case of failure.
467 
468  Real r_cur_norm_sq = VecVec(r, r),
469  r_initial_norm_sq = r_cur_norm_sq,
470  r_recompute_norm_sq = r_cur_norm_sq;
471 
472  KALDI_VLOG(5) << "In linear CG: initial norm-square of residual = "
473  << r_initial_norm_sq;
474 
476  Real max_error_sq = std::max<Real>(opts.max_error * opts.max_error,
477  std::numeric_limits<Real>::min()),
478  residual_factor = opts.recompute_residual_factor *
480  inv_residual_factor = 1.0 / residual_factor;
481 
482  // Note: although from a mathematical point of view the method should converge
483  // after M iterations, in practice (due to roundoff) it does not always
484  // converge to good precision after that many iterations so we let the maximum
485  // be M + 5 instead.
486  int32 k = 0;
487  for (; k < M + 5 && k != opts.max_iters; k++) {
488  // Note: we'll break from this loop if we converge sooner due to
489  // max_error.
490  Ap.AddSpVec(1.0, A, p, 0.0); // Ap = A p
491 
492  // Below is how the code used to look.
493  // // next line: \alpha_k = (r_k^T r_k) / (p_k^T A p_k)
494  // Real alpha = r_cur_norm_sq / VecVec(p, Ap);
495  //
496  // We changed r_cur_norm_sq below to -VecVec(p, r). Although this is
497  // slightly less efficient, it seems to make the algorithm dramatically more
498  // robust. Note that -p^T r is the mathematically more natural quantity to
499  // use here, that corresponds to minimizing along that direction... r^T r is
500  // recommended in Nocedal and Wright only as a kind of optimization as it is
501  // supposed to be the same as -p^T r and we already have it computed.
502  Real alpha = -VecVec(p, r) / VecVec(p, Ap);
503 
504  // next line: x_{k+1} = x_k + \alpha_k p_k;
505  x->AddVec(alpha, p);
506  // next line: r_{k+1} = r_k + \alpha_k A p_k
507  r.AddVec(alpha, Ap);
508  Real r_next_norm_sq = VecVec(r, r);
509 
510  if (r_next_norm_sq < residual_factor * r_recompute_norm_sq ||
511  r_next_norm_sq > inv_residual_factor * r_recompute_norm_sq) {
512 
513  // Recompute the residual from scratch if the residual norm has decreased
514  // a lot; this costs an extra matrix-vector multiply, but helps keep the
515  // residual accurate.
516  // Also do the same if the residual norm has increased a lot since
517  // the last time we recomputed... this shouldn't happen often, but
518  // it can indicate bad stuff is happening.
519 
520  // r_{k+1} = A x_{k+1} - b
521  r.AddSpVec(1.0, A, *x, 0.0);
522  r.AddVec(-1.0, b);
523  r_next_norm_sq = VecVec(r, r);
524  r_recompute_norm_sq = r_next_norm_sq;
525 
526  KALDI_VLOG(5) << "In linear CG: recomputing residual.";
527  }
528  KALDI_VLOG(5) << "In linear CG: k = " << k
529  << ", r_next_norm_sq = " << r_next_norm_sq;
530  // Check if converged.
531  if (r_next_norm_sq <= max_error_sq)
532  break;
533 
534  // next line: \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
535  Real beta_next = r_next_norm_sq / r_cur_norm_sq;
536  // next lines: p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
537  Vector<Real> p_old(p);
538  p.Scale(beta_next);
539  p.AddVec(-1.0, r);
540  r_cur_norm_sq = r_next_norm_sq;
541  }
542 
543  // note: the first element of the && is only there to save compute.
544  // the residual r is A x - b, and r_cur_norm_sq and r_initial_norm_sq are
545  // of the form r * r, so it's clear that b * b has the right dimension to
546  // compare with the residual.
547  if (r_cur_norm_sq > r_initial_norm_sq &&
548  r_cur_norm_sq > r_initial_norm_sq + 1.0e-10 * VecVec(b, b)) {
549  KALDI_WARN << "Doing linear CGD in dimension " << A.NumRows() << ", after " << k
550  << " iterations the squared residual has got worse, "
551  << r_cur_norm_sq << " > " << r_initial_norm_sq
552  << ". Will do an exact optimization.";
553  SolverOptions opts("called-from-linearCGD");
554  x->CopyFromVec(x_orig);
555  SolveQuadraticProblem(A, b, opts, x);
556  }
557  return k;
558 }
559 
560 // Instantiate the class for float and double.
561 template
563 template
565 
566 
567 template
569  const SpMatrix<float> &A, const VectorBase<float> &b,
570  VectorBase<float> *x);
571 
572 template
574  const SpMatrix<double> &A, const VectorBase<double> &b,
575  VectorBase<double> *x);
576 
577 } // end namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Vector< Real > x_
Definition: optimization.h:210
void DoStep(Real function_value, const VectorBase< Real > &gradient)
The user calls this function to provide the class with the function and gradient info at the point Ge...
Packed symetric matrix class.
Definition: matrix-common.h:62
This class describes the options for maximizing various quadratic objective functions.
Definition: sp-matrix.h:443
std::vector< Real > step_lengths_
Definition: optimization.h:232
double SolveQuadraticProblem(const SpMatrix< double > &H, const VectorBase< double > &g, const SolverOptions &opts, VectorBase< double > *x)
Definition: sp-matrix.cc:635
ComputationState computation_state_
Definition: optimization.h:205
void Restart(const VectorBase< Real > &x, Real function_value, const VectorBase< Real > &gradient)
#define KALDI_ISINF
Definition: kaldi-math.h:73
float first_step_learning_rate
Definition: optimization.h:87
kaldi::int32 int32
const VectorBase< Real > & GetValue(Real *objf_value=NULL) const
This returns the value of the variable x that has the best objective function so far, and the corresponding objective function value if requested.
A class for storing matrices.
Definition: kaldi-matrix.h:823
BaseFloat recompute_residual_factor
Definition: optimization.h:45
Real Min() const
Returns the minimum value of any element, or +infinity for the empty vector.
MatrixIndexT NumRows() const
SignedMatrixIndexT k_
Definition: optimization.h:202
Real Norm(Real p) const
Compute the p-th norm of the vector.
Vector< Real > temp_
Definition: optimization.h:215
template int32 LinearCgd< float >(const LinearCgdOptions &opts, const SpMatrix< float > &A, const VectorBase< float > &b, VectorBase< float > *x)
void AddVecVec(Real alpha, const VectorBase< Real > &v, const VectorBase< Real > &r, Real beta)
Add element-by-element product of vectors:
OptimizeLbfgs(const VectorBase< Real > &x, const LbfgsOptions &opts)
Initializer takes the starting value of x.
Definition: optimization.cc:35
enum kaldi::OptimizeLbfgs::@0 last_failure_type_
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
MatrixIndexT NumCols() const
int32 MatrixIndexT
Definition: matrix-common.h:98
SubVector< Real > S(MatrixIndexT i)
Definition: optimization.h:184
Vector< Real > deriv_
Definition: optimization.h:214
Vector< Real > new_x_
Definition: optimization.h:211
struct rnnlm::@11::@12 n
Vector< Real > H_
Definition: optimization.h:226
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
void StepSizeIteration(Real function_value, const VectorBase< Real > &gradient)
#define KALDI_WARN
Definition: kaldi-error.h:150
LbfgsOptions opts_
Definition: optimization.h:201
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
MatrixIndexT M()
Definition: optimization.h:180
void RecordStepLength(Real s)
Real RecentStepLength() const
Returns the average magnitude of the last n steps (but not more than the number we have stored)...
Definition: optimization.cc:55
template int32 LinearCgd< double >(const LinearCgdOptions &opts, const SpMatrix< double > &A, const VectorBase< double > &b, VectorBase< double > *x)
void ComputeHifNeeded(const VectorBase< Real > &gradient)
Definition: optimization.cc:70
Vector< Real > best_x_
Definition: optimization.h:212
SubVector< Real > Y(MatrixIndexT i)
Definition: optimization.h:181
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ISNAN
Definition: kaldi-math.h:72
int32 SignedMatrixIndexT
Definition: matrix-common.h:99
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void ComputeNewDirection(Real function_value, const VectorBase< Real > &gradient)
int32 LinearCgd(const LinearCgdOptions &opts, const SpMatrix< Real > &A, const VectorBase< Real > &b, VectorBase< Real > *x)
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
This is an implementation of L-BFGS.
Definition: optimization.h:84
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
void SetZero()
Set vector to all zeros.
bool AcceptStep(Real function_value, const VectorBase< Real > &gradient)
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
Vector< Real > rho_
Definition: optimization.h:230
Matrix< Real > data_
Definition: optimization.h:228