doc/optimization_8cc_source.html

 // matrix/optimization.cc

 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)


 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 //
 // (*) incorporates, with permission, FFT code from his book
 // "Signal Processing with Lapped Transforms", Artech, 1992.

 #include <algorithm>

 #include "matrix/optimization.h"
 #include "matrix/sp-matrix.h"

 namespace kaldi {


 // Below, N&W refers to Nocedal and Wright, "Numerical Optimization", 2nd Ed.

 template<typename Real>
 OptimizeLbfgs<Real>::OptimizeLbfgs(const VectorBase<Real> &x,
                                    const LbfgsOptions &opts):
     opts_(opts), k_(0), computation_state_(kBeforeStep), H_was_set_(false) {
   KALDI_ASSERT(opts.m > 0); // dimension.
   MatrixIndexT dim = x.Dim();
   KALDI_ASSERT(dim > 0);
   x_ = x; // this is the value of x_k
   new_x_ = x;  // this is where we'll evaluate the function next.
   deriv_.Resize(dim);
   temp_.Resize(dim);
   data_.Resize(2 * opts.m, dim);
   rho_.Resize(opts.m);
   // Just set f_ to some invalid value, as we haven't yet set it.
   f_ = (opts.minimize ? 1 : -1 ) * std::numeric_limits<Real>::infinity();
   best_f_ = f_;
   best_x_ = x_;
 }


 template<typename Real>
 Real OptimizeLbfgs<Real>::RecentStepLength() const {
   size_t n = step_lengths_.size();
   if (n == 0) return std::numeric_limits<Real>::infinity();
   else {
     if (n >= 2 && step_lengths_[n-1] == 0.0 && step_lengths_[n-2] == 0.0)
       return 0.0; // two zeros in a row means repeated restarts, which is
     // a loop.  Short-circuit this by returning zero.
     Real avg = 0.0;
     for (size_t i = 0; i < n; i++)
       avg += step_lengths_[i] / n;
     return avg;
   }
 }

 template<typename Real>
 void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
   if (k_ == 0) {
     if (H_.Dim() == 0) {
       // H was never set up.  Set it up for the first time.
       Real learning_rate;
       if (opts_.first_step_length > 0.0) { // this takes
         // precedence over first_step_learning_rate, if set.
         // We are setting up H for the first time.
         Real gradient_length = gradient.Norm(2.0);
         learning_rate = (gradient_length > 0.0 ?
                          opts_.first_step_length / gradient_length :
                          1.0);
       } else if (opts_.first_step_impr > 0.0) {
         Real gradient_length = gradient.Norm(2.0);
         learning_rate = (gradient_length > 0.0 ?
                   opts_.first_step_impr / (gradient_length * gradient_length) :
                   1.0);
       } else {
         learning_rate = opts_.first_step_learning_rate;
       }
       H_.Resize(x_.Dim());
       KALDI_ASSERT(learning_rate > 0.0);
       H_.Set(opts_.minimize ? learning_rate : -learning_rate);
     }
   } else { // k_ > 0
     if (!H_was_set_) { // The user never specified an approximate
       // diagonal inverse Hessian.
       // Set it using formula 7.20: H_k^{(0)} = \gamma_k I, where
       // \gamma_k = s_{k-1}^T y_{k-1} / y_{k-1}^T y_{k-1}
       SubVector<Real> y_km1 = Y(k_-1);
       double gamma_k = VecVec(S(k_-1), y_km1) / VecVec(y_km1, y_km1);
       if (KALDI_ISNAN(gamma_k) || KALDI_ISINF(gamma_k)) {
         KALDI_WARN << "NaN encountered in L-BFGS (already converged?)";
         gamma_k = (opts_.minimize ? 1.0 : -1.0);
       }
       H_.Set(gamma_k);
     }
   }
 }

 // This represents the first 2 lines of Algorithm 7.5 (N&W), which
 // in fact is mostly a call to Algorithm 7.4.
 // Note: this is valid whether we are minimizing or maximizing.
 template<typename Real>
 void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
                                               const VectorBase<Real> &gradient) {
   KALDI_ASSERT(computation_state_ == kBeforeStep);
   SignedMatrixIndexT m = M(), k = k_;
   ComputeHifNeeded(gradient);
   // The rest of this is computing p_k <-- - H_k \nabla f_k using Algorithm
   // 7.4 of N&W.
   Vector<Real> &q(deriv_), &r(new_x_); // Use deriv_ as a temporary place to put
   // q, and new_x_ as a temporay place to put r.
   // The if-statement below is just to get rid of spurious warnings from
   // valgrind about memcpy source and destination overlap, since sometimes q and
   // gradient are the same variable.
   if (&q != &gradient)
     q.CopyFromVec(gradient); // q <-- \nabla f_k.
   Vector<Real> alpha(m);
   // for i = k - 1, k - 2, ... k - m
   for (SignedMatrixIndexT i = k - 1;
        i >= std::max(k - m, static_cast<SignedMatrixIndexT>(0));
        i--) {
     alpha(i % m) = rho_(i % m) * VecVec(S(i), q); // \alpha_i <-- \rho_i s_i^T q.
     q.AddVec(-alpha(i % m), Y(i)); // q <-- q - \alpha_i y_i
   }
   r.SetZero();
   r.AddVecVec(1.0, H_, q, 0.0); // r <-- H_k^{(0)} q.
   // for k = k - m, k - m + 1, ... , k - 1
   for (SignedMatrixIndexT i = std::max(k - m, static_cast<SignedMatrixIndexT>(0));
        i < k;
        i++) {
     Real beta = rho_(i % m) * VecVec(Y(i), r); // \beta <-- \rho_i y_i^T r
     r.AddVec(alpha(i % m) - beta, S(i)); // r <-- r + s_i (\alpha_i - \beta)
   }

   { // TEST.  Note, -r will be the direction.
     Real dot = VecVec(gradient, r);
     if ((opts_.minimize && dot < 0) || (!opts_.minimize && dot > 0))
       KALDI_WARN << "Step direction has the wrong sign!  Routine will fail.";
   }

   // Now we're out of Alg. 7.4 and back into Alg. 7.5.
   // Alg. 7.4 returned r (using new_x_ as the location), and with \alpha_k = 1
   // as the initial guess, we're setting x_{k+1} = x_k + \alpha_k p_k, with
   // p_k = -r [hence the statement new_x_.Scale(-1.0)]., and \alpha_k = 1.
   // This is the first place we'll get the user to evaluate the function;
   // any backtracking (or acceptance of that step) occurs inside StepSizeIteration.
   // We're still within iteration k; we haven't yet finalized the step size.
   new_x_.Scale(-1.0);
   new_x_.AddVec(1.0, x_);
   if (&deriv_ != &gradient)
     deriv_.CopyFromVec(gradient);
   f_ = function_value;
   d_ = opts_.d;
   num_wolfe_i_failures_ = 0;
   num_wolfe_ii_failures_ = 0;
   last_failure_type_ = kNone;
   computation_state_ = kWithinStep;
 }


 template<typename Real>
 bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
                                      const VectorBase<Real> &gradient) {
   // Save s_k = x_{k+1} - x_{k}, and y_k = \nabla f_{k+1} - \nabla f_k.
   SubVector<Real> s = S(k_), y = Y(k_);
   s.CopyFromVec(new_x_);
   s.AddVec(-1.0, x_); // s = new_x_ - x_.
   y.CopyFromVec(gradient);
   y.AddVec(-1.0, deriv_); // y = gradient - deriv_.

   // Warning: there is a division in the next line.  This could
   // generate inf or nan, but this wouldn't necessarily be an error
   // at this point because for zero step size or derivative we should
   // terminate the iterations.  But this is up to the calling code.
   Real prod = VecVec(y, s);
   rho_(k_ % opts_.m) = 1.0 / prod;
   Real len = s.Norm(2.0);

   if ((opts_.minimize && prod <= 1.0e-20) || (!opts_.minimize && prod >= -1.0e-20)
       || len == 0.0)
     return false; // This will force restart.

   KALDI_VLOG(3) << "Accepted step; length was " << len
                 << ", prod was " << prod;
   RecordStepLength(len);

   // store x_{k+1} and the function value f_{k+1}.
   x_.CopyFromVec(new_x_);
   f_ = function_value;
   k_++;

   return true; // We successfully accepted the step.
 }

 template<typename Real>
 void OptimizeLbfgs<Real>::RecordStepLength(Real s) {
   step_lengths_.push_back(s);
   if (step_lengths_.size() > static_cast<size_t>(opts_.avg_step_length))
     step_lengths_.erase(step_lengths_.begin(), step_lengths_.begin() + 1);
 }


 template<typename Real>
 void OptimizeLbfgs<Real>::Restart(const VectorBase<Real> &x,
                                   Real f,
                                   const VectorBase<Real> &gradient) {
   // Note: we will consider restarting (the transition of x_ -> x)
   // as a step, even if it has zero step size.  This is necessary in
   // order for convergence to be detected.
   {
     Vector<Real> &diff(temp_);
     diff.CopyFromVec(x);
     diff.AddVec(-1.0, x_);
     RecordStepLength(diff.Norm(2.0));
   }
   k_ = 0; // Restart the iterations!  [But note that the Hessian,
   // whatever it was, stays as before.]
   if (&x_ != &x)
     x_.CopyFromVec(x);
   new_x_.CopyFromVec(x);
   f_ = f;
   computation_state_ = kBeforeStep;
   ComputeNewDirection(f, gradient);
 }

 template<typename Real>
 void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
                                             const VectorBase<Real> &gradient) {
   KALDI_VLOG(3) << "In step size iteration, function value changed "
                 << f_ << " to " << function_value;

   // We're in some part of the backtracking, and the user is providing
   // the objective function value and gradient.
   // We're checking two conditions: Wolfe i) [the Armijo rule] and
   // Wolfe ii).

   // The Armijo rule (when minimizing) is:
   // f(k_k + \alpha_k p_k) <= f(x_k) + c_1 \alpha_k p_k^T \nabla f(x_k), where
   //  \nabla means the derivative.
   // Below, "temp" is the RHS of this equation, where (\alpha_k p_k) equals
   // (new_x_ - x_); we don't store \alpha or p_k separately, they are implicit
   // as the difference new_x_ - x_.

   // Below, pf is \alpha_k p_k^T \nabla f(x_k).
   Real pf = VecVec(new_x_, deriv_) - VecVec(x_, deriv_);
   Real temp = f_ + opts_.c1 * pf;

   bool wolfe_i_ok;
   if (opts_.minimize) wolfe_i_ok = (function_value <= temp);
   else wolfe_i_ok = (function_value >= temp);

   // Wolfe condition ii) can be written as:
   //  p_k^T \nabla f(x_k + \alpha_k p_k) >= c_2 p_k^T \nabla f(x_k)
   // p2f equals \alpha_k p_k^T \nabla f(x_k + \alpha_k p_k), where
   // (\alpha_k p_k^T) is (new_x_ - x_).
   // Note that in our version of Wolfe condition (ii) we have an extra
   // factor alpha, which doesn't affect anything.
   Real p2f = VecVec(new_x_, gradient) - VecVec(x_, gradient);
   //eps = (sizeof(Real) == 4 ? 1.0e-05 : 1.0e-10) *
   //(std::abs(p2f) + std::abs(pf));
   bool wolfe_ii_ok;
   if (opts_.minimize) wolfe_ii_ok = (p2f >= opts_.c2 * pf);
   else wolfe_ii_ok = (p2f <= opts_.c2 * pf);

   enum { kDecrease, kNoChange } d_action; // What do do with d_: leave it alone,
   // or take the square root.
   enum { kAccept, kDecreaseStep, kIncreaseStep, kRestart } iteration_action;
   // What we'll do in the overall iteration: accept this value, DecreaseStep
   // (reduce the step size), IncreaseStep (increase the step size), or kRestart
   // (set k back to zero).  Generally when we can't get both conditions to be
   // true with a reasonable period of time, it makes sense to restart, because
   // probably we've almost converged and got into numerical issues; from here
   // we'll just produced NaN's.  Restarting is a safe thing to do and the outer
   // code will quickly detect convergence.

   d_action = kNoChange; // the default.

   if (wolfe_i_ok && wolfe_ii_ok) {
     iteration_action = kAccept;
     d_action = kNoChange; // actually doesn't matter, it'll get reset.
   } else if (!wolfe_i_ok) {
     // If wolfe i) [the Armijo rule] failed then we went too far (or are
     // meeting numerical problems).
     if (last_failure_type_ == kWolfeII) { // Last time we failed it was Wolfe ii).
       // When we switch between them we decrease d.
       d_action = kDecrease;
     }
     iteration_action = kDecreaseStep;
     last_failure_type_ = kWolfeI;
     num_wolfe_i_failures_++;
   } else if (!wolfe_ii_ok) {
     // Curvature condition failed -> we did not go far enough.
     if (last_failure_type_ == kWolfeI) // switching between wolfe i and ii failures->
       d_action = kDecrease; // decrease value of d.
     iteration_action = kIncreaseStep;
     last_failure_type_ = kWolfeII;
     num_wolfe_ii_failures_++;
   }

   // Test whether we've been switching too many times betwen wolfe i) and ii)
   // failures, or overall have an excessive number of failures.  We just give up
   // and restart L-BFGS.  Probably we've almost converged.
   if (num_wolfe_i_failures_ + num_wolfe_ii_failures_ >
       opts_.max_line_search_iters) {
     KALDI_VLOG(2) << "Too many steps in line search -> restarting.";
     iteration_action = kRestart;
   }

   if (d_action == kDecrease)
     d_ = std::sqrt(d_);

   KALDI_VLOG(3) << "d = " << d_ << ", iter = " << k_ << ", action = "
                 << (iteration_action == kAccept ? "accept" :
                     (iteration_action == kDecreaseStep ? "decrease" :
                      (iteration_action == kIncreaseStep ? "increase" :
                       "reject")));

   // Note: even if iteration_action != Restart at this point,
   // some code below may set it to Restart.
   if (iteration_action == kAccept) {
     if (AcceptStep(function_value, gradient)) { // If we did
       // not detect a problem while accepting the step..
       computation_state_ = kBeforeStep;
       ComputeNewDirection(function_value, gradient);
     } else {
       KALDI_VLOG(2) << "Restarting L-BFGS computation; problem found while "
                     << "accepting step.";
       iteration_action = kRestart; // We'll have to restart now.
     }
   }
   if (iteration_action == kDecreaseStep || iteration_action == kIncreaseStep) {
     Real scale = (iteration_action == kDecreaseStep ? 1.0 / d_ : d_);
     temp_.CopyFromVec(new_x_);
     new_x_.Scale(scale);
     new_x_.AddVec(1.0 - scale, x_);
     if (new_x_.ApproxEqual(temp_, 0.0)) {
       // Value of new_x_ did not change at all --> we must restart.
       KALDI_VLOG(3) << "Value of x did not change, when taking step; "
                     << "will restart computation.";
       iteration_action = kRestart;
     }
     if (new_x_.ApproxEqual(temp_, 1.0e-08) &&
         std::abs(f_ - function_value) < 1.0e-08 *
         std::abs(f_) && iteration_action == kDecreaseStep) {
       // This is common and due to roundoff.
       KALDI_VLOG(3) << "We appear to be backtracking while we are extremely "
                     << "close to the old value; restarting.";
       iteration_action = kRestart;
     }

     if (iteration_action == kDecreaseStep) {
       num_wolfe_i_failures_++;
       last_failure_type_ = kWolfeI;
     } else {
       num_wolfe_ii_failures_++;
       last_failure_type_ = kWolfeII;
     }
   }
   if (iteration_action == kRestart) {
     // We want to restart the computation.  If the objf at new_x_ is
     // better than it was at x_, we'll start at new_x_, else at x_.
     bool use_newx;
     if (opts_.minimize) use_newx = (function_value < f_);
     else use_newx = (function_value > f_);
     KALDI_VLOG(3) << "Restarting computation.";
     if (use_newx) Restart(new_x_, function_value, gradient);
     else Restart(x_, f_, deriv_);
   }
 }

 template<typename Real>
 void OptimizeLbfgs<Real>::DoStep(Real function_value,
                                  const VectorBase<Real> &gradient) {
   if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
     best_f_ = function_value;
     best_x_.CopyFromVec(new_x_);
   }
   if (computation_state_ == kBeforeStep)
     ComputeNewDirection(function_value, gradient);
   else // kWithinStep{1,2,3}
     StepSizeIteration(function_value, gradient);
 }

 template<typename Real>
 void OptimizeLbfgs<Real>::DoStep(Real function_value,
                                  const VectorBase<Real> &gradient,
                                  const VectorBase<Real> &diag_approx_2nd_deriv) {
   if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
     best_f_ = function_value;
     best_x_.CopyFromVec(new_x_);
   }
   if (opts_.minimize) {
     KALDI_ASSERT(diag_approx_2nd_deriv.Min() > 0.0);
   } else {
     KALDI_ASSERT(diag_approx_2nd_deriv.Max() < 0.0);
   }
   H_was_set_ = true;
   H_.CopyFromVec(diag_approx_2nd_deriv);
   H_.InvertElements();
   DoStep(function_value, gradient);
 }

 template<typename Real>
 const VectorBase<Real>&
 OptimizeLbfgs<Real>::GetValue(Real *objf_value) const {
   if (objf_value != NULL) *objf_value = best_f_;
   return best_x_;
 }

 // to compute the alpha, we are minimizing f(x) =  x^T b - 0.5 x_k^T A x_k  along
 // direction p_k... consider alpha
 // d/dx of f(x) = b - A x_k = r.

 // Notation based on Sec. 5.1 of Nocedal and Wright
 // Computation based on Alg. 5.2 of Nocedal and Wright (Pg. 112)
 // Notation (replicated for convenience):
 //  To solve Ax=b for x
 //  k : current iteration
 //  x_k : estimate of x (at iteration k)
 //  r_k : residual ( r_k \eqdef A x_k - b )
 //  \alpha_k : step size
 //  p_k : A-conjugate direction
 //  \beta_k  : coefficient used in A-conjugate direction computation for next
 //  iteration
 //
 //  Algo.  LinearCG(A,b,x_0)
 //  ========================
 //  r_0 = Ax_0 - b
 //  p_0 = -r_0
 //  k = 0
 //
 //  while r_k != 0
 //    \alpha_k = (r_k^T  r_k) / (p_k^T  A  p_k)
 //    x_{k+1} = x_k + \alpha_k  p_k;
 //    r_{k+1} = r_k + \alpha_k  A  p_k
 //    \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
 //    p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
 //    k = k + 1
 //  end

 template<class Real>
 int32 LinearCgd(const LinearCgdOptions &opts,
                 const SpMatrix<Real> &A,
                 const VectorBase<Real> &b,
                 VectorBase<Real> *x) {
   // Initialize the variables
   //
   int32 M = A.NumCols();

   Matrix<Real> storage(4, M);
   SubVector<Real> r(storage, 0), p(storage, 1), Ap(storage, 2), x_orig(storage, 3);
   p.CopyFromVec(b);
   p.AddSpVec(-1.0, A, *x, 1.0);  // p_0 = b - A x_0
   r.AddVec(-1.0, p);  // r_0 = - p_0
   x_orig.CopyFromVec(*x);  // in case of failure.

   Real r_cur_norm_sq = VecVec(r, r),
       r_initial_norm_sq = r_cur_norm_sq,
       r_recompute_norm_sq = r_cur_norm_sq;

   KALDI_VLOG(5) << "In linear CG: initial norm-square of residual = "
                 << r_initial_norm_sq;

   KALDI_ASSERT(opts.recompute_residual_factor <= 1.0);
   Real max_error_sq = std::max<Real>(opts.max_error * opts.max_error,
                                      std::numeric_limits<Real>::min()),
       residual_factor = opts.recompute_residual_factor *
                         opts.recompute_residual_factor,
       inv_residual_factor = 1.0 / residual_factor;

   // Note: although from a mathematical point of view the method should converge
   // after M iterations, in practice (due to roundoff) it does not always
   // converge to good precision after that many iterations so we let the maximum
   // be M + 5 instead.
   int32 k = 0;
   for (; k < M + 5 && k != opts.max_iters; k++) {
     // Note: we'll break from this loop if we converge sooner due to
     // max_error.
     Ap.AddSpVec(1.0, A, p, 0.0);  // Ap = A p

     // Below is how the code used to look.
     // // next line: \alpha_k = (r_k^T r_k) / (p_k^T A p_k)
     // Real alpha = r_cur_norm_sq / VecVec(p, Ap);
     //
     // We changed r_cur_norm_sq below to -VecVec(p, r).  Although this is
     // slightly less efficient, it seems to make the algorithm dramatically more
     // robust.  Note that -p^T r is the mathematically more natural quantity to
     // use here, that corresponds to minimizing along that direction... r^T r is
     // recommended in Nocedal and Wright only as a kind of optimization as it is
     // supposed to be the same as -p^T r and we already have it computed.
     Real alpha = -VecVec(p, r) / VecVec(p, Ap);

     // next line: x_{k+1} = x_k + \alpha_k p_k;
     x->AddVec(alpha, p);
     // next line: r_{k+1} = r_k + \alpha_k A p_k
     r.AddVec(alpha, Ap);
     Real r_next_norm_sq = VecVec(r, r);

     if (r_next_norm_sq < residual_factor * r_recompute_norm_sq ||
         r_next_norm_sq > inv_residual_factor * r_recompute_norm_sq) {

       // Recompute the residual from scratch if the residual norm has decreased
       // a lot; this costs an extra matrix-vector multiply, but helps keep the
       // residual accurate.
       // Also do the same if the residual norm has increased a lot since
       // the last time we recomputed... this shouldn't happen often, but
       // it can indicate bad stuff is happening.

       // r_{k+1} = A x_{k+1} - b
       r.AddSpVec(1.0, A, *x, 0.0);
       r.AddVec(-1.0, b);
       r_next_norm_sq = VecVec(r, r);
       r_recompute_norm_sq = r_next_norm_sq;

       KALDI_VLOG(5) << "In linear CG: recomputing residual.";
     }
     KALDI_VLOG(5) << "In linear CG: k = " << k
                   << ", r_next_norm_sq = " << r_next_norm_sq;
     // Check if converged.
     if (r_next_norm_sq <= max_error_sq)
       break;

     // next line: \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
     Real beta_next = r_next_norm_sq / r_cur_norm_sq;
     // next lines: p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
     Vector<Real> p_old(p);
     p.Scale(beta_next);
     p.AddVec(-1.0, r);
     r_cur_norm_sq = r_next_norm_sq;
   }

   // note: the first element of the && is only there to save compute.
   // the residual r is A x - b, and r_cur_norm_sq and r_initial_norm_sq are
   // of the form r * r, so it's clear that b * b has the right dimension to
   // compare with the residual.
   if (r_cur_norm_sq > r_initial_norm_sq &&
       r_cur_norm_sq > r_initial_norm_sq + 1.0e-10 * VecVec(b, b)) {
     KALDI_WARN << "Doing linear CGD in dimension " << A.NumRows() << ", after " << k
               << " iterations the squared residual has got worse, "
                << r_cur_norm_sq << " > " << r_initial_norm_sq
                << ".  Will do an exact optimization.";
     SolverOptions opts("called-from-linearCGD");
     x->CopyFromVec(x_orig);
     SolveQuadraticProblem(A, b, opts, x);
   }
   return k;
 }

 // Instantiate the class for float and double.
 template
 class OptimizeLbfgs<float>;
 template
 class OptimizeLbfgs<double>;


 template
 int32 LinearCgd<float>(const LinearCgdOptions &opts,
                       const SpMatrix<float> &A, const VectorBase<float> &b,
                       VectorBase<float> *x);

 template
 int32 LinearCgd<double>(const LinearCgdOptions &opts,
                         const SpMatrix<double> &A, const VectorBase<double> &b,
                         VectorBase<double> *x);

 } // end namespace kaldi
kaldi::OptimizeLbfgs::H_was_set_
bool H_was_set_
Definition: optimization.h:206

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::OptimizeLbfgs::x_
Vector< Real > x_
Definition: optimization.h:210

kaldi::OptimizeLbfgs::DoStep
void DoStep(Real function_value, const VectorBase< Real > &gradient)
The user calls this function to provide the class with the function and gradient info at the point Ge...
Definition: optimization.cc:383

kaldi::SpMatrix
Packed symetric matrix class.
Definition: matrix-common.h:62

kaldi::OptimizeLbfgs::kWolfeI
Definition: optimization.h:223

kaldi::SolverOptions
This class describes the options for maximizing various quadratic objective functions.
Definition: sp-matrix.h:443

kaldi::OptimizeLbfgs::step_lengths_
std::vector< Real > step_lengths_
Definition: optimization.h:232

sp-matrix.h

kaldi::SolveQuadraticProblem
double SolveQuadraticProblem(const SpMatrix< double > &H, const VectorBase< double > &g, const SolverOptions &opts, VectorBase< double > *x)
Definition: sp-matrix.cc:635

kaldi::OptimizeLbfgs::computation_state_
ComputationState computation_state_
Definition: optimization.h:205

kaldi::OptimizeLbfgs::kBeforeStep
Definition: optimization.h:174

kaldi::OptimizeLbfgs::kWithinStep
Definition: optimization.h:175

kaldi::OptimizeLbfgs::Restart
void Restart(const VectorBase< Real > &x, Real function_value, const VectorBase< Real > &gradient)
Definition: optimization.cc:215

kaldi::OptimizeLbfgs::d_
Real d_
Definition: optimization.h:218

KALDI_ISINF
#define KALDI_ISINF
Definition: kaldi-math.h:73

kaldi::LbfgsOptions::max_line_search_iters
int max_line_search_iters
Definition: optimization.h:103

kaldi::LbfgsOptions::first_step_learning_rate
float first_step_learning_rate
Definition: optimization.h:87

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

kaldi::OptimizeLbfgs::GetValue
const VectorBase< Real > & GetValue(Real *objf_value=NULL) const
This returns the value of the variable x that has the best objective function so far, and the corresponding objective function value if requested.
Definition: optimization.cc:416

kaldi::Matrix
A class for storing matrices.
Definition: kaldi-matrix.h:823

kaldi::LinearCgdOptions::recompute_residual_factor
BaseFloat recompute_residual_factor
Definition: optimization.h:45

kaldi::VectorBase::Min
Real Min() const
Returns the minimum value of any element, or +infinity for the empty vector.
Definition: kaldi-vector.cc:614

kaldi::PackedMatrix::NumRows
MatrixIndexT NumRows() const
Definition: packed-matrix.h:104

kaldi::OptimizeLbfgs::k_
SignedMatrixIndexT k_
Definition: optimization.h:202

kaldi::VectorBase::Norm
Real Norm(Real p) const
Compute the p-th norm of the vector.
Definition: kaldi-vector.cc:512

optimization.h

kaldi::OptimizeLbfgs::temp_
Vector< Real > temp_
Definition: optimization.h:215

kaldi::LinearCgd< float >
template int32 LinearCgd< float >(const LinearCgdOptions &opts, const SpMatrix< float > &A, const VectorBase< float > &b, VectorBase< float > *x)

kaldi::LbfgsOptions::d
float d
Definition: optimization.h:101

kaldi::VectorBase::AddVecVec
void AddVecVec(Real alpha, const VectorBase< Real > &v, const VectorBase< Real > &r, Real beta)
Add element-by-element product of vectors:
Definition: kaldi-vector.cc:1000

kaldi::OptimizeLbfgs::OptimizeLbfgs
OptimizeLbfgs(const VectorBase< Real > &x, const LbfgsOptions &opts)
Initializer takes the starting value of x.
Definition: optimization.cc:35

kaldi::OptimizeLbfgs::last_failure_type_
enum kaldi::OptimizeLbfgs::@0 last_failure_type_

kaldi::OptimizeLbfgs::f_
Real f_
Definition: optimization.h:216

kaldi::VectorBase::CopyFromVec
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
Definition: kaldi-vector.cc:228

kaldi::PackedMatrix::NumCols
MatrixIndexT NumCols() const
Definition: packed-matrix.h:105

kaldi::MatrixIndexT
int32 MatrixIndexT
Definition: matrix-common.h:98

kaldi::OptimizeLbfgs::S
SubVector< Real > S(MatrixIndexT i)
Definition: optimization.h:184

kaldi::OptimizeLbfgs::kWolfeII
Definition: optimization.h:223

kaldi::OptimizeLbfgs::best_f_
Real best_f_
Definition: optimization.h:217

kaldi::OptimizeLbfgs::deriv_
Vector< Real > deriv_
Definition: optimization.h:214

kaldi::OptimizeLbfgs::new_x_
Vector< Real > new_x_
Definition: optimization.h:211

kaldi::LbfgsOptions::first_step_length
float first_step_length
Definition: optimization.h:90

rnnlm::n
struct rnnlm::@11::@12 n

kaldi::OptimizeLbfgs::H_
Vector< Real > H_
Definition: optimization.h:226

kaldi::VectorBase::Max
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
Definition: kaldi-vector.cc:574

kaldi::OptimizeLbfgs::num_wolfe_ii_failures_
int num_wolfe_ii_failures_
Definition: optimization.h:222

kaldi::OptimizeLbfgs::StepSizeIteration
void StepSizeIteration(Real function_value, const VectorBase< Real > &gradient)
Definition: optimization.cc:238

kaldi::OptimizeLbfgs
Definition: optimization.h:121

KALDI_WARN
#define KALDI_WARN
Definition: kaldi-error.h:150

kaldi::LinearCgdOptions
Definition: optimization.h:37

kaldi::OptimizeLbfgs::opts_
LbfgsOptions opts_
Definition: optimization.h:201

kaldi::VectorBase::Dim
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64

kaldi::LbfgsOptions::minimize
bool minimize
Definition: optimization.h:85

kaldi::OptimizeLbfgs::M
MatrixIndexT M()
Definition: optimization.h:180

kaldi::OptimizeLbfgs::RecordStepLength
void RecordStepLength(Real s)
Definition: optimization.cc:207

kaldi::LinearCgdOptions::max_iters
int32 max_iters
Definition: optimization.h:38

kaldi::OptimizeLbfgs::RecentStepLength
Real RecentStepLength() const
Returns the average magnitude of the last n steps (but not more than the number we have stored)...
Definition: optimization.cc:55

kaldi::OptimizeLbfgs::num_wolfe_i_failures_
int num_wolfe_i_failures_
Definition: optimization.h:221

kaldi::OptimizeLbfgs::kNone
Definition: optimization.h:223

kaldi::LinearCgd< double >
template int32 LinearCgd< double >(const LinearCgdOptions &opts, const SpMatrix< double > &A, const VectorBase< double > &b, VectorBase< double > *x)

kaldi::LbfgsOptions::m
int m
Definition: optimization.h:86

kaldi::OptimizeLbfgs::ComputeHifNeeded
void ComputeHifNeeded(const VectorBase< Real > &gradient)
Definition: optimization.cc:70

kaldi::LinearCgdOptions::max_error
BaseFloat max_error
Definition: optimization.h:39

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::LbfgsOptions::first_step_impr
float first_step_impr
Definition: optimization.h:94

kaldi::OptimizeLbfgs::best_x_
Vector< Real > best_x_
Definition: optimization.h:212

kaldi::OptimizeLbfgs::Y
SubVector< Real > Y(MatrixIndexT i)
Definition: optimization.h:181

kaldi::Vector
A class representing a vector.
Definition: kaldi-vector.h:406

KALDI_ISNAN
#define KALDI_ISNAN
Definition: kaldi-math.h:72

kaldi::SignedMatrixIndexT
int32 SignedMatrixIndexT
Definition: matrix-common.h:99

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

kaldi::OptimizeLbfgs::ComputeNewDirection
void ComputeNewDirection(Real function_value, const VectorBase< Real > &gradient)
Definition: optimization.cc:114

kaldi::LbfgsOptions::avg_step_length
int avg_step_length
Definition: optimization.h:104

kaldi::LinearCgd
int32 LinearCgd(const LinearCgdOptions &opts, const SpMatrix< Real > &A, const VectorBase< Real > &b, VectorBase< Real > *x)
Definition: optimization.cc:453

KALDI_VLOG
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156

kaldi::LbfgsOptions::c2
float c2
Definition: optimization.h:100

kaldi::LbfgsOptions::c1
float c1
Definition: optimization.h:99

kaldi::LbfgsOptions
This is an implementation of L-BFGS.
Definition: optimization.h:84

kaldi::VectorBase
Provides a vector abstraction class.
Definition: kaldi-vector.h:41

kaldi::VectorBase::SetZero
void SetZero()
Set vector to all zeros.
Definition: kaldi-vector.cc:288

kaldi::OptimizeLbfgs::AcceptStep
bool AcceptStep(Real function_value, const VectorBase< Real > &gradient)
Definition: optimization.cc:173

kaldi::VecVec
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37

kaldi::VectorBase::AddVec
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Definition: kaldi-vector.cc:1044

kaldi::SubVector
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501

kaldi::OptimizeLbfgs::rho_
Vector< Real > rho_
Definition: optimization.h:230

kaldi::OptimizeLbfgs::data_
Matrix< Real > data_
Definition: optimization.h:228