doc/optimization_8h_source.html

 // matrix/optimization.h

 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 //
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 //
 // (*) incorporates, with permission, FFT code from his book
 // "Signal Processing with Lapped Transforms", Artech, 1992.


 #ifndef KALDI_MATRIX_OPTIMIZATION_H_
 #define KALDI_MATRIX_OPTIMIZATION_H_

 #include "matrix/kaldi-vector.h"
 #include "matrix/kaldi-matrix.h"

 namespace kaldi {


 struct LinearCgdOptions {
   int32 max_iters;  //  Maximum number of iters (if >= 0).
   BaseFloat max_error;  // Maximum 2-norm of the residual A x - b (convergence
                         // test)
   // Every time the residual 2-norm decreases by this recompute_residual_factor
   // since the last time it was computed from scratch, recompute it from
   // scratch.  This helps to keep the computed residual accurate even in the
   // presence of roundoff.
   BaseFloat recompute_residual_factor;

   LinearCgdOptions(): max_iters(-1),
                       max_error(0.0),
                       recompute_residual_factor(0.01) { }
 };

 /*
   This function uses linear conjugate gradient descent to approximately solve
   the system A x = b.  The value of x at entry corresponds to the initial guess
   of x.  The algorithm continues until the number of iterations equals b.Dim(),
   or until the 2-norm of (A x - b) is <= max_error, or until the number of
   iterations equals max_iter, whichever happens sooner.  It is a requirement
   that A be positive definite.
   It returns the number of iterations that were actually executed (this is
   useful for testing purposes).
 */
 template<typename Real>
 int32 LinearCgd(const LinearCgdOptions &opts,
                 const SpMatrix<Real> &A, const VectorBase<Real> &b,
                 VectorBase<Real> *x);


 struct LbfgsOptions {
   bool minimize; // if true, we're minimizing, else maximizing.
   int m; // m is the number of stored vectors L-BFGS keeps.
   float first_step_learning_rate; // The very first step of L-BFGS is
   // like gradient descent.  If you want to configure the size of that step,
   // you can do it using this variable.
   float first_step_length; // If this variable is >0.0, it overrides
   // first_step_learning_rate; on the first step we choose an approximate
   // Hessian that is the multiple of the identity that would generate this
   // step-length, or 1.0 if the gradient is zero.
   float first_step_impr; // If this variable is >0.0, it overrides
   // first_step_learning_rate; on the first step we choose an approximate
   // Hessian that is the multiple of the identity that would generate this
   // amount of objective function improvement (assuming the "real" objf
   // was linear).
   float c1; // A constant in Armijo rule = Wolfe condition i)
   float c2; // A constant in Wolfe condition ii)
   float d; // An amount > 1.0 (default 2.0) that we initially multiply or
   // divide the step length by, in the line search.
   int max_line_search_iters; // after this many iters we restart L-BFGS.
   int avg_step_length; // number of iters to avg step length over, in
   // RecentStepLength().

   LbfgsOptions (bool minimize = true):
       minimize(minimize),
       m(10),
       first_step_learning_rate(1.0),
       first_step_length(0.0),
       first_step_impr(0.0),
       c1(1.0e-04),
       c2(0.9),
       d(2.0),
       max_line_search_iters(50),
       avg_step_length(4) { }
 };

 template<typename Real>
 class OptimizeLbfgs {
  public:
   OptimizeLbfgs(const VectorBase<Real> &x,
                 const LbfgsOptions &opts);

   const VectorBase<Real>& GetValue(Real *objf_value = NULL) const;

   const VectorBase<Real>& GetProposedValue() const { return new_x_; }

   Real RecentStepLength() const;

   void DoStep(Real function_value,
               const VectorBase<Real> &gradient);

   void DoStep(Real function_value,
               const VectorBase<Real> &gradient,
               const VectorBase<Real> &diag_approx_2nd_deriv);

  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(OptimizeLbfgs);


   // The following variable says what stage of the computation we're at.
   // Refer to Algorithm 7.5 (L-BFGS) of Nodecdal & Wright, "Numerical
   // Optimization", 2nd edition.
   // kBeforeStep means we're about to do
   // kWithinStep means we're at some point within line search; note
   // that line search is iterative so we can stay in this state more
   // than one time on each iteration.
   enum ComputationState {
     kBeforeStep,
     kWithinStep, // This means we're within the step-size computation, and
     // have not yet done the 1st function evaluation.
   };

   inline MatrixIndexT Dim() { return x_.Dim(); }
   inline MatrixIndexT M() { return opts_.m; }
   SubVector<Real> Y(MatrixIndexT i) {
     return SubVector<Real>(data_, (i % M()) * 2); // vector y_i
   }
   SubVector<Real> S(MatrixIndexT i) {
     return SubVector<Real>(data_, (i % M()) * 2 + 1); // vector s_i
   }
   // The following are subroutines within DoStep():
   bool AcceptStep(Real function_value,
                   const VectorBase<Real> &gradient);
   void Restart(const VectorBase<Real> &x,
                Real function_value,
                const VectorBase<Real> &gradient);
   void ComputeNewDirection(Real function_value,
                            const VectorBase<Real> &gradient);
   void ComputeHifNeeded(const VectorBase<Real> &gradient);
   void StepSizeIteration(Real function_value,
                          const VectorBase<Real> &gradient);
   void RecordStepLength(Real s);


   LbfgsOptions opts_;
   SignedMatrixIndexT k_; // Iteration number, starts from zero.  Gets set back to zero
   // when we restart.

   ComputationState computation_state_;
   bool H_was_set_; // True if the user specified H_; if false,
   // we'll use a heuristic to estimate it.


   Vector<Real> x_; // current x.
   Vector<Real> new_x_; // the x proposed in the line search.
   Vector<Real> best_x_; // the x with the best objective function so far
                         // (either the same as x_ or something in the current line search.)
   Vector<Real> deriv_; // The most recently evaluated derivative-- at x_k.
   Vector<Real> temp_;
   Real f_; // The function evaluated at x_k.
   Real best_f_; // the best objective function so far.
   Real d_; // a number d > 1.0, but during an iteration we may decrease this, when
   // we switch between armijo and wolfe failures.

   int num_wolfe_i_failures_; // the num times we decreased step size.
   int num_wolfe_ii_failures_; // the num times we increased step size.
   enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search
   // failure on this iter.

   Vector<Real> H_; // Current inverse-Hessian estimate.  May be computed by this class itself,
   // or provided by user using 2nd form of SetGradientInfo().
   Matrix<Real> data_; // dimension (m*2) x dim.  Even rows store
   // gradients y_i, odd rows store steps s_i.
   Vector<Real> rho_; // dimension m; rho_(m) = 1/(y_m^T s_m), Eq. 7.17.

   std::vector<Real> step_lengths_; // The step sizes we took on the last
   // (up to m) iterations; these are not stored in a rotating buffer but
   // are shifted by one each time (this is more convenient when we
   // restart, as we keep this info past restarting).


 };


 } // end namespace kaldi


 #endif

kaldi::OptimizeLbfgs::H_was_set_
bool H_was_set_
Definition: optimization.h:206

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::OptimizeLbfgs::x_
Vector< Real > x_
Definition: optimization.h:210

kaldi::SpMatrix
Packed symetric matrix class.
Definition: matrix-common.h:62

kaldi::OptimizeLbfgs::step_lengths_
std::vector< Real > step_lengths_
Definition: optimization.h:232

kaldi::OptimizeLbfgs::computation_state_
ComputationState computation_state_
Definition: optimization.h:205

kaldi::OptimizeLbfgs::kBeforeStep
Definition: optimization.h:174

kaldi::OptimizeLbfgs::kWithinStep
Definition: optimization.h:175

kaldi::OptimizeLbfgs::d_
Real d_
Definition: optimization.h:218

kaldi::LbfgsOptions::max_line_search_iters
int max_line_search_iters
Definition: optimization.h:103

kaldi-matrix.h

kaldi::LbfgsOptions::first_step_learning_rate
float first_step_learning_rate
Definition: optimization.h:87

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

kaldi::Matrix
A class for storing matrices.
Definition: kaldi-matrix.h:823

KALDI_DISALLOW_COPY_AND_ASSIGN
#define KALDI_DISALLOW_COPY_AND_ASSIGN(type)
Definition: kaldi-utils.h:121

kaldi::LinearCgdOptions::recompute_residual_factor
BaseFloat recompute_residual_factor
Definition: optimization.h:45

data_
uint64 data_
Definition: arpa-lm-compiler.cc:108

kaldi::OptimizeLbfgs::k_
SignedMatrixIndexT k_
Definition: optimization.h:202

kaldi::OptimizeLbfgs::temp_
Vector< Real > temp_
Definition: optimization.h:215

kaldi::nnet3::kNone
Definition: nnet-nnet.h:55

kaldi::LbfgsOptions::LbfgsOptions
LbfgsOptions(bool minimize=true)
Definition: optimization.h:107

kaldi::LbfgsOptions::d
float d
Definition: optimization.h:101

kaldi::OptimizeLbfgs::f_
Real f_
Definition: optimization.h:216

kaldi::MatrixIndexT
int32 MatrixIndexT
Definition: matrix-common.h:98

kaldi::OptimizeLbfgs::S
SubVector< Real > S(MatrixIndexT i)
Definition: optimization.h:184

kaldi::OptimizeLbfgs::kWolfeII
Definition: optimization.h:223

float

kaldi::OptimizeLbfgs::best_f_
Real best_f_
Definition: optimization.h:217

kaldi::OptimizeLbfgs::deriv_
Vector< Real > deriv_
Definition: optimization.h:214

kaldi::OptimizeLbfgs::new_x_
Vector< Real > new_x_
Definition: optimization.h:211

kaldi::LbfgsOptions::first_step_length
float first_step_length
Definition: optimization.h:90

kaldi-vector.h

kaldi::OptimizeLbfgs::H_
Vector< Real > H_
Definition: optimization.h:226

kaldi::OptimizeLbfgs::num_wolfe_ii_failures_
int num_wolfe_ii_failures_
Definition: optimization.h:222

kaldi::OptimizeLbfgs
Definition: optimization.h:121

kaldi::LinearCgdOptions
Definition: optimization.h:37

kaldi::OptimizeLbfgs::opts_
LbfgsOptions opts_
Definition: optimization.h:201

kaldi::LbfgsOptions::minimize
bool minimize
Definition: optimization.h:85

kaldi::OptimizeLbfgs::M
MatrixIndexT M()
Definition: optimization.h:180

kaldi::LinearCgdOptions::max_iters
int32 max_iters
Definition: optimization.h:38

kaldi::OptimizeLbfgs::num_wolfe_i_failures_
int num_wolfe_i_failures_
Definition: optimization.h:221

kaldi::LbfgsOptions::m
int m
Definition: optimization.h:86

kaldi::LinearCgdOptions::max_error
BaseFloat max_error
Definition: optimization.h:39

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::LbfgsOptions::first_step_impr
float first_step_impr
Definition: optimization.h:94

kaldi::OptimizeLbfgs::best_x_
Vector< Real > best_x_
Definition: optimization.h:212

kaldi::OptimizeLbfgs::Y
SubVector< Real > Y(MatrixIndexT i)
Definition: optimization.h:181

kaldi::Vector
A class representing a vector.
Definition: kaldi-vector.h:406

kaldi::SignedMatrixIndexT
int32 SignedMatrixIndexT
Definition: matrix-common.h:99

kaldi::LbfgsOptions::avg_step_length
int avg_step_length
Definition: optimization.h:104

kaldi::LinearCgdOptions::LinearCgdOptions
LinearCgdOptions()
Definition: optimization.h:47

kaldi::LinearCgd
int32 LinearCgd(const LinearCgdOptions &opts, const SpMatrix< Real > &A, const VectorBase< Real > &b, VectorBase< Real > *x)
Definition: optimization.cc:453

kaldi::OptimizeLbfgs::ComputationState
ComputationState
"compute p_k <-- - H_k \delta f_k" (i.e. Algorithm 7.4).
Definition: optimization.h:173

kaldi::LbfgsOptions::c2
float c2
Definition: optimization.h:100

kaldi::LbfgsOptions::c1
float c1
Definition: optimization.h:99

kaldi::LbfgsOptions
This is an implementation of L-BFGS.
Definition: optimization.h:84

kaldi::VectorBase
Provides a vector abstraction class.
Definition: kaldi-vector.h:41

kaldi::SubVector
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501

kaldi::OptimizeLbfgs::GetProposedValue
const VectorBase< Real > & GetProposedValue() const
This returns the value at which the function wants us to compute the objective function and gradient...
Definition: optimization.h:134

kaldi::OptimizeLbfgs::Dim
MatrixIndexT Dim()
Definition: optimization.h:179

kaldi::OptimizeLbfgs::rho_
Vector< Real > rho_
Definition: optimization.h:230

kaldi::OptimizeLbfgs::data_
Matrix< Real > data_
Definition: optimization.h:228