optimization.h
Go to the documentation of this file.
1 // matrix/optimization.h
2 
3 // Copyright 2012 Johns Hopkins University (author: Daniel Povey)
4 //
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 //
20 // (*) incorporates, with permission, FFT code from his book
21 // "Signal Processing with Lapped Transforms", Artech, 1992.
22 
23 
24 
25 #ifndef KALDI_MATRIX_OPTIMIZATION_H_
26 #define KALDI_MATRIX_OPTIMIZATION_H_
27 
28 #include "matrix/kaldi-vector.h"
29 #include "matrix/kaldi-matrix.h"
30 
31 namespace kaldi {
32 
33 
36 
38  int32 max_iters; // Maximum number of iters (if >= 0).
39  BaseFloat max_error; // Maximum 2-norm of the residual A x - b (convergence
40  // test)
41  // Every time the residual 2-norm decreases by this recompute_residual_factor
42  // since the last time it was computed from scratch, recompute it from
43  // scratch. This helps to keep the computed residual accurate even in the
44  // presence of roundoff.
46 
47  LinearCgdOptions(): max_iters(-1),
48  max_error(0.0),
49  recompute_residual_factor(0.01) { }
50 };
51 
52 /*
53  This function uses linear conjugate gradient descent to approximately solve
54  the system A x = b. The value of x at entry corresponds to the initial guess
55  of x. The algorithm continues until the number of iterations equals b.Dim(),
56  or until the 2-norm of (A x - b) is <= max_error, or until the number of
57  iterations equals max_iter, whichever happens sooner. It is a requirement
58  that A be positive definite.
59  It returns the number of iterations that were actually executed (this is
60  useful for testing purposes).
61 */
62 template<typename Real>
63 int32 LinearCgd(const LinearCgdOptions &opts,
64  const SpMatrix<Real> &A, const VectorBase<Real> &b,
65  VectorBase<Real> *x);
66 
67 
68 
69 
70 
71 
84 struct LbfgsOptions {
85  bool minimize; // if true, we're minimizing, else maximizing.
86  int m; // m is the number of stored vectors L-BFGS keeps.
87  float first_step_learning_rate; // The very first step of L-BFGS is
88  // like gradient descent. If you want to configure the size of that step,
89  // you can do it using this variable.
90  float first_step_length; // If this variable is >0.0, it overrides
91  // first_step_learning_rate; on the first step we choose an approximate
92  // Hessian that is the multiple of the identity that would generate this
93  // step-length, or 1.0 if the gradient is zero.
94  float first_step_impr; // If this variable is >0.0, it overrides
95  // first_step_learning_rate; on the first step we choose an approximate
96  // Hessian that is the multiple of the identity that would generate this
97  // amount of objective function improvement (assuming the "real" objf
98  // was linear).
99  float c1; // A constant in Armijo rule = Wolfe condition i)
100  float c2; // A constant in Wolfe condition ii)
101  float d; // An amount > 1.0 (default 2.0) that we initially multiply or
102  // divide the step length by, in the line search.
103  int max_line_search_iters; // after this many iters we restart L-BFGS.
104  int avg_step_length; // number of iters to avg step length over, in
105  // RecentStepLength().
106 
107  LbfgsOptions (bool minimize = true):
108  minimize(minimize),
109  m(10),
110  first_step_learning_rate(1.0),
111  first_step_length(0.0),
112  first_step_impr(0.0),
113  c1(1.0e-04),
114  c2(0.9),
115  d(2.0),
116  max_line_search_iters(50),
117  avg_step_length(4) { }
118 };
119 
120 template<typename Real>
122  public:
125  const LbfgsOptions &opts);
126 
130  const VectorBase<Real>& GetValue(Real *objf_value = NULL) const;
131 
134  const VectorBase<Real>& GetProposedValue() const { return new_x_; }
135 
142  Real RecentStepLength() const;
143 
150  void DoStep(Real function_value,
151  const VectorBase<Real> &gradient);
152 
157  void DoStep(Real function_value,
158  const VectorBase<Real> &gradient,
159  const VectorBase<Real> &diag_approx_2nd_deriv);
160 
161  private:
163 
164 
165  // The following variable says what stage of the computation we're at.
166  // Refer to Algorithm 7.5 (L-BFGS) of Nodecdal & Wright, "Numerical
167  // Optimization", 2nd edition.
168  // kBeforeStep means we're about to do
170  // kWithinStep means we're at some point within line search; note
171  // that line search is iterative so we can stay in this state more
172  // than one time on each iteration.
175  kWithinStep, // This means we're within the step-size computation, and
176  // have not yet done the 1st function evaluation.
177  };
178 
179  inline MatrixIndexT Dim() { return x_.Dim(); }
180  inline MatrixIndexT M() { return opts_.m; }
182  return SubVector<Real>(data_, (i % M()) * 2); // vector y_i
183  }
185  return SubVector<Real>(data_, (i % M()) * 2 + 1); // vector s_i
186  }
187  // The following are subroutines within DoStep():
188  bool AcceptStep(Real function_value,
189  const VectorBase<Real> &gradient);
190  void Restart(const VectorBase<Real> &x,
191  Real function_value,
192  const VectorBase<Real> &gradient);
193  void ComputeNewDirection(Real function_value,
194  const VectorBase<Real> &gradient);
195  void ComputeHifNeeded(const VectorBase<Real> &gradient);
196  void StepSizeIteration(Real function_value,
197  const VectorBase<Real> &gradient);
198  void RecordStepLength(Real s);
199 
200 
202  SignedMatrixIndexT k_; // Iteration number, starts from zero. Gets set back to zero
203  // when we restart.
204 
206  bool H_was_set_; // True if the user specified H_; if false,
207  // we'll use a heuristic to estimate it.
208 
209 
210  Vector<Real> x_; // current x.
211  Vector<Real> new_x_; // the x proposed in the line search.
212  Vector<Real> best_x_; // the x with the best objective function so far
213  // (either the same as x_ or something in the current line search.)
214  Vector<Real> deriv_; // The most recently evaluated derivative-- at x_k.
216  Real f_; // The function evaluated at x_k.
217  Real best_f_; // the best objective function so far.
218  Real d_; // a number d > 1.0, but during an iteration we may decrease this, when
219  // we switch between armijo and wolfe failures.
220 
221  int num_wolfe_i_failures_; // the num times we decreased step size.
222  int num_wolfe_ii_failures_; // the num times we increased step size.
223  enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search
224  // failure on this iter.
225 
226  Vector<Real> H_; // Current inverse-Hessian estimate. May be computed by this class itself,
227  // or provided by user using 2nd form of SetGradientInfo().
228  Matrix<Real> data_; // dimension (m*2) x dim. Even rows store
229  // gradients y_i, odd rows store steps s_i.
230  Vector<Real> rho_; // dimension m; rho_(m) = 1/(y_m^T s_m), Eq. 7.17.
231 
232  std::vector<Real> step_lengths_; // The step sizes we took on the last
233  // (up to m) iterations; these are not stored in a rotating buffer but
234  // are shifted by one each time (this is more convenient when we
235  // restart, as we keep this info past restarting).
236 
237 
238 };
239 
241 
242 
243 } // end namespace kaldi
244 
245 
246 
247 #endif
248 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Vector< Real > x_
Definition: optimization.h:210
Packed symetric matrix class.
Definition: matrix-common.h:62
std::vector< Real > step_lengths_
Definition: optimization.h:232
ComputationState computation_state_
Definition: optimization.h:205
float first_step_learning_rate
Definition: optimization.h:87
kaldi::int32 int32
A class for storing matrices.
Definition: kaldi-matrix.h:823
#define KALDI_DISALLOW_COPY_AND_ASSIGN(type)
Definition: kaldi-utils.h:121
BaseFloat recompute_residual_factor
Definition: optimization.h:45
uint64 data_
SignedMatrixIndexT k_
Definition: optimization.h:202
Vector< Real > temp_
Definition: optimization.h:215
LbfgsOptions(bool minimize=true)
Definition: optimization.h:107
int32 MatrixIndexT
Definition: matrix-common.h:98
SubVector< Real > S(MatrixIndexT i)
Definition: optimization.h:184
Vector< Real > deriv_
Definition: optimization.h:214
Vector< Real > new_x_
Definition: optimization.h:211
Vector< Real > H_
Definition: optimization.h:226
LbfgsOptions opts_
Definition: optimization.h:201
MatrixIndexT M()
Definition: optimization.h:180
Vector< Real > best_x_
Definition: optimization.h:212
SubVector< Real > Y(MatrixIndexT i)
Definition: optimization.h:181
A class representing a vector.
Definition: kaldi-vector.h:406
int32 SignedMatrixIndexT
Definition: matrix-common.h:99
int32 LinearCgd(const LinearCgdOptions &opts, const SpMatrix< Real > &A, const VectorBase< Real > &b, VectorBase< Real > *x)
ComputationState
"compute p_k <-- - H_k \delta f_k" (i.e. Algorithm 7.4).
Definition: optimization.h:173
This is an implementation of L-BFGS.
Definition: optimization.h:84
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
const VectorBase< Real > & GetProposedValue() const
This returns the value at which the function wants us to compute the objective function and gradient...
Definition: optimization.h:134
MatrixIndexT Dim()
Definition: optimization.h:179
Vector< Real > rho_
Definition: optimization.h:230
Matrix< Real > data_
Definition: optimization.h:228