nnet-precondition.cc
Go to the documentation of this file.
1 // nnet2/nnet-precondition.cc
2 
3 // Copyright 2012 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
21 
22 namespace kaldi {
23 namespace nnet2 {
24 
27  double lambda,
29 
30  int32 N = R.NumRows(), D = R.NumCols();
31  KALDI_ASSERT(SameDim(R, *P) && N > 0);
32  if (N == 1) {
33  KALDI_WARN << "Trying to precondition set of only one frames: returning "
34  << "unchanged. Ignore this warning if infrequent.";
35  P->CopyFromMat(R);
36  return;
37  }
39 
40  if (N >= D) {
41  // Compute G = (\lambda I + 1/(N-1) R^T R)^{-1} by direct inversion.
42  // G <-- lambda I.
43  CuMatrix<BaseFloat> G(D, D);
44  G.AddToDiag(lambda);
45  // G += 1.0/(N-1) * R^T R.
46  G.SymAddMat2(1.0 / (N-1), R, kTrans, 1.0);
47  G.CopyLowerToUpper();
48  if (GetVerboseLevel() >= 5 && Rand() % 20 == 0) {
50  SpMatrix<BaseFloat> G_cpu(tmp);
51  G_cpu.PrintEigs("G");
52  }
53  G.SymInvertPosDef();
54  // Q <-- R G^T (we just make it transposed as we think
55  // it will be slightly faster; it's symmetric).
56  Q.AddMatMat(1.0, R, kNoTrans, G, kTrans, 0.0);
57  } else {
58  // Through a lot of rearrangements, it turns out that
59  // if we let S = (\lambda I + 1/(N-1) R R^T)^{-1}
60  // then what we need is
61  // Q <-- S R.
62  // It is curious and (to me) unexpected that the actual code is basically
63  // the same when transposed.
64  CuMatrix<BaseFloat> S(N, N);
65  // S <-- lambda I.
66  S.AddToDiag(lambda);
67  // S += (N-1) R R^T.
68  // the following function only updates the lower triangle.
69  S.SymAddMat2(1.0 / (N-1), R, kNoTrans, 1.0);
70  S.CopyLowerToUpper();
71  // invert S, so now S = (\lambda I + (N-1) R R^T)^{-1}.
72  if (GetVerboseLevel() >= 5 && Rand() % 20 == 0) {
74  SpMatrix<BaseFloat> S_cpu(tmp);
75  S_cpu.PrintEigs("S");
76  }
77  S.SymInvertPosDef();
78  Q.AddMatMat(1.0, S, kNoTrans, R, kNoTrans, 0.0);
79  }
80 
81 #if 0 // Old code before it was optimized for CUDA:
82  for (int32 n = 0; n < N; n++) {
83  CuSubVector<BaseFloat> r(R, n), q(Q, n);
84  BaseFloat gamma = VecVec(r, q), // gamma_n = r_n^T q_n.
85  beta = 1 + gamma / (N - 1 - gamma);
86  if (!(gamma >= 0.0 && beta > 0.0)) {
87  KALDI_ERR << "Bad values encountered in preconditioning: gamma = " << gamma
88  << ", beta = " << beta;
89  }
90  // Q and P share the same memory. The result of the
91  // scaling below will be output as P.
92  q.Scale(beta);
93  }
94 #else
95  CuVector<BaseFloat> gamma(N);
96  gamma.AddDiagMatMat(1.0, R, kNoTrans, Q, kTrans, 0.0);
97  // at this point, gamma(i) equals the i'th row of R dotted with
98  // the i'th row of Q.
99  Vector<BaseFloat> cpu_gamma(gamma), cpu_beta(N, kUndefined);
100  for (int32 n = 0; n < N; n++) {
101  BaseFloat this_gamma = cpu_gamma(n),
102  this_beta = 1.0 + this_gamma / (N - 1 - this_gamma);
103  if (!(this_gamma >= 0.0 && this_beta > 0.0))
104  KALDI_ERR << "Bad values encountered in preconditioning: gamma = "
105  << this_gamma << ", beta = " << this_beta;
106  cpu_beta(n) = this_beta;
107  }
108  CuVector<BaseFloat> beta(cpu_beta);
109  P->MulRowsVec(beta);
110 #endif
111 }
112 
113 
115  const CuMatrixBase<BaseFloat> &R,
116  double alpha,
118  KALDI_ASSERT(alpha > 0.0);
119  // probably does not really make sense.
120  double t = TraceMatMat(R, R, kTrans), floor = 1.0e-20;
121  if (t < floor) {
122  KALDI_WARN << "Flooring trace from " << t
123  << " to " << floor;
124  t = floor;
125  }
126  double lambda = t * alpha / R.NumRows() / R.NumCols();
127  // see the extended comment below for an explanation of this.
128  if (lambda <= 0.0) {
129  // This should never really happen, it would probably indicate a bug
130  // in the calling code.
131  KALDI_WARN << "Zero or negative lambda in PreconditionDirectionsAlpha.";
132  lambda = 1.0e-10;
133  }
134  PreconditionDirections(R, lambda, P);
135 }
136 
137 
139  const CuMatrixBase<BaseFloat> &R,
140  double alpha,
142  KALDI_ASSERT(alpha > 0.0); // alpha > 1.0
143  // probably does not really make sense.
144  double t = TraceMatMat(R, R, kTrans), floor = 1.0e-20;
145  if (t == 0.0) {
146  P->CopyFromMat(R);
147  return;
148  }
149  if (t < floor) {
150  KALDI_WARN << "Flooring trace from " << t
151  << " to " << floor;
152  t = floor;
153  }
154  double lambda = t * alpha / R.NumRows() / R.NumCols();
155  // see the extended comment below for an explanation of this.
156  KALDI_ASSERT(lambda != 0.0);
157  PreconditionDirections(R, lambda, P);
158  double p_trace = TraceMatMat(*P, *P, kTrans),
159  rescale = sqrt(t / p_trace);
160  KALDI_ASSERT(p_trace != 0.0);
161  P->Scale(rescale);
162 }
163 
164 
165 } // namespace nnet2
166 } // namespace kaldi
167 
168 /*
169  Notes for an idea on preconditioning.
170  update is of form:
171  params += learning_rate * input_row * output_deriv'
172  want to precondition by fisher-like matrix in each of (the input dim and the
173  output dim).
174  [note: in this method we'll pretend the chunk-weights are all one.
175  It shouldn't really matter, it's only preconditioning.]
176 
177  The first observation is, if we do:
178 
179  params += learning_rate * S * input_row * output_deriv' * T
180 
181  for any positive definite S and T that we choose (well, perhaps we have
182  to ensure their eigenvalues are bounded in some way, but we'll bother with
183  that later), then we'll still get convergence. But S and T cannot be
184  functions of the current sample, the one that creates "input_row" and
185  "output_deriv", or this introduces a bias.
186 
187  We can view it as a preconditioning of the vectorized form of the
188  transformation matrix.
189 
190  For a Fisher-like preconditioning, we can precondition using
191  the inverse of the scatter of the other features in the batch.
192  For the input_row, call this r_j.
193 
194  Let the total scatter be
195 
196  S = \sum_n r_n r_n^T
197  where the sum is taken over the minibatch, and
198  S_n = S - r_n r_n^T
199  i.e. the scatter with this sample removed.
200  Let F_n be the normalized version of this, dividing by the #samples.
201  F_n = 1/(N-1) S_n
202  where N is the minibatch size (so N-1 is excluding the current sample).
203  We're going to want to invert F_n, so we need to make it positive definite.
204 
205  We're going to define G_n as a smoothed form of the estimated Fisher matrix
206  for this batch:
207  G_n = F_n + \lambda_n I
208  where I is the identity. A suitable formula for \lambda_n is to define
209  a small constant \alpha (say, \alpha=0.1), and let
210 
211  \lambda_n = (\alpha/dim(F)) trace(F_n) .
212 
213  In practice (although we lost strict convergence guarantees) it will be easier
214  to set a global \lambda, to:
215 
216  \lambda = (\alpha/dim(S)) trace(S)
217  = (\alpha/(R.NumRows()*R.NumCols()) * trace(R^T R)).
218 
219  This is an easy way to set it. Let's define P_n as the inverse of G_n. This
220  is what we'll be multiplying the input values by:
221 
222  P_n = G_n^{-1} = (F_n + \lambda_n I)^{-1}
223 
224  First, let's define an uncorrected "global" Fisher matrix
225  F = (1/(N-1)) S_n,
226  and G = F^{-1}.
227  If we let R be the matrix each of whose rows is one of the r_n,
228  then
229  S = R^T R, and
230  F = 1/(N-1) R^T R
231 
232  G = (F + \lambda I)^{-1}
233  = (1/(N-1) R^T R + \lambda I)^{-1}
234 Using the Woodbury formula,
235  G = (1/\lambda) I - (1/\lambda^2) R^T M R
236 where
237  M = ((N-1) I + 1/\lambda R R^T)^{-1}
238 (and this inversion for M is actually done as an inversion, in a lower
239  dimension such as 250, versus the actual dimension which might be 1000).
240 
241 Let's assume \lambda is a constant, i.e. there is no \lambda_n.
242 We can get it from the previous minibatch.
243 
244  We want to compute
245 
246  G_n = F_n^{-1} = (F - 1/(N-1) r_n r_n^T)^{-1}
247 
248  and using the Sherman-Morrison formula, this may be written as:
249 
250  G_n = G + \alpha_n q_n q_n^T # Caution: \alpha_n has nothing to do with \alpha.
251 
252  where q_n = G r_n, and
253 
254  \alpha_n = 1/( (N-1) (1 - 1/(N-1) r_n^T q_n) )
255  = 1 / (N - 1 - r_n^T q_n)
256 
257  We'll want to compute this efficiently. For each r_n we'll want to compute
258 
259  p_n = G_n r_n
260 
261  which will correspond to the direction we update in.
262  We'll use
263 
264  p_n = G r_n + \alpha_n q_n q_n^T r_n
265 
266  and since q_n = G r_n, both terms in this equation point in
267  the same direction, and we can write this as:
268 
269  p_n = \beta_n q_n,
270 
271  where, defining \gamma_n = r_n^T q_n, we have
272 
273  \beta_n = 1 + \gamma_n \alpha_n
274  = 1 + \gamma_n / ((N-1) (1 - \gamma_n/(N-1)))
275  = 1 + \gamma_n / (N - 1 - \gamma_n)
276 
277 */
278 
279 /*
280 
281  SUMMARY:
282  let the input features (we can extend these with a 1 for the bias term) be
283  a matrix R, each row of which corresponds to a training example r_n
284 
285  The dimension of R is N x D, where N is the minibatch size and D is the
286  dimension of the input to this layer of the network.
287 
288  We'll be computing a matrix P, each row p_n of which will be the corresponding
289  row r_n of R, multiplied by a positive definite preconditioning matrix G_n.
290  [we can check that for each i, p_n^T r_n >= 0].
291  The following computation obtains P:
292 
293  \lambda <-- (\alpha/N) \trace(R R^T). # 0 < \alpha <= 1 is a global constant, e.g.
294  # \alpha = 0.1, but should try different
295  # values, this will be important (note: if the
296  # minibatch size is >= the dimension (N >= D),
297  # then we can let \alpha be quite small, e.g.
298  # 0.001.
299 
300  if N >= D, then
301  # compute G by direct inversion.
302  G <-- (\lambda I + 1/(N-1) R^T R)^{-1}
303  Q <-- R G.
304  else # number of samples is less than dimension, use
305  # morrison-Woodbury formula, it's more efficient.
306  # We'd first compute:
307  # L <-- ((N-1) I + 1/\lambda R R^T)
308  # (note: L is something that appears in the morrison-Woodbury expansion of G)
309  # M <-- L^{-1}
310  # Note: G is 1/\lambda I - (1/\lambda^2) R^T M R
311  # We're doing Q <-- R G, which is:
312  # Q <-- 1/\lambda R - (1/\lambda^2) R (R^T M R)
313  # It's more efficient in this case to left-multiply R
314  # by something, i.e. bracket as:
315  # Q <-- 1/\lambda R - (1/\lambda^2) (R R^T M) R
316  # so let's write it as
317  # Q <-- S R, with
318  # S = 1/\lambda I - 1/\lambda^2 R R^T M
319  # = 1/\lambda (I - 1/\lambda R R^T M)
320  # Now, -1/\lambda R R^T = (N-1) I - L, and L M = I, so
321  # S = 1/\lambda (I + ((N-1) I - L) M)
322  # = (N-1)/\lambda M
323  # and we can get rid of that scalar earlier on:
324  # if we let L' = \lambda/(N-1) L, so that
325  # L' = (lambda I + 1/(N-1) R R^T)
326  # then
327  # S = (\lambda I + 1/(N-1) R R^T)^{-1}.
328 
329  S <-- (\lambda I + 1/(N-1) R R^T)^{-1}.
330  Q <-- S R
331  fi
332 
333  Here, we're right multiplying each row r_n of r by the symmetric matrix G, to get
334  the corresponding row q_n of q. Note: in practice Q will be the same memory as P.
335  Next we work out for each n:
336  \gamma_n = r_n^T q_n # This should be nonnegative! Check this.
337  \beta_n = 1 + \gamma_n / (N - 1 - \gamma_n) # This should be positive; check this.
338  For each n, we'll do (for the corresponding rows of P and Q):
339  p_n <-- \beta_n q_n.
340  In practice, we'd do this computation in-place, with P and Q using the
341  same memory.
342 
343  If we're being paranoid, we should verify that
344 
345  p_n = (\lambda I + 1/(N-1) \sum_{m != n} r_n r_n^T)^{-1} r_n.
346 
347  This is exact mathematically, but there could be differences due to roundoff,
348  and if \alpha is quite small, these differences could be substantial.
349 
350  */
351 
352 
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Packed symetric matrix class.
Definition: matrix-common.h:62
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60
void PreconditionDirectionsAlpha(const CuMatrixBase< BaseFloat > &R, double alpha, CuMatrixBase< BaseFloat > *P)
This wrapper for PreconditionDirections computes lambda using = /(N D) trace(R^T, R), and calls PreconditionDirections.
kaldi::int32 int32
void AddToDiag(Real value)
Adds "value" to the diagonal elements of the matrix.
Definition: cu-matrix.cc:604
void PreconditionDirections(const CuMatrixBase< BaseFloat > &R, double lambda, CuMatrixBase< BaseFloat > *P)
See below for comment.
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void AddDiagMatMat(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const CuMatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
Definition: cu-vector.cc:611
void PrintEigs(const char *name)
Definition: sp-matrix.h:203
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
void Scale(Real value)
Definition: cu-matrix.cc:644
void SymInvertPosDef()
Inversion for positive definite symmetric matrices.
Definition: cu-matrix.cc:2111
float BaseFloat
Definition: kaldi-types.h:29
struct rnnlm::@11::@12 n
void SymAddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transA, Real beta)
*this = beta * *this + alpha * M M^T, for symmetric matrices.
Definition: cu-matrix.cc:1353
#define KALDI_ERR
Definition: kaldi-error.h:147
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
#define KALDI_WARN
Definition: kaldi-error.h:150
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
int Rand(struct RandomState *state)
Definition: kaldi-math.cc:45
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void Scale(Real value)
Definition: cu-vector.cc:1216
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i&#39;th row by scale[i]
Definition: cu-matrix.cc:792
void PreconditionDirectionsAlphaRescaled(const CuMatrixBase< BaseFloat > &R, double alpha, CuMatrixBase< BaseFloat > *P)
This wrapper for PreconditionDirections computes lambda using = /(N D) trace(R^T, R), and calls PreconditionDirections.