natural-gradient-online-test.cc
Go to the documentation of this file.
1 // nnet3/natural-gradient-online-test.cc
2 
3 // Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
21 #include "util/common-utils.h"
22 
23 namespace kaldi {
24 namespace nnet3 {
25 
26 // Simple version of OnlineNaturalGradient that we use to make
27 // sure it is behaving as advertised.
29  public:
31  epsilon_(1.0e-10), delta_(5.0e-04) { }
32 
33  void SetRank(int32 rank) { rank_ = rank; }
34 
37  CuVectorBase<BaseFloat> *row_prod,
38  BaseFloat *scale);
39 
40 
41  private:
42  BaseFloat Eta(int32 N) const;
43 
46  VectorBase<double> *row_prod,
47  BaseFloat *scale);
48 
49 
50  void Init(const MatrixBase<double> &R0);
51 
52  void InitDefault(int32 D);
53 
56  double alpha_;
57  double epsilon_;
58  double delta_;
59 
60  // Fisher matrix defined as F_t = R_t^T diag(d_t) R_t + rho_t I.
63  double rho_t_;
64 };
65 
66 
69  CuVectorBase<BaseFloat> *row_prod,
70  BaseFloat *scale) {
71  Matrix<BaseFloat> R_cpu(*R);
72  Vector<BaseFloat> row_prod_cpu(*row_prod);
73  Matrix<double> R_cpu_dbl(R_cpu);
74  Vector<double> row_prod_cpu_dbl(row_prod_cpu);
75  PreconditionDirectionsCpu(&R_cpu_dbl,
76  &row_prod_cpu_dbl,
77  scale);
78  row_prod_cpu.CopyFromVec(row_prod_cpu_dbl);
79  R_cpu.CopyFromMat(R_cpu_dbl);
80  R->CopyFromMat(R_cpu);
81  row_prod->CopyFromVec(row_prod_cpu);
82 }
83 
85  if (rank_ >= D) {
86  KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
87  << ", setting it to "
88  << (D - 1) << " (but this is probably still too high)";
89  rank_ = D - 1;
90  }
91  int32 R = rank_;
92  R_t_.Resize(R, D);
93  for (int32 r = 0; r < R; r++) {
94  std::vector<int32> cols;
95  for (int32 c = r; c < D; c += R)
96  cols.push_back(c);
97  for (int32 i = 0; i < cols.size(); i++) {
98  int32 c = cols[i];
99  R_t_(r, c) = (i == 0 ? 1.1 : 1.0) /
100  sqrt(1.1 * 1.1 + cols.size() - 1);
101  }
102  }
103  d_t_.Resize(R);
104  d_t_.Set(epsilon_);
105  rho_t_ = epsilon_;
106 }
107 
109  int32 D = R0.NumCols(), N = R0.NumRows();
110  InitDefault(D);
111  int32 num_init_iters = 3;
112  for (int32 i = 0; i < num_init_iters; i++) {
113  CuMatrix<BaseFloat> R0_copy(R0);
114  CuVector<BaseFloat> row_products(N);
115  BaseFloat scale;
116  PreconditionDirections(&R0_copy, &row_products, &scale);
117  }
118 }
119 
122  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
123  if (ans > 0.9) ans = 0.9;
124  return ans;
125 }
126 
127 
129  MatrixBase<double> *X_t,
130  VectorBase<double> *row_prod,
131  BaseFloat *scale) {
132  if (R_t_.NumRows() == 0)
133  Init(*X_t);
134  int32 R = R_t_.NumRows(), D = R_t_.NumCols(), N = X_t->NumRows();
135  BaseFloat eta = Eta(N);
136 
137  SpMatrix<double> F_t(D);
138  // F_t =(def) R_t^T D_t R_t + \rho_t I
139  F_t.AddToDiag(rho_t_);
140  F_t.AddMat2Vec(1.0, R_t_, kTrans, d_t_, 1.0);
141 
142  // Make sure F_t is +ve definite.
143  {
144  KALDI_ASSERT(d_t_.Min() > 0);
145  Vector<double> eigs(D);
146  F_t.Eig(&eigs, NULL);
147  KALDI_ASSERT(eigs.Min() > 0);
148  }
149 
150  // S_t =(def) 1/N X_t^T X_t.
151  SpMatrix<double> S_t(D);
152  S_t.AddMat2(1.0 / N, *X_t, kTrans, 0.0);
153 
154  // T_t =(def) \eta S_t + (1-\eta) F_t
155  SpMatrix<double> T_t(D);
156  T_t.AddSp(eta, S_t);
157  T_t.AddSp(1.0 - eta, F_t);
158 
159  // Y_t =(def) R_t T_t
160  Matrix<double> Y_t(R, D);
161  Y_t.AddMatSp(1.0, R_t_, kNoTrans, T_t, 0.0);
162 
163  // Z_t =(def) Y_t Y_t^T
164  SpMatrix<double> Z_t(R);
165  Z_t.AddMat2(1.0, Y_t, kNoTrans, 0.0);
166 
167  Matrix<double> U_t(R, R);
168  Vector<double> c_t(R);
169  // decompose Z_t = U_t C_t U_t^T
170  Z_t.Eig(&c_t, &U_t);
171  SortSvd(&c_t, &U_t);
172  double c_t_floor = pow(rho_t_ * (1.0 - eta), 2);
173  int32 nf;
174  c_t.ApplyFloor(c_t_floor, &nf);
175  if (nf > 0) {
176  KALDI_WARN << "Floored " << nf << " elements of c_t.";
177  }
178  // KALDI_LOG << "c_t is " << c_t;
179  // KALDI_LOG << "U_t is " << U_t;
180  // KALDI_LOG << "Z_t is " << Z_t;
181 
182  Vector<double> sqrt_c_t(c_t);
183  sqrt_c_t.ApplyPow(0.5);
184  Vector<double> inv_sqrt_c_t(sqrt_c_t);
185  inv_sqrt_c_t.InvertElements();
186  Matrix<double> R_t1(R, D);
187  // R_{t+1} = C_t^{-0.5} U_t^T Y_t
188  R_t1.AddMatMat(1.0, U_t, kTrans, Y_t, kNoTrans, 0.0);
189  R_t1.MulRowsVec(inv_sqrt_c_t);
190 
191  double rho_t1 = (1.0 / (D - R)) *
192  (eta * S_t.Trace() + (1.0 - eta) * (D * rho_t_ + d_t_.Sum()) - sqrt_c_t.Sum());
193 
194  Vector<double> d_t1(sqrt_c_t);
195  d_t1.Add(-rho_t1);
196 
197  double floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max());
198  if (rho_t1 < floor_val) {
199  KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1;
200  rho_t1 = floor_val;
201  }
202  d_t1.ApplyFloor(floor_val, &nf);
203  if (nf > 0) {
204  KALDI_VLOG(3) << "d_t1 was " << d_t1;
205  KALDI_WARN << "Floored " << nf << " elements of d_{t+1}.";
206  }
207  // a check.
208  if (nf == 0 && rho_t1 > floor_val) {
209  double tr_F_t1 = D * rho_t1 + d_t1.Sum(), tr_T_t = T_t.Trace();
210  AssertEqual(tr_F_t1, tr_T_t);
211  }
212 
213  // G_t = F_t + alpha/D tr(F_t)
214  SpMatrix<double> G_t(F_t);
215  G_t.AddToDiag(alpha_ / D * F_t.Trace());
216  SpMatrix<double> G_t_inv(G_t);
217  G_t_inv.Invert();
218 
219  double beta_t = rho_t_ + alpha_/D * F_t.Trace();
220  // X_hat_t = beta_t X_t G_t^{-1}.
221  Matrix<double> X_hat_t(N, D);
222  X_hat_t.AddMatSp(beta_t, *X_t, kNoTrans, G_t_inv, 0.0);
223 
224  double tr_x_x = TraceMatMat(*X_t, *X_t, kTrans),
225  tr_Xhat_Xhat = TraceMatMat(X_hat_t, X_hat_t, kTrans);
226  double gamma = (tr_Xhat_Xhat == 0 ? 1.0 : sqrt(tr_x_x / tr_Xhat_Xhat));
227 
228  X_t->CopyFromMat(X_hat_t);
229  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
230  *scale = gamma;
231 
232  // Update the parameters
233  rho_t_ = rho_t1;
234  d_t_.CopyFromVec(d_t1);
235  R_t_.CopyFromMat(R_t1);
236 
237  KALDI_VLOG(3) << "rho_t_ = " << rho_t_;
238  KALDI_VLOG(3) << "d_t_ = " << d_t_;
239  KALDI_VLOG(3) << "R_t_ = " << R_t_;
240 
241 
242  { // check that R_t_ R_t_^T = I.
243  SpMatrix<double> unit(R);
244  unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
245  if (!unit.IsUnit(1.0e-03)) {
246  KALDI_WARN << "R is not orthogonal, reorthogonalizing.";
247  for (int32 i = 0; i < R; i++) {
248  SubVector<double> row(R_t_, i);
249  for (int32 j = 0; j < i; j++) {
250  SubVector<double> row_j(R_t_, j);
251  row.AddVec(-VecVec(row_j, row), row_j);
252  }
253  row.Scale(1.0 / row.Norm(2.0));
254  }
255  }
256  unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
257  KALDI_ASSERT(unit.IsUnit(1.0e-03));
258  }
259 }
260 
261 
263  MatrixIndexT R = 1 + Rand() % 30, // rank of correction
264  N = (2 * R) + Rand() % 30, // batch size
265  D = R + 1 + Rand() % 20; // problem dimension. Must be > R.
266 
267  // Test sometimes with features that are all-zero or all-one; this will
268  // help to make sure low-rank or zero input doesn't crash the code.
269  bool zero = false;
270  bool one = false;
271  if (Rand() % 3 == 0) zero = true;
272  //else if (Rand() % 2 == 0) one = true;
273 
274  CuVector<BaseFloat> row_prod1(N);
275  BaseFloat gamma1, gamma2;
276  BaseFloat big_eig_factor = RandInt(1, 20);
277  big_eig_factor = big_eig_factor * big_eig_factor;
278  Vector<BaseFloat> big_eig_vector(D);
279  big_eig_vector.SetRandn();
280  big_eig_vector.Scale(big_eig_factor);
281 
282  OnlineNaturalGradientSimple preconditioner1;
283  OnlineNaturalGradient preconditioner2;
284  preconditioner1.SetRank(R);
285  preconditioner2.SetRank(R);
286  preconditioner2.TurnOnDebug();
287 
288  int32 num_iters = 100;
289  for (int32 iter = 0; iter < num_iters; iter++) {
290  Matrix<BaseFloat> M_cpu(N, D);
291  if (one) M_cpu.Set(1.0);
292  else if (!zero) {
293  M_cpu.SetRandn();
294  Vector<BaseFloat> rand_vec(N);
295  rand_vec.SetRandn();
296  M_cpu.AddVecVec(1.0, rand_vec, big_eig_vector);
297  }
298  CuMatrix<BaseFloat> M(M_cpu);
299 
300  CuMatrix<BaseFloat> Mcopy1(M), Mcopy2(M);
301 
302  preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1);
303 
304  preconditioner2.PreconditionDirections(&Mcopy2, &gamma2);
305 
306  BaseFloat trace1 = TraceMatMat(M, M, kTrans),
307  trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans);
308  AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
309 
310  AssertEqual(Mcopy1, Mcopy2);
311  AssertEqual(gamma1, gamma2, 1.0e-02);
312 
313  // make sure positive definite
314  CuVector<BaseFloat> inner_prods(M.NumRows());
315  inner_prods.AddDiagMatMat(1.0, M, kNoTrans, Mcopy1, kTrans, 0.0);
316  KALDI_ASSERT(inner_prods.Min() >= 0.0);
317  }
318  return;
319 }
320 
321 
322 } // namespace nnet3
323 } // namespace kaldi
324 
325 
326 int main() {
327  using namespace kaldi;
328  using namespace kaldi::nnet3;
329  for (int32 loop = 0; loop < 2; loop++) {
330 #if HAVE_CUDA == 1
331  CuDevice::Instantiate().SetDebugStrideMode(true);
332  if (loop == 0)
333  CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
334  else
335  CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
336 #endif
337  for (int32 i = 0; i < 5; i++) {
339  }
340  }
341 }
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
void AddMat2(const Real alpha, const MatrixBase< Real > &M, MatrixTransposeType transM, const Real beta)
rank-N update: if (transM == kNoTrans) (*this) = beta*(*this) + alpha * M * M^T, or (if transM == kTr...
Definition: sp-matrix.cc:1110
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool IsUnit(Real cutoff=1.0e-05) const
Definition: sp-matrix.cc:480
void AddMat2Vec(const Real alpha, const MatrixBase< Real > &M, MatrixTransposeType transM, const VectorBase< Real > &v, const Real beta=0.0)
Extension of rank-N update: this <– beta*this + alpha * M * diag(v) * M^T.
Definition: sp-matrix.cc:1081
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
Real Trace() const
Definition: sp-matrix.cc:171
void AddDiagMat2(Real alpha, const MatrixBase< Real > &M, MatrixTransposeType trans=kNoTrans, Real beta=1.0)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
kaldi::int32 int32
void Eig(VectorBase< Real > *s, MatrixBase< Real > *P=NULL) const
Solves the symmetric eigenvalue problem: at end we should have (*this) = P * diag(s) * P^T...
Definition: qr.cc:433
Keywords for search: natural gradient, naturalgradient, NG-SGD.
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
void CopyFromMat(const MatrixBase< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given matrix. (no resize is done).
Real Min() const
Returns the minimum value of any element, or +infinity for the empty vector.
Real Norm(Real p) const
Compute the p-th norm of the vector.
void AddDiagMatMat(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const CuMatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
Definition: cu-vector.cc:611
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=nullptr)
Applies floor to all elements.
Definition: kaldi-vector.h:149
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
void AddToDiag(const Real r)
float BaseFloat
Definition: kaldi-types.h:29
int32 MatrixIndexT
Definition: matrix-common.h:98
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
Definition: cu-vector.cc:1078
void AddSp(const Real alpha, const SpMatrix< Real > &Ma)
Definition: sp-matrix.h:211
void AddMatSp(const Real alpha, const MatrixBase< Real > &A, MatrixTransposeType transA, const SpMatrix< Real > &B, const Real beta)
this <– beta*this + alpha*A*B.
Definition: kaldi-matrix.h:708
void SetRandn()
Sets to random values of a normal distribution.
void AddMatMat(const Real alpha, const MatrixBase< Real > &A, MatrixTransposeType transA, const MatrixBase< Real > &B, MatrixTransposeType transB, const Real beta)
void PreconditionDirections(CuMatrixBase< BaseFloat > *R, CuVectorBase< BaseFloat > *row_prod, BaseFloat *scale)
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
void PreconditionDirections(CuMatrixBase< BaseFloat > *X, BaseFloat *scale)
This call implements the main functionality of this class.
#define KALDI_WARN
Definition: kaldi-error.h:150
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
void Scale(Real alpha)
Multiplies all elements by this constant.
int Rand(struct RandomState *state)
Definition: kaldi-math.cc:45
Real Sum() const
Returns sum of the elements.
void SetRandn()
Set vector to random normally-distributed noise.
void MulRowsVec(const VectorBase< Real > &scale)
Equivalent to (*this) = diag(scale) * (*this).
Matrix for CUDA computing.
Definition: matrix-common.h:69
void InvertElements()
Invert all elements.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void Set(Real f)
Set all members of a vector to a specified value.
void ApplyPow(Real power)
Take all elements of vector to a power.
Definition: kaldi-vector.h:179
void AddVecVec(const Real alpha, const VectorBase< OtherReal > &a, const VectorBase< OtherReal > &b)
*this += alpha * a * b^T
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
Definition: kaldi-math.h:276
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
void Add(Real c)
Add a constant to each element of a vector.
void Invert(Real *logdet=NULL, Real *det_sign=NULL, bool inverse_needed=true)
matrix inverse.
Definition: sp-matrix.cc:219
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
void PreconditionDirectionsCpu(MatrixBase< double > *R, VectorBase< double > *row_prod, BaseFloat *scale)
void SortSvd(VectorBase< Real > *s, MatrixBase< Real > *U, MatrixBase< Real > *Vt, bool sort_on_absolute_value)
Function to ensure that SVD is sorted.
void Set(Real)
Sets all elements to a specific value.
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
Definition: kaldi-math.cc:95
Vector for CUDA computing.
Definition: matrix-common.h:72