combine-nnet-fast.cc
Go to the documentation of this file.
1 // nnet2/combine-nnet-fast.cc
2 
3 // Copyright 2012 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
22 #include "util/kaldi-thread.h"
23 
24 namespace kaldi {
25 namespace nnet2 {
26 
27 /*
28  This class is responsible for computing a Fisher matrix which is a kind of
29  scatter of gradients on subsets; it's used for preconditioning the update in
30  class FastNnetCombiner. */
32  public:
34  const std::vector<Nnet> &nnets,
35  const std::vector<NnetExample> &egs,
36  int32 minibatch_size,
37  SpMatrix<double> *scatter):
38  nnet_(nnet), nnets_(nnets), egs_(egs), minibatch_size_(minibatch_size),
39  scatter_ptr_(scatter) { } // This initializer is only used to create a
40  // temporary version of the object; the next initializer is used to
41  // create the separate versions for the parallel jobs.
42 
44  MultiThreadable(other),
45  nnet_(other.nnet_), nnets_(other.nnets_), egs_(other.egs_),
48 
49  void operator () () {
50  // b is the "minibatch id."
51  int32 num_egs = static_cast<int32>(egs_.size());
52  Nnet nnet_gradient(nnet_);
53  for (int32 b = 0; b * minibatch_size_ < num_egs; b++) {
54  if (b % num_threads_ != thread_id_)
55  continue; // We're not responsible for this minibatch.
56  int32 offset = b * minibatch_size_,
57  length = std::min(minibatch_size_,
58  num_egs - offset);
59  bool is_gradient = true;
60  nnet_gradient.SetZero(is_gradient);
61  std::vector<NnetExample> minibatch(egs_.begin() + offset,
62  egs_.begin() + offset + length);
63  DoBackprop(nnet_, minibatch, &nnet_gradient);
65  int32 i = 0;
66  for (int32 n = 0; n < static_cast<int32>(nnets_.size()); n++) {
67  for (int32 c = 0; c < nnet_.NumComponents(); c++) {
68  const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
69  &(nnet_gradient.GetComponent(c))),
70  *uc_other = dynamic_cast<const UpdatableComponent*>(
71  &(nnets_[n].GetComponent(c)));
72  if (uc != NULL) {
73  gradient(i) = uc->DotProduct(*uc_other);
74  i++;
75  }
76  }
77  }
78  KALDI_ASSERT(i == gradient.Dim());
79  scatter_.AddVec2(1.0, gradient);
80  }
81  }
83  if (scatter_.NumRows() != 0) {
84  if (scatter_ptr_->NumRows() == 0)
87  }
88  }
89 
90  private:
91  const Nnet &nnet_; // point at which we compute the parameter gradients.
92  const std::vector<Nnet> &nnets_; // The dot-product of each of these with the parameter gradients,
93  // are the actual gradients that go into "scatter".
94  const std::vector<NnetExample> &egs_;
95  int32 minibatch_size_; // equals config --fisher-minbatch-size e.g. 64 (smaller than
96  // regular minibatch size.)
98  SpMatrix<double> scatter_; // Local accumulation of the scatter.
99 };
100 
101 
103  public:
105  const std::vector<NnetExample> &validation_set,
106  const std::vector<Nnet> &nnets_in,
107  Nnet *nnet_out):
108  config_(combine_config), egs_(validation_set),
109  nnets_(nnets_in), nnet_out_(nnet_out) {
110 
111  GetInitialParams();
112  ComputePreconditioner();
113 
114  int32 dim = params_.Dim();
115  KALDI_ASSERT(dim > 0);
116  Vector<double> gradient(dim);
117 
118  double regularizer_objf, initial_regularizer_objf; // for diagnostics
119  double objf, initial_objf;
120 
121  LbfgsOptions lbfgs_options;
122  lbfgs_options.minimize = false; // We're maximizing.
123  lbfgs_options.m = std::min(dim, config_.max_lbfgs_dim);
124  lbfgs_options.first_step_impr = config_.initial_impr;
125 
126  OptimizeLbfgs<double> lbfgs(params_,
127  lbfgs_options);
128 
129  for (int32 i = 0; i < config_.num_lbfgs_iters; i++) {
130  params_.CopyFromVec(lbfgs.GetProposedValue());
131  objf = ComputeObjfAndGradient(&gradient, &regularizer_objf);
132  // Note: there is debug printout in ComputeObjfAndGradient
133  // (at verbose-level 2).
134  if (i == 0) {
135  initial_objf = objf;
136  initial_regularizer_objf = regularizer_objf;
137  }
138  lbfgs.DoStep(objf, gradient);
139  }
140  params_ = lbfgs.GetValue(&objf);
141 
142  ComputeCurrentNnet(nnet_out_, true); // create the output neural net, and
143  // print out the scaling factors.
144  if (config_.regularizer != 0.0) {
145  double initial_part = initial_objf - initial_regularizer_objf,
146  part = objf - regularizer_objf;
147  KALDI_LOG << "Combining nnets, objf/frame + regularizer changed from "
148  << initial_part << " + " << initial_regularizer_objf
149  << " = " << initial_objf << " to " << part << " + "
150  << regularizer_objf << " = " << objf;
151  } else {
152  KALDI_LOG << "Combining nnets, objf per frame changed from "
153  << initial_objf << " to " << objf;
154  }
155  }
156 
157  private:
159  const std::vector<NnetExample> &validation_set,
160  const std::vector<Nnet> &nnets) const;
161 
162  void GetInitialParams();
163 
164  void ComputePreconditioner();
165 
166  // Computes and returns objective function per frame, including
167  // regularizer term if applicable. Also puts just the regularizer
168  // term in *regularizer_objf.
169  double ComputeObjfAndGradient(
170  Vector<double> *gradient,
171  double *regularizer_objf);
172 
173  void ComputeCurrentNnet(
174  Nnet *dest, bool debug = false);
175 
176  static void CombineNnets(const Vector<double> &scale_params,
177  const std::vector<Nnet> &nnets,
178  Nnet *dest);
179 
180 
181  // C_ is the cholesky of the smoothed Fisher matrix F.
182  // Let F = C C^T.
183  // Preconditioned gradient is \hat{g} = C^{-1} g
184  // Note: preconditioned parameter is \hat{p} = C^T p,
185  // so p = C^{-T} \hat{p}.
188  Vector<double> params_; // the parameters we're optimizing-- in the
189  // preconditioned space. These are the same dimension
190  // as the number of nnets we're combining times the
191  // number of updatable layers.
192 
194  const std::vector<NnetExample> &egs_;
195  const std::vector<Nnet> &nnets_;
197 };
198 
199 
200 // static
202  const std::vector<Nnet> &nnets,
203  Nnet *dest) {
204  int32 num_nnets = nnets.size();
205  KALDI_ASSERT(num_nnets >= 1);
206  int32 num_uc = nnets[0].NumUpdatableComponents();
207  KALDI_ASSERT(nnets[0].NumUpdatableComponents() >= 1);
208 
209 
210  *dest = nnets[0];
211  SubVector<double> scale_params0(scale_params, 0, num_uc);
212  dest->ScaleComponents(Vector<BaseFloat>(scale_params0));
213  for (int32 n = 1; n < num_nnets; n++) {
214  SubVector<double> scale_params_n(scale_params, n * num_uc, num_uc);
215  dest->AddNnet(Vector<BaseFloat>(scale_params_n), nnets[n]);
216  }
217 }
218 
219 
221  SpMatrix<double> F; // Fisher matrix.
222  Nnet nnet;
223  ComputeCurrentNnet(&nnet); // will be at initial value of neural net.
224 
225  { // This block does the multi-threaded computation.
226  // The next line just initializes an "example" object.
228  config_.fisher_minibatch_size,
229  &F);
230 
231  // Setting num_threads to zero if config_.num_threads == 1
232  // is a signal to the MultiThreader class to run without creating
233  // any extra threads in this case; it helps support GPUs.
234  int32 num_threads = config_.num_threads == 1 ? 0 : config_.num_threads;
235  // The work gets done in the initializer and destructor of
236  // the class below.
237  MultiThreader<FisherComputationClass> m(num_threads, fc);
238  }
239 
240  // The scale of F is irrelevant but it might be quite
241  // large at this point, so we just normalize it.
242  KALDI_ASSERT(F.Trace() > 0);
243  F.Scale(F.NumRows() / F.Trace()); // same scale as unit matrix.
244  // Make zero diagonal elements of F non-zero. Relates to updatable
245  // components that have no effect, e.g. MixtureProbComponents that have
246  // no real free parameters.
247  KALDI_ASSERT(config_.fisher_floor > 0.0);
248  for (int32 i = 0; i < F.NumRows(); i++)
249  F(i, i) = std::max<BaseFloat>(F(i, i), config_.fisher_floor);
250  // We next smooth the diagonal elements of F by a small amount.
251  // This is mainly necessary in case the number of minibatches is
252  // smaller than the dimension of F; we want to ensure F is full rank.
253  for (int32 i = 0; i < F.NumRows(); i++)
254  F(i, i) *= (1.0 + config_.alpha);
255 
256  C_.Resize(F.NumRows());
257  C_.Cholesky(F);
258  C_inv_ = C_;
259  C_inv_.Invert();
260 
261  // Transform the params_ data-member to be in the preconditioned space.
262  Vector<double> raw_params(params_);
263  params_.AddTpVec(1.0, C_, kTrans, raw_params, 0.0);
264 }
265 
266 // Note, we ignore the regularizer in selecting the best one. It shouldn't
267 // really matter.
269  int32 initial_model = config_.initial_model,
270  num_nnets = static_cast<int32>(nnets_.size());
271  if (initial_model > num_nnets)
272  initial_model = num_nnets;
273  if (initial_model < 0)
274  initial_model = GetInitialModel(egs_, nnets_);
275 
276  KALDI_ASSERT(initial_model >= 0 && initial_model <= num_nnets);
277  int32 num_uc = nnets_[0].NumUpdatableComponents();
278 
279  Vector<double> raw_params(num_uc * num_nnets); // parameters in
280  // non-preconditioned space.
281  if (initial_model < num_nnets) {
282  KALDI_LOG << "Initializing with neural net with index " << initial_model;
283  // At this point we're using the best of the individual neural nets.
284  raw_params.Set(0.0);
285 
286  // Set the block of parameters corresponding to the "best" of the
287  // source neural nets to
288  SubVector<double> best_block(raw_params, num_uc * initial_model, num_uc);
289  best_block.Set(1.0);
290  } else { // initial_model == num_nnets
291  KALDI_LOG << "Initializing with all neural nets averaged.";
292  raw_params.Set(1.0 / num_nnets);
293  }
294  KALDI_ASSERT(C_.NumRows() == 0); // Assume this not set up yet.
295  params_ = raw_params; // this is in non-preconditioned space.
296 }
297 
300  Vector<double> *gradient,
301  double *regularizer_objf_ptr) {
302  Nnet nnet;
303  ComputeCurrentNnet(&nnet); // compute it at the value "params_".
304 
305  Nnet nnet_gradient(nnet);
306  bool is_gradient = true;
307  nnet_gradient.SetZero(is_gradient);
308  double tot_weight = 0.0;
309  double objf = DoBackpropParallel(nnet, config_.minibatch_size, config_.num_threads,
310  egs_, &tot_weight, &nnet_gradient) / egs_.size();
311 
312  // raw_gradient is gradient in non-preconditioned space.
313  Vector<double> raw_gradient(params_.Dim());
314 
315  double regularizer_objf = 0.0; // sum of -0.5 * config_.regularizer * params-squared.
316  int32 i = 0; // index into raw_gradient
317  int32 num_nnets = nnets_.size();
318  for (int32 n = 0; n < num_nnets; n++) {
319  for (int32 j = 0; j < nnet.NumComponents(); j++) {
320  const UpdatableComponent *uc =
321  dynamic_cast<const UpdatableComponent*>(&(nnets_[n].GetComponent(j))),
322  *uc_gradient =
323  dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j))),
324  *uc_params =
325  dynamic_cast<const UpdatableComponent*>(&(nnet.GetComponent(j)));
326  if (uc != NULL) {
327  double gradient = uc->DotProduct(*uc_gradient) / tot_weight;
328  // "gradient" is the derivative of the objective function w.r.t. this
329  // element of the parameters (i.e. this weight, which gets applied to
330  // the j'th component of the n'th source neural net).
331  if (config_.regularizer != 0.0) {
332  gradient -= config_.regularizer * uc->DotProduct(*uc_params);
333  if (n == 0) // only add this once...
334  regularizer_objf +=
335  -0.5 * config_.regularizer * uc_params->DotProduct(*uc_params);
336  }
337  raw_gradient(i) = gradient;
338  i++;
339  }
340  }
341  }
342  if (config_.regularizer != 0.0) {
343  KALDI_VLOG(2) << "Objf is " << objf << " + regularizer " << regularizer_objf
344  << " = " << (objf + regularizer_objf) << ", raw gradient is "
345  << raw_gradient;
346  } else {
347  KALDI_VLOG(2) << "Objf is " << objf << ", raw gradient is " << raw_gradient;
348  }
349  KALDI_ASSERT(i == raw_gradient.Dim());
350  // \hat{g} = C^{-1} g.
351  gradient->AddTpVec(1.0, C_inv_, kNoTrans, raw_gradient, 0.0);
352  *regularizer_objf_ptr = regularizer_objf;
353  return objf + regularizer_objf;
354 }
355 
357  Nnet *dest, bool debug) {
358  int32 num_nnets = nnets_.size();
359  KALDI_ASSERT(num_nnets >= 1);
360  KALDI_ASSERT(params_.Dim() == num_nnets * nnets_[0].NumUpdatableComponents());
361  Vector<double> raw_params(params_.Dim()); // Weights in non-preconditioned space:
362  // p = C^{-T} \hat{p}. Here, raw_params is p, params_, is \hat{p}.
363 
364  if (C_inv_.NumRows() > 0)
365  raw_params.AddTpVec(1.0, C_inv_, kTrans, params_, 0.0);
366  else
367  raw_params = params_; // C not set up yet: interpret params_ as raw parameters.
368 
369  if (debug) {
370  Matrix<double> params_mat(num_nnets,
372  params_mat.CopyRowsFromVec(raw_params);
373  KALDI_LOG << "Scale parameters are " << params_mat;
374  }
375  CombineNnets(raw_params, nnets_, dest);
376 }
377 
382  const std::vector<NnetExample> &validation_set,
383  const std::vector<Nnet> &nnets) const {
384  int32 num_nnets = static_cast<int32>(nnets.size());
385  KALDI_ASSERT(!nnets.empty());
386  int32 best_n = -1;
387  double best_objf = -std::numeric_limits<double>::infinity();
388  Vector<double> objfs(nnets.size());
389  for (int32 n = 0; n < num_nnets; n++) {
390  double num_frames;
391  double objf = ComputeNnetObjfParallel(nnets[n], config_.minibatch_size,
392  config_.num_threads, validation_set,
393  &num_frames);
394  KALDI_ASSERT(num_frames != 0);
395  objf /= num_frames;
396 
397  if (n == 0 || objf > best_objf) {
398  best_objf = objf;
399  best_n = n;
400  }
401  objfs(n) = objf;
402  }
403  KALDI_LOG << "Objective functions for the source neural nets are " << objfs;
404 
405  int32 num_uc = nnets[0].NumUpdatableComponents();
406 
407  if (num_nnets > 1) { // Now try a version where all the neural nets have the
408  // same weight. Don't do this if num_nnets == 1 as
409  // it would be a waste of time (identical to n == 0).
410  Vector<double> scale_params(num_uc * num_nnets);
411  scale_params.Set(1.0 / num_nnets);
412  Nnet average_nnet;
413  CombineNnets(scale_params, nnets, &average_nnet);
414  double num_frames;
415  double objf = ComputeNnetObjfParallel(average_nnet, config_.minibatch_size,
416  config_.num_threads, validation_set,
417  &num_frames);
418  objf /= num_frames;
419  KALDI_LOG << "Objf with all neural nets averaged is " << objf;
420  if (objf > best_objf) {
421  return num_nnets;
422  } else {
423  return best_n;
424  }
425  } else {
426  return best_n;
427  }
428 }
429 
430 void CombineNnetsFast(const NnetCombineFastConfig &combine_config,
431  const std::vector<NnetExample> &validation_set,
432  const std::vector<Nnet> &nnets_in,
433  Nnet *nnet_out) {
434  // Everything happens in the initializer.
435  FastNnetCombiner combiner(combine_config,
436  validation_set,
437  nnets_in,
438  nnet_out);
439 }
440 
441 
442 } // namespace nnet2
443 } // namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
const Component & GetComponent(int32 c) const
Definition: nnet-nnet.cc:141
void DoStep(Real function_value, const VectorBase< Real > &gradient)
The user calls this function to provide the class with the function and gradient info at the point Ge...
void Scale(Real c)
void AddNnet(const VectorBase< BaseFloat > &scales, const Nnet &other)
For each updatatable component, adds to it the corresponding element of "other" times the appropriate...
Definition: nnet-nnet.cc:576
int32 NumUpdatableComponents() const
Returns the number of updatable components.
Definition: nnet-nnet.cc:413
Real Trace() const
Definition: sp-matrix.cc:171
static void CombineNnets(const Vector< double > &scale_params, const std::vector< Nnet > &nnets, Nnet *dest)
kaldi::int32 int32
const VectorBase< Real > & GetValue(Real *objf_value=NULL) const
This returns the value of the variable x that has the best objective function so far, and the corresponding objective function value if requested.
static int32 GetInitialModel(const std::vector< NnetExample > &validation_set, const std::vector< Nnet > &nnets)
Returns an integer saying which model to use: either 0 ...
Definition: combine-nnet.cc:49
MatrixIndexT NumRows() const
const std::vector< Nnet > & nnets_
double DoBackprop(const Nnet &nnet, const std::vector< NnetExample > &examples, Nnet *nnet_to_update, double *tot_accuracy)
This function computes the objective function and either updates the model or adds to parameter gradi...
Definition: nnet-update.cc:265
FisherComputationClass(const Nnet &nnet, const std::vector< Nnet > &nnets, const std::vector< NnetExample > &egs, int32 minibatch_size, SpMatrix< double > *scatter)
int32 NumComponents() const
Returns number of components– think of this as similar to # of layers, but e.g.
Definition: nnet-nnet.h:69
void CombineNnetsFast(const NnetCombineFastConfig &combine_config, const std::vector< NnetExample > &validation_set, const std::vector< Nnet > &nnets_in, Nnet *nnet_out)
void AddVec2(const Real alpha, const VectorBase< OtherReal > &v)
rank-one update, this <– this + alpha v v&#39;
Definition: sp-matrix.cc:946
int32 GetInitialModel(const std::vector< NnetExample > &validation_set, const std::vector< Nnet > &nnets) const
Returns an integer saying which model to use: either 0 ...
void SetZero(bool treat_as_gradient)
Definition: nnet-nnet.cc:151
FastNnetCombiner(const NnetCombineFastConfig &combine_config, const std::vector< NnetExample > &validation_set, const std::vector< Nnet > &nnets_in, Nnet *nnet_out)
const std::vector< Nnet > & nnets_
virtual BaseFloat DotProduct(const UpdatableComponent &other) const =0
Here, "other" is a component of the same specific type.
void AddSp(const Real alpha, const SpMatrix< Real > &Ma)
Definition: sp-matrix.h:211
struct rnnlm::@11::@12 n
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
void ScaleComponents(const VectorBase< BaseFloat > &scales)
Scales the parameters of each of the updatable components.
Definition: nnet-nnet.cc:421
FisherComputationClass(const FisherComputationClass &other)
void AddTpVec(const Real alpha, const TpMatrix< Real > &M, const MatrixTransposeType trans, const VectorBase< Real > &v, const Real beta)
Add triangular matrix times vector: this <– beta*this + alpha*M*v.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void Set(Real f)
Set all members of a vector to a specified value.
double ComputeNnetObjfParallel(const Nnet &nnet, int32 minibatch_size, int32 num_threads, const std::vector< NnetExample > &examples, double *num_frames)
This is basically to clarify the fact that DoBackpropParallel will also work with nnet_to_update == N...
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
static void CombineNnets(const Vector< BaseFloat > &scale_params, const std::vector< Nnet > &nnets, Nnet *dest)
Definition: combine-nnet.cc:28
This is an implementation of L-BFGS.
Definition: optimization.h:84
double ComputeObjfAndGradient(Vector< double > *gradient, double *regularizer_objf)
Computes objf at point "params_".
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type=kSetZero)
Definition: sp-matrix.h:81
const std::vector< NnetExample > & egs_
void CopyRowsFromVec(const VectorBase< Real > &v)
This function has two modes of operation.
Configuration class that controls neural net combination, where we combine a number of neural nets...
#define KALDI_LOG
Definition: kaldi-error.h:153
double DoBackpropParallel(const Nnet &nnet, int32 minibatch_size, SequentialNnetExampleReader *examples_reader, double *tot_weight, Nnet *nnet_to_update)
This function is similar to "DoBackprop" in nnet-update.h This function computes the objective functi...
const std::vector< NnetExample > & egs_
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
const VectorBase< Real > & GetProposedValue() const
This returns the value at which the function wants us to compute the objective function and gradient...
Definition: optimization.h:134
const NnetCombineFastConfig & config_
int32 NumUpdatableComponents(const Nnet &dest)
Returns the number of updatable components in the nnet.
Definition: nnet-utils.cc:422
void ComputeCurrentNnet(Nnet *dest, bool debug=false)
Class UpdatableComponent is a Component which has trainable parameters and contains some global param...
static BaseFloat ComputeObjfAndGradient(const std::vector< NnetExample > &validation_set, const Vector< double > &scale_params, const Nnet &orig_nnet, const Nnet &direction, Vector< double > *gradient)