combine-nnet-fast.h
Go to the documentation of this file.
1 // nnet2/combine-nnet-fast.h
2 
3 // Copyright 2012 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #ifndef KALDI_NNET2_COMBINE_NNET_FAST_H_
21 #define KALDI_NNET2_COMBINE_NNET_FAST_H_
22 
23 #include "nnet2/nnet-update.h"
24 #include "nnet2/nnet-compute.h"
25 #include "util/parse-options.h"
26 #include "itf/options-itf.h"
27 
28 
29 // Compare with combine-nnet.h. What we're doing is taking
30 // a set of neural nets, and combining them with combination weights
31 // (separate weights for each updatable layer), and optimizing
32 // these weights using a validation set,
33 
34 // This is a faster implementation
35 // with multi-threading and more careful preconditioning.
36 // To get the pre-conditioning, we divide the validation subset
37 // up into small-ish batches (e.g. 100 frames), and compute the
38 // neural net gradient for each one. We then compute the parameter
39 // gradient (i.e. the gradient w.r.t. the combination weights we're
40 // optimizing) for each batch, and use the scatter of these as a
41 // kind of Fisher matrix for preconditioning.
42 
43 namespace kaldi {
44 namespace nnet2 {
45 
51  int32 initial_model; // If provided, the index of the initial model to start
52  // the optimization from.
56  BaseFloat fisher_floor; // Flooring value we use for Fisher matrix (mainly
57  // makes a difference in pnorm systems, where there
58  // are don't-care directions in parameter space.
59  BaseFloat alpha; // A smoothing value we use in getting the Fisher matrix.
60  int32 fisher_minibatch_size; // e.g. 64; a relatively small minibatch size we
61  // use in the Fisher matrix computation (smaller will generally mean more accurate
62  // preconditioning but will slow down the computation).
63  int32 minibatch_size; // e.g. 1028; a larger minibatch size we use in
64  // the gradient computation.
67 
68  NnetCombineFastConfig(): initial_model(-1), num_lbfgs_iters(10),
69  num_threads(1), initial_impr(0.01), fisher_floor(1.0e-20),
70  alpha(0.01), fisher_minibatch_size(64), minibatch_size(1024),
71  max_lbfgs_dim(10), regularizer(0.0) {}
72 
73  void Register(OptionsItf *opts) {
74  opts->Register("initial-model", &initial_model, "Specifies where to start the "
75  "optimization from. If 0 ... #models-1, then specifies the model; "
76  "if >= #models, then the average of all inputs; if <0, chosen "
77  "automatically from the previous options.");
78  opts->Register("num-lbfgs-iters", &num_lbfgs_iters, "Maximum number of function "
79  "evaluations for L-BFGS to use when optimizing combination weights");
80  opts->Register("initial-impr", &initial_impr, "Amount of objective-function change "
81  "We aim for on the first iteration.");
82  opts->Register("num-threads", &num_threads, "Number of threads to use in "
83  "multi-core computation");
84  opts->Register("fisher-floor", &fisher_floor,
85  "Floor for diagonal of Fisher matrix (used in preconditioning)");
86  opts->Register("alpha", &alpha, "Value we use in smoothing the Fisher matrix "
87  "with its diagonal, in preconditioning the update.");
88  opts->Register("fisher-minibatch-size", &fisher_minibatch_size, "Size of minibatch "
89  "used in computation of Fisher matrix (smaller -> better "
90  "preconditioning");
91  opts->Register("minibatch-size", &minibatch_size, "Minibatch size used in computing "
92  "gradients (only affects speed)");
93  opts->Register("max-lbfgs-dim", &max_lbfgs_dim, "Maximum dimension to use in "
94  "L-BFGS (will not get higher than this even if the dimension "
95  "of the space gets higher.)");
96  opts->Register("regularizer", &regularizer, "Add to the objective "
97  "function (which is average log-like per frame), -0.5 * "
98  "regularizer * square of parameters.");
99  }
100 };
101 
102 void CombineNnetsFast(const NnetCombineFastConfig &combine_config,
103  const std::vector<NnetExample> &validation_set,
104  const std::vector<Nnet> &nnets_in,
105  Nnet *nnet_out);
106 
107 
108 
109 } // namespace nnet2
110 } // namespace kaldi
111 
112 #endif
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
kaldi::int32 int32
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
void CombineNnetsFast(const NnetCombineFastConfig &combine_config, const std::vector< NnetExample > &validation_set, const std::vector< Nnet > &nnets_in, Nnet *nnet_out)
This header provides functionality for sample-by-sample stochastic gradient descent and gradient comp...
Configuration class that controls neural net combination, where we combine a number of neural nets...