fmpe.h
Go to the documentation of this file.
1 // transform/fmpe.h
2 
3 // Copyright 2011-2012 Yanmin Qian Johns Hopkins University (Author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 #ifndef KALDI_TRANSFORM_FMPE_H_
22 #define KALDI_TRANSFORM_FMPE_H_ 1
23 
24 #include <vector>
25 
26 #include "gmm/am-diag-gmm.h"
27 #include "gmm/mle-am-diag-gmm.h"
28 #include "hmm/transition-model.h"
29 #include "hmm/posterior.h"
30 
31 namespace kaldi {
32 
33 
34 struct FmpeOptions {
35  // Probably the easiest place to start, to understand fMPE, is the
36  // paper "Improvements to fMPE for discriminative training of features".
37  // We are simplifying a few things here. We are getting rid of the
38  // "indirect differential"; we are adding a linear transform after the
39  // high->low dimension projection whose function is to "un-whiten" the
40  // transformed features (i.e. project from a nominally Gaussian-distributed
41  // space into our actual feature space), in order to make it unnecessary to
42  // take into account the per-dim variance during the update phase of fMPE;
43  // and the update equations are rather simpler than described in
44  // the paper; we take away some stuff, but add in the capability to
45  // do l2 regularization during the update phase.
46 
47  std::string context_expansion; // This string describes the various contexts...
48  // the easiest way to think of it is, we first generate the high-dimensional
49  // features without context expansion, and we then append the left and right
50  // frames, and also weighted averages of further-out frames, as specified by
51  // this string. Suppose there are 1024 Gaussians and the feature dimension is
52  // 40. In the simple way to describe it, supposing there are 9 contexts (the
53  // central frame, the left and right frames, and 6 averages of more distant
54  // frames), we generate the "offset features" of dimension (1024 * 41), then
55  // add left and right temporal context to the high-dim features so the
56  // dimension is (1024 * 41 * 9), and then project down to 40, so we train a
57  // matrix of 40 x (1024 * 41 * 9). As described in the paper, though, we
58  // reorganize the computation for efficiency (it has to do with preserving
59  // sparsity), and we train a matrix of dimension (40 * 9) x (1024 * 41). The
60  // (40 x 9) -> 40 transformation, which involves time as well, is dictated by
61  // these contexts.
62 
63  // You probably won't want to mess with this "context_expansion" string.
64  // The most important parameter to tune is the number of Gaussians in
65  // the UBM. Typically this will be in the range 300 to 1000.
66 
67  BaseFloat post_scale; // Scale on the posterior component of the high-dim
68  // features (1 of these for every [feat-dim] of the offset features).
69  // Typically 5.0-- this just gives a bit more emphasis to these posteriors
70  // during training, like a faster learning rate.
71 
72  FmpeOptions(): context_expansion("0,1.0:-1,1.0:1,1.0:-2,0.5;-3,0.5:2,0.5;3,0.5:-4,0.5;-5,0.5:4,0.5;5,0.5:-6,0.333;-7,0.333;-8,0.333:6,0.333;7,0.333;8,0.333"),
73  post_scale(5.0) { }
74 
75  void Register(OptionsItf *opts) {
76  opts->Register("post-scale", &post_scale, "Scaling constant on posterior "
77  "element of offset features, to give it a faster learning "
78  "rate.");
79  opts->Register("context-expansion", &context_expansion, "Specifies the "
80  "temporal context-splicing of high-dimensional features.");
81  }
82  // We include write and read functions, since this
83  // object is included as a member of the fMPE object.
84  void Write(std::ostream &os, bool binary) const;
85  void Read(std::istream &is, bool binary);
86 };
87 
89  BaseFloat learning_rate; // Learning rate constant. Like inverse of E
90  // in the papers.
91  BaseFloat l2_weight; // Weight on l2 regularization term
92 
93  FmpeUpdateOptions(): learning_rate(0.1), l2_weight(100.0) { }
94 
95  void Register(OptionsItf *opts) {
96  opts->Register("learning-rate", &learning_rate,
97  "Learning rate constant (like inverse of E in fMPE papers)");
98  opts->Register("l2-weight", &l2_weight,
99  "Weight on l2 regularization term in objective function.");
100  }
101 };
102 
103 class Fmpe;
104 
105 struct FmpeStats {
106  FmpeStats() { };
107  void Init(const Fmpe &fmpe);
108  FmpeStats(const Fmpe &fmpe) { Init(fmpe); }
109 
110  void Write(std::ostream &os, bool binary) const;
111  void Read(std::istream &is, bool binary, bool add = false);
112 
113  SubMatrix<BaseFloat> DerivPlus() const;
114  SubMatrix<BaseFloat> DerivMinus() const;
115 
119  void AccumulateChecks(const MatrixBase<BaseFloat> &feats,
120  const MatrixBase<BaseFloat> &direct_deriv,
121  const MatrixBase<BaseFloat> &indirect_deriv);
122  void DoChecks(); // Will check that stuff cancels. Just prints
123  // messages for now.
124  private:
125  Matrix<BaseFloat> deriv; // contains positive and negative parts of derivatives
126  // separately as sub-parts of the matrix, to ensure memory locality.
127 
128  // checks() is an 8 x fmpe.FeatDim() matrix that stores:
129  // (0-1) summed-deriv from direct, +ve and -ve part.
130  // (2-3) summed-deriv from indirect, +ve and -ve part.
131  // (4-5) (summed-deriv from direct * features), +ve and -ve part.
132  // (6-7) (summed-deriv from indirect * features), +ve and -ve part.
133  Matrix<double> checks; // contains quantities we use to check the
134  // indirect and direct derivatives are canceling as they should.
135 
136 };
137 
138 class Fmpe {
139  public:
140  Fmpe() {}
141  Fmpe(const DiagGmm &gmm, const FmpeOptions &config);
142 
143  int32 FeatDim() const { return gmm_.Dim(); }
144  int32 NumGauss() const { return gmm_.NumGauss(); }
145  int32 NumContexts() const { return static_cast<int32>(contexts_.size()); }
146 
147  // Note: this returns the number of rows and columns in projT_,
148  // which is the transpose of the high->intermediate dimensional
149  // projection matrix. This is the dimension we want for the
150  // stats.
151  int32 ProjectionTNumRows() const { return (FeatDim()+1) * NumGauss(); }
152  int32 ProjectionTNumCols() const { return FeatDim() * NumContexts(); }
153 
154 
155  // Computes the fMPE feature offsets and outputs them.
156  // You can add feat_in to this afterwards, if you want.
157  // Requires the Gaussian-selection info, which would normally
158  // be computed by a separate program-- this consists of
159  // lists of the top-scoring Gaussians for these features.
160  void ComputeFeatures(const MatrixBase<BaseFloat> &feat_in,
161  const std::vector<std::vector<int32> > &gselect,
162  Matrix<BaseFloat> *feat_out) const;
163 
164  // For training-- compute the derivative w.r.t the projection matrix
165  // (we keep the positive and negative parts separately to help
166  // set the learning rates).
167  void AccStats(const MatrixBase<BaseFloat> &feat_in,
168  const std::vector<std::vector<int32> > &gselect,
169  const MatrixBase<BaseFloat> &direct_feat_deriv,
170  const MatrixBase<BaseFloat> *indirect_feat_deriv, // may be NULL
171  FmpeStats *stats) const;
172 
173  // Note: the form on disk starts with the GMM; that way,
174  // the gselect program can treat the fMPE object as if it
175  // is a GMM.
176  void Write(std::ostream &os, bool binary) const;
177  void Read(std::istream &is, bool binary);
178 
179  // Returns total objf improvement, based on linear assumption.
180  BaseFloat Update(const FmpeUpdateOptions &config,
181  const FmpeStats &stats);
182 
183  private:
184  void SetContexts(std::string context_str);
185  void ComputeC(); // Computes the Cholesky factor C, from the GMM.
186  void ComputeStddevs();
187 
188  // Constructs the high-dim features and applies the main projection matrix proj_.
189  void ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
190  const std::vector<std::vector<int32> > &gselect,
191  MatrixBase<BaseFloat> *intermed_feat) const;
192 
193  // The same in reverse, for computing derivatives.
194  void ApplyProjectionReverse(const MatrixBase<BaseFloat> &feat_in,
195  const std::vector<std::vector<int32> > &gselect,
196  const MatrixBase<BaseFloat> &intermed_feat_deriv,
197  MatrixBase<BaseFloat> *proj_deriv_plus,
198  MatrixBase<BaseFloat> *proj_deriv_minus) const;
199 
200  // Applies the temporal context splicing from the intermediate
201  // features-- adds the result to feat_out which at this point
202  // will typically be zero.
203  void ApplyContext(const MatrixBase<BaseFloat> &intermed_feat,
204  MatrixBase<BaseFloat> *feat_out) const;
205 
206  // This is as ApplyContext but for back-propagating the derivative.
207  // Result is added to intermediate_feat_deriv which at this point will
208  // typically be zero.
209  void ApplyContextReverse(const MatrixBase<BaseFloat> &feat_deriv,
210  MatrixBase<BaseFloat> *intermed_feat_deriv) const;
211 
212  // Multiplies the feature offsets by the Cholesky matrix C.
213  void ApplyC(MatrixBase<BaseFloat> *feat_out, bool reverse = false) const;
214 
215  // For computing derivatives-- multiply the derivatives by C^T,
216  // which is the "reverse" of the forward pass of multiplying
217  // by C (this is how derivatives behave...)
218  void ApplyCReverse(MatrixBase<BaseFloat> *deriv) const { ApplyC(deriv, true); }
219 
220 
221 
222  DiagGmm gmm_; // The GMM used to get posteriors.
224  Matrix<BaseFloat> stddevs_; // The standard deviations of the
225  // variances of the GMM -- computed to avoid taking a square root
226  // in the fMPE computation. Derived variable-- not stored on
227  // disk.
228  Matrix<BaseFloat> projT_; // The transpose of the projection matrix;
229  // this is of dimension
230  // (NumGauss() * (FeatDim()+1)) * (FeatDim() * NumContexts()).
231 
232  TpMatrix<BaseFloat> C_; // Cholesky factor of the variance Sigma of
233  // features around their mean (as estimated from GMM)... applied
234  // to fMPE offset just before we add it to the features. This allows
235  // us to simplify the fMPE update and not have to worry about
236  // the features having non-unit variance, and what effect this should
237  // have on the learning rate..
238 
239  // The following variable dictates how we use temporal context.
240  // e.g. contexts = { { (0, 1.0) }, { (-1, 1.0) }, { (1, 1.0) },
241  // { (-2, 0.5 ), (-3, 0.5) }, ... }
242  std::vector<std::vector<std::pair<int32, BaseFloat> > > contexts_;
243 
244 };
245 
259  const TransitionModel &trans_model,
260  const Posterior &posterior,
261  const MatrixBase<BaseFloat> &features,
262  Matrix<BaseFloat> *direct_deriv,
263  const AccumAmDiagGmm *model_diff = NULL,
264  Matrix<BaseFloat> *indirect_deriv = NULL);
265 
266 
267 
268 } // End namespace kaldi
269 
270 
271 #endif
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Matrix< BaseFloat > deriv
Definition: fmpe.h:125
Matrix< double > checks
Definition: fmpe.h:133
FmpeStats(const Fmpe &fmpe)
Definition: fmpe.h:108
Fmpe()
Definition: fmpe.h:140
BaseFloat learning_rate
Definition: fmpe.h:89
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
int32 NumGauss() const
Definition: fmpe.h:144
void Register(OptionsItf *opts)
Definition: fmpe.h:95
kaldi::int32 int32
BaseFloat ComputeAmGmmFeatureDeriv(const AmDiagGmm &am_gmm, const TransitionModel &trans_model, const Posterior &posterior, const MatrixBase< BaseFloat > &features, Matrix< BaseFloat > *direct_deriv, const AccumAmDiagGmm *model_diff, Matrix< BaseFloat > *indirect_deriv)
Computes derivatives of the likelihood of these states (weighted), w.r.t.
Definition: fmpe.cc:522
Matrix< BaseFloat > projT_
Definition: fmpe.h:228
std::string context_expansion
Definition: fmpe.h:47
std::vector< std::vector< std::pair< int32, BaseFloat > > > contexts_
Definition: fmpe.h:242
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
void Read(std::istream &is, bool binary)
Definition: fmpe.cc:430
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
int32 ProjectionTNumCols() const
Definition: fmpe.h:152
int32 FeatDim() const
Definition: fmpe.h:143
void Register(OptionsItf *opts)
Definition: fmpe.h:75
BaseFloat l2_weight
Definition: fmpe.h:91
Packed symetric matrix class.
Definition: matrix-common.h:63
TpMatrix< BaseFloat > C_
Definition: fmpe.h:232
FmpeOptions config_
Definition: fmpe.h:223
int32 NumContexts() const
Definition: fmpe.h:145
Definition for Gaussian Mixture Model with diagonal covariances.
Definition: diag-gmm.h:42
Matrix< BaseFloat > stddevs_
Definition: fmpe.h:224
int32 ProjectionTNumRows() const
Definition: fmpe.h:151
BaseFloat post_scale
Definition: fmpe.h:67
void ApplyCReverse(MatrixBase< BaseFloat > *deriv) const
Definition: fmpe.h:218
Sub-matrix representation.
Definition: kaldi-matrix.h:988
DiagGmm gmm_
Definition: fmpe.h:222
void Write(std::ostream &os, bool binary) const
Definition: fmpe.cc:426