online-gmm-decoding.h
Go to the documentation of this file.
1 // online2/online-gmm-decoding.h
2 
3 // Copyright 2014 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 #ifndef KALDI_ONLINE2_ONLINE_GMM_DECODING_H_
22 #define KALDI_ONLINE2_ONLINE_GMM_DECODING_H_
23 
24 #include <string>
25 #include <vector>
26 #include <deque>
27 
28 #include "matrix/matrix-lib.h"
29 #include "util/common-utils.h"
30 #include "base/kaldi-error.h"
37 #include "hmm/transition-model.h"
38 #include "gmm/am-diag-gmm.h"
39 #include "hmm/posterior.h"
40 
41 
42 namespace kaldi {
45 
46 
47 
62  adaptation_first_utt_delay(2.0),
63  adaptation_first_utt_ratio(1.5),
64  adaptation_delay(5.0),
65  adaptation_ratio(2.0) { }
66 
67  void Register(OptionsItf *opts) {
68  opts->Register("adaptation-first-utt-delay", &adaptation_first_utt_delay,
69  "Delay before first basis-fMLLR adaptation for first utterance "
70  "of each speaker");
71  opts->Register("adaptation-first-utt-ratio", &adaptation_first_utt_ratio,
72  "Ratio that controls frequency of fMLLR adaptation for first "
73  "utterance of each speaker");
74  opts->Register("adaptation-delay", &adaptation_delay,
75  "Delay before first basis-fMLLR adaptation for not-first "
76  "utterances of each speaker");
77  opts->Register("adaptation-ratio", &adaptation_ratio,
78  "Ratio that controls frequency of fMLLR adaptation for "
79  "not-first utterances of each speaker");
80  }
81 
83  void Check() const;
84 
88  bool DoAdapt(BaseFloat chunk_begin_secs,
89  BaseFloat chunk_end_secs,
90  bool is_first_utterance) const;
91 
92 };
93 
94 
97 
98  BasisFmllrOptions basis_opts; // options for basis-fMLLR adaptation.
99 
101 
103 
104  // rxfilename for model trained with online-CMN features
105  // (only needed if different from model_rxfilename)
107  // rxfilename for model used for estimating fMLLR transforms
108  std::string model_rxfilename;
109  // rxfilename for possible discriminatively trained model
110  // (only needed if different from model_rxfilename)
112  // rxfilename for the BasisFmllrEstimate object containing the basis
113  // used for basis-fMLLR.
115 
117 
118  std::string silence_phones;
120 
121 
122  OnlineGmmDecodingConfig(): fmllr_lattice_beam(3.0), acoustic_scale(0.1),
123  silence_weight(0.1) { }
124 
125  void Register(OptionsItf *opts) {
126  { // register basis_opts with prefix, there are getting to be too many
127  // options.
128  ParseOptions basis_po("basis", opts);
129  basis_opts.Register(&basis_po);
130  }
131  adaptation_policy_opts.Register(opts);
132  faster_decoder_opts.Register(opts);
133  opts->Register("acoustic-scale", &acoustic_scale,
134  "Scaling factor for acoustic likelihoods");
135  opts->Register("silence-phones", &silence_phones,
136  "Colon-separated list of integer ids of silence phones, e.g. "
137  "1:2:3 (affects adaptation).");
138  opts->Register("silence-weight", &silence_weight,
139  "Weight applied to silence frames for fMLLR estimation (if "
140  "--silence-phones option is supplied)");
141  opts->Register("fmllr-lattice-beam", &fmllr_lattice_beam, "Beam used in "
142  "pruning lattices for fMLLR estimation");
143  opts->Register("online-alignment-model", &online_alimdl_rxfilename,
144  "(Extended) filename for model trained with online CMN "
145  "features, e.g. from apply-cmvn-online.");
146  opts->Register("model", &model_rxfilename, "(Extended) filename for model, "
147  "typically the one used for fMLLR computation. Required option.");
148  opts->Register("rescore-model", &rescore_model_rxfilename, "(Extended) filename "
149  "for model to rescore lattices with, e.g. discriminatively trained"
150  "model, if it differs from that supplied to --model option. Must"
151  "have the same tree.");
152  opts->Register("fmllr-basis", &fmllr_basis_rxfilename, "(Extended) filename "
153  "of fMLLR basis object, as output by gmm-basis-fmllr-training");
154  }
155 };
156 
157 
167  public:
169 
170  const TransitionModel &GetTransitionModel() const;
171 
172  const AmDiagGmm &GetOnlineAlignmentModel() const;
173 
174  const AmDiagGmm &GetModel() const;
175 
176  const AmDiagGmm &GetFinalModel() const;
177 
178  const BasisFmllrEstimate &GetFmllrBasis() const;
179 
180  private:
181  // The transition-model is only needed for its integer ids, and these need to
182  // be identical for all 3 models, so we only store one (it doesn't matter
183  // which one).
185  // The model trained with online-CMVN features
186  // (if supplied, otherwise use model_)
188  // The ML-trained model used to get transforms (required)
190  // The discriminatively trained model
191  // (if supplied, otherwise use model_)
193  // The following object contains the basis elements for
194  // "Basis fMLLR".
196 };
197 
198 
203 
204  // Writing and reading of the state of the object
205  void Write(std::ostream &out_stream, bool binary) const;
206  void Read(std::istream &in_stream, bool binary);
207 
208 
209 };
210 
217  public:
219  const OnlineGmmDecodingModels &models,
220  const OnlineFeaturePipeline &feature_prototype,
221  const fst::Fst<fst::StdArc> &fst,
222  const OnlineGmmAdaptationState &adaptation_state);
223 
224  OnlineFeaturePipeline &FeaturePipeline() { return *feature_pipeline_; }
225 
231  void AdvanceDecoding();
232 
235  void FinalizeDecoding();
236 
239  bool HaveTransform() const;
240 
248  void EstimateFmllr(bool end_of_utterance);
249 
250  void GetAdaptationState(OnlineGmmAdaptationState *adaptation_state) const;
251 
259  void GetLattice(bool rescore_if_needed,
260  bool end_of_utterance,
261  CompactLattice *clat) const;
262 
267  void GetBestPath(bool end_of_utterance,
268  Lattice *best_path) const;
269 
276  BaseFloat FinalRelativeCost() { return decoder_.FinalRelativeCost(); }
277 
278 
281  bool EndpointDetected(const OnlineEndpointConfig &config);
282 
284  private:
285  bool GetGaussianPosteriors(bool end_of_utterance, GaussPost *gpost);
286 
291  bool RescoringIsNeeded() const;
292 
294  std::vector<int32> silence_phones_; // sorted, unique list of silence phones,
295  // derived from config_
299  // adaptation_state_ generally reflects the "current" state of the
300  // adaptation. Note: adaptation_state_.cmvn_state is just copied from
301  // orig_adaptation_state, the function GetAdaptationState() gets the CMVN
302  // state.
305 };
306 
307 
309 
310 } // namespace kaldi
311 
312 
313 
314 #endif // KALDI_ONLINE2_ONLINE_GMM_DECODING_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
This class is used to read, store and give access to the models used for 3 phases of decoding (first-...
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
Definition: graph.dox:21
This does not work with multiple feature transforms.
OnlineGmmAdaptationState adaptation_state_
This file contains a class OnlineFeaturePipeline for online feature extraction, which puts together v...
bool EndpointDetected(const OnlineEndpointConfig &config, int32 num_frames_decoded, int32 trailing_silence_frames, BaseFloat frame_shift_in_seconds, BaseFloat final_relative_cost)
This function returns true if this set of endpointing rules thinks we should terminate decoding...
void Check() const
Check that configuration values make sense.
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
OnlineFeaturePipeline is a class that&#39;s responsible for putting together the various stages of the fe...
You will instantiate this class when you want to decode a single utterance using the online-decoding ...
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void Register(OptionsItf *opts)
Struct OnlineCmvnState stores the state of CMVN adaptation between utterances (but not the state of t...
fst::VectorFst< LatticeArc > Lattice
Definition: kaldi-lattice.h:44
LatticeFasterDecoderConfig faster_decoder_opts
OnlineFeaturePipeline * feature_pipeline_
This configuration class controls when to re-estimate the basis-fMLLR during online decoding...
fst::VectorFst< CompactLatticeArc > CompactLattice
Definition: kaldi-lattice.h:46
BaseFloat FinalRelativeCost()
This function outputs to "final_relative_cost", if non-NULL, a number >= 0 that will be close to zero...
bool DoAdapt(BaseFloat chunk_begin_secs, BaseFloat chunk_end_secs, bool is_first_utterance) const
This function returns true if we are scheduled to re-estimate fMLLR somewhere in the interval [ chunk...
OnlineGmmDecodingAdaptationPolicyConfig adaptation_policy_opts
OnlineFeaturePipeline & FeaturePipeline()
const OnlineGmmDecodingModels & models_
const OnlineGmmAdaptationState & orig_adaptation_state_
std::vector< std::vector< std::pair< int32, Vector< BaseFloat > > > > GaussPost
GaussPost is a typedef for storing Gaussian-level posteriors for an utterance.
Definition: posterior.h:51
LatticeFasterOnlineDecoder decoder_
Estimation functions for basis fMLLR.