online-faster-decoder.h
Go to the documentation of this file.
1 // online/online-faster-decoder.h
2 
3 // Copyright 2012 Cisco Systems (author: Matthias Paulik)
4 
5 // Modifications to the original contribution by Cisco Systems made by:
6 // Vassil Panayotov
7 
8 // See ../../COPYING for clarification regarding multiple authors
9 //
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 //
14 // http://www.apache.org/licenses/LICENSE-2.0
15 //
16 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
18 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
19 // MERCHANTABLITY OR NON-INFRINGEMENT.
20 // See the Apache 2 License for the specific language governing permissions and
21 // limitations under the License.
22 
23 #ifndef KALDI_ONLINE_ONLINE_FASTER_DECODER_H_
24 #define KALDI_ONLINE_ONLINE_FASTER_DECODER_H_
25 
26 #include "util/stl-utils.h"
27 #include "decoder/faster-decoder.h"
28 #include "hmm/transition-model.h"
29 
30 namespace kaldi {
31 
32 // Extends the definition of FasterDecoder's options to include additional
33 // parameters. The meaning of the "beam" option is also redefined as
34 // the _maximum_ beam value allowed.
36  BaseFloat rt_min; // minimum decoding runtime factor
37  BaseFloat rt_max; // maximum decoding runtime factor
38  int32 batch_size; // number of features decoded in one go
39  int32 inter_utt_sil; // minimum silence (#frames) to trigger end of utterance
40  int32 max_utt_len_; // if utt. is longer, we accept shorter silence as utt. separators
41  int32 update_interval; // beam update period in # of frames
42  BaseFloat beam_update; // rate of adjustment of the beam
43  BaseFloat max_beam_update; // maximum rate of beam adjustment
44 
46  rt_min(.7), rt_max(.75), batch_size(27),
47  inter_utt_sil(50), max_utt_len_(1500),
48  update_interval(3), beam_update(.01),
49  max_beam_update(0.05) {}
50 
51  void Register(OptionsItf *opts, bool full) {
53  opts->Register("rt-min", &rt_min,
54  "Approximate minimum decoding run time factor");
55  opts->Register("rt-max", &rt_max,
56  "Approximate maximum decoding run time factor");
57  opts->Register("update-interval", &update_interval,
58  "Beam update interval in frames");
59  opts->Register("beam-update", &beam_update, "Beam update rate");
60  opts->Register("max-beam-update", &max_beam_update, "Max beam update rate");
61  opts->Register("inter-utt-sil", &inter_utt_sil,
62  "Maximum # of silence frames to trigger new utterance");
63  opts->Register("max-utt-length", &max_utt_len_,
64  "If the utterance becomes longer than this number of frames, "
65  "shorter silence is acceptable as an utterance separator");
66  }
67 };
68 
70  public:
71  // Codes returned by Decode() to show the current state of the decoder
72  enum DecodeState {
73  kEndFeats = 1, // No more scores are available from the Decodable
74  kEndUtt = 2, // End of utterance, caused by e.g. a sufficiently long silence
75  kEndBatch = 4 // End of batch - end of utterance not reached yet
76  };
77 
78  // "sil_phones" - the IDs of all silence phones
79  OnlineFasterDecoder(const fst::Fst<fst::StdArc> &fst,
80  const OnlineFasterDecoderOpts &opts,
81  const std::vector<int32> &sil_phones,
82  const TransitionModel &trans_model)
83  : FasterDecoder(fst, opts), opts_(opts),
84  silence_set_(sil_phones), trans_model_(trans_model),
85  max_beam_(opts.beam), effective_beam_(FasterDecoder::config_.beam),
86  state_(kEndFeats), frame_(0), utt_frames_(0) {}
87 
88  DecodeState Decode(DecodableInterface *decodable);
89 
90  // Makes a linear graph, by tracing back from the last "immortal" token
91  // to the previous one
92  bool PartialTraceback(fst::MutableFst<LatticeArc> *out_fst);
93 
94  // Makes a linear graph, by tracing back from the best currently active token
95  // to the last immortal token. This method is meant to be invoked at the end
96  // of an utterance in order to get the last chunk of the hypothesis
97  void FinishTraceBack(fst::MutableFst<LatticeArc> *fst_out);
98 
99  // Returns "true" if the best current hypothesis ends with long enough silence
100  bool EndOfUtterance();
101 
102  int32 frame() { return frame_; }
103 
104  private:
105  void ResetDecoder(bool full);
106 
107  // Returns a linear fst by tracing back the last N frames, beginning
108  // from the best current token
109  void TracebackNFrames(int32 nframes, fst::MutableFst<LatticeArc> *out_fst);
110 
111  // Makes a linear "lattice", by tracing back a path delimited by two tokens
112  void MakeLattice(const Token *start,
113  const Token *end,
114  fst::MutableFst<LatticeArc> *out_fst) const;
115 
116  // Searches for the last token, ancestor of all currently active tokens
117  void UpdateImmortalToken();
118 
120  const ConstIntegerSet<int32> silence_set_; // silence phones IDs
121  const TransitionModel &trans_model_; // needed for trans-id -> phone conversion
122  const BaseFloat max_beam_; // the maximum allowed beam
123  BaseFloat &effective_beam_; // the currently used beam
124  DecodeState state_; // the current state of the decoder
125  int32 frame_; // the next frame to be processed
126  int32 utt_frames_; // # frames processed from the current utterance
127  Token *immortal_tok_; // "immortal" token means it's an ancestor of ...
128  Token *prev_immortal_tok_; // ... all currently active tokens
130 };
131 
132 } // namespace kaldi
133 #endif // KALDI_ONLINE_ONLINE_FASTER_DECODER_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
DecodableInterface provides a link between the (acoustic-modeling and feature-processing) code and th...
Definition: decodable-itf.h:82
void Register(OptionsItf *opts, bool full)
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
Definition: graph.dox:21
kaldi::int32 int32
#define KALDI_DISALLOW_COPY_AND_ASSIGN(type)
Definition: kaldi-utils.h:121
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
OnlineFasterDecoder(const fst::Fst< fst::StdArc > &fst, const OnlineFasterDecoderOpts &opts, const std::vector< int32 > &sil_phones, const TransitionModel &trans_model)
void Register(OptionsItf *opts, bool full)
const ConstIntegerSet< int32 > silence_set_
const TransitionModel & trans_model_
const OnlineFasterDecoderOpts opts_