online-endpoint.cc
Go to the documentation of this file.
1 // online2/online-endpoint.cc
2 
3 // Copyright 2014 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
22 #include "decoder/grammar-fst.h"
23 
24 namespace kaldi {
25 
26 static bool RuleActivated(const OnlineEndpointRule &rule,
27  const std::string &rule_name,
28  BaseFloat trailing_silence,
29  BaseFloat relative_cost,
30  BaseFloat utterance_length) {
31  bool contains_nonsilence = (utterance_length > trailing_silence);
32 
33  bool ans = (contains_nonsilence || !rule.must_contain_nonsilence) &&
34  trailing_silence >= rule.min_trailing_silence &&
35  relative_cost <= rule.max_relative_cost &&
36  utterance_length >= rule.min_utterance_length;
37  if (ans) {
38  KALDI_VLOG(2) << "Endpointing rule " << rule_name << " activated: "
39  << (contains_nonsilence ? "true" : "false" ) << ','
40  << trailing_silence << ',' << relative_cost << ','
41  << utterance_length;
42  }
43  return ans;
44 }
45 
47  int32 num_frames_decoded,
48  int32 trailing_silence_frames,
49  BaseFloat frame_shift_in_seconds,
50  BaseFloat final_relative_cost) {
51  KALDI_ASSERT(num_frames_decoded >= trailing_silence_frames);
52 
53  BaseFloat utterance_length = num_frames_decoded * frame_shift_in_seconds,
54  trailing_silence = trailing_silence_frames * frame_shift_in_seconds;
55 
56  if (RuleActivated(config.rule1, "rule1",
57  trailing_silence, final_relative_cost, utterance_length))
58  return true;
59  if (RuleActivated(config.rule2, "rule2",
60  trailing_silence, final_relative_cost, utterance_length))
61  return true;
62  if (RuleActivated(config.rule3, "rule3",
63  trailing_silence, final_relative_cost, utterance_length))
64  return true;
65  if (RuleActivated(config.rule4, "rule4",
66  trailing_silence, final_relative_cost, utterance_length))
67  return true;
68  if (RuleActivated(config.rule5, "rule5",
69  trailing_silence, final_relative_cost, utterance_length))
70  return true;
71  return false;
72 }
73 
74 template <typename FST, typename DEC>
76  const std::string &silence_phones_str,
77  const DEC &decoder) {
78  std::vector<int32> silence_phones;
79  if (!SplitStringToIntegers(silence_phones_str, ":", false, &silence_phones))
80  KALDI_ERR << "Bad --silence-phones option in endpointing config: "
81  << silence_phones_str;
82  std::sort(silence_phones.begin(), silence_phones.end());
83  KALDI_ASSERT(IsSortedAndUniq(silence_phones) &&
84  "Duplicates in --silence-phones option in endpointing config");
85  KALDI_ASSERT(!silence_phones.empty() &&
86  "Endpointing requires nonempty --endpoint.silence-phones option");
87  ConstIntegerSet<int32> silence_set(silence_phones);
88 
89  bool use_final_probs = false;
90  typename DEC::BestPathIterator iter =
91  decoder.BestPathEnd(use_final_probs, NULL);
92  int32 num_silence_frames = 0;
93  while (!iter.Done()) { // we're going backwards in time from the most
94  // recently decoded frame...
95  LatticeArc arc;
96  iter = decoder.TraceBackBestPath(iter, &arc);
97  if (arc.ilabel != 0) {
98  int32 phone = tmodel.TransitionIdToPhone(arc.ilabel);
99  if (silence_set.count(phone) != 0) {
100  num_silence_frames++;
101  } else {
102  break; // stop counting as soon as we hit non-silence.
103  }
104  }
105  }
106  return num_silence_frames;
107 }
108 
109 template <typename FST>
111  const OnlineEndpointConfig &config,
112  const TransitionModel &tmodel,
113  BaseFloat frame_shift_in_seconds,
114  const LatticeFasterOnlineDecoderTpl<FST> &decoder) {
115  if (decoder.NumFramesDecoded() == 0) return false;
116 
117  BaseFloat final_relative_cost = decoder.FinalRelativeCost();
118 
119  int32 num_frames_decoded = decoder.NumFramesDecoded(),
120  trailing_silence_frames = TrailingSilenceLength<FST, LatticeFasterOnlineDecoderTpl<FST>>(tmodel,
121  config.silence_phones,
122  decoder);
123 
124  return EndpointDetected(config, num_frames_decoded, trailing_silence_frames,
125  frame_shift_in_seconds, final_relative_cost);
126 }
127 
128 template <typename FST>
130  const OnlineEndpointConfig &config,
131  const TransitionModel &tmodel,
132  BaseFloat frame_shift_in_seconds,
134  if (decoder.NumFramesDecoded() == 0) return false;
135 
136  BaseFloat final_relative_cost = decoder.FinalRelativeCost();
137 
138  int32 num_frames_decoded = decoder.NumFramesDecoded(),
139  trailing_silence_frames = TrailingSilenceLength<FST, LatticeIncrementalOnlineDecoderTpl<FST>>(tmodel,
140  config.silence_phones,
141  decoder);
142 
143  return EndpointDetected(config, num_frames_decoded, trailing_silence_frames,
144  frame_shift_in_seconds, final_relative_cost);
145 }
146 
147 
148 
149 // Instantiate EndpointDetected for the types we need.
150 // It will require TrailingSilenceLength so we don't have to instantiate that.
151 template
152 bool EndpointDetected<fst::Fst<fst::StdArc> >(
153  const OnlineEndpointConfig &config,
154  const TransitionModel &tmodel,
155  BaseFloat frame_shift_in_seconds,
157 
158 
159 template
160 bool EndpointDetected<fst::GrammarFst>(
161  const OnlineEndpointConfig &config,
162  const TransitionModel &tmodel,
163  BaseFloat frame_shift_in_seconds,
165 
166 template
167 bool EndpointDetected<fst::Fst<fst::StdArc> >(
168  const OnlineEndpointConfig &config,
169  const TransitionModel &tmodel,
170  BaseFloat frame_shift_in_seconds,
172 
173 
174 template
175 bool EndpointDetected<fst::GrammarFst>(
176  const OnlineEndpointConfig &config,
177  const TransitionModel &tmodel,
178  BaseFloat frame_shift_in_seconds,
180 
181 
182 
183 } // namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
fst::ArcTpl< LatticeWeight > LatticeArc
Definition: kaldi-lattice.h:40
OnlineEndpointRule rule1
e.g.
LatticeIncrementalOnlineDecoderTpl is as LatticeIncrementalDecoderTpl but also supports an efficient ...
OnlineEndpointRule rule2
rule2 times out after 0.5 seconds of silence if we reached the final-state with good probability (rel...
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
Definition: text-utils.h:68
kaldi::int32 int32
bool EndpointDetected(const OnlineEndpointConfig &config, int32 num_frames_decoded, int32 trailing_silence_frames, BaseFloat frame_shift_in_seconds, BaseFloat final_relative_cost)
This function returns true if this set of endpointing rules thinks we should terminate decoding...
OnlineEndpointRule rule5
rule5 times out after the utterance is 20 seconds long, regardless of anything else.
OnlineEndpointRule rule4
rule4 times out after 2.0 seconds of silence after decoding something, even if we did not reach a fin...
static bool RuleActivated(const OnlineEndpointRule &rule, const std::string &rule_name, BaseFloat trailing_silence, BaseFloat relative_cost, BaseFloat utterance_length)
int32 TrailingSilenceLength(const TransitionModel &tmodel, const std::string &silence_phones_str, const DEC &decoder)
returns the number of frames of trailing silence in the best-path traceback (not using final-probs)...
#define KALDI_ERR
Definition: kaldi-error.h:147
LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also supports an efficient way to get...
BaseFloat FinalRelativeCost() const
FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives more information.
int32 NumFramesDecoded() const
Returns the number of frames decoded so far.
BaseFloat FinalRelativeCost() const
FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives more information.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
This header contains a simple facility for endpointing, that should be used in conjunction with the "...
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
OnlineEndpointRule rule3
rule3 times out after 1.0 seconds of silence if we reached the final-state with OK probability (relat...
bool IsSortedAndUniq(const std::vector< T > &vec)
Returns true if the vector is sorted and contains each element only once.
Definition: stl-utils.h:63
int32 TransitionIdToPhone(int32 trans_id) const