79 using namespace kaldi;
83 typedef kaldi::int64 int64;
86 "Reads in wav file(s) and simulates online decoding with neural nets\n" 87 "(nnet3 setup), with optional iVector-based speaker adaptation and\n" 88 "optional endpointing. Note: some configuration values and inputs are\n" 89 "set via config files whose filenames are passed as options\n" 90 "The lattice determinization algorithm here can operate\n" 93 "Usage: online2-wav-nnet3-latgen-incremental [options] <nnet3-in> <fst-in> " 94 "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n" 95 "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n" 96 "you want to decode utterance by utterance.\n";
100 std::string word_syms_rxfilename;
110 bool do_endpointing =
false;
113 po.Register(
"chunk-length", &chunk_length_secs,
114 "Length of chunk size in seconds, that we process. Set to <= 0 " 115 "to use all input in one chunk.");
116 po.Register(
"word-symbol-table", &word_syms_rxfilename,
117 "Symbol table for words [for debug output]");
118 po.Register(
"do-endpointing", &do_endpointing,
119 "If true, apply endpoint detection");
120 po.Register(
"online", &online,
121 "You can set this to false to disable online iVector estimation " 122 "and have all the data for each utterance used, even at " 123 "utterance start. This is useful where you just want the best " 124 "results and don't care about online operation. Setting this to " 125 "false has the same effect as setting " 126 "--use-most-recent-ivector=true and --greedy-ivector-extractor=true " 127 "in the file given to --ivector-extraction-config, and " 128 "--chunk-length=-1.");
130 "Number of threads used when initializing iVector extractor.");
140 if (po.NumArgs() != 5) {
145 std::string nnet3_rxfilename = po.GetArg(1),
146 fst_rxfilename = po.GetArg(2),
147 spk2utt_rspecifier = po.GetArg(3),
148 wav_rspecifier = po.GetArg(4),
149 clat_wspecifier = po.GetArg(5);
154 feature_info.ivector_extractor_info.use_most_recent_ivector =
true;
155 feature_info.ivector_extractor_info.greedy_ivector_extractor =
true;
156 chunk_length_secs = -1.0;
163 Input ki(nnet3_rxfilename, &binary);
164 trans_model.
Read(ki.Stream(), binary);
165 am_nnet.
Read(ki.Stream(), binary);
180 fst::SymbolTable *word_syms = NULL;
181 if (word_syms_rxfilename !=
"")
182 if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
183 KALDI_ERR <<
"Could not read symbol table from file " 184 << word_syms_rxfilename;
186 int32 num_done = 0, num_err = 0;
187 double tot_like = 0.0;
188 int64 num_frames = 0;
196 for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
197 std::string spk = spk2utt_reader.Key();
198 const std::vector<std::string> &uttlist = spk2utt_reader.Value();
200 feature_info.ivector_extractor_info);
201 for (
size_t i = 0;
i < uttlist.size();
i++) {
202 std::string utt = uttlist[
i];
203 if (!wav_reader.HasKey(utt)) {
204 KALDI_WARN <<
"Did not find audio for utterance " << utt;
208 const WaveData &wave_data = wav_reader.Value(utt);
214 feature_pipeline.SetAdaptationState(adaptation_state);
218 feature_info.silence_weighting_config,
223 *decode_fst, &feature_pipeline);
228 if (chunk_length_secs > 0) {
229 chunk_length =
int32(samp_freq * chunk_length_secs);
230 if (chunk_length == 0) chunk_length = 1;
232 chunk_length = std::numeric_limits<int32>::max();
235 int32 samp_offset = 0;
236 std::vector<std::pair<int32, BaseFloat> > delta_weights;
238 while (samp_offset < data.Dim()) {
239 int32 samp_remaining = data.Dim() - samp_offset;
240 int32 num_samp = chunk_length < samp_remaining ? chunk_length
244 feature_pipeline.AcceptWaveform(samp_freq, wave_part);
246 samp_offset += num_samp;
247 decoding_timer.WaitUntil(samp_offset / samp_freq);
248 if (samp_offset == data.Dim()) {
250 feature_pipeline.InputFinished();
253 if (silence_weighting.Active() &&
254 feature_pipeline.IvectorFeature() != NULL) {
255 silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
256 silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
258 feature_pipeline.IvectorFeature()->UpdateFrameWeights(delta_weights);
261 decoder.AdvanceDecoding();
263 if (do_endpointing && decoder.EndpointDetected(endpoint_opts)) {
267 decoder.FinalizeDecoding();
269 bool use_final_probs =
true;
270 CompactLattice clat = decoder.GetLattice(decoder.NumFramesDecoded(),
275 &num_frames, &tot_like);
277 decoding_timer.OutputStats(&timing_stats);
281 feature_pipeline.GetAdaptationState(&adaptation_state);
288 clat_writer.Write(utt, clat);
289 KALDI_LOG <<
"Decoded utterance " << utt;
293 timing_stats.
Print(online);
295 KALDI_LOG <<
"Decoded " << num_done <<
" utterances, " 296 << num_err <<
" with errors.";
297 KALDI_LOG <<
"Overall likelihood per frame was " << (tot_like / num_frames)
298 <<
" per frame over " << num_frames <<
" frames.";
301 return (num_done != 0 ? 0 : 1);
302 }
catch(
const std::exception& e) {
303 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void CollapseModel(const CollapseModelConfig &config, Nnet *nnet)
This function modifies the neural net for efficiency, in a way that suitable to be done in test time...
class OnlineTimer is used to test real-time decoding algorithms and evaluate how long the decoding of...
This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which in turn is the configurat...
int32 frame_subsampling_factor
Fst< StdArc > * ReadFstKaldiGeneric(std::string rxfilename, bool throw_on_err)
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
void SetBatchnormTestMode(bool test_mode, Nnet *nnet)
This function affects only components of type BatchNormComponent.
A templated class for writing objects to an archive or script file; see The Table concept...
BaseFloat SampFreq() const
const Matrix< BaseFloat > & Data() const
const Nnet & GetNnet() const
void Register(OptionsItf *opts)
void GetDiagnosticsAndPrintOutput(const std::string &utt, const fst::SymbolTable *word_syms, const CompactLattice &clat, int64 *tot_num_frames, double *tot_like)
This class is responsible for storing configuration variables, objects and options for OnlineNnet2Fea...
void Read(std::istream &is, bool binary)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
void SetDropoutTestMode(bool test_mode, Nnet *nnet)
This function affects components of child-classes of RandomComponent.
std::vector< std::vector< double > > AcousticLatticeScale(double acwt)
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
void Print(bool online=true)
Here, if "online == false" we take into account that the setup was used in not-really-online mode whe...
void ScaleLattice(const std::vector< std::vector< ScaleFloat > > &scale, MutableFst< ArcTpl< Weight > > *fst)
Scales the pairs of weights in LatticeWeight or CompactLatticeWeight by viewing the pair (a...
void Read(std::istream &is, bool binary)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
void Register(OptionsItf *opts)
fst::VectorFst< CompactLatticeArc > CompactLattice
This class's purpose is to read in Wave files.
The normal decoder, lattice-faster-decoder.h, sometimes has an issue when doing real-time application...
void Register(OptionsItf *opts)
You will instantiate this class when you want to decode a single utterance using the online-decoding ...
OnlineNnet2FeaturePipeline is a class that's responsible for putting together the various parts of th...
class OnlineTimingStats stores statistics from timing of online decoding, which will enable the Print...
When you instantiate class DecodableNnetSimpleLooped, you should give it a const reference to this cl...
void Register(OptionsItf *opts)
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Config class for the CollapseModel function.