33 using namespace kaldi;
40 const int32 kDeltaOrder = 2;
43 "Reads in wav file(s) and simulates online decoding.\n" 44 "Writes integerized-text and .ali files for WER computation. Utterance " 45 "segmentation is done on-the-fly.\n" 46 "Feature splicing/LDA transform is used, if the optional(last) argument " 48 "Otherwise delta/delta-delta(i.e. 2-nd order) features are produced.\n" 49 "Caution: the last few frames of the wav file may not be decoded properly.\n" 50 "Hence, don't use one wav file per utterance, but " 51 "rather use one wav file per show.\n\n" 52 "Usage: online-wav-gmm-decode-faster [options] wav-rspecifier model-in" 53 "fst-in word-symbol-table silence-phones transcript-wspecifier " 54 "alignments-wspecifier [lda-matrix-in]\n\n" 55 "Example: ./online-wav-gmm-decode-faster --rt-min=0.3 --rt-max=0.5 " 56 "--max-active=4000 --beam=12.0 --acoustic-scale=0.0769 " 57 "scp:wav.scp model HCLG.fst words.txt '1:2:3:4:5' ark,t:trans.txt ark,t:ali.txt";
60 int32 cmn_window = 600,
63 int32 right_context = 4, left_context = 4;
70 po.Register(
"left-context", &left_context,
"Number of frames of left context");
71 po.Register(
"right-context", &right_context,
"Number of frames of right context");
72 po.Register(
"acoustic-scale", &acoustic_scale,
73 "Scaling factor for acoustic likelihoods");
74 po.Register(
"cmn-window", &cmn_window,
75 "Number of feat. vectors used in the running average CMN calculation");
76 po.Register(
"min-cmn-window", &min_cmn_window,
77 "Minumum CMN window used at start of decoding (adds " 78 "latency only at start)");
79 po.Register(
"channel", &channel,
80 "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)");
82 if (po.NumArgs() != 7 && po.NumArgs() != 8) {
87 std::string wav_rspecifier = po.GetArg(1),
88 model_rspecifier = po.GetArg(2),
89 fst_rspecifier = po.GetArg(3),
90 word_syms_filename = po.GetArg(4),
91 silence_phones_str = po.GetArg(5),
92 words_wspecifier = po.GetArg(6),
93 alignment_wspecifier = po.GetArg(7),
94 lda_mat_rspecifier = po.GetOptArg(8);
96 std::vector<int32> silence_phones;
98 KALDI_ERR <<
"Invalid silence-phones string " << silence_phones_str;
99 if (silence_phones.empty())
106 if (lda_mat_rspecifier !=
"") {
108 Input ki(lda_mat_rspecifier, &binary_in);
109 lda_transform.
Read(ki.Stream(), binary_in);
116 Input ki(model_rspecifier, &binary);
117 trans_model.
Read(ki.Stream(), binary);
118 am_gmm.
Read(ki.Stream(), binary);
121 fst::SymbolTable *word_syms = NULL;
122 if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
123 KALDI_ERR <<
"Could not read symbol table from file " 124 << word_syms_filename;
136 int32 window_size = right_context + left_context + 1;
140 silence_phones, trans_model);
142 VectorFst<LatticeArc> out_fst;
143 for (; !reader.Done(); reader.Next()) {
144 std::string wav_key = reader.Key();
145 std::cerr <<
"File: " << wav_key << std::endl;
146 const WaveData &wav_data = reader.Value();
148 KALDI_ERR <<
"Sampling rates other than 16kHz are not supported!";
149 int32 num_chan = wav_data.
Data().
NumRows(), this_chan = channel;
156 KALDI_WARN <<
"Channel not specified but you have data with " 157 << num_chan <<
" channels; defaulting to zero";
159 if (this_chan >= num_chan) {
160 KALDI_WARN <<
"File with id " << wav_key <<
" has " 161 << num_chan <<
" channels but you specified channel " 162 << channel <<
", producing no output.";
168 Mfcc mfcc(mfcc_opts);
169 FeInput fe_input(&au_src, &mfcc,
170 frame_length*(wav_data.
SampFreq()/1000),
171 frame_shift*(wav_data.
SampFreq()/1000));
174 if (lda_mat_rspecifier !=
"") {
176 &cmn_input, lda_transform,
177 left_context, right_context);
180 opts.
order = kDeltaOrder;
190 int32 start_frame = 0;
191 bool partial_res =
false;
192 decoder.InitDecoding();
195 if (dstate & (decoder.kEndFeats | decoder.kEndUtt)) {
196 std::vector<int32> word_ids;
197 decoder.FinishTraceBack(&out_fst);
199 static_cast<vector<int32> *
>(0),
201 static_cast<LatticeArc::Weight*>(0));
205 decoder.GetBestPath(&out_fst);
206 std::vector<int32> tids;
210 static_cast<LatticeArc::Weight*>(0));
211 std::stringstream res_key;
212 res_key << wav_key <<
'_' << start_frame <<
'-' << decoder.frame();
213 if (!word_ids.empty())
214 words_writer.Write(res_key.str(), word_ids);
215 alignment_writer.Write(res_key.str(), tids);
216 if (dstate == decoder.kEndFeats)
218 start_frame = decoder.frame();
220 std::vector<int32> word_ids;
221 if (decoder.PartialTraceback(&out_fst)) {
223 static_cast<vector<int32> *
>(0),
225 static_cast<LatticeArc::Weight*>(0));
228 partial_res = (word_ids.size() > 0);
232 delete feat_transform;
237 }
catch(
const std::exception& e) {
238 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void Register(OptionsItf *opts, bool full)
MfccOptions contains basic options for computing MFCC features.
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
A templated class for writing objects to an archive or script file; see The Table concept...
BaseFloat SampFreq() const
const Matrix< BaseFloat > & Data() const
bool GetLinearSymbolSequence(const Fst< Arc > &fst, std::vector< I > *isymbols_out, std::vector< I > *osymbols_out, typename Arc::Weight *tot_weight_out)
GetLinearSymbolSequence gets the symbol sequence from a linear FST.
void PrintPartialResult(const std::vector< int32 > &words, const fst::SymbolTable *word_syms, bool line_break)
void Read(std::istream &in, bool binary, bool add=false)
read from stream.
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
FrameExtractionOptions frame_opts
void Read(std::istream &is, bool binary)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
This class's purpose is to read in Wave files.
fst::Fst< fst::StdArc > * ReadDecodeGraph(const std::string &filename)
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
void Register(OptionsItf *opts)
This templated class is intended for offline feature extraction, i.e.
void Read(std::istream &in_stream, bool binary)