69 using namespace kaldi;
76 signal(SIGPIPE, SIG_IGN);
79 const int32 kDeltaOrder = 2;
82 "Starts a TCP server that receives RAW audio and outputs aligned words.\n" 83 "A sample client can be found in: onlinebin/online-audio-client\n\n" 84 "Usage: online-audio-server-decode-faster [options] model-in " 85 "fst-in word-symbol-table silence-phones word_boundary_file tcp-port [lda-matrix-in]\n\n" 86 "example: online-audio-server-decode-faster --verbose=1 --rt-min=0.5 --rt-max=3.0 --max-active=6000\n" 87 "--beam=72.0 --acoustic-scale=0.0769 final.mdl graph/HCLG.fst graph/words.txt '1:2:3:4:5'\n" 88 "graph/word_boundary.int 5000 final.mat\n\n";
92 int32 cmn_window = 600, min_cmn_window = 100;
93 int32 right_context = 4, left_context = 4;
101 po.Register(
"left-context", &left_context,
102 "Number of frames of left context");
103 po.Register(
"right-context", &right_context,
104 "Number of frames of right context");
105 po.Register(
"acoustic-scale", &acoustic_scale,
106 "Scaling factor for acoustic likelihoods");
108 "cmn-window", &cmn_window,
109 "Number of feat. vectors used in the running average CMN calculation");
110 po.Register(
"min-cmn-window", &min_cmn_window,
111 "Minumum CMN window used at start of decoding (adds " 112 "latency only at start)");
113 po.Register(
"frame-shift", &frame_shift,
114 "Time in seconds between frames.\n");
120 if (po.NumArgs() < 6 || po.NumArgs() > 7) {
125 std::string model_rspecifier = po.GetArg(1), fst_rspecifier = po.GetArg(2),
126 word_syms_filename = po.GetArg(3), silence_phones_str = po.GetArg(4),
127 word_boundary_file = po.GetArg(5), lda_mat_rspecifier =
"";
129 if (po.NumArgs() == 7)
130 lda_mat_rspecifier = po.GetOptArg(7);
132 int32 port = strtol(po.GetArg(6).c_str(), 0, 10);
134 std::vector<int32> silence_phones;
136 KALDI_ERR <<
"Invalid silence-phones string " << silence_phones_str;
137 if (silence_phones.empty())
140 if (!tcp_server.
Listen(port))
143 std::cout <<
"Reading LDA matrix: " << lda_mat_rspecifier <<
"..." 146 if (lda_mat_rspecifier !=
"") {
148 Input ki(lda_mat_rspecifier, &binary_in);
149 lda_transform.
Read(ki.Stream(), binary_in);
152 std::cout <<
"Reading acoustic model: " << model_rspecifier <<
"..." 158 Input ki(model_rspecifier, &binary);
159 trans_model.
Read(ki.Stream(), binary);
160 am_gmm.
Read(ki.Stream(), binary);
163 std::cout <<
"Reading word list: " << word_syms_filename <<
"..." 165 fst::SymbolTable *word_syms = NULL;
166 if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
167 KALDI_ERR <<
"Could not read symbol table from file " 168 << word_syms_filename;
170 std::cout <<
"Reading word boundary file: " << word_boundary_file <<
"..." 174 std::cout <<
"Reading FST: " << fst_rspecifier <<
"..." << std::endl;
175 fst::Fst < fst::StdArc > *decode_fst =
ReadDecodeGraph(fst_rspecifier);
185 int32 window_size = right_context + left_context + 1;
192 VectorFst < LatticeArc > out_fst;
196 int32 client_socket = -1;
201 std::cout <<
"Client disconnected!" << std::endl;
204 client_socket = tcp_server.
Accept();
212 Mfcc mfcc(mfcc_opts);
213 FeInput fe_input(au_src, &mfcc, frame_length * (16000 / 1000),
214 mfcc_frame_shift * (16000 / 1000));
217 if (lda_mat_rspecifier !=
"") {
219 left_context, right_context);
222 opts.
order = kDeltaOrder;
230 acoustic_scale, &feature_matrix);
232 clock_t start = clock();
233 int32 decoder_offset = 0;
245 if (dstate & (decoder.kEndFeats | decoder.kEndUtt)) {
246 std::vector<int32> word_ids, times, lengths;
248 decoder.FinishTraceBack(&out_fst);
249 decoder.GetBestPath(&out_fst);
266 for (
size_t i = 0;
i < word_ids.size();
i++)
267 if (word_ids[
i] != 0)
272 float dur = (clock() - start) / (
float) CLOCKS_PER_SEC;
278 std::stringstream sstr;
279 sstr <<
"RESULT:NUM=" << words_num <<
",FORMAT=WSE,RECO-DUR=" << dur
280 <<
",INPUT-DUR=" << input_dur;
284 for (
size_t i = 0;
i < word_ids.size();
i++) {
285 if (word_ids[
i] == 0)
288 std::string word = word_syms->Find(word_ids[
i]);
295 std::stringstream wstr;
296 wstr << word <<
"," << start <<
"," << (start + len);
302 if (dstate == decoder.kEndFeats) {
307 decoder_offset = decoder.frame();
309 std::vector<int32> word_ids;
310 if (decoder.PartialTraceback(&out_fst)) {
313 static_cast<LatticeArc::Weight*>(0));
314 for (
size_t i = 0; i < word_ids.size(); i++) {
315 if (word_ids[i] != 0) {
317 "PARTIAL:" + word_syms->Find(word_ids[i]));
323 delete feat_transform;
326 std::cout <<
"Deinitizalizing..." << std::endl;
332 }
catch (
const std::exception& e) {
333 std::cerr << e.what();
size_t SamplesProcessed()
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void Register(OptionsItf *opts, bool full)
MfccOptions contains basic options for computing MFCC features.
bool DeterminizeLatticePruned(const ExpandedFst< ArcTpl< Weight > > &ifst, double beam, MutableFst< ArcTpl< CompactLatticeWeightTpl< Weight, IntType > > > *ofst, DeterminizeLatticePrunedOptions opts)
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
const float kFramesPerSecond
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
bool WordAlignLattice(const CompactLattice &lat, const TransitionModel &tmodel, const WordBoundaryInfo &info, int32 max_states, CompactLattice *lat_out)
Align lattice so that each arc has the transition-ids on it that correspond to the word that is on th...
bool GetLinearSymbolSequence(const Fst< Arc > &fst, std::vector< I > *isymbols_out, std::vector< I > *osymbols_out, typename Arc::Weight *tot_weight_out)
GetLinearSymbolSequence gets the symbol sequence from a linear FST.
void Read(std::istream &in, bool binary, bool add=false)
read from stream.
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
FrameExtractionOptions frame_opts
void Read(std::istream &is, bool binary)
void ConvertLattice(const ExpandedFst< ArcTpl< Weight > > &ifst, MutableFst< ArcTpl< CompactLatticeWeightTpl< Weight, Int > > > *ofst, bool invert)
Convert lattice from a normal FST to a CompactLattice FST.
void Register(OptionsItf *opts)
fst::VectorFst< LatticeArc > Lattice
fst::VectorFst< CompactLatticeArc > CompactLattice
bool WriteLine(int32 socket, std::string line)
fst::Fst< fst::StdArc > * ReadDecodeGraph(const std::string &filename)
void Register(OptionsItf *opts)
This templated class is intended for offline feature extraction, i.e.
void Read(std::istream &in_stream, bool binary)
bool CompactLatticeToWordAlignment(const CompactLattice &clat, std::vector< int32 > *words, std::vector< int32 > *begin_times, std::vector< int32 > *lengths)
This function takes a CompactLattice that should only contain a single linear sequence (e...