doc/online2-wav-nnet2-latgen-faster_8cc_source.html

 // online2bin/online2-wav-nnet2-latgen-faster.cc

 // Copyright 2014  Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.

 #include "feat/wave-reader.h"
 #include "online2/online-nnet2-decoding.h"
 #include "online2/online-nnet2-feature-pipeline.h"
 #include "online2/onlinebin-util.h"
 #include "online2/online-timing.h"
 #include "online2/online-endpoint.h"
 #include "fstext/fstext-lib.h"
 #include "lat/lattice-functions.h"
 #include "util/kaldi-thread.h"

 namespace kaldi {

 void GetDiagnosticsAndPrintOutput(const std::string &utt,
                                   const fst::SymbolTable *word_syms,
                                   const CompactLattice &clat,
                                   int64 *tot_num_frames,
                                   double *tot_like) {
   if (clat.NumStates() == 0) {
     KALDI_WARN << "Empty lattice.";
     return;
   }
   CompactLattice best_path_clat;
   CompactLatticeShortestPath(clat, &best_path_clat);

   Lattice best_path_lat;
   ConvertLattice(best_path_clat, &best_path_lat);

   double likelihood;
   LatticeWeight weight;
   int32 num_frames;
   std::vector<int32> alignment;
   std::vector<int32> words;
   GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
   num_frames = alignment.size();
   likelihood = -(weight.Value1() + weight.Value2());
   *tot_num_frames += num_frames;
   *tot_like += likelihood;
   KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
                 << (likelihood / num_frames) << " over " << num_frames
                 << " frames.";

   if (word_syms != NULL) {
     std::cerr << utt << ' ';
     for (size_t i = 0; i < words.size(); i++) {
       std::string s = word_syms->Find(words[i]);
       if (s == "")
         KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
       std::cerr << s << ' ';
     }
     std::cerr << std::endl;
   }
 }

 }

 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace fst;

     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;

     const char *usage =
         "Reads in wav file(s) and simulates online decoding with neural nets\n"
         "(nnet2 setup), with optional iVector-based speaker adaptation and\n"
         "optional endpointing.  Note: some configuration values and inputs are\n"
         "set via config files whose filenames are passed as options\n"
         "\n"
         "Usage: online2-wav-nnet2-latgen-faster [options] <nnet2-in> <fst-in> "
         "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
         "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
         "you want to decode utterance by utterance.\n"
         "See egs/rm/s5/local/run_online_decoding_nnet2.sh for example\n"
         "See also online2-wav-nnet2-latgen-threaded\n";

     ParseOptions po(usage);

     std::string word_syms_rxfilename;

     OnlineEndpointConfig endpoint_config;

     // feature_config includes configuration for the iVector adaptation,
     // as well as the basic features.
     OnlineNnet2FeaturePipelineConfig feature_config;
     OnlineNnet2DecodingConfig nnet2_decoding_config;

     BaseFloat chunk_length_secs = 0.05;
     bool do_endpointing = false;
     bool online = true;

     po.Register("chunk-length", &chunk_length_secs,
                 "Length of chunk size in seconds, that we process.  Set to <= 0 "
                 "to use all input in one chunk.");
     po.Register("word-symbol-table", &word_syms_rxfilename,
                 "Symbol table for words [for debug output]");
     po.Register("do-endpointing", &do_endpointing,
                 "If true, apply endpoint detection");
     po.Register("online", &online,
                 "You can set this to false to disable online iVector estimation "
                 "and have all the data for each utterance used, even at "
                 "utterance start.  This is useful where you just want the best "
                 "results and don't care about online operation.  Setting this to "
                 "false has the same effect as setting "
                 "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
                 "in the file given to --ivector-extraction-config, and "
                 "--chunk-length=-1.");
     po.Register("num-threads-startup", &g_num_threads,
                 "Number of threads used when initializing iVector extractor.");

     feature_config.Register(&po);
     nnet2_decoding_config.Register(&po);
     endpoint_config.Register(&po);

     po.Read(argc, argv);

     if (po.NumArgs() != 5) {
       po.PrintUsage();
       return 1;
     }

     std::string nnet2_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         spk2utt_rspecifier = po.GetArg(3),
         wav_rspecifier = po.GetArg(4),
         clat_wspecifier = po.GetArg(5);

     OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
     if (!online) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
       chunk_length_secs = -1.0;
     }

     Matrix<double> global_cmvn_stats;
     if (feature_info.global_cmvn_stats_rxfilename != "")
       ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
                       &global_cmvn_stats);

     TransitionModel trans_model;
     nnet2::AmNnet nnet;
     {
       bool binary;
       Input ki(nnet2_rxfilename, &binary);
       trans_model.Read(ki.Stream(), binary);
       nnet.Read(ki.Stream(), binary);
     }

     fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);

     fst::SymbolTable *word_syms = NULL;
     if (word_syms_rxfilename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
         KALDI_ERR << "Could not read symbol table from file "
                   << word_syms_rxfilename;

     int32 num_done = 0, num_err = 0;
     double tot_like = 0.0;
     int64 num_frames = 0;

     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
     CompactLatticeWriter clat_writer(clat_wspecifier);

     OnlineTimingStats timing_stats;

     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();

       OnlineIvectorExtractorAdaptationState adaptation_state(
           feature_info.ivector_extractor_info);
       OnlineCmvnState cmvn_state(global_cmvn_stats);

       for (size_t i = 0; i < uttlist.size(); i++) {
         std::string utt = uttlist[i];
         if (!wav_reader.HasKey(utt)) {
           KALDI_WARN << "Did not find audio for utterance " << utt;
           num_err++;
           continue;
         }
         const WaveData &wave_data = wav_reader.Value(utt);
         // get the data for channel zero (if the signal is not mono, we only
         // take the first channel).
         SubVector<BaseFloat> data(wave_data.Data(), 0);

         OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
         feature_pipeline.SetAdaptationState(adaptation_state);
         feature_pipeline.SetCmvnState(cmvn_state);

         OnlineSilenceWeighting silence_weighting(
             trans_model,
             feature_info.silence_weighting_config);

         SingleUtteranceNnet2Decoder decoder(nnet2_decoding_config,
                                             trans_model,
                                             nnet,
                                             *decode_fst,
                                             &feature_pipeline);
         OnlineTimer decoding_timer(utt);

         BaseFloat samp_freq = wave_data.SampFreq();
         int32 chunk_length;
         if (chunk_length_secs > 0) {
           chunk_length = int32(samp_freq * chunk_length_secs);
           if (chunk_length == 0) chunk_length = 1;
         } else {
           chunk_length = std::numeric_limits<int32>::max();
         }

         int32 samp_offset = 0;
         std::vector<std::pair<int32, BaseFloat> > delta_weights;

         while (samp_offset < data.Dim()) {
           int32 samp_remaining = data.Dim() - samp_offset;
           int32 num_samp = chunk_length < samp_remaining ? chunk_length
                                                          : samp_remaining;

           SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
           feature_pipeline.AcceptWaveform(samp_freq, wave_part);

           samp_offset += num_samp;
           decoding_timer.WaitUntil(samp_offset / samp_freq);
           if (samp_offset == data.Dim()) {
             // no more input. flush out last frames
             feature_pipeline.InputFinished();
           }

           if (silence_weighting.Active() &&
               feature_pipeline.IvectorFeature() != NULL) {
             silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
             silence_weighting.GetDeltaWeights(
                 feature_pipeline.IvectorFeature()->NumFramesReady(),
                 &delta_weights);
             feature_pipeline.IvectorFeature()->UpdateFrameWeights(
                 delta_weights);
           }

           decoder.AdvanceDecoding();

           if (do_endpointing && decoder.EndpointDetected(endpoint_config))
             break;
         }
         decoder.FinalizeDecoding();

         CompactLattice clat;
         bool end_of_utterance = true;
         decoder.GetLattice(end_of_utterance, &clat);

         GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
                                      &num_frames, &tot_like);

         decoding_timer.OutputStats(&timing_stats);

         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
         feature_pipeline.GetCmvnState(&cmvn_state);

         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
             1.0 / nnet2_decoding_config.decodable_opts.acoustic_scale;
         ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);

         clat_writer.Write(utt, clat);
         KALDI_LOG << "Decoded utterance " << utt;
         num_done++;
       }
     }
     timing_stats.Print(online);

     KALDI_LOG << "Decoded " << num_done << " utterances, "
               << num_err << " with errors.";
     KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
               << " per frame over " << num_frames << " frames.";
     delete decode_fst;
     delete word_syms; // will delete if non-NULL.
     return (num_done != 0 ? 0 : 1);
   } catch(const std::exception& e) {
     std::cerr << e.what();
     return -1;
   }
 } // main()
words
int32 words[kMaxOrder]
Definition: arpa-file-parser-test.cc:43

kaldi::OnlineNnet2DecodingConfig::decodable_opts
nnet2::DecodableNnet2OnlineOptions decodable_opts
Definition: online-nnet2-decoding.h:53

fstext-lib.h

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::OnlineTimer
class OnlineTimer is used to test real-time decoding algorithms and evaluate how long the decoding of...
Definition: online-timing.h:88

kaldi::OnlineNnet2FeaturePipelineConfig
This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which in turn is the configurat...
Definition: online-nnet2-feature-pipeline.h:69

main
int main(int argc, char *argv[])
Definition: online2-wav-nnet2-latgen-faster.cc:75

kaldi::SingleUtteranceNnet2Decoder::FinalizeDecoding
void FinalizeDecoding()
Finalizes the decoding.
Definition: online-nnet2-decoding.cc:44

kaldi::Input
Definition: kaldi-io.h:190

kaldi-thread.h

fst::ReadFstKaldiGeneric
Fst< StdArc > * ReadFstKaldiGeneric(std::string rxfilename, bool throw_on_err)
Definition: kaldi-fst-io.cc:45

kaldi::OnlineSilenceWeighting
Definition: online-ivector-feature.h:465

kaldi::nnet2::AmNnet
Definition: am-nnet.h:38

fst
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
Definition: graph.dox:21

kaldi::ParseOptions::PrintUsage
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
Definition: parse-options.cc:393

kaldi::SequentialTableReader::Key
std::string Key()
Definition: kaldi-table-inl.h:918

online-nnet2-decoding.h

kaldi::g_num_threads
int32 g_num_threads
Definition: kaldi-thread.cc:25

kaldi::OnlineIvectorExtractorAdaptationState
This class stores the adaptation state from the online iVector extractor, which can help you to initi...
Definition: online-ivector-feature.h:211

kaldi::OnlineTimer::OutputStats
void OutputStats(OnlineTimingStats *stats)
This call, which should be made after decoding is done, writes the stats to the object that accumulat...
Definition: online-timing.cc:96

kaldi::nnet2::AmNnet::Read
void Read(std::istream &is, bool binary)
Definition: am-nnet.cc:39

kaldi::SingleUtteranceNnet2Decoder::Decoder
const LatticeFasterOnlineDecoder & Decoder() const
Definition: online-nnet2-decoding.h:107

kaldi::OnlineEndpointConfig
Definition: online-endpoint.h:127

kaldi::TableWriter
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

kaldi::Matrix< double >

kaldi::WaveData::SampFreq
BaseFloat SampFreq() const
Definition: wave-reader.h:126

wave-reader.h

kaldi::WaveData::Data
const Matrix< BaseFloat > & Data() const
Definition: wave-reader.h:124

kaldi::OnlineEndpointConfig::Register
void Register(OptionsItf *opts)
Definition: online-endpoint.h:159

kaldi::GetDiagnosticsAndPrintOutput
void GetDiagnosticsAndPrintOutput(const std::string &utt, const fst::SymbolTable *word_syms, const CompactLattice &clat, int64 *tot_num_frames, double *tot_like)
Definition: online2-wav-gmm-latgen-faster.cc:31

fst::GetLinearSymbolSequence
bool GetLinearSymbolSequence(const Fst< Arc > &fst, std::vector< I > *isymbols_out, std::vector< I > *osymbols_out, typename Arc::Weight *tot_weight_out)
GetLinearSymbolSequence gets the symbol sequence from a linear FST.
Definition: fstext-utils-inl.h:178

online-nnet2-feature-pipeline.h
This file contains a different version of the feature-extraction pipeline in online-feature-pipeline...

kaldi::TableWriter::Write
void Write(const std::string &key, const T &value) const
Definition: kaldi-table-inl.h:1511

kaldi::OnlineNnet2FeaturePipelineInfo
This class is responsible for storing configuration variables, objects and options for OnlineNnet2Fea...
Definition: online-nnet2-feature-pipeline.h:138

kaldi::ParseOptions::Register
void Register(const std::string &name, bool *ptr, const std::string &doc)
Definition: parse-options.cc:56

kaldi::TransitionModel
Definition: transition-model.h:123

kaldi::ReadKaldiObject
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832

lattice-functions.h

kaldi::RandomAccessTableReader
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233

kaldi::LatticeWeight
fst::LatticeWeightTpl< BaseFloat > LatticeWeight
Definition: kaldi-lattice.h:32

kaldi::CompactLatticeShortestPath
void CompactLatticeShortestPath(const CompactLattice &clat, CompactLattice *shortest_path)
A form of the shortest-path/best-path algorithm that&#39;s specially coded for CompactLattice.
Definition: lattice-functions.cc:1097

fst::AcousticLatticeScale
std::vector< std::vector< double > > AcousticLatticeScale(double acwt)
Definition: lattice-utils.h:138

kaldi::SingleUtteranceNnet2Decoder::AdvanceDecoding
void AdvanceDecoding()
advance the decoding as far as we can.
Definition: online-nnet2-decoding.cc:40

kaldi::Input::Stream
std::istream & Stream()
Definition: kaldi-io.cc:826

kaldi::BaseFloat
float BaseFloat
Definition: kaldi-types.h:29

kaldi::ParseOptions
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36

kaldi::OnlineTimingStats::Print
void Print(bool online=true)
Here, if "online == false" we take into account that the setup was used in not-really-online mode whe...
Definition: online-timing.cc:29

kaldi::OnlineSilenceWeighting::ComputeCurrentTraceback
void ComputeCurrentTraceback(const LatticeFasterOnlineDecoderTpl< FST > &decoder)
Definition: online-ivector-feature.cc:482

kaldi::OnlineNnet2DecodingConfig::Register
void Register(OptionsItf *opts)
Definition: online-nnet2-decoding.h:57

kaldi::RandomAccessTableReader::Value
const T & Value(const std::string &key)
Definition: kaldi-table-inl.h:2561

kaldi::OnlineNnet2DecodingConfig
Definition: online-nnet2-decoding.h:50

kaldi::SingleUtteranceNnet2Decoder
You will instantiate this class when you want to decode a single utterance using the online-decoding ...
Definition: online-nnet2-decoding.h:67

kaldi::OnlineSilenceWeighting::Active
bool Active() const
Definition: online-ivector-feature.h:478

fst::ScaleLattice
void ScaleLattice(const std::vector< std::vector< ScaleFloat > > &scale, MutableFst< ArcTpl< Weight > > *fst)
Scales the pairs of weights in LatticeWeight or CompactLatticeWeight by viewing the pair (a...
Definition: lattice-utils-inl.h:197

kaldi::TransitionModel::Read
void Read(std::istream &is, bool binary)
Definition: transition-model.cc:394

kaldi::OnlineCmvnState
Struct OnlineCmvnState stores the state of CMVN adaptation between utterances (but not the state of t...
Definition: online-feature.h:266

fst::ConvertLattice
void ConvertLattice(const ExpandedFst< ArcTpl< Weight > > &ifst, MutableFst< ArcTpl< CompactLatticeWeightTpl< Weight, Int > > > *ofst, bool invert)
Convert lattice from a normal FST to a CompactLattice FST.
Definition: lattice-utils-inl.h:33

kaldi::SingleUtteranceNnet2Decoder::GetLattice
void GetLattice(bool end_of_utterance, CompactLattice *clat) const
Gets the lattice.
Definition: online-nnet2-decoding.cc:52

kaldi::SequentialTableReader
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287

kaldi::Lattice
fst::VectorFst< LatticeArc > Lattice
Definition: kaldi-lattice.h:44

kaldi::ParseOptions::Read
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
Definition: parse-options.cc:311

kaldi::SequentialTableReader::Done
bool Done()
Definition: kaldi-table-inl.h:948

KALDI_ERR
#define KALDI_ERR
Definition: kaldi-error.h:147

kaldi::OnlineNnet2FeaturePipelineConfig::Register
void Register(OptionsItf *opts)
Definition: online-nnet2-feature-pipeline.h:101

kaldi::ParseOptions::GetArg
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
Definition: parse-options.cc:202

KALDI_WARN
#define KALDI_WARN
Definition: kaldi-error.h:150

kaldi::SequentialTableReader::Next
void Next()
Definition: kaldi-table-inl.h:942

online-timing.h

kaldi::RandomAccessTableReader::HasKey
bool HasKey(const std::string &key)
Definition: kaldi-table-inl.h:2551

kaldi::CompactLattice
fst::VectorFst< CompactLatticeArc > CompactLattice
Definition: kaldi-lattice.h:46

kaldi::WaveData
This class&#39;s purpose is to read in Wave files.
Definition: wave-reader.h:106

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::ParseOptions::NumArgs
int NumArgs() const
Number of positional parameters (c.f. argc-1).
Definition: parse-options.cc:198

kaldi::OnlineIvectorExtractionInfo::use_most_recent_ivector
bool use_most_recent_ivector
Definition: online-ivector-feature.h:189

kaldi::OnlineNnet2FeaturePipelineInfo::global_cmvn_stats_rxfilename
std::string global_cmvn_stats_rxfilename
Options for online cmvn, read from config file.
Definition: online-nnet2-feature-pipeline.h:163

kaldi::SequentialTableReader::Value
T & Value()
Definition: kaldi-table-inl.h:934

kaldi::OnlineNnet2FeaturePipeline
OnlineNnet2FeaturePipeline is a class that&#39;s responsible for putting together the various parts of th...
Definition: online-nnet2-feature-pipeline.h:198

onlinebin-util.h

kaldi::OnlineNnet2FeaturePipelineInfo::silence_weighting_config
OnlineSilenceWeightingConfig silence_weighting_config
Config for weighting silence in iVector adaptation.
Definition: online-nnet2-feature-pipeline.h:177

KALDI_VLOG
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156

kaldi::OnlineIvectorExtractionInfo::greedy_ivector_extractor
bool greedy_ivector_extractor
Definition: online-ivector-feature.h:190

kaldi::OnlineTimingStats
class OnlineTimingStats stores statistics from timing of online decoding, which will enable the Print...
Definition: online-timing.h:41

kaldi::nnet2::DecodableNnet2OnlineOptions::acoustic_scale
BaseFloat acoustic_scale
Definition: online-nnet2-decodable.h:38

online-endpoint.h

kaldi::SingleUtteranceNnet2Decoder::EndpointDetected
bool EndpointDetected(const OnlineEndpointConfig &config)
This function calls EndpointDetected from online-endpoint.h, with the required arguments.
Definition: online-nnet2-decoding.cc:72

KALDI_LOG
#define KALDI_LOG
Definition: kaldi-error.h:153

kaldi::SubVector
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501

kaldi::OnlineNnet2FeaturePipelineInfo::ivector_extractor_info
OnlineIvectorExtractionInfo ivector_extractor_info
Definition: online-nnet2-feature-pipeline.h:170

kaldi::OnlineTimer::WaitUntil
void WaitUntil(double cur_utterance_length)
The call to WaitUntil(t) simulates the effect of sleeping until cur_utterance_length seconds after th...
Definition: online-timing.cc:65

kaldi::OnlineSilenceWeighting::GetDeltaWeights
void GetDeltaWeights(int32 num_frames_ready, int32 first_decoder_frame, std::vector< std::pair< int32, BaseFloat > > *delta_weights)
Definition: online-ivector-feature.cc:597