#include "feat/wave-reader.h"
#include "online2/online-nnet2-decoding.h"
#include "online2/online-nnet2-feature-pipeline.h"
#include "online2/onlinebin-util.h"
#include "online2/online-timing.h"
#include "online2/online-endpoint.h"
#include "fstext/fstext-lib.h"
#include "lat/lattice-functions.h"
#include "util/kaldi-thread.h"

Include dependency graph for online2-wav-nnet2-latgen-faster.cc:

Namespaces
	kaldi
	This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for mispronunciations detection tasks, the reference:

Functions
void	GetDiagnosticsAndPrintOutput (const std::string &utt, const fst::SymbolTable word_syms, const CompactLattice &clat, int64 tot_num_frames, double *tot_like)

int	main (int argc, char *argv[])

Function Documentation

◆ main()

int main	(	int	argc,
		char *	argv[]
	)

Definition at line 75 of file online2-wav-nnet2-latgen-faster.cc.

References DecodableNnet2OnlineOptions::acoustic_scale, fst::AcousticLatticeScale(), OnlineSilenceWeighting::Active(), SingleUtteranceNnet2Decoder::AdvanceDecoding(), OnlineSilenceWeighting::ComputeCurrentTraceback(), WaveData::Data(), OnlineNnet2DecodingConfig::decodable_opts, SingleUtteranceNnet2Decoder::Decoder(), SequentialTableReader< Holder >::Done(), SingleUtteranceNnet2Decoder::EndpointDetected(), SingleUtteranceNnet2Decoder::FinalizeDecoding(), kaldi::g_num_threads, ParseOptions::GetArg(), OnlineSilenceWeighting::GetDeltaWeights(), kaldi::GetDiagnosticsAndPrintOutput(), SingleUtteranceNnet2Decoder::GetLattice(), OnlineNnet2FeaturePipelineInfo::global_cmvn_stats_rxfilename, OnlineIvectorExtractionInfo::greedy_ivector_extractor, RandomAccessTableReader< Holder >::HasKey(), rnnlm::i, OnlineNnet2FeaturePipelineInfo::ivector_extractor_info, KALDI_ERR, KALDI_LOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), OnlineTimer::OutputStats(), OnlineTimingStats::Print(), ParseOptions::PrintUsage(), AmNnet::Read(), ParseOptions::Read(), TransitionModel::Read(), fst::ReadFstKaldiGeneric(), kaldi::ReadKaldiObject(), OnlineNnet2DecodingConfig::Register(), ParseOptions::Register(), OnlineNnet2FeaturePipelineConfig::Register(), OnlineEndpointConfig::Register(), WaveData::SampFreq(), fst::ScaleLattice(), OnlineNnet2FeaturePipelineInfo::silence_weighting_config, Input::Stream(), OnlineIvectorExtractionInfo::use_most_recent_ivector, RandomAccessTableReader< Holder >::Value(), SequentialTableReader< Holder >::Value(), OnlineTimer::WaitUntil(), and TableWriter< Holder >::Write().

                                  {
   try {
     using namespace kaldi;
     using namespace fst;
 
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
 
     const char *usage =
         "Reads in wav file(s) and simulates online decoding with neural nets\n"
         "(nnet2 setup), with optional iVector-based speaker adaptation and\n"
         "optional endpointing.  Note: some configuration values and inputs are\n"
         "set via config files whose filenames are passed as options\n"
         "\n"
         "Usage: online2-wav-nnet2-latgen-faster [options] <nnet2-in> <fst-in> "
         "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
         "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
         "you want to decode utterance by utterance.\n"
         "See egs/rm/s5/local/run_online_decoding_nnet2.sh for example\n"
         "See also online2-wav-nnet2-latgen-threaded\n";
 
     ParseOptions po(usage);
 
     std::string word_syms_rxfilename;
 
     OnlineEndpointConfig endpoint_config;
 
     // feature_config includes configuration for the iVector adaptation,
     // as well as the basic features.
     OnlineNnet2FeaturePipelineConfig feature_config;
     OnlineNnet2DecodingConfig nnet2_decoding_config;
 
     BaseFloat chunk_length_secs = 0.05;
     bool do_endpointing = false;
     bool online = true;
 
     po.Register("chunk-length", &chunk_length_secs,
                 "Length of chunk size in seconds, that we process.  Set to <= 0 "
                 "to use all input in one chunk.");
     po.Register("word-symbol-table", &word_syms_rxfilename,
                 "Symbol table for words [for debug output]");
     po.Register("do-endpointing", &do_endpointing,
                 "If true, apply endpoint detection");
     po.Register("online", &online,
                 "You can set this to false to disable online iVector estimation "
                 "and have all the data for each utterance used, even at "
                 "utterance start.  This is useful where you just want the best "
                 "results and don't care about online operation.  Setting this to "
                 "false has the same effect as setting "
                 "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
                 "in the file given to --ivector-extraction-config, and "
                 "--chunk-length=-1.");
     po.Register("num-threads-startup", &g_num_threads,
                 "Number of threads used when initializing iVector extractor.");
 
     feature_config.Register(&po);
     nnet2_decoding_config.Register(&po);
     endpoint_config.Register(&po);
 
     po.Read(argc, argv);
 
     if (po.NumArgs() != 5) {
       po.PrintUsage();
       return 1;
     }
 
     std::string nnet2_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         spk2utt_rspecifier = po.GetArg(3),
         wav_rspecifier = po.GetArg(4),
         clat_wspecifier = po.GetArg(5);
 
     OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
     if (!online) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
       chunk_length_secs = -1.0;
     }
 
     Matrix<double> global_cmvn_stats;
     if (feature_info.global_cmvn_stats_rxfilename != "")
       ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
                       &global_cmvn_stats);
 
     TransitionModel trans_model;
     nnet2::AmNnet nnet;
     {
       bool binary;
       Input ki(nnet2_rxfilename, &binary);
       trans_model.Read(ki.Stream(), binary);
       nnet.Read(ki.Stream(), binary);
     }
 
     fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
 
     fst::SymbolTable *word_syms = NULL;
     if (word_syms_rxfilename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
         KALDI_ERR << "Could not read symbol table from file "
                   << word_syms_rxfilename;
 
     int32 num_done = 0, num_err = 0;
     double tot_like = 0.0;
     int64 num_frames = 0;
 
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
     CompactLatticeWriter clat_writer(clat_wspecifier);
 
     OnlineTimingStats timing_stats;
 
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
 
       OnlineIvectorExtractorAdaptationState adaptation_state(
           feature_info.ivector_extractor_info);
       OnlineCmvnState cmvn_state(global_cmvn_stats);
 
       for (size_t i = 0; i < uttlist.size(); i++) {
         std::string utt = uttlist[i];
         if (!wav_reader.HasKey(utt)) {
           KALDI_WARN << "Did not find audio for utterance " << utt;
           num_err++;
           continue;
         }
         const WaveData &wave_data = wav_reader.Value(utt);
         // get the data for channel zero (if the signal is not mono, we only
         // take the first channel).
         SubVector<BaseFloat> data(wave_data.Data(), 0);
 
         OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
         feature_pipeline.SetAdaptationState(adaptation_state);
         feature_pipeline.SetCmvnState(cmvn_state);
 
         OnlineSilenceWeighting silence_weighting(
             trans_model,
             feature_info.silence_weighting_config);
 
         SingleUtteranceNnet2Decoder decoder(nnet2_decoding_config,
                                             trans_model,
                                             nnet,
                                             *decode_fst,
                                             &feature_pipeline);
         OnlineTimer decoding_timer(utt);
 
         BaseFloat samp_freq = wave_data.SampFreq();
         int32 chunk_length;
         if (chunk_length_secs > 0) {
           chunk_length = int32(samp_freq * chunk_length_secs);
           if (chunk_length == 0) chunk_length = 1;
         } else {
           chunk_length = std::numeric_limits<int32>::max();
         }
 
         int32 samp_offset = 0;
         std::vector<std::pair<int32, BaseFloat> > delta_weights;
 
         while (samp_offset < data.Dim()) {
           int32 samp_remaining = data.Dim() - samp_offset;
           int32 num_samp = chunk_length < samp_remaining ? chunk_length
                                                          : samp_remaining;
 
           SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
           feature_pipeline.AcceptWaveform(samp_freq, wave_part);
 
           samp_offset += num_samp;
           decoding_timer.WaitUntil(samp_offset / samp_freq);
           if (samp_offset == data.Dim()) {
             // no more input. flush out last frames
             feature_pipeline.InputFinished();
           }
 
           if (silence_weighting.Active() &&
               feature_pipeline.IvectorFeature() != NULL) {
             silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
             silence_weighting.GetDeltaWeights(
                 feature_pipeline.IvectorFeature()->NumFramesReady(),
                 &delta_weights);
             feature_pipeline.IvectorFeature()->UpdateFrameWeights(
                 delta_weights);
           }
 
           decoder.AdvanceDecoding();
 
           if (do_endpointing && decoder.EndpointDetected(endpoint_config))
             break;
         }
         decoder.FinalizeDecoding();
 
         CompactLattice clat;
         bool end_of_utterance = true;
         decoder.GetLattice(end_of_utterance, &clat);
 
         GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
                                      &num_frames, &tot_like);
 
         decoding_timer.OutputStats(&timing_stats);
 
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
         feature_pipeline.GetCmvnState(&cmvn_state);
 
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
             1.0 / nnet2_decoding_config.decodable_opts.acoustic_scale;
         ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
 
         clat_writer.Write(utt, clat);
         KALDI_LOG << "Decoded utterance " << utt;
         num_done++;
       }
     }
     timing_stats.Print(online);
 
     KALDI_LOG << "Decoded " << num_done << " utterances, "
               << num_err << " with errors.";
     KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
               << " per frame over " << num_frames << " frames.";
     delete decode_fst;
     delete word_syms; // will delete if non-NULL.
     return (num_done != 0 ? 0 : 1);
   } catch(const std::exception& e) {
     std::cerr << e.what();
     return -1;
   }
 } // main()

Namespaces

Functions

Function Documentation

◆ main()