#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "online2/online-ivector-feature.h"
#include "util/kaldi-thread.h"
#include "base/timer.h"

Include dependency graph for ivector-extract-online2.cc:

Functions
int	main (int argc, char *argv[])

Function Documentation

◆ main()

int main	(	int	argc,
		char *	argv[]
	)

Definition at line 28 of file ivector-extract-online2.cc.

References MatrixBase< Real >::ColRange(), VectorBase< Real >::Dim(), OnlineIvectorFeature::Dim(), SequentialTableReader< Holder >::Done(), OnlineIvectorExtractionInfo::ExpectedFeatureDim(), OnlineIvectorExtractionInfo::extractor, kaldi::g_num_threads, OnlineIvectorFeature::GetAdaptationState(), ParseOptions::GetArg(), OnlineIvectorFeature::GetFrame(), RandomAccessTableReader< Holder >::HasKey(), rnnlm::i, OnlineIvectorExtractionConfig::ivector_period, IvectorExtractor::IvectorDim(), KALDI_LOG, KALDI_VLOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), rnnlm::n, SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), OnlineIvectorFeature::ObjfImprPerFrame(), ParseOptions::PrintUsage(), ParseOptions::Read(), ParseOptions::Register(), OnlineIvectorExtractionConfig::Register(), OnlineIvectorFeature::SetAdaptationState(), OnlineIvectorFeature::UbmLogLikePerFrame(), OnlineIvectorFeature::UpdateFrameWeights(), OnlineIvectorExtractionConfig::use_most_recent_ivector, RandomAccessTableReader< Holder >::Value(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

                                  {
   using namespace kaldi;
   typedef kaldi::int32 int32;
   typedef kaldi::int64 int64;
   try {
     const char *usage =
         "Extract iVectors for utterances every --ivector-period frames, using a trained\n"
         "iVector extractor and features and Gaussian-level posteriors.  Similar to\n"
         "ivector-extract-online but uses the actual online decoder code to do it,\n"
         "and does everything in-memory instead of using multiple processes.\n"
         "Note: the value of the --use-most-recent-ivector config variable is ignored\n"
         "it's set to false.  The <spk2utt-rspecifier> is mandatory, to simplify the code;\n"
         "if you want to do it separately per utterance, just make it of the form\n"
         "<utterance-id> <utterance-id>.\n"
         "The iVectors are output as an archive of matrices, indexed by utterance-id;\n"
         "each row corresponds to an iVector.  If --repeat=true, outputs the whole matrix\n"
         "of iVectors, not just every (ivector-period)'th frame\n"
         "The input features are the raw, non-cepstral-mean-normalized features, e.g. MFCC.\n"
         "\n"
         "Usage:  ivector-extract-online2 [options] <spk2utt-rspecifier> <feature-rspecifier> <ivector-wspecifier>\n"
         "e.g.: \n"
         "  ivector-extract-online2 --config=exp/nnet2_online/nnet_online/conf/ivector_extractor.conf \\\n"
         "    ark:data/train/spk2utt scp:data/train/feats.scp ark,t:ivectors.1.ark\n";
 
     ParseOptions po(usage);
 
     OnlineIvectorExtractionConfig ivector_config;
     ivector_config.Register(&po);
 
     g_num_threads = 8;
     bool repeat = false;
     int32 length_tolerance = 0;
     std::string frame_weights_rspecifier;
 
     po.Register("num-threads", &g_num_threads,
                 "Number of threads to use for computing derived variables "
                 "of iVector extractor, at process start-up.");
     po.Register("repeat", &repeat,
                 "If true, output the same number of iVectors as input frames "
                 "(including repeated data).");
     po.Register("frame-weights-rspecifier", &frame_weights_rspecifier,
                 "Archive of frame weights to scale stats");
     po.Register("length-tolerance", &length_tolerance,
                 "Tolerance on the difference in number of frames "
                 "for feats and frame weights");
 
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
 
     std::string spk2utt_rspecifier = po.GetArg(1),
         feature_rspecifier = po.GetArg(2),
         ivectors_wspecifier = po.GetArg(3);
 
     double tot_ubm_loglike = 0.0, tot_objf_impr = 0.0, tot_t = 0.0,
         tot_length = 0.0, tot_length_utt_end = 0.0;
     int32 num_done = 0, num_err = 0;
 
     ivector_config.use_most_recent_ivector = false;
     OnlineIvectorExtractionInfo ivector_info(ivector_config);
 
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
     RandomAccessBaseFloatVectorReader frame_weights_reader(frame_weights_rspecifier);
     BaseFloatMatrixWriter ivector_writer(ivectors_wspecifier);
 
     bool warned_dim = false;
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
       OnlineIvectorExtractorAdaptationState adaptation_state(
           ivector_info);
       for (size_t i = 0; i < uttlist.size(); i++) {
         std::string utt = uttlist[i];
         if (!feature_reader.HasKey(utt)) {
           KALDI_WARN << "Did not find audio for utterance " << utt;
           num_err++;
           continue;
         }
         const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
 
         int32 feat_dim = feats.NumCols();
         if (feat_dim == ivector_info.ExpectedFeatureDim() + 3) {
           if (!warned_dim) {
             KALDI_WARN << "Feature dimension is too large by 3, assuming there are "
                 "pitch features and removing the last 3 dims.";
             warned_dim = true;
           }
           feat_dim -= 3;
         }
 
         SubMatrix<BaseFloat> range = feats.ColRange(0, feat_dim);
         OnlineMatrixFeature matrix_feature(range);
 
         OnlineIvectorFeature ivector_feature(ivector_info,
                                              &matrix_feature);
 
         ivector_feature.SetAdaptationState(adaptation_state);
 
         if (!frame_weights_rspecifier.empty()) {
           if (!frame_weights_reader.HasKey(utt)) {
             KALDI_WARN << "Did not find weights for utterance " << utt;
             num_err++;
             continue;
           }
           const Vector<BaseFloat> &weights = frame_weights_reader.Value(utt);
 
           if (std::abs(weights.Dim() - feats.NumRows()) > length_tolerance) {
             num_err++;
             continue;
           }
 
           std::vector<std::pair<int32, BaseFloat> > frame_weights;
           for (int32 i = 0; i < feats.NumRows(); i++) {
             if (i < weights.Dim())
               frame_weights.push_back(std::make_pair(i, weights(i)));
             else
               frame_weights.push_back(std::make_pair(i, 0.0));
           }
 
 
           ivector_feature.UpdateFrameWeights(frame_weights);
         }
 
         int32 T = feats.NumRows(),
             n = (repeat ? 1 : ivector_config.ivector_period),
             num_ivectors = (T + n - 1) / n;
 
         Matrix<BaseFloat> ivectors(num_ivectors,
                                    ivector_feature.Dim());
 
         for (int32 i = 0; i < num_ivectors; i++) {
           int32 t = i * n;
           SubVector<BaseFloat> ivector(ivectors, i);
           ivector_feature.GetFrame(t, &ivector);
         }
         // Update diagnostics.
 
         tot_ubm_loglike += T * ivector_feature.UbmLogLikePerFrame();
         tot_objf_impr += T * ivector_feature.ObjfImprPerFrame();
         tot_length_utt_end += T * ivectors.Row(num_ivectors - 1).Norm(2.0);
         for (int32 i = 0; i < num_ivectors; i++)
           tot_length += T * ivectors.Row(i).Norm(2.0) / num_ivectors;
         tot_t += T;
         KALDI_VLOG(2) << "For utterance " << utt << " of speaker " << spk
                       << ", UBM loglike/frame was "
                       << ivector_feature.UbmLogLikePerFrame()
                       << ", iVector length (at utterance end) was "
                       << ivectors.Row(num_ivectors-1).Norm(2.0)
                       << ", objf improvement/frame from iVector estimation was "
                       << ivector_feature.ObjfImprPerFrame();
 
         ivector_feature.GetAdaptationState(&adaptation_state);
         ivector_writer.Write(utt, ivectors);
         num_done++;
       }
     }
 
     KALDI_LOG << "Estimated iVectors for " << num_done << " files, " << num_err
               << " with errors.";
     KALDI_LOG << "Average objective-function improvement was "
               << (tot_objf_impr / tot_t) << " per frame, over "
               << tot_t << " frames (weighted).";
     KALDI_LOG << "Average iVector length was " << (tot_length / tot_t)
               << " and at utterance-end was " << (tot_length_utt_end / tot_t)
               << ", over " << tot_t << " frames (weighted); "
               << " expected length is "
               << sqrt(ivector_info.extractor.IvectorDim());
 
     return (num_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
   }
 }

Functions

Function Documentation

◆ main()