doc/ivector-extract_8cc_source.html

 // ivectorbin/ivector-extract.cc

 // Copyright 2013  Daniel Povey

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.


 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "ivector/ivector-extractor.h"
 #include "util/kaldi-thread.h"

 namespace kaldi {

 // This class will be used to parallelize over multiple threads the job
 // that this program does.  The work happens in the operator (), the
 // output happens in the destructor.
 class IvectorExtractTask {
  public:
   IvectorExtractTask(const IvectorExtractor &extractor,
                      std::string utt,
                      const Matrix<BaseFloat> &feats,
                      const Posterior &posterior,
                      BaseFloatVectorWriter *writer,
                      double *tot_auxf_change):
       extractor_(extractor), utt_(utt), feats_(feats), posterior_(posterior),
       writer_(writer), tot_auxf_change_(tot_auxf_change) { }

   void operator () () {
     bool need_2nd_order_stats = false;

     IvectorExtractorUtteranceStats utt_stats(extractor_.NumGauss(),
                                              extractor_.FeatDim(),
                                              need_2nd_order_stats);

     utt_stats.AccStats(feats_, posterior_);

     ivector_.Resize(extractor_.IvectorDim());
     ivector_(0) = extractor_.PriorOffset();

     if (tot_auxf_change_ != NULL) {
       double old_auxf = extractor_.GetAuxf(utt_stats, ivector_);
       extractor_.GetIvectorDistribution(utt_stats, &ivector_, NULL);
       double new_auxf = extractor_.GetAuxf(utt_stats, ivector_);
       auxf_change_ = new_auxf - old_auxf;
     } else {
       extractor_.GetIvectorDistribution(utt_stats, &ivector_, NULL);
     }
   }
   ~IvectorExtractTask() {
     if (tot_auxf_change_ != NULL) {
       double T = TotalPosterior(posterior_);
       *tot_auxf_change_ += auxf_change_;
       KALDI_VLOG(2) << "Auxf change for utterance " << utt_ << " was "
                     << (auxf_change_ / T) << " per frame over " << T
                     << " frames (weighted)";
     }
     // We actually write out the offset of the iVectors from the mean of the
     // prior distribution; this is the form we'll need it in for scoring.  (most
     // formulations of iVectors have zero-mean priors so this is not normally an
     // issue).
     ivector_(0) -= extractor_.PriorOffset();
     KALDI_VLOG(2) << "Ivector norm for utterance " << utt_
                   << " was " << ivector_.Norm(2.0);
     writer_->Write(utt_, Vector<BaseFloat>(ivector_));
   }
  private:
   const IvectorExtractor &extractor_;
   std::string utt_;
   Matrix<BaseFloat> feats_;
   Posterior posterior_;
   BaseFloatVectorWriter *writer_;
   double *tot_auxf_change_; // if non-NULL we need the auxf change.
   Vector<double> ivector_;
   double auxf_change_;
 };

 int32 RunPerSpeaker(const std::string &ivector_extractor_rxfilename,
                    const IvectorEstimationOptions &opts,
                    bool compute_objf_change,
                    const std::string &spk2utt_rspecifier,
                    const std::string &feature_rspecifier,
                    const std::string &posterior_rspecifier,
                    const std::string &ivector_wspecifier) {
   IvectorExtractor extractor;
   ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
   SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
   RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
   RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
   BaseFloatVectorWriter ivector_writer(ivector_wspecifier);

   double tot_auxf_change = 0.0, tot_post = 0.0, tot_norm = 0.0;
   int32 num_utt_done = 0, num_utt_err = 0,
       num_spk_done = 0, num_spk_err = 0;

   for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
     std::string spk = spk2utt_reader.Key();
     const std::vector<std::string> &utts = spk2utt_reader.Value();

     bool need_2nd_order_stats = false;

     IvectorExtractorUtteranceStats utt_stats(extractor.NumGauss(),
                                              extractor.FeatDim(),
                                              need_2nd_order_stats);

     for (size_t i = 0; i < utts.size(); i++) {
       const std::string &utt = utts[i];
       if (!feature_reader.HasKey(utt)) {
         KALDI_WARN << "No features present for utterance " << utt;
         num_utt_err++;
         continue;
       }
       const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
       if (!posterior_reader.HasKey(utt)) {
         KALDI_WARN << "No posteriors present for utterance " << utt;
         num_utt_err++;
         continue;
       }
       Posterior posterior = posterior_reader.Value(utt);
       if (feats.NumRows() != posterior.size()) {
         KALDI_WARN << "Posterior has wrong size " << posterior.size()
                    << " vs. feats " << feats.NumRows() << " for "
                    << utt;
         num_utt_err++;
         continue;
       }
       ScalePosterior(opts.acoustic_weight, &posterior);
       num_utt_done++;
       utt_stats.AccStats(feats, posterior);
     }

     if (utt_stats.NumFrames() == 0.0) {
       KALDI_WARN << "No stats accumulated for speaker " << spk;
       num_spk_err++;
       continue;
     } else {
       if (opts.max_count > 0 && utt_stats.NumFrames() > opts.max_count) {
         double scale = opts.max_count / utt_stats.NumFrames();
         utt_stats.Scale(scale);
         KALDI_LOG << "Scaling stats for speaker " << spk << " by scale "
                   << scale << " due to --max-count=" << opts.max_count;
       }

       Vector<double> ivector(extractor.IvectorDim());
       ivector(0) = extractor.PriorOffset();

       if (compute_objf_change) {
         double old_auxf = extractor.GetAuxf(utt_stats, ivector);
         extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
         double new_auxf = extractor.GetAuxf(utt_stats, ivector);
         double auxf_change = new_auxf - old_auxf;

         KALDI_LOG << "Auxf change for speaker " << spk << " was "
                   << (auxf_change / utt_stats.NumFrames()) << " per frame, over "
                   << utt_stats.NumFrames() << " frames (weighted).";
         tot_auxf_change += auxf_change;
       } else {
         extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
       }
       // We actually write out the offset of the iVectors from the mean of the
       // prior distribution; this is the form we'll need it in for scoring and
       // as a feature for neural nets.  (most formulations of iVectors have
       // zero-mean priors so this is not normally an issue).
       ivector(0) -= extractor.PriorOffset();
       KALDI_LOG << "Ivector norm for speaker " << spk
                 << " was " << ivector.Norm(2.0);

       tot_norm += ivector.Norm(2.0) * utt_stats.NumFrames();
       tot_post += utt_stats.NumFrames();
       num_spk_done++;
       Vector<BaseFloat> ivector_flt(ivector);
       ivector_writer.Write(spk, ivector_flt);
     }
   }

   KALDI_LOG << "Done " << num_spk_done << " speakers; " << num_spk_err
             << " with errors.  " << num_utt_done << " utterances "
             << "were processed, " << num_utt_err << " with errors.";
   if (tot_post != 0.0) {
     if (compute_objf_change) {
       KALDI_LOG << "Overall weighted-average objective function improvement was "
                 << (tot_auxf_change / tot_post) << " over " << tot_post
                 << " frames (weighted)";
     }
     KALDI_LOG << "Average iVector norm (weighted by frames) was "
               << (tot_norm / tot_post) << " over " << tot_post
               << " frames (weighted)";
   }
   return (num_spk_done != 0 ? 0 : 1);
 }

 }


 int main(int argc, char *argv[]) {
   using namespace kaldi;
   typedef kaldi::int32 int32;
   typedef kaldi::int64 int64;
   try {
     const char *usage =
         "Extract iVectors for utterances, using a trained iVector extractor,\n"
         "and features and Gaussian-level posteriors\n"
         "Usage:  ivector-extract [options] <model-in> <feature-rspecifier> "
         "<posteriors-rspecifier> <ivector-wspecifier>\n"
         "e.g.: \n"
         " fgmm-global-gselect-to-post 1.ubm '$feats' 'ark:gunzip -c gselect.1.gz|' ark:- | \\\n"
         "  ivector-extract final.ie '$feats' ark,s,cs:- ark,t:ivectors.1.ark\n";

     ParseOptions po(usage);
     bool compute_objf_change = true;
     IvectorEstimationOptions opts;
     std::string spk2utt_rspecifier;
     TaskSequencerConfig sequencer_config;
     po.Register("compute-objf-change", &compute_objf_change,
                 "If true, compute the change in objective function from using "
                 "nonzero iVector (a potentially useful diagnostic).  Combine "
                 "with --verbose=2 for per-utterance information");
     po.Register("spk2utt", &spk2utt_rspecifier, "Supply this option if you "
                 "want iVectors to be output at the per-speaker level, estimated "
                 "using stats accumulated from multiple utterances.  Note: this "
                 "is not the normal way iVectors are obtained for speaker-id. "
                 "This option will cause the program to ignore the --num-threads "
                 "option.");

     opts.Register(&po);
     sequencer_config.Register(&po);

     po.Read(argc, argv);

     if (po.NumArgs() != 4) {
       po.PrintUsage();
       exit(1);
     }

     std::string ivector_extractor_rxfilename = po.GetArg(1),
         feature_rspecifier = po.GetArg(2),
         posterior_rspecifier = po.GetArg(3),
         ivectors_wspecifier = po.GetArg(4);


     if (spk2utt_rspecifier.empty()) {
       // g_num_threads affects how ComputeDerivedVars is called when we read the
       // extractor.
       g_num_threads = sequencer_config.num_threads;
       IvectorExtractor extractor;
       ReadKaldiObject(ivector_extractor_rxfilename, &extractor);

       double tot_auxf_change = 0.0, tot_t = 0.0;
       int32 num_done = 0, num_err = 0;

       SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
       RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
       BaseFloatVectorWriter ivector_writer(ivectors_wspecifier);

       {
         TaskSequencer<IvectorExtractTask> sequencer(sequencer_config);
         for (; !feature_reader.Done(); feature_reader.Next()) {
           std::string utt = feature_reader.Key();
           if (!posterior_reader.HasKey(utt)) {
             KALDI_WARN << "No posteriors for utterance " << utt;
             num_err++;
             continue;
           }
           const Matrix<BaseFloat> &mat = feature_reader.Value();
           Posterior posterior = posterior_reader.Value(utt);

           if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
             KALDI_WARN << "Size mismatch between posterior " << posterior.size()
                        << " and features " << mat.NumRows() << " for utterance "
                        << utt;
             num_err++;
             continue;
           }

           double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL );

           double this_t = opts.acoustic_weight * TotalPosterior(posterior),
               max_count_scale = 1.0;
           if (opts.max_count > 0 && this_t > opts.max_count) {
             max_count_scale = opts.max_count / this_t;
             KALDI_LOG << "Scaling stats for utterance " << utt << " by scale "
                       << max_count_scale << " due to --max-count="
                       << opts.max_count;
             this_t = opts.max_count;
           }
           ScalePosterior(opts.acoustic_weight * max_count_scale,
                          &posterior);
           // note: now, this_t == sum of posteriors.

           sequencer.Run(new IvectorExtractTask(extractor, utt, mat, posterior,
                                                &ivector_writer, auxf_ptr));

           tot_t += this_t;
           num_done++;
         }
         // Destructor of "sequencer" will wait for any remaining tasks.
       }

       KALDI_LOG << "Done " << num_done << " files, " << num_err
                 << " with errors.  Total (weighted) frames " << tot_t;
       if (compute_objf_change)
         KALDI_LOG << "Overall average objective-function change from estimating "
                   << "ivector was " << (tot_auxf_change / tot_t) << " per frame "
                   << " over " << tot_t << " (weighted) frames.";

       return (num_done != 0 ? 0 : 1);
     } else {
       KALDI_ASSERT(sequencer_config.num_threads == 1 &&
                    "--spk2utt option is incompatible with --num-threads option");
       return RunPerSpeaker(ivector_extractor_rxfilename,
                            opts,
                            compute_objf_change,
                            spk2utt_rspecifier,
                            feature_rspecifier,
                            posterior_rspecifier,
                            ivectors_wspecifier);
     }
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
   }
 }
am-diag-gmm.h

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::IvectorExtractor::FeatDim
int32 FeatDim() const
Definition: ivector-extractor.cc:28

kaldi::IvectorExtractTask::auxf_change_
double auxf_change_
Definition: ivector-extract.cc:89

kaldi::IvectorExtractor::PriorOffset
double PriorOffset() const
The distribution over iVectors, in our formulation, is not centered at zero; its first dimension has ...
Definition: ivector-extractor.h:159

kaldi::IvectorExtractTask::ivector_
Vector< double > ivector_
Definition: ivector-extract.cc:88

kaldi::IvectorEstimationOptions::max_count
double max_count
Definition: ivector-extractor.h:55

kaldi::IvectorExtractTask::IvectorExtractTask
IvectorExtractTask(const IvectorExtractor &extractor, std::string utt, const Matrix< BaseFloat > &feats, const Posterior &posterior, BaseFloatVectorWriter *writer, double *tot_auxf_change)
Definition: ivector-extract.cc:34

kaldi-thread.h

kaldi::TaskSequencer::Run
void Run(C *c)
This function takes ownership of the pointer "c", and will delete it in the same sequence as Run was ...
Definition: kaldi-thread.h:190

kaldi::IvectorExtractTask::tot_auxf_change_
double * tot_auxf_change_
Definition: ivector-extract.cc:87

kaldi::IvectorEstimationOptions::Register
void Register(OptionsItf *opts)
Definition: ivector-extractor.h:57

kaldi::ParseOptions::PrintUsage
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
Definition: parse-options.cc:393

kaldi::SequentialTableReader::Key
std::string Key()
Definition: kaldi-table-inl.h:918

kaldi::g_num_threads
int32 g_num_threads
Definition: kaldi-thread.cc:25

kaldi::IvectorExtractor::GetAuxf
double GetAuxf(const IvectorExtractorUtteranceStats &utt_stats, const VectorBase< double > &mean, const SpMatrix< double > *var=NULL) const
Returns the log-likelihood objective function, summed over frames, for this distribution of iVectors ...
Definition: ivector-extractor.cc:331

kaldi::IvectorExtractTask::posterior_
Posterior posterior_
Definition: ivector-extract.cc:85

kaldi::TableWriter
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

common-utils.h

kaldi::Matrix< BaseFloat >

kaldi::Vector::Resize
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
Definition: kaldi-vector.cc:190

kaldi::IvectorExtractor
Definition: ivector-extractor.h:136

kaldi::VectorBase::Norm
Real Norm(Real p) const
Compute the p-th norm of the vector.
Definition: kaldi-vector.cc:512

kaldi::TableWriter::Write
void Write(const std::string &key, const T &value) const
Definition: kaldi-table-inl.h:1511

kaldi::ParseOptions::Register
void Register(const std::string &name, bool *ptr, const std::string &doc)
Definition: parse-options.cc:56

kaldi::ReadKaldiObject
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832

kaldi::TotalPosterior
BaseFloat TotalPosterior(const Posterior &post)
Returns the total of all the weights in "post".
Definition: posterior.cc:230

kaldi::RandomAccessTableReader
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233

main
int main(int argc, char *argv[])
Definition: ivector-extract.cc:210

kaldi::IvectorExtractor::IvectorDim
int32 IvectorDim() const
Definition: ivector-extractor.cc:33

kaldi::Posterior
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42

kaldi::TaskSequencerConfig::num_threads
int32 num_threads
Definition: kaldi-thread.h:157

kaldi::ParseOptions
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36

kaldi::RandomAccessTableReader::Value
const T & Value(const std::string &key)
Definition: kaldi-table-inl.h:2561

kaldi::IvectorExtractTask::writer_
BaseFloatVectorWriter * writer_
Definition: ivector-extract.cc:86

kaldi::SequentialTableReader
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287

kaldi::ParseOptions::Read
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
Definition: parse-options.cc:311

kaldi::SequentialTableReader::Done
bool Done()
Definition: kaldi-table-inl.h:948

kaldi::ParseOptions::GetArg
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
Definition: parse-options.cc:202

KALDI_WARN
#define KALDI_WARN
Definition: kaldi-error.h:150

kaldi::SequentialTableReader::Next
void Next()
Definition: kaldi-table-inl.h:942

kaldi::IvectorExtractTask::feats_
Matrix< BaseFloat > feats_
Definition: ivector-extract.cc:84

kaldi::TaskSequencer
Definition: kaldi-thread.h:175

kaldi::RandomAccessTableReader::HasKey
bool HasKey(const std::string &key)
Definition: kaldi-table-inl.h:2551

ivector-extractor.h

kaldi::ScalePosterior
void ScalePosterior(BaseFloat scale, Posterior *post)
Scales the BaseFloat (weight) element in the posterior entries.
Definition: posterior.cc:218

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::ParseOptions::NumArgs
int NumArgs() const
Number of positional parameters (c.f. argc-1).
Definition: parse-options.cc:198

kaldi::IvectorExtractTask
Definition: ivector-extract.cc:32

kaldi::IvectorExtractorUtteranceStats::AccStats
void AccStats(const MatrixBase< BaseFloat > &feats, const Posterior &post)
Definition: ivector-extractor.cc:852

kaldi::IvectorExtractor::GetIvectorDistribution
void GetIvectorDistribution(const IvectorExtractorUtteranceStats &utt_stats, VectorBase< double > *mean, SpMatrix< double > *var) const
Gets the distribution over ivectors (or at least, a Gaussian approximation to it).
Definition: ivector-extractor.cc:63

kaldi::Vector
A class representing a vector.
Definition: kaldi-vector.h:406

kaldi::SequentialTableReader::Value
T & Value()
Definition: kaldi-table-inl.h:934

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

kaldi::MatrixBase::NumRows
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64

KALDI_VLOG
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156

kaldi::IvectorEstimationOptions::acoustic_weight
double acoustic_weight
Definition: ivector-extractor.h:54

kaldi::TaskSequencerConfig
Definition: kaldi-thread.h:156

kaldi::IvectorExtractorUtteranceStats
These are the stats for a particular utterance, i.e.
Definition: ivector-extractor.h:79

kaldi::IvectorExtractTask::operator()
void operator()()
Definition: ivector-extract.cc:43

kaldi::IvectorExtractor::NumGauss
int32 NumGauss() const
Definition: ivector-extractor.cc:38

KALDI_LOG
#define KALDI_LOG
Definition: kaldi-error.h:153

kaldi::IvectorExtractTask::~IvectorExtractTask
~IvectorExtractTask()
Definition: ivector-extract.cc:64

kaldi::IvectorExtractTask::extractor_
const IvectorExtractor & extractor_
Definition: ivector-extract.cc:82

kaldi-common.h

kaldi::RunPerSpeaker
int32 RunPerSpeaker(const std::string &ivector_extractor_rxfilename, const IvectorEstimationOptions &opts, bool compute_objf_change, const std::string &spk2utt_rspecifier, const std::string &feature_rspecifier, const std::string &posterior_rspecifier, const std::string &ivector_wspecifier)
Definition: ivector-extract.cc:92

kaldi::IvectorEstimationOptions
Definition: ivector-extractor.h:53

kaldi::TaskSequencerConfig::Register
void Register(OptionsItf *opts)
Definition: kaldi-thread.h:160

kaldi::IvectorExtractTask::utt_
std::string utt_
Definition: ivector-extract.cc:83