ivector-extractor.h
Go to the documentation of this file.
1 // ivector/ivector-extractor.h
2 
3 // Copyright 2013-2014 Daniel Povey
4 // 2015 David Snyder
5 
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #ifndef KALDI_IVECTOR_IVECTOR_EXTRACTOR_H_
23 #define KALDI_IVECTOR_IVECTOR_EXTRACTOR_H_
24 
25 #include <vector>
26 #include <mutex>
27 #include "base/kaldi-common.h"
28 #include "matrix/matrix-lib.h"
29 #include "gmm/model-common.h"
30 #include "gmm/diag-gmm.h"
31 #include "gmm/full-gmm.h"
32 #include "itf/options-itf.h"
33 #include "util/common-utils.h"
34 #include "hmm/posterior.h"
35 
36 namespace kaldi {
37 
38 // Note, throughout this file we use SGMM-type notation because
39 // that's what I'm comfortable with.
40 // Dimensions:
41 // D is the feature dim (e.g. D = 60)
42 // I is the number of Gaussians (e.g. I = 2048)
43 // S is the ivector dim (e.g. S = 400)
44 
45 
46 
47 // Options for estimating iVectors, during both training and test. Note: the
48 // "acoustic_weight" is not read by any class declared in this header; it has to
49 // be applied by calling IvectorExtractorUtteranceStats::Scale() before
50 // obtaining the iVector.
51 // The same is true of max_count: it has to be applied by programs themselves
52 // e.g. see ../ivectorbin/ivector-extract.cc.
55  double max_count;
56  IvectorEstimationOptions(): acoustic_weight(1.0), max_count(0.0) {}
57  void Register(OptionsItf *opts) {
58  opts->Register("acoustic-weight", &acoustic_weight,
59  "Weight on part of auxf that involves the data (e.g. 0.2); "
60  "if this weight is small, the prior will have more effect.");
61  opts->Register("max-count", &max_count,
62  "Maximum frame count (affects prior scaling): if >0, the prior "
63  "term will be scaled up after the frame count exceeds this "
64  "value. Note that this count is considered after posterior "
65  "scaling (e.g. --acoustic-weight option, or scale argument to "
66  "scale-post), so you would normally use a cutoff 10 times "
67  "smaller than the corresponding number of frames.");
68  }
69 };
70 
71 
72 class IvectorExtractor;
74 
80  public:
82  bool need_2nd_order_stats):
83  gamma_(num_gauss), X_(num_gauss, feat_dim) {
84  if (need_2nd_order_stats) {
85  S_.resize(num_gauss);
86  for (int32 i = 0; i < num_gauss; i++)
87  S_[i].Resize(feat_dim);
88  }
89  }
90 
91  void AccStats(const MatrixBase<BaseFloat> &feats,
92  const Posterior &post);
93 
94  void Scale(double scale); // Used to apply acoustic scale.
95 
96  double NumFrames() { return gamma_.Sum(); }
97 
98  protected:
99  friend class IvectorExtractor;
100  friend class IvectorExtractorStats;
101  Vector<double> gamma_; // zeroth-order stats (summed posteriors), dimension [I]
102  Matrix<double> X_; // first-order stats, dimension [I][D]
103  std::vector<SpMatrix<double> > S_; // 2nd-order stats, dimension [I][D][D], if
104  // required.
105 };
106 
107 
112  IvectorExtractorOptions(): ivector_dim(400), num_iters(2),
113  use_weights(true) { }
114  void Register(OptionsItf *opts) {
115  opts->Register("num-iters", &num_iters, "Number of iterations in "
116  "iVector estimation (>1 needed due to weights)");
117  opts->Register("ivector-dim", &ivector_dim, "Dimension of iVector");
118  opts->Register("use-weights", &use_weights, "If true, regress the "
119  "log-weights on the iVector");
120  }
121 };
122 
123 
124 // Forward declaration. This class is used together with IvectorExtractor to
125 // compute iVectors in an online way, so we can update the estimate efficiently
126 // as we add frames.
128 
129 // Caution: the IvectorExtractor is not the only thing required to get an
130 // ivector. We also need to get posteriors from a GMM, typically a FullGmm.
131 // Typically these will be obtained in a process that involves using a DiagGmm
132 // for Gaussian selection, followed by getting posteriors from the FullGmm. To
133 // keep track of these, we keep them all in the same directory,
134 // e.g. final.{ubm,dubm,ie}.
135 
137  public:
138  friend class IvectorExtractorStats;
140 
141  IvectorExtractor(): prior_offset_(0.0) { }
142 
144  const IvectorExtractorOptions &opts,
145  const FullGmm &fgmm);
146 
151  void GetIvectorDistribution(
152  const IvectorExtractorUtteranceStats &utt_stats,
153  VectorBase<double> *mean,
154  SpMatrix<double> *var) const;
155 
159  double PriorOffset() const { return prior_offset_; }
160 
163  double GetAuxf(const IvectorExtractorUtteranceStats &utt_stats,
164  const VectorBase<double> &mean,
165  const SpMatrix<double> *var = NULL) const;
166 
169  double GetAcousticAuxf(const IvectorExtractorUtteranceStats &utt_stats,
170  const VectorBase<double> &mean,
171  const SpMatrix<double> *var = NULL) const;
172 
176  double GetPriorAuxf(const VectorBase<double> &mean,
177  const SpMatrix<double> *var = NULL) const;
178 
184  double GetAcousticAuxfVariance(
185  const IvectorExtractorUtteranceStats &utt_stats) const;
186 
189  double GetAcousticAuxfMean(
190  const IvectorExtractorUtteranceStats &utt_stats,
191  const VectorBase<double> &mean,
192  const SpMatrix<double> *var = NULL) const;
193 
196  double GetAcousticAuxfGconst(
197  const IvectorExtractorUtteranceStats &utt_stats) const;
198 
202  double GetAcousticAuxfWeight(
203  const IvectorExtractorUtteranceStats &utt_stats,
204  const VectorBase<double> &mean,
205  const SpMatrix<double> *var = NULL) const;
206 
207 
213  void GetIvectorDistMean(
214  const IvectorExtractorUtteranceStats &utt_stats,
215  VectorBase<double> *linear,
216  SpMatrix<double> *quadratic) const;
217 
221  void GetIvectorDistPrior(
222  const IvectorExtractorUtteranceStats &utt_stats,
223  VectorBase<double> *linear,
224  SpMatrix<double> *quadratic) const;
225 
232  void GetIvectorDistWeight(
233  const IvectorExtractorUtteranceStats &utt_stats,
234  const VectorBase<double> &mean,
235  VectorBase<double> *linear,
236  SpMatrix<double> *quadratic) const;
237 
238  // Note: the function GetStats no longer exists due to code refactoring.
239  // Instead of this->GetStats(feats, posterior, &utt_stats), call
240  // utt_stats.AccStats(feats, posterior).
241 
242  int32 FeatDim() const;
243  int32 IvectorDim() const;
244  int32 NumGauss() const;
245  bool IvectorDependentWeights() const { return w_.NumRows() != 0; }
246  void Write(std::ostream &os, bool binary) const;
247  void Read(std::istream &is, bool binary);
248 
249  // Note: we allow the default assignment and copy operators
250  // because they do what we want.
251  protected:
252  void ComputeDerivedVars();
253  void ComputeDerivedVars(int32 i);
255 
256  // Imagine we'll project the iVectors with transformation T, so apply T^{-1}
257  // where necessary to keep the model equivalent. Used to keep unit variance
258  // (like prior re-estimation).
260  double new_prior_offset);
261 
262 
265 
271 
276  std::vector<Matrix<double> > M_;
277 
279  std::vector<SpMatrix<double> > Sigma_inv_;
280 
285 
286  // Below are *derived variables* that can be computed from the
287  // variables above.
288 
292 
299 
301  std::vector<Matrix<double> > Sigma_inv_M_;
302  private:
303  // var <-- quadratic_term^{-1}, but done carefully, first flooring eigenvalues
304  // of quadratic_term to 1.0, which mathematically is the least they can be,
305  // due to the prior term.
306  static void InvertWithFlooring(const SpMatrix<double> &quadratic_term,
307  SpMatrix<double> *var);
308 };
309 
315  public:
316  // Search above for max_count to see an explanation; if nonzero, it will
317  // put a higher weight on the prior (vs. the stats) once the count passes
318  // that value.
320  BaseFloat prior_offset,
322 
324 
325 
326  // Accumulate stats for one frame.
327  void AccStats(const IvectorExtractor &extractor,
328  const VectorBase<BaseFloat> &feature,
329  const std::vector<std::pair<int32, BaseFloat> > &gauss_post);
330 
331  // Accumulate stats for a sequence (or collection) of frames.
332  void AccStats(const IvectorExtractor &extractor,
333  const MatrixBase<BaseFloat> &features,
334  const std::vector<std::vector<std::pair<int32, BaseFloat> > > &gauss_post);
335 
336 
337  int32 IvectorDim() const { return linear_term_.Dim(); }
338 
352  void GetIvector(int32 num_cg_iters,
353  VectorBase<double> *ivector) const;
354 
355  double NumFrames() const { return num_frames_; }
356 
357  double PriorOffset() const { return prior_offset_; }
358 
363  double ObjfChange(const VectorBase<double> &ivector) const;
364 
365  double Count() const { return num_frames_; }
366 
370  void Scale(double scale);
371 
372  void Write(std::ostream &os, bool binary) const;
373  void Read(std::istream &is, bool binary);
374 
375  // Override the default assignment operator
377  this->prior_offset_ = other.prior_offset_;
378  this->max_count_ = other.max_count_;
379  this->num_frames_ = other.num_frames_;
380  this->quadratic_term_=other.quadratic_term_;
381  this->linear_term_=other.linear_term_;
382  return *this;
383  }
384 
385  protected:
387  double Objf(const VectorBase<double> &ivector) const;
388 
391  double DefaultObjf() const;
392 
393  friend class IvectorExtractor;
395  double max_count_;
396  double num_frames_; // num frames (weighted, if applicable).
399 };
400 
401 
402 // This code obtains periodically (for each "ivector_period" frames, e.g. 10
403 // frames), an estimate of the iVector including all frames up to that point.
404 // This emulates what you could do in an online/streaming algorithm; its use is
405 // for neural network training in a way that's matched to online decoding.
406 // [note: I don't believe we are currently using the program,
407 // ivector-extract-online.cc, that calls this function, in any of the scripts.].
408 // Caution: this program outputs the raw iVectors, where the first component
409 // will generally be very positive. You probably want to subtract PriorOffset()
410 // from the first element of each row of the output before writing it out.
411 // For num_cg_iters, we suggest 15. It can be a positive number (more -> more
412 // exact, less -> faster), or if it's negative it will do the optimization
413 // exactly each time which is slower.
414 // It returns the objective function improvement per frame from the "default" iVector to
415 // the last iVector estimated.
417  const Matrix<BaseFloat> &feats,
418  const Posterior &post,
419  const IvectorExtractor &extractor,
420  int32 ivector_period,
421  int32 num_cg_iters,
423  Matrix<BaseFloat> *ivectors);
424 
425 
433 
434  IvectorExtractorStatsOptions(): update_variances(true),
435  compute_auxf(true),
436  num_samples_for_weights(10),
437  cache_size(100) { }
438  void Register(OptionsItf *opts) {
439  opts->Register("update-variances", &update_variances, "If true, update the "
440  "Gaussian variances");
441  opts->Register("compute-auxf", &compute_auxf, "If true, compute the "
442  "auxiliary functions on training data; can be used to "
443  "debug and check convergence.");
444  opts->Register("num-samples-for-weights", &num_samples_for_weights,
445  "Number of samples from iVector distribution to use "
446  "for accumulating stats for weight update. Must be >1");
447  opts->Register("cache-size", &cache_size, "Size of an internal "
448  "cache (not critical, only affects speed/memory)");
449  }
450 };
451 
452 
459  IvectorExtractorEstimationOptions(): variance_floor_factor(0.1),
460  gaussian_min_count(100.0),
461  diagonalize(true) { }
462  void Register(OptionsItf *opts) {
463  opts->Register("variance-floor-factor", &variance_floor_factor,
464  "Factor that determines variance flooring (we floor each covar "
465  "to this times global average covariance");
466  opts->Register("gaussian-min-count", &gaussian_min_count,
467  "Minimum total count per Gaussian, below which we refuse to "
468  "update any associated parameters.");
469  opts->Register("diagonalize", &diagonalize,
470  "If true, diagonalize the quadratic term in the "
471  "objective function. This reorders the ivector dimensions "
472  "from most to least important.");
473  }
474 };
475 
478 
482  public:
483  friend class IvectorExtractor;
484 
485  IvectorExtractorStats(): tot_auxf_(0.0), R_num_cached_(0), num_ivectors_(0) { }
486 
487  IvectorExtractorStats(const IvectorExtractor &extractor,
488  const IvectorExtractorStatsOptions &stats_opts);
489 
490  void Add(const IvectorExtractorStats &other);
491 
492  void AccStatsForUtterance(const IvectorExtractor &extractor,
493  const MatrixBase<BaseFloat> &feats,
494  const Posterior &post);
495 
496  // This version (intended mainly for testing) works out the Gaussian
497  // posteriors from the model. Returns total log-like for feats, given
498  // unadapted fgmm. You'd want to add Gaussian pruning and preselection using
499  // the diagonal, GMM, for speed, if you used this outside testing code.
500  double AccStatsForUtterance(const IvectorExtractor &extractor,
501  const MatrixBase<BaseFloat> &feats,
502  const FullGmm &fgmm);
503 
504  void Read(std::istream &is, bool binary, bool add = false);
505 
506  void Write(std::ostream &os, bool binary); // non-const version; relates to cache.
507 
508  // const version of Write; may use extra memory if we have stuff cached
509  void Write(std::ostream &os, bool binary) const;
510 
512  double Update(const IvectorExtractorEstimationOptions &opts,
513  IvectorExtractor *extractor) const;
514 
515  double AuxfPerFrame() { return tot_auxf_ / gamma_.Sum(); }
516 
519  void IvectorVarianceDiagnostic(const IvectorExtractor &extractor);
520 
521  // Copy constructor.
522  explicit IvectorExtractorStats (const IvectorExtractorStats &other);
523 
524  protected:
527 
528 
529  // This is called by AccStatsForUtterance
530  void CommitStatsForUtterance(const IvectorExtractor &extractor,
531  const IvectorExtractorUtteranceStats &utt_stats);
532 
535  void CommitStatsForM(const IvectorExtractor &extractor,
536  const IvectorExtractorUtteranceStats &utt_stats,
537  const VectorBase<double> &ivec_mean,
538  const SpMatrix<double> &ivec_var);
539 
541  void FlushCache();
542 
544  void CommitStatsForSigma(const IvectorExtractor &extractor,
545  const IvectorExtractorUtteranceStats &utt_stats);
546 
549  void CommitStatsForWPoint(const IvectorExtractor &extractor,
550  const IvectorExtractorUtteranceStats &utt_stats,
551  const VectorBase<double> &ivector,
552  double weight);
553 
554 
556  void CommitStatsForW(const IvectorExtractor &extractor,
557  const IvectorExtractorUtteranceStats &utt_stats,
558  const VectorBase<double> &ivec_mean,
559  const SpMatrix<double> &ivec_var);
560 
562  void CommitStatsForPrior(const VectorBase<double> &ivec_mean,
563  const SpMatrix<double> &ivec_var);
564 
565  // Updates M. Returns the objf improvement per frame.
566  double UpdateProjections(const IvectorExtractorEstimationOptions &opts,
567  IvectorExtractor *extractor) const;
568 
569  // This internally called function returns the objf improvement
570  // for this Gaussian index. Updates one M.
571  double UpdateProjection(const IvectorExtractorEstimationOptions &opts,
572  int32 gaussian,
573  IvectorExtractor *extractor) const;
574 
575  // Updates the weight projections. Returns the objf improvement per
576  // frame.
577  double UpdateWeights(const IvectorExtractorEstimationOptions &opts,
578  IvectorExtractor *extractor) const;
579 
580  // Updates the weight projection for one Gaussian index. Returns the objf
581  // improvement for this index.
582  double UpdateWeight(const IvectorExtractorEstimationOptions &opts,
583  int32 gaussian,
584  IvectorExtractor *extractor) const;
585 
586  // Returns the objf improvement per frame.
587  double UpdateVariances(const IvectorExtractorEstimationOptions &opts,
588  IvectorExtractor *extractor) const;
589 
590 
591 
592  // Updates the prior; returns obj improvement per frame.
593  double UpdatePrior(const IvectorExtractorEstimationOptions &opts,
594  IvectorExtractor *extractor) const;
595 
596  // Called from UpdatePrior, separating out some code that
597  // computes likelihood changes.
598  double PriorDiagnostics(double old_prior_offset) const;
599 
600 
601  void CheckDims(const IvectorExtractor &extractor) const;
602 
604 
610  double tot_auxf_;
611 
613  std::mutex gamma_Y_lock_;
614 
617 
620  std::vector<Matrix<double> > Y_;
621 
623  std::mutex R_lock_;
624 
631 
634  std::mutex R_cache_lock_;
635 
643 
645  std::mutex weight_stats_lock_;
646 
651 
655 
658 
661  std::vector< SpMatrix<double> > S_;
662 
663 
666  std::mutex prior_stats_lock_;
667 
671 
674 
677 
678  private:
685  void GetOrthogonalIvectorTransform(const SubMatrix<double> &T,
686  IvectorExtractor *extractor,
687  Matrix<double> *A) const;
688 
689  IvectorExtractorStats &operator = (const IvectorExtractorStats &other); // Disallow.
690 };
691 
692 
693 
694 } // namespace kaldi
695 
696 
697 #endif
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
int32 R_num_cached_
To avoid too-frequent rank-1 update of R, which is slow, we cache some quantities here...
std::mutex R_cache_lock_
This mutex guards R_num_cached_, R_gamma_cache_, R_ivec_cache_ (for multi-threaded update) ...
double PriorOffset() const
The distribution over iVectors, in our formulation, is not centered at zero; its first dimension has ...
Matrix< double > U_
U_i = M_i^T ^{-1} M_i is a quantity that comes up in ivector estimation.
void Register(OptionsItf *opts)
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
Vector< double > w_vec_
If we are not using weight-projection vectors, stores the Gaussian mixture weights from the UBM...
bool IvectorDependentWeights() const
Definition for Gaussian Mixture Model with full covariances.
Definition: full-gmm.h:40
Matrix< double > R_ivec_scatter_cache_
dimension: [num-to-cache][S*(S+1)/2]
std::mutex R_lock_
This mutex guards R_ (for multi-threaded update)
IvectorExtractorStats is a class used to update the parameters of the ivector extractor.
kaldi::int32 int32
Matrix< double > R_gamma_cache_
dimension: [num-to-cache][I]
This class helps us to efficiently estimate iVectors in situations where the data is coming in frame ...
double tot_auxf_
Caution: if we read from disk, this.
Vector< double > ivector_sum_
Sum of all the iVector means. Needed for prior re-estimation.
std::vector< Matrix< double > > Sigma_inv_M_
The product of Sigma_inv_[i] with M_[i].
Matrix< double > G_
G_ is the linear term in the weight projection matrix w_.
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
Matrix< double > R_
R_i, quadratic term for ivector subspace (M matrix)estimation.
Options for IvectorExtractorStats, which is used to update the parameters of IvectorExtractor.
IvectorExtractorUtteranceStats(int32 num_gauss, int32 feat_dim, bool need_2nd_order_stats)
IvectorExtractorStatsOptions config_
Matrix< double > Q_
Q_ is like R_ (with same dimensions), except used for weight estimation; the scatter of ivectors is w...
std::vector< SpMatrix< double > > Sigma_inv_
Inverse variances of speaker-adapted model, dimension [I][D][D].
SpMatrix< double > ivector_scatter_
Second-order stats for the iVectors. Needed for prior re-estimation.
double num_ivectors_
Count of the number of iVectors we trained on.
std::vector< Matrix< double > > M_
Ivector-subspace projection matrices, dimension is [I][D][S].
std::vector< SpMatrix< double > > S_
std::mutex weight_stats_lock_
This mutex guards Q_ and G_ (for multi-threaded update)
double EstimateIvectorsOnline(const Matrix< BaseFloat > &feats, const Posterior &post, const IvectorExtractor &extractor, int32 ivector_period, int32 num_cg_iters, BaseFloat max_count, Matrix< BaseFloat > *ivectors)
void TransformIvectors(const Matrix< BaseFloat > &ivectors_in, const PldaConfig &plda_config, const Plda &plda, Matrix< BaseFloat > *ivectors_out)
void Register(OptionsItf *opts)
std::mutex variance_stats_lock_
This mutex guards S_ (for multi-threaded update)
std::vector< Matrix< double > > Y_
Stats Y_i for estimating projections M.
Options for training the IvectorExtractor, e.g. variance flooring.
Vector< double > gconsts_
The constant term in the log-likelihood of each Gaussian (not counting any weight).
void AccStatsForUtterance(const TransitionModel &trans_model, const AmDiagGmm &am_gmm, const GaussPost &gpost, const Matrix< BaseFloat > &feats, FmllrRawAccs *accs)
std::mutex prior_stats_lock_
This mutex guards num_ivectors_, ivector_sum_ and ivector_scatter_ (for multi-threaded update) ...
These are the stats for a particular utterance, i.e.
double prior_offset_
1st dim of the prior over the ivector has an offset, so it is not zero.
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
Matrix< double > w_
Weight projection vectors, if used. Dimension is [I][S].
Vector< double > gamma_
Total occupation count for each Gaussian index (zeroth-order stats)
Sub-matrix representation.
Definition: kaldi-matrix.h:988
std::vector< SpMatrix< double > > S_
S_{i}, raw second-order stats per Gaussian which we will use to update the variances Sigma_inv_...
OnlineIvectorEstimationStats & operator=(const OnlineIvectorEstimationStats &other)
std::mutex gamma_Y_lock_
This mutex guards gamma_ and Y_ (for multi-threaded update)