ivector-extract.cc
Go to the documentation of this file.
1 // ivectorbin/ivector-extract.cc
2 
3 // Copyright 2013 Daniel Povey
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 #include "base/kaldi-common.h"
22 #include "util/common-utils.h"
23 #include "gmm/am-diag-gmm.h"
25 #include "util/kaldi-thread.h"
26 
27 namespace kaldi {
28 
29 // This class will be used to parallelize over multiple threads the job
30 // that this program does. The work happens in the operator (), the
31 // output happens in the destructor.
33  public:
35  std::string utt,
36  const Matrix<BaseFloat> &feats,
37  const Posterior &posterior,
38  BaseFloatVectorWriter *writer,
39  double *tot_auxf_change):
40  extractor_(extractor), utt_(utt), feats_(feats), posterior_(posterior),
41  writer_(writer), tot_auxf_change_(tot_auxf_change) { }
42 
43  void operator () () {
44  bool need_2nd_order_stats = false;
45 
48  need_2nd_order_stats);
49 
50  utt_stats.AccStats(feats_, posterior_);
51 
54 
55  if (tot_auxf_change_ != NULL) {
56  double old_auxf = extractor_.GetAuxf(utt_stats, ivector_);
57  extractor_.GetIvectorDistribution(utt_stats, &ivector_, NULL);
58  double new_auxf = extractor_.GetAuxf(utt_stats, ivector_);
59  auxf_change_ = new_auxf - old_auxf;
60  } else {
61  extractor_.GetIvectorDistribution(utt_stats, &ivector_, NULL);
62  }
63  }
65  if (tot_auxf_change_ != NULL) {
66  double T = TotalPosterior(posterior_);
68  KALDI_VLOG(2) << "Auxf change for utterance " << utt_ << " was "
69  << (auxf_change_ / T) << " per frame over " << T
70  << " frames (weighted)";
71  }
72  // We actually write out the offset of the iVectors from the mean of the
73  // prior distribution; this is the form we'll need it in for scoring. (most
74  // formulations of iVectors have zero-mean priors so this is not normally an
75  // issue).
77  KALDI_VLOG(2) << "Ivector norm for utterance " << utt_
78  << " was " << ivector_.Norm(2.0);
80  }
81  private:
83  std::string utt_;
87  double *tot_auxf_change_; // if non-NULL we need the auxf change.
89  double auxf_change_;
90 };
91 
92 int32 RunPerSpeaker(const std::string &ivector_extractor_rxfilename,
93  const IvectorEstimationOptions &opts,
94  bool compute_objf_change,
95  const std::string &spk2utt_rspecifier,
96  const std::string &feature_rspecifier,
97  const std::string &posterior_rspecifier,
98  const std::string &ivector_wspecifier) {
99  IvectorExtractor extractor;
100  ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
101  SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
102  RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
103  RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
104  BaseFloatVectorWriter ivector_writer(ivector_wspecifier);
105 
106  double tot_auxf_change = 0.0, tot_post = 0.0, tot_norm = 0.0;
107  int32 num_utt_done = 0, num_utt_err = 0,
108  num_spk_done = 0, num_spk_err = 0;
109 
110  for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
111  std::string spk = spk2utt_reader.Key();
112  const std::vector<std::string> &utts = spk2utt_reader.Value();
113 
114  bool need_2nd_order_stats = false;
115 
116  IvectorExtractorUtteranceStats utt_stats(extractor.NumGauss(),
117  extractor.FeatDim(),
118  need_2nd_order_stats);
119 
120  for (size_t i = 0; i < utts.size(); i++) {
121  const std::string &utt = utts[i];
122  if (!feature_reader.HasKey(utt)) {
123  KALDI_WARN << "No features present for utterance " << utt;
124  num_utt_err++;
125  continue;
126  }
127  const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
128  if (!posterior_reader.HasKey(utt)) {
129  KALDI_WARN << "No posteriors present for utterance " << utt;
130  num_utt_err++;
131  continue;
132  }
133  Posterior posterior = posterior_reader.Value(utt);
134  if (feats.NumRows() != posterior.size()) {
135  KALDI_WARN << "Posterior has wrong size " << posterior.size()
136  << " vs. feats " << feats.NumRows() << " for "
137  << utt;
138  num_utt_err++;
139  continue;
140  }
141  ScalePosterior(opts.acoustic_weight, &posterior);
142  num_utt_done++;
143  utt_stats.AccStats(feats, posterior);
144  }
145 
146  if (utt_stats.NumFrames() == 0.0) {
147  KALDI_WARN << "No stats accumulated for speaker " << spk;
148  num_spk_err++;
149  continue;
150  } else {
151  if (opts.max_count > 0 && utt_stats.NumFrames() > opts.max_count) {
152  double scale = opts.max_count / utt_stats.NumFrames();
153  utt_stats.Scale(scale);
154  KALDI_LOG << "Scaling stats for speaker " << spk << " by scale "
155  << scale << " due to --max-count=" << opts.max_count;
156  }
157 
158  Vector<double> ivector(extractor.IvectorDim());
159  ivector(0) = extractor.PriorOffset();
160 
161  if (compute_objf_change) {
162  double old_auxf = extractor.GetAuxf(utt_stats, ivector);
163  extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
164  double new_auxf = extractor.GetAuxf(utt_stats, ivector);
165  double auxf_change = new_auxf - old_auxf;
166 
167  KALDI_LOG << "Auxf change for speaker " << spk << " was "
168  << (auxf_change / utt_stats.NumFrames()) << " per frame, over "
169  << utt_stats.NumFrames() << " frames (weighted).";
170  tot_auxf_change += auxf_change;
171  } else {
172  extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
173  }
174  // We actually write out the offset of the iVectors from the mean of the
175  // prior distribution; this is the form we'll need it in for scoring and
176  // as a feature for neural nets. (most formulations of iVectors have
177  // zero-mean priors so this is not normally an issue).
178  ivector(0) -= extractor.PriorOffset();
179  KALDI_LOG << "Ivector norm for speaker " << spk
180  << " was " << ivector.Norm(2.0);
181 
182  tot_norm += ivector.Norm(2.0) * utt_stats.NumFrames();
183  tot_post += utt_stats.NumFrames();
184  num_spk_done++;
185  Vector<BaseFloat> ivector_flt(ivector);
186  ivector_writer.Write(spk, ivector_flt);
187  }
188  }
189 
190  KALDI_LOG << "Done " << num_spk_done << " speakers; " << num_spk_err
191  << " with errors. " << num_utt_done << " utterances "
192  << "were processed, " << num_utt_err << " with errors.";
193  if (tot_post != 0.0) {
194  if (compute_objf_change) {
195  KALDI_LOG << "Overall weighted-average objective function improvement was "
196  << (tot_auxf_change / tot_post) << " over " << tot_post
197  << " frames (weighted)";
198  }
199  KALDI_LOG << "Average iVector norm (weighted by frames) was "
200  << (tot_norm / tot_post) << " over " << tot_post
201  << " frames (weighted)";
202  }
203  return (num_spk_done != 0 ? 0 : 1);
204 }
205 
206 }
207 
208 
209 
210 int main(int argc, char *argv[]) {
211  using namespace kaldi;
212  typedef kaldi::int32 int32;
213  typedef kaldi::int64 int64;
214  try {
215  const char *usage =
216  "Extract iVectors for utterances, using a trained iVector extractor,\n"
217  "and features and Gaussian-level posteriors\n"
218  "Usage: ivector-extract [options] <model-in> <feature-rspecifier> "
219  "<posteriors-rspecifier> <ivector-wspecifier>\n"
220  "e.g.: \n"
221  " fgmm-global-gselect-to-post 1.ubm '$feats' 'ark:gunzip -c gselect.1.gz|' ark:- | \\\n"
222  " ivector-extract final.ie '$feats' ark,s,cs:- ark,t:ivectors.1.ark\n";
223 
224  ParseOptions po(usage);
225  bool compute_objf_change = true;
227  std::string spk2utt_rspecifier;
228  TaskSequencerConfig sequencer_config;
229  po.Register("compute-objf-change", &compute_objf_change,
230  "If true, compute the change in objective function from using "
231  "nonzero iVector (a potentially useful diagnostic). Combine "
232  "with --verbose=2 for per-utterance information");
233  po.Register("spk2utt", &spk2utt_rspecifier, "Supply this option if you "
234  "want iVectors to be output at the per-speaker level, estimated "
235  "using stats accumulated from multiple utterances. Note: this "
236  "is not the normal way iVectors are obtained for speaker-id. "
237  "This option will cause the program to ignore the --num-threads "
238  "option.");
239 
240  opts.Register(&po);
241  sequencer_config.Register(&po);
242 
243  po.Read(argc, argv);
244 
245  if (po.NumArgs() != 4) {
246  po.PrintUsage();
247  exit(1);
248  }
249 
250  std::string ivector_extractor_rxfilename = po.GetArg(1),
251  feature_rspecifier = po.GetArg(2),
252  posterior_rspecifier = po.GetArg(3),
253  ivectors_wspecifier = po.GetArg(4);
254 
255 
256  if (spk2utt_rspecifier.empty()) {
257  // g_num_threads affects how ComputeDerivedVars is called when we read the
258  // extractor.
259  g_num_threads = sequencer_config.num_threads;
260  IvectorExtractor extractor;
261  ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
262 
263  double tot_auxf_change = 0.0, tot_t = 0.0;
264  int32 num_done = 0, num_err = 0;
265 
266  SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
267  RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
268  BaseFloatVectorWriter ivector_writer(ivectors_wspecifier);
269 
270  {
271  TaskSequencer<IvectorExtractTask> sequencer(sequencer_config);
272  for (; !feature_reader.Done(); feature_reader.Next()) {
273  std::string utt = feature_reader.Key();
274  if (!posterior_reader.HasKey(utt)) {
275  KALDI_WARN << "No posteriors for utterance " << utt;
276  num_err++;
277  continue;
278  }
279  const Matrix<BaseFloat> &mat = feature_reader.Value();
280  Posterior posterior = posterior_reader.Value(utt);
281 
282  if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
283  KALDI_WARN << "Size mismatch between posterior " << posterior.size()
284  << " and features " << mat.NumRows() << " for utterance "
285  << utt;
286  num_err++;
287  continue;
288  }
289 
290  double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL );
291 
292  double this_t = opts.acoustic_weight * TotalPosterior(posterior),
293  max_count_scale = 1.0;
294  if (opts.max_count > 0 && this_t > opts.max_count) {
295  max_count_scale = opts.max_count / this_t;
296  KALDI_LOG << "Scaling stats for utterance " << utt << " by scale "
297  << max_count_scale << " due to --max-count="
298  << opts.max_count;
299  this_t = opts.max_count;
300  }
301  ScalePosterior(opts.acoustic_weight * max_count_scale,
302  &posterior);
303  // note: now, this_t == sum of posteriors.
304 
305  sequencer.Run(new IvectorExtractTask(extractor, utt, mat, posterior,
306  &ivector_writer, auxf_ptr));
307 
308  tot_t += this_t;
309  num_done++;
310  }
311  // Destructor of "sequencer" will wait for any remaining tasks.
312  }
313 
314  KALDI_LOG << "Done " << num_done << " files, " << num_err
315  << " with errors. Total (weighted) frames " << tot_t;
316  if (compute_objf_change)
317  KALDI_LOG << "Overall average objective-function change from estimating "
318  << "ivector was " << (tot_auxf_change / tot_t) << " per frame "
319  << " over " << tot_t << " (weighted) frames.";
320 
321  return (num_done != 0 ? 0 : 1);
322  } else {
323  KALDI_ASSERT(sequencer_config.num_threads == 1 &&
324  "--spk2utt option is incompatible with --num-threads option");
325  return RunPerSpeaker(ivector_extractor_rxfilename,
326  opts,
327  compute_objf_change,
328  spk2utt_rspecifier,
329  feature_rspecifier,
330  posterior_rspecifier,
331  ivectors_wspecifier);
332  }
333  } catch(const std::exception &e) {
334  std::cerr << e.what();
335  return -1;
336  }
337 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
double PriorOffset() const
The distribution over iVectors, in our formulation, is not centered at zero; its first dimension has ...
IvectorExtractTask(const IvectorExtractor &extractor, std::string utt, const Matrix< BaseFloat > &feats, const Posterior &posterior, BaseFloatVectorWriter *writer, double *tot_auxf_change)
void Run(C *c)
This function takes ownership of the pointer "c", and will delete it in the same sequence as Run was ...
Definition: kaldi-thread.h:190
void Register(OptionsItf *opts)
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
int32 g_num_threads
Definition: kaldi-thread.cc:25
double GetAuxf(const IvectorExtractorUtteranceStats &utt_stats, const VectorBase< double > &mean, const SpMatrix< double > *var=NULL) const
Returns the log-likelihood objective function, summed over frames, for this distribution of iVectors ...
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
Real Norm(Real p) const
Compute the p-th norm of the vector.
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832
BaseFloat TotalPosterior(const Posterior &post)
Returns the total of all the weights in "post".
Definition: posterior.cc:230
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
int main(int argc, char *argv[])
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const T & Value(const std::string &key)
BaseFloatVectorWriter * writer_
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
#define KALDI_WARN
Definition: kaldi-error.h:150
Matrix< BaseFloat > feats_
bool HasKey(const std::string &key)
void ScalePosterior(BaseFloat scale, Posterior *post)
Scales the BaseFloat (weight) element in the posterior entries.
Definition: posterior.cc:218
int NumArgs() const
Number of positional parameters (c.f. argc-1).
void AccStats(const MatrixBase< BaseFloat > &feats, const Posterior &post)
void GetIvectorDistribution(const IvectorExtractorUtteranceStats &utt_stats, VectorBase< double > *mean, SpMatrix< double > *var) const
Gets the distribution over ivectors (or at least, a Gaussian approximation to it).
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
These are the stats for a particular utterance, i.e.
#define KALDI_LOG
Definition: kaldi-error.h:153
const IvectorExtractor & extractor_
int32 RunPerSpeaker(const std::string &ivector_extractor_rxfilename, const IvectorEstimationOptions &opts, bool compute_objf_change, const std::string &spk2utt_rspecifier, const std::string &feature_rspecifier, const std::string &posterior_rspecifier, const std::string &ivector_wspecifier)
void Register(OptionsItf *opts)
Definition: kaldi-thread.h:160