online-ivector-feature.cc
Go to the documentation of this file.
1 // online2/online-ivector-feature.cc
2 
3 // Copyright 2014 Daniel Povey
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
21 
22 namespace kaldi {
23 
25  const OnlineIvectorExtractionConfig &config) {
26  Init(config);
27 }
28 
30  const OnlineIvectorExtractionConfig &config) {
33  num_gselect = config.num_gselect;
34  min_post = config.min_post;
36  max_count = config.max_count;
37  num_cg_iters = config.num_cg_iters;
41  KALDI_WARN << "--greedy-ivector-extractor=true implies "
42  << "--use-most-recent-ivector=true";
44  }
46 
47  std::string note = "(note: this may be needed "
48  "in the file supplied to --ivector-extractor-config)";
49  if (config.lda_mat_rxfilename == "")
50  KALDI_ERR << "--lda-matrix option must be set " << note;
52  if (config.global_cmvn_stats_rxfilename == "")
53  KALDI_ERR << "--global-cmvn-stats option must be set " << note;
55  if (config.cmvn_config_rxfilename == "")
56  KALDI_ERR << "--cmvn-config option must be set " << note;
58  if (config.splice_config_rxfilename == "")
59  KALDI_ERR << "--splice-config option must be set " << note;
61  if (config.diag_ubm_rxfilename == "")
62  KALDI_ERR << "--diag-ubm option must be set " << note;
64  if (config.ivector_extractor_rxfilename == "")
65  KALDI_ERR << "--ivector-extractor option must be set " << note;
67  this->Check();
68 }
69 
72  full_dim = lda_mat.NumCols();
73  if (!(full_dim % num_splice == 0 || full_dim % num_splice == 1)){
74  KALDI_WARN << "Error getting expected feature dimension: full-dim = "
75  << full_dim << ", num-splice = " << num_splice;
76  }
77  return full_dim / num_splice;
78 }
79 
82  int32 base_feat_dim = global_cmvn_stats.NumCols() - 1,
84  spliced_input_dim = base_feat_dim * num_splice;
85 
86  KALDI_ASSERT(lda_mat.NumCols() == spliced_input_dim ||
87  lda_mat.NumCols() == spliced_input_dim + 1);
92  KALDI_ASSERT(min_post < 0.5);
93  // posterior scale more than one does not really make sense.
96 }
97 
98 // The class constructed in this way should never be used.
102  max_remembered_frames(0) { }
103 
106  cmvn_state(other.cmvn_state), ivector_stats(other.ivector_stats) { }
107 
108 
110  BaseFloat max_remembered_frames, BaseFloat posterior_scale) {
111  KALDI_ASSERT(max_remembered_frames >= 0);
114  int32 feat_dim = cmvn_state.speaker_cmvn_stats.NumCols() - 1;
116  if (count > max_remembered_frames)
117  cmvn_state.speaker_cmvn_stats.Scale(max_remembered_frames / count);
118  }
119  // the stats for the iVector have been scaled by info_.posterior_scale,
120  // so we need to take this in account when setting the target count.
121  BaseFloat max_remembered_frames_scaled =
122  max_remembered_frames * posterior_scale;
123  if (ivector_stats.Count() > max_remembered_frames_scaled) {
124  ivector_stats.Scale(max_remembered_frames_scaled /
125  ivector_stats.Count());
126  }
127 }
128 
129 void OnlineIvectorExtractorAdaptationState::Write(std::ostream &os, bool binary) const {
130  WriteToken(os, binary, "<OnlineIvectorExtractorAdaptationState>"); // magic string.
131  WriteToken(os, binary, "<CmvnState>");
132  cmvn_state.Write(os, binary);
133  WriteToken(os, binary, "<IvectorStats>");
134  ivector_stats.Write(os, binary);
135  WriteToken(os, binary, "</OnlineIvectorExtractorAdaptationState>");
136 }
137 
138 void OnlineIvectorExtractorAdaptationState::Read(std::istream &is, bool binary) {
139  ExpectToken(is, binary, "<OnlineIvectorExtractorAdaptationState>"); // magic string.
140  ExpectToken(is, binary, "<CmvnState>");
141  cmvn_state.Read(is, binary);
142  ExpectToken(is, binary, "<IvectorStats>");
143  ivector_stats.Read(is, binary);
144  ExpectToken(is, binary, "</OnlineIvectorExtractorAdaptationState>");
145 }
146 
148  return info_.extractor.IvectorDim();
149 }
150 
152  // Note: it might be more logical to return, say, lda_->IsLastFrame()
153  // since this is the feature the iVector extractor directly consumes,
154  // but it will give the same answer as base_->IsLastFrame() anyway.
155  // [note: the splicing component pads at begin and end so it always
156  // returns the same number of frames as its input.]
157  return base_->IsLastFrame(frame);
158 }
159 
161  KALDI_ASSERT(lda_ != NULL);
162  return lda_->NumFramesReady();
163 }
164 
166  return lda_->FrameShiftInSeconds();
167 }
168 
170  const std::vector<std::pair<int32, BaseFloat> > &delta_weights) {
171  // add the elements to delta_weights_, which is a priority queue. The top
172  // element of the priority queue is the lowest numbered frame (we ensured this
173  // by making the comparison object std::greater instead of std::less). Adding
174  // elements from top (lower-numbered frames) to bottom (higher-numbered
175  // frames) should be most efficient, assuming it's a heap internally. So we
176  // go forward not backward in delta_weights while adding.
177  for (size_t i = 0; i < delta_weights.size(); i++) {
178  delta_weights_.push(delta_weights[i]);
179  int32 frame = delta_weights[i].first;
180  KALDI_ASSERT(frame >= 0);
181  if (frame > most_recent_frame_with_weight_)
182  most_recent_frame_with_weight_ = frame;
183  }
184  delta_weights_provided_ = true;
185 }
186 
187 
189  BaseFloat min_post = info_.min_post;
190  BaseFloat abs_weight = fabs(weight);
191  // If we return 0.99, it will have the same effect as just picking the
192  // most probable Gaussian on that frame.
193  if (abs_weight == 0.0)
194  return 0.99; // I don't anticipate reaching here.
195  min_post /= abs_weight;
196  if (min_post > 0.99)
197  min_post = 0.99;
198  return min_post;
199 }
200 
202  const std::vector<std::pair<int32, BaseFloat> > &frame_weights_in) {
203 
204  std::vector<std::pair<int32, BaseFloat> > frame_weights(frame_weights_in);
205  // Remove duplicates of frames.
206  MergePairVectorSumming(&frame_weights);
207 
208  if (frame_weights.empty())
209  return;
210 
211  int32 num_frames = static_cast<int32>(frame_weights.size());
212  int32 feat_dim = lda_normalized_->Dim();
213  Matrix<BaseFloat> feats(num_frames, feat_dim, kUndefined),
214  log_likes;
215 
216  std::vector<int32> frames;
217  frames.reserve(frame_weights.size());
218  for (int32 i = 0; i < num_frames; i++)
219  frames.push_back(frame_weights[i].first);
220  lda_normalized_->GetFrames(frames, &feats);
221 
222  info_.diag_ubm.LogLikelihoods(feats, &log_likes);
223 
224  // "posteriors" stores, for each frame index in the range of frames, the
225  // pruned posteriors for the Gaussians in the UBM.
226  std::vector<std::vector<std::pair<int32, BaseFloat> > > posteriors(num_frames);
227  for (int32 i = 0; i < num_frames; i++) {
228  std::vector<std::pair<int32, BaseFloat> > &posterior = posteriors[i];
229  BaseFloat weight = frame_weights[i].second;
230  if (weight != 0.0) {
231  tot_ubm_loglike_ += weight *
232  VectorToPosteriorEntry(log_likes.Row(i), info_.num_gselect,
233  GetMinPost(weight), &posterior);
234  for (size_t j = 0; j < posterior.size(); j++)
235  posterior[j].second *= info_.posterior_scale * weight;
236  }
237  }
238 
239  if (! info_.online_cmvn_iextractor) {
240  lda_->GetFrames(frames, &feats); // default, get features without OnlineCmvn
241  } else {
242  lda_normalized_->GetFrames(frames, &feats); // get features with OnlineCmvn
243  }
244  ivector_stats_.AccStats(info_.extractor, feats, posteriors);
245 }
246 
247 
249  KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady() &&
250  !delta_weights_provided_);
251  updated_with_no_delta_weights_ = true;
252 
253  int32 ivector_period = info_.ivector_period;
254  int32 num_cg_iters = info_.num_cg_iters;
255 
256  std::vector<std::pair<int32, BaseFloat> > frame_weights;
257 
258  for (; num_frames_stats_ <= frame; num_frames_stats_++) {
259  int32 t = num_frames_stats_;
260  BaseFloat frame_weight = 1.0;
261  frame_weights.push_back(std::pair<int32, BaseFloat>(t, frame_weight));
262  if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
263  (info_.use_most_recent_ivector && t == frame)) {
264  // The call below to UpdateStatsForFrames() is equivalent to doing, for
265  // all valid indexes i:
266  // UpdateStatsForFrame(cur_start_frame + i, frame_weights[i])
267  UpdateStatsForFrames(frame_weights);
268  frame_weights.clear();
269  ivector_stats_.GetIvector(num_cg_iters, &current_ivector_);
270  if (!info_.use_most_recent_ivector) { // need to cache iVectors.
271  int32 ivec_index = t / ivector_period;
272  KALDI_ASSERT(ivec_index == static_cast<int32>(ivectors_history_.size()));
273  ivectors_history_.push_back(new Vector<BaseFloat>(current_ivector_));
274  }
275  }
276  }
277  if (!frame_weights.empty())
278  UpdateStatsForFrames(frame_weights);
279 }
280 
282  KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady() &&
283  delta_weights_provided_ &&
284  ! updated_with_no_delta_weights_ &&
285  frame <= most_recent_frame_with_weight_);
286  bool debug_weights = false;
287 
288  int32 ivector_period = info_.ivector_period;
289  int32 num_cg_iters = info_.num_cg_iters;
290 
291  std::vector<std::pair<int32, BaseFloat> > frame_weights;
292  frame_weights.reserve(delta_weights_.size());
293 
294  for (; num_frames_stats_ <= frame; num_frames_stats_++) {
295  int32 t = num_frames_stats_;
296  // Instead of just updating frame t, we update all frames that need updating
297  // with index <= t, in case old frames were reclassified as silence/nonsilence.
298  while (!delta_weights_.empty() &&
299  delta_weights_.top().first <= t) {
300  int32 frame = delta_weights_.top().first;
301  BaseFloat weight = delta_weights_.top().second;
302  frame_weights.push_back(delta_weights_.top());
303  delta_weights_.pop();
304  if (debug_weights) {
305  if (current_frame_weight_debug_.size() <= frame)
306  current_frame_weight_debug_.resize(frame + 1, 0.0);
307  current_frame_weight_debug_[frame] += weight;
308  }
309  }
310  if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
311  (info_.use_most_recent_ivector && t == frame)) {
312  UpdateStatsForFrames(frame_weights);
313  frame_weights.clear();
314  ivector_stats_.GetIvector(num_cg_iters, &current_ivector_);
315  if (!info_.use_most_recent_ivector) { // need to cache iVectors.
316  int32 ivec_index = t / ivector_period;
317  KALDI_ASSERT(ivec_index == static_cast<int32>(ivectors_history_.size()));
318  ivectors_history_.push_back(new Vector<BaseFloat>(current_ivector_));
319  }
320  }
321  }
322  if (!frame_weights.empty())
323  UpdateStatsForFrames(frame_weights);
324 }
325 
326 
328  VectorBase<BaseFloat> *feat) {
329  int32 frame_to_update_until = (info_.greedy_ivector_extractor ?
330  lda_->NumFramesReady() - 1 : frame);
331  if (!delta_weights_provided_) // No silence weighting.
332  UpdateStatsUntilFrame(frame_to_update_until);
333  else
334  UpdateStatsUntilFrameWeighted(frame_to_update_until);
335 
336  KALDI_ASSERT(feat->Dim() == this->Dim());
337 
338  if (info_.use_most_recent_ivector) {
339  KALDI_VLOG(5) << "due to --use-most-recent-ivector=true, using iVector "
340  << "from frame " << num_frames_stats_ << " for frame "
341  << frame;
342  // use the most recent iVector we have, even if 'frame' is significantly in
343  // the past.
344  feat->CopyFromVec(current_ivector_);
345  // Subtract the prior-mean from the first dimension of the output feature so
346  // it's approximately zero-mean.
347  (*feat)(0) -= info_.extractor.PriorOffset();
348  } else {
349  int32 i = frame / info_.ivector_period; // rounds down.
350  // if the following fails, UpdateStatsUntilFrame would have a bug.
351  KALDI_ASSERT(static_cast<size_t>(i) < ivectors_history_.size());
352  feat->CopyFromVec(*(ivectors_history_[i]));
353  (*feat)(0) -= info_.extractor.PriorOffset();
354  }
355 }
356 
358  if (num_frames_stats_ == 0) {
359  KALDI_VLOG(3) << "Processed no data.";
360  } else {
361  KALDI_VLOG(3) << "UBM log-likelihood was "
362  << (tot_ubm_loglike_ / NumFrames())
363  << " per frame, over " << NumFrames()
364  << " frames.";
365 
366  Vector<BaseFloat> temp_ivector(current_ivector_);
367  temp_ivector(0) -= info_.extractor.PriorOffset();
368 
369  KALDI_VLOG(2) << "By the end of the utterance, objf change/frame "
370  << "from estimating iVector (vs. default) was "
371  << ivector_stats_.ObjfChange(current_ivector_)
372  << " and iVector length was "
373  << temp_ivector.Norm(2.0);
374  }
375 }
376 
378  PrintDiagnostics();
379  // Delete objects owned here.
380  for (size_t i = 0; i < to_delete_.size(); i++)
381  delete to_delete_[i];
382  for (size_t i = 0; i < ivectors_history_.size(); i++)
383  delete ivectors_history_[i];
384 }
385 
387  OnlineIvectorExtractorAdaptationState *adaptation_state) const {
388  // Note: the following call will work even if cmvn_->NumFramesReady() == 0; in
389  // that case it will return the unmodified adaptation state that cmvn_ was
390  // initialized with.
391  cmvn_->GetState(cmvn_->NumFramesReady() - 1,
392  &(adaptation_state->cmvn_state));
393  adaptation_state->ivector_stats = ivector_stats_;
394  adaptation_state->LimitFrames(info_.max_remembered_frames,
395  info_.posterior_scale);
396 }
397 
398 
400  const OnlineIvectorExtractionInfo &info,
401  OnlineFeatureInterface *base_feature):
402  info_(info),
403  base_(base_feature),
404  ivector_stats_(info_.extractor.IvectorDim(),
405  info_.extractor.PriorOffset(),
406  info_.max_count),
407  num_frames_stats_(0), delta_weights_provided_(false),
408  updated_with_no_delta_weights_(false),
409  most_recent_frame_with_weight_(-1), tot_ubm_loglike_(0.0) {
410  info.Check();
411  KALDI_ASSERT(base_feature != NULL);
412  OnlineFeatureInterface *splice_feature = new OnlineSpliceFrames(info_.splice_opts, base_feature);
413  to_delete_.push_back(splice_feature);
414  OnlineFeatureInterface *lda_feature = new OnlineTransform(info.lda_mat, splice_feature);
415  to_delete_.push_back(lda_feature);
416  OnlineFeatureInterface *lda_cache_feature = new OnlineCacheFeature(lda_feature);
417  lda_ = lda_cache_feature;
418  to_delete_.push_back(lda_cache_feature);
419 
420 
421  OnlineCmvnState naive_cmvn_state(info.global_cmvn_stats);
422  // Note: when you call this constructor the CMVN state knows nothing
423  // about the speaker. If you want to inform this class about more specific
424  // adaptation state, call this->SetAdaptationState(), most likely derived
425  // from a call to GetAdaptationState() from a previous object of this type.
426  cmvn_ = new OnlineCmvn(info.cmvn_opts, naive_cmvn_state, base_feature);
427  to_delete_.push_back(cmvn_);
428 
429  OnlineFeatureInterface *splice_normalized =
431  *lda_normalized =
432  new OnlineTransform(info.lda_mat, splice_normalized),
433  *cache_normalized = new OnlineCacheFeature(lda_normalized);
434  lda_normalized_ = cache_normalized;
435 
436  to_delete_.push_back(splice_normalized);
437  to_delete_.push_back(lda_normalized);
438  to_delete_.push_back(cache_normalized);
439 
440  // Set the iVector to its default value, [ prior_offset, 0, 0, ... ].
443 }
444 
446  const OnlineIvectorExtractorAdaptationState &adaptation_state) {
448  "SetAdaptationState called after frames were processed.");
450  adaptation_state.ivector_stats.IvectorDim());
451  ivector_stats_ = adaptation_state.ivector_stats;
452  cmvn_->SetState(adaptation_state.cmvn_state);
453 }
454 
456  if (NumFrames() == 0) return 0;
457  else return tot_ubm_loglike_ / NumFrames();
458 }
459 
462 }
463 
464 
466  const TransitionModel &trans_model,
467  const OnlineSilenceWeightingConfig &config,
468  int32 frame_subsampling_factor):
469  trans_model_(trans_model), config_(config),
470  frame_subsampling_factor_(frame_subsampling_factor),
471  num_frames_output_and_correct_(0) {
473  std::vector<int32> silence_phones;
474  SplitStringToIntegers(config.silence_phones_str, ":,", false,
475  &silence_phones);
476  for (size_t i = 0; i < silence_phones.size(); i++)
477  silence_phones_.insert(silence_phones[i]);
478 }
479 
480 
481 template <typename FST>
483  const LatticeFasterOnlineDecoderTpl<FST> &decoder) {
484  int32 num_frames_decoded = decoder.NumFramesDecoded(),
485  num_frames_prev = frame_info_.size();
486  // note, num_frames_prev is not the number of frames previously decoded,
487  // it's the generally-larger number of frames that we were requested to
488  // provide weights for.
489  if (num_frames_prev < num_frames_decoded)
490  frame_info_.resize(num_frames_decoded);
491  if (num_frames_prev > num_frames_decoded &&
492  frame_info_[num_frames_decoded].transition_id != -1)
493  KALDI_ERR << "Number of frames decoded decreased"; // Likely bug
494 
495  if (num_frames_decoded == 0)
496  return;
497  int32 frame = num_frames_decoded - 1;
498  bool use_final_probs = false;
500  decoder.BestPathEnd(use_final_probs, NULL);
501  while (frame >= 0) {
502  LatticeArc arc;
503  arc.ilabel = 0;
504  while (arc.ilabel == 0) // the while loop skips over input-epsilons
505  iter = decoder.TraceBackBestPath(iter, &arc);
506  // note, the iter.frame values are slightly unintuitively defined,
507  // they are one less than you might expect.
508  KALDI_ASSERT(iter.frame == frame - 1);
509 
510  if (frame_info_[frame].token == iter.tok) {
511  // we know that the traceback from this point back will be identical, so
512  // no point tracing back further. Note: we are comparing memory addresses
513  // of tokens of the decoder; this guarantees it's the same exact token
514  // because tokens, once allocated on a frame, are only deleted, never
515  // reallocated for that frame.
516  break;
517  }
518 
519  if (num_frames_output_and_correct_ > frame)
521 
522  frame_info_[frame].token = iter.tok;
523  frame_info_[frame].transition_id = arc.ilabel;
524  frame--;
525  // leave frame_info_.current_weight at zero for now (as set in the
526  // constructor), reflecting that we haven't already output a weight for that
527  // frame.
528  }
529 }
530 
531 template <typename FST>
534  int32 num_frames_decoded = decoder.NumFramesDecoded(),
535  num_frames_prev = frame_info_.size();
536  // note, num_frames_prev is not the number of frames previously decoded,
537  // it's the generally-larger number of frames that we were requested to
538  // provide weights for.
539  if (num_frames_prev < num_frames_decoded)
540  frame_info_.resize(num_frames_decoded);
541  if (num_frames_prev > num_frames_decoded &&
542  frame_info_[num_frames_decoded].transition_id != -1)
543  KALDI_ERR << "Number of frames decoded decreased"; // Likely bug
544 
545  if (num_frames_decoded == 0)
546  return;
547  int32 frame = num_frames_decoded - 1;
548  bool use_final_probs = false;
550  decoder.BestPathEnd(use_final_probs, NULL);
551  while (frame >= 0) {
552  LatticeArc arc;
553  arc.ilabel = 0;
554  while (arc.ilabel == 0) // the while loop skips over input-epsilons
555  iter = decoder.TraceBackBestPath(iter, &arc);
556  // note, the iter.frame values are slightly unintuitively defined,
557  // they are one less than you might expect.
558  KALDI_ASSERT(iter.frame == frame - 1);
559 
560  if (frame_info_[frame].token == iter.tok) {
561  // we know that the traceback from this point back will be identical, so
562  // no point tracing back further. Note: we are comparing memory addresses
563  // of tokens of the decoder; this guarantees it's the same exact token,
564  // because tokens, once allocated on a frame, are only deleted, never
565  // reallocated for that frame.
566  break;
567  }
568 
569  if (num_frames_output_and_correct_ > frame)
571 
572  frame_info_[frame].token = iter.tok;
573  frame_info_[frame].transition_id = arc.ilabel;
574  frame--;
575  // leave frame_info_.current_weight at zero for now (as set in the
576  // constructor), reflecting that we haven't already output a weight for that
577  // frame.
578  }
579 }
580 
581 
582 // Instantiate the template OnlineSilenceWeighting::ComputeCurrentTraceback().
583 template
584 void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::Fst<fst::StdArc> >(
586 template
587 void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::GrammarFst>(
589 template
590 void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::Fst<fst::StdArc> >(
592 template
593 void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::GrammarFst>(
595 
596 
598  int32 num_frames_ready, int32 first_decoder_frame,
599  std::vector<std::pair<int32, BaseFloat> > *delta_weights) {
600  // num_frames_ready is at the feature frame-rate, most of the code
601  // in this function is at the decoder frame-rate.
602  // round up, so we are sure to get weights for at least the frame
603  // 'num_frames_ready - 1', and maybe one or two frames afterward.
604  KALDI_ASSERT(num_frames_ready > first_decoder_frame || num_frames_ready == 0);
606  num_decoder_frames_ready = (num_frames_ready - first_decoder_frame + fs - 1) / fs;
607 
608  const int32 max_state_duration = config_.max_state_duration;
609  const BaseFloat silence_weight = config_.silence_weight;
610 
611  delta_weights->clear();
612 
613  int32 prev_num_frames_processed = frame_info_.size();
614  if (frame_info_.size() < static_cast<size_t>(num_decoder_frames_ready))
615  frame_info_.resize(num_decoder_frames_ready);
616 
617  // Don't go further backward into the past then 100 frames before the most
618  // recent frame previously than 100 frames when modifying the traceback.
619  // C.f. the value 200 in template
620  // OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature in online-feature.cc,
621  // which needs to be more than this value of 100 plus the amount of context
622  // that LDA might use plus the chunk size we're likely to decode in one time.
623  // The user can always increase the value of --max-feature-vectors in case one
624  // of these conditions is broken. Search for ONLINE_IVECTOR_LIMIT in
625  // online-feature.cc
626  int32 begin_frame = std::max<int32>(0, prev_num_frames_processed - 100),
627  frames_out = static_cast<int32>(frame_info_.size()) - begin_frame;
628  // frames_out is the number of frames we will output.
629  KALDI_ASSERT(frames_out >= 0);
630  std::vector<BaseFloat> frame_weight(frames_out, 1.0);
631  // we will set frame_weight to the value silence_weight for silence frames and
632  // for transition-ids that repeat with duration > max_state_duration. Frames
633  // newer than the most recent traceback will get a weight equal to the weight
634  // for the most recent frame in the traceback; or the silence weight, if there
635  // is no traceback at all available yet.
636 
637  // First treat some special cases.
638  if (frames_out == 0) // Nothing to output.
639  return;
640  if (frame_info_[begin_frame].transition_id == -1) {
641  // We do not have any traceback at all within the frames we are to output...
642  // find the most recent weight that we output and apply the same weight to
643  // all the new output; or output the silence weight, if nothing was output.
644  BaseFloat weight = (begin_frame == 0 ? silence_weight :
645  frame_info_[begin_frame - 1].current_weight);
646  for (int32 offset = 0; offset < frames_out; offset++)
647  frame_weight[offset] = weight;
648  } else {
649  int32 current_run_start_offset = 0;
650  for (int32 offset = 0; offset < frames_out; offset++) {
651  int32 frame = begin_frame + offset;
652  int32 transition_id = frame_info_[frame].transition_id;
653  if (transition_id == -1) {
654  // this frame does not yet have a decoder traceback, so just
655  // duplicate the silence/non-silence status of the most recent
656  // frame we have a traceback for (probably a reasonable guess).
657  frame_weight[offset] = frame_weight[offset - 1];
658  } else {
659  int32 phone = trans_model_.TransitionIdToPhone(transition_id);
660  bool is_silence = (silence_phones_.count(phone) != 0);
661  if (is_silence)
662  frame_weight[offset] = silence_weight;
663  // now deal with max-duration issues.
664  if (max_state_duration > 0 &&
665  (offset + 1 == frames_out ||
666  transition_id != frame_info_[frame + 1].transition_id)) {
667  // If this is the last frame of a run...
668  int32 run_length = offset - current_run_start_offset + 1;
669  if (run_length >= max_state_duration) {
670  // treat runs of the same transition-id longer than the max, as
671  // silence, even if they were not silence.
672  for (int32 offset2 = current_run_start_offset;
673  offset2 <= offset; offset2++)
674  frame_weight[offset2] = silence_weight;
675  }
676  if (offset + 1 < frames_out)
677  current_run_start_offset = offset + 1;
678  }
679  }
680  }
681  }
682  // Now commit the stats...
683  for (int32 offset = 0; offset < frames_out; offset++) {
684  int32 frame = begin_frame + offset;
685  BaseFloat old_weight = frame_info_[frame].current_weight,
686  new_weight = frame_weight[offset],
687  weight_diff = new_weight - old_weight;
688  frame_info_[frame].current_weight = new_weight;
689  // Even if the delta-weight is zero for the last frame, we provide it,
690  // because the identity of the most recent frame with a weight is used in
691  // some debugging/checking code.
692  if (weight_diff != 0.0 || offset + 1 == frames_out) {
693  KALDI_VLOG(6) << "Weight for frame " << frame << " changing from "
694  << old_weight << " to " << new_weight;
695  for(int32 i = 0; i < frame_subsampling_factor_; i++) {
696  int32 input_frame = first_decoder_frame + (frame * frame_subsampling_factor_) + i;
697  delta_weights->push_back(std::make_pair(input_frame, weight_diff));
698  }
699  }
700  }
701 }
702 
703 } // namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
int32 Dim() const
Returns the dimensionality of the Gaussian mean vectors.
Definition: diag-gmm.h:74
double ObjfChange(const VectorBase< double > &ivector) const
ObjfChange returns the change in objective function *per frame* from using the default value [ prior_...
void ReadConfigFromFile(const std::string &config_filename, C *c)
This template is provided for convenience in reading config classes from files; this is not the stand...
fst::ArcTpl< LatticeWeight > LatticeArc
Definition: kaldi-lattice.h:40
OnlineIvectorFeature(const OnlineIvectorExtractionInfo &info, OnlineFeatureInterface *base_feature)
Constructor.
double PriorOffset() const
The distribution over iVectors, in our formulation, is not centered at zero; its first dimension has ...
Matrix< double > speaker_cmvn_stats
This struct contains various things that are needed (as const references) by class OnlineIvectorExtra...
Vector< double > current_ivector_
Most recently estimated iVector, will have been estimated at the greatest time t where t <= num_frame...
void Write(std::ostream &os, bool binary) const
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
LatticeIncrementalOnlineDecoderTpl is as LatticeIncrementalDecoderTpl but also supports an efficient ...
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
Definition: text-utils.h:68
OnlineIvectorExtractorAdaptationState(const OnlineIvectorExtractionInfo &info)
This constructor initializes adaptation-state with no prior speaker history.
This class does an online version of the cepstral mean and [optionally] variance, but note that this ...
This class stores the adaptation state from the online iVector extractor, which can help you to initi...
BaseFloat VectorToPosteriorEntry(const VectorBase< BaseFloat > &log_likes, int32 num_gselect, BaseFloat min_post, std::vector< std::pair< int32, BaseFloat > > *post_entry)
Given a vector of log-likelihoods (typically of Gaussians in a GMM but could be of pdf-ids)...
Definition: posterior.cc:440
void GetAdaptationState(OnlineIvectorExtractorAdaptationState *adaptation_state) const
Get the adaptation state; you may want to call this before destroying this object, to get adaptation state that can be used to improve decoding of later utterances of this speaker.
Matrix< double > frozen_state
void Read(std::istream &is, bool binary)
kaldi::int32 int32
virtual int32 Dim() const
Dim() will return the iVector dimension.
BestPathIterator BestPathEnd(bool use_final_probs, BaseFloat *final_cost=NULL) const
This function returns an iterator that can be used to trace back the best path.
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
void UpdateStatsForFrames(const std::vector< std::pair< int32, BaseFloat > > &frame_weights)
int32 num_frames_stats_
num_frames_stats_ is the number of frames of data we have already accumulated from this utterance and...
Real Norm(Real p) const
Compute the p-th norm of the vector.
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832
void SetAdaptationState(const OnlineIvectorExtractorAdaptationState &adaptation_state)
Set the adaptation state to a particular value, e.g.
double tot_ubm_loglike_
The following is only needed for diagnostics.
OnlineFeatureInterface * lda_normalized_
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
const size_t count
void Write(std::ostream &os, bool binary) const
OnlineIvectorEstimationStats ivector_stats_
the iVector estimation stats
void ComputeCurrentTraceback(const LatticeFasterOnlineDecoderTpl< FST > &decoder)
void Read(std::istream &is, bool binary)
OnlineIvectorEstimationStats ivector_stats
Stats for online iVector estimation.
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
const OnlineSilenceWeightingConfig & config_
void Scale(Real alpha)
Multiply each element with a scalar value.
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191
void SetState(const OnlineCmvnState &cmvn_state)
void LimitFrames(BaseFloat max_remembered_frames, BaseFloat posterior_scale)
Scales down the stats if needed to ensure the number of frames in the speaker-specific CMVN stats doe...
BaseFloat GetMinPost(BaseFloat weight) const
Struct OnlineCmvnState stores the state of CMVN adaptation between utterances (but not the state of t...
void Write(std::ostream &os, bool binary) const
#define KALDI_ERR
Definition: kaldi-error.h:147
This online-feature class implements any affine or linear transform.
int32 NumFrames(int64 num_samples, const FrameExtractionOptions &opts, bool flush)
This function returns the number of frames that we can extract from a wave file with the given number...
#define KALDI_WARN
Definition: kaldi-error.h:150
void Init(const OnlineIvectorExtractionConfig &config)
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
const OnlineIvectorExtractionInfo & info_
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also supports an efficient way to get...
This file contains code for online iVector extraction in a form compatible with OnlineFeatureInterfac...
BestPathIterator TraceBackBestPath(BestPathIterator iter, LatticeArc *arc) const
This function can be used in conjunction with BestPathEnd() to trace back the best path one link at a...
void UpdateFrameWeights(const std::vector< std::pair< int32, BaseFloat > > &delta_weights)
std::vector< OnlineFeatureInterface * > to_delete_
unordered_set< int32 > silence_phones_
virtual void GetFrame(int32 frame, VectorBase< BaseFloat > *feat)
Gets the feature vector for this frame.
A class representing a vector.
Definition: kaldi-vector.h:406
int32 NumFramesDecoded() const
Returns the number of frames decoded so far.
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void MergePairVectorSumming(std::vector< std::pair< I, F > > *vec)
For a vector of pair<I, F> where I is an integer and F a floating-point or integer type...
Definition: stl-utils.h:288
virtual BaseFloat FrameShiftInSeconds() const
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
OnlineFeatureInterface is an interface for online feature processing (it is also usable in the offlin...
virtual int32 NumFramesReady() const
returns the feature dimension.
BestPathIterator BestPathEnd(bool use_final_probs, BaseFloat *final_cost=NULL) const
This function returns an iterator that can be used to trace back the best path.
This class includes configuration variables relating to the online iVector extraction, but not including configuration for the "base feature", i.e.
const TransitionModel & trans_model_
void UpdateStatsUntilFrameWeighted(int32 frame)
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
int32 TransitionIdToPhone(int32 trans_id) const
std::vector< FrameInfo > frame_info_
This feature type can be used to cache its input, to avoid repetition of computation in a multi-pass ...
BestPathIterator TraceBackBestPath(BestPathIterator iter, LatticeArc *arc) const
This function can be used in conjunction with BestPathEnd() to trace back the best path one link at a...
OnlineFeatureInterface * lda_
OnlineSilenceWeighting(const TransitionModel &trans_model, const OnlineSilenceWeightingConfig &config, int32 frame_subsampling_factor=1)
void Scale(double scale)
Scales the number of frames of stats by 0 <= scale <= 1, to make it as if we had fewer frames of adap...
void GetDeltaWeights(int32 num_frames_ready, int32 first_decoder_frame, std::vector< std::pair< int32, BaseFloat > > *delta_weights)