41 KALDI_WARN <<
"--greedy-ivector-extractor=true implies " 42 <<
"--use-most-recent-ivector=true";
47 std::string note =
"(note: this may be needed " 48 "in the file supplied to --ivector-extractor-config)";
50 KALDI_ERR <<
"--lda-matrix option must be set " << note;
53 KALDI_ERR <<
"--global-cmvn-stats option must be set " << note;
56 KALDI_ERR <<
"--cmvn-config option must be set " << note;
59 KALDI_ERR <<
"--splice-config option must be set " << note;
62 KALDI_ERR <<
"--diag-ubm option must be set " << note;
65 KALDI_ERR <<
"--ivector-extractor option must be set " << note;
73 if (!(full_dim % num_splice == 0 || full_dim % num_splice == 1)){
74 KALDI_WARN <<
"Error getting expected feature dimension: full-dim = " 75 << full_dim <<
", num-splice = " << num_splice;
77 return full_dim / num_splice;
84 spliced_input_dim = base_feat_dim * num_splice;
106 cmvn_state(other.cmvn_state), ivector_stats(other.ivector_stats) { }
116 if (count > max_remembered_frames)
122 max_remembered_frames * posterior_scale;
130 WriteToken(os, binary,
"<OnlineIvectorExtractorAdaptationState>");
135 WriteToken(os, binary,
"</OnlineIvectorExtractorAdaptationState>");
139 ExpectToken(is, binary,
"<OnlineIvectorExtractorAdaptationState>");
144 ExpectToken(is, binary,
"</OnlineIvectorExtractorAdaptationState>");
148 return info_.extractor.IvectorDim();
157 return base_->IsLastFrame(frame);
162 return lda_->NumFramesReady();
166 return lda_->FrameShiftInSeconds();
170 const std::vector<std::pair<int32, BaseFloat> > &delta_weights) {
177 for (
size_t i = 0;
i < delta_weights.size();
i++) {
178 delta_weights_.push(delta_weights[
i]);
179 int32 frame = delta_weights[
i].first;
181 if (frame > most_recent_frame_with_weight_)
182 most_recent_frame_with_weight_ = frame;
184 delta_weights_provided_ =
true;
193 if (abs_weight == 0.0)
195 min_post /= abs_weight;
202 const std::vector<std::pair<int32, BaseFloat> > &frame_weights_in) {
204 std::vector<std::pair<int32, BaseFloat> > frame_weights(frame_weights_in);
208 if (frame_weights.empty())
211 int32 num_frames =
static_cast<int32>(frame_weights.size());
212 int32 feat_dim = lda_normalized_->Dim();
216 std::vector<int32> frames;
217 frames.reserve(frame_weights.size());
218 for (
int32 i = 0;
i < num_frames;
i++)
219 frames.push_back(frame_weights[
i].first);
220 lda_normalized_->GetFrames(frames, &feats);
222 info_.diag_ubm.LogLikelihoods(feats, &log_likes);
226 std::vector<std::vector<std::pair<int32, BaseFloat> > > posteriors(num_frames);
227 for (
int32 i = 0; i < num_frames; i++) {
228 std::vector<std::pair<int32, BaseFloat> > &posterior = posteriors[
i];
231 tot_ubm_loglike_ += weight *
233 GetMinPost(weight), &posterior);
234 for (
size_t j = 0;
j < posterior.size();
j++)
235 posterior[
j].second *= info_.posterior_scale * weight;
239 if (! info_.online_cmvn_iextractor) {
240 lda_->GetFrames(frames, &feats);
242 lda_normalized_->GetFrames(frames, &feats);
244 ivector_stats_.AccStats(info_.extractor, feats, posteriors);
249 KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady() &&
250 !delta_weights_provided_);
251 updated_with_no_delta_weights_ =
true;
253 int32 ivector_period = info_.ivector_period;
254 int32 num_cg_iters = info_.num_cg_iters;
256 std::vector<std::pair<int32, BaseFloat> > frame_weights;
258 for (; num_frames_stats_ <= frame; num_frames_stats_++) {
259 int32 t = num_frames_stats_;
261 frame_weights.push_back(std::pair<int32, BaseFloat>(t, frame_weight));
262 if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
263 (info_.use_most_recent_ivector && t == frame)) {
267 UpdateStatsForFrames(frame_weights);
268 frame_weights.clear();
269 ivector_stats_.GetIvector(num_cg_iters, ¤t_ivector_);
270 if (!info_.use_most_recent_ivector) {
271 int32 ivec_index = t / ivector_period;
272 KALDI_ASSERT(ivec_index == static_cast<int32>(ivectors_history_.size()));
277 if (!frame_weights.empty())
278 UpdateStatsForFrames(frame_weights);
282 KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady() &&
283 delta_weights_provided_ &&
284 ! updated_with_no_delta_weights_ &&
285 frame <= most_recent_frame_with_weight_);
286 bool debug_weights =
false;
288 int32 ivector_period = info_.ivector_period;
289 int32 num_cg_iters = info_.num_cg_iters;
291 std::vector<std::pair<int32, BaseFloat> > frame_weights;
292 frame_weights.reserve(delta_weights_.size());
294 for (; num_frames_stats_ <= frame; num_frames_stats_++) {
295 int32 t = num_frames_stats_;
298 while (!delta_weights_.empty() &&
299 delta_weights_.top().first <= t) {
300 int32 frame = delta_weights_.top().first;
301 BaseFloat weight = delta_weights_.top().second;
302 frame_weights.push_back(delta_weights_.top());
303 delta_weights_.pop();
305 if (current_frame_weight_debug_.size() <= frame)
306 current_frame_weight_debug_.resize(frame + 1, 0.0);
307 current_frame_weight_debug_[frame] += weight;
310 if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
311 (info_.use_most_recent_ivector && t == frame)) {
312 UpdateStatsForFrames(frame_weights);
313 frame_weights.clear();
314 ivector_stats_.GetIvector(num_cg_iters, ¤t_ivector_);
315 if (!info_.use_most_recent_ivector) {
316 int32 ivec_index = t / ivector_period;
317 KALDI_ASSERT(ivec_index == static_cast<int32>(ivectors_history_.size()));
322 if (!frame_weights.empty())
323 UpdateStatsForFrames(frame_weights);
329 int32 frame_to_update_until = (info_.greedy_ivector_extractor ?
330 lda_->NumFramesReady() - 1 : frame);
331 if (!delta_weights_provided_)
332 UpdateStatsUntilFrame(frame_to_update_until);
334 UpdateStatsUntilFrameWeighted(frame_to_update_until);
338 if (info_.use_most_recent_ivector) {
339 KALDI_VLOG(5) <<
"due to --use-most-recent-ivector=true, using iVector " 340 <<
"from frame " << num_frames_stats_ <<
" for frame " 347 (*feat)(0) -= info_.extractor.PriorOffset();
349 int32 i = frame / info_.ivector_period;
351 KALDI_ASSERT(static_cast<size_t>(i) < ivectors_history_.size());
353 (*feat)(0) -= info_.extractor.PriorOffset();
358 if (num_frames_stats_ == 0) {
367 temp_ivector(0) -= info_.extractor.PriorOffset();
369 KALDI_VLOG(2) <<
"By the end of the utterance, objf change/frame " 370 <<
"from estimating iVector (vs. default) was " 371 << ivector_stats_.ObjfChange(current_ivector_)
372 <<
" and iVector length was " 373 << temp_ivector.
Norm(2.0);
380 for (
size_t i = 0;
i < to_delete_.size();
i++)
381 delete to_delete_[
i];
382 for (
size_t i = 0;
i < ivectors_history_.size();
i++)
383 delete ivectors_history_[
i];
391 cmvn_->GetState(cmvn_->NumFramesReady() - 1,
394 adaptation_state->
LimitFrames(info_.max_remembered_frames,
395 info_.posterior_scale);
404 ivector_stats_(info_.extractor.IvectorDim(),
405 info_.extractor.PriorOffset(),
407 num_frames_stats_(0), delta_weights_provided_(false),
408 updated_with_no_delta_weights_(false),
409 most_recent_frame_with_weight_(-1), tot_ubm_loglike_(0.0) {
417 lda_ = lda_cache_feature;
448 "SetAdaptationState called after frames were processed.");
468 int32 frame_subsampling_factor):
469 trans_model_(trans_model), config_(config),
470 frame_subsampling_factor_(frame_subsampling_factor),
471 num_frames_output_and_correct_(0) {
473 std::vector<int32> silence_phones;
476 for (
size_t i = 0;
i < silence_phones.size();
i++)
481 template <
typename FST>
489 if (num_frames_prev < num_frames_decoded)
491 if (num_frames_prev > num_frames_decoded &&
492 frame_info_[num_frames_decoded].transition_id != -1)
493 KALDI_ERR <<
"Number of frames decoded decreased";
495 if (num_frames_decoded == 0)
497 int32 frame = num_frames_decoded - 1;
498 bool use_final_probs =
false;
504 while (arc.ilabel == 0)
531 template <
typename FST>
539 if (num_frames_prev < num_frames_decoded)
541 if (num_frames_prev > num_frames_decoded &&
542 frame_info_[num_frames_decoded].transition_id != -1)
543 KALDI_ERR <<
"Number of frames decoded decreased";
545 if (num_frames_decoded == 0)
547 int32 frame = num_frames_decoded - 1;
548 bool use_final_probs =
false;
554 while (arc.ilabel == 0)
584 void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::Fst<fst::StdArc> >(
587 void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::GrammarFst>(
590 void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::Fst<fst::StdArc> >(
593 void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::GrammarFst>(
598 int32 num_frames_ready,
int32 first_decoder_frame,
599 std::vector<std::pair<int32, BaseFloat> > *delta_weights) {
604 KALDI_ASSERT(num_frames_ready > first_decoder_frame || num_frames_ready == 0);
606 num_decoder_frames_ready = (num_frames_ready - first_decoder_frame + fs - 1) / fs;
611 delta_weights->clear();
614 if (
frame_info_.size() <
static_cast<size_t>(num_decoder_frames_ready))
626 int32 begin_frame = std::max<int32>(0, prev_num_frames_processed - 100),
627 frames_out = static_cast<int32>(
frame_info_.size()) - begin_frame;
630 std::vector<BaseFloat> frame_weight(frames_out, 1.0);
640 if (
frame_info_[begin_frame].transition_id == -1) {
644 BaseFloat weight = (begin_frame == 0 ? silence_weight :
646 for (
int32 offset = 0; offset < frames_out; offset++)
647 frame_weight[offset] = weight;
649 int32 current_run_start_offset = 0;
650 for (
int32 offset = 0; offset < frames_out; offset++) {
651 int32 frame = begin_frame + offset;
653 if (transition_id == -1) {
657 frame_weight[offset] = frame_weight[offset - 1];
662 frame_weight[offset] = silence_weight;
664 if (max_state_duration > 0 &&
665 (offset + 1 == frames_out ||
666 transition_id !=
frame_info_[frame + 1].transition_id)) {
668 int32 run_length = offset - current_run_start_offset + 1;
669 if (run_length >= max_state_duration) {
672 for (
int32 offset2 = current_run_start_offset;
673 offset2 <= offset; offset2++)
674 frame_weight[offset2] = silence_weight;
676 if (offset + 1 < frames_out)
677 current_run_start_offset = offset + 1;
683 for (
int32 offset = 0; offset < frames_out; offset++) {
684 int32 frame = begin_frame + offset;
686 new_weight = frame_weight[offset],
687 weight_diff = new_weight - old_weight;
692 if (weight_diff != 0.0 || offset + 1 == frames_out) {
693 KALDI_VLOG(6) <<
"Weight for frame " << frame <<
" changing from " 694 << old_weight <<
" to " << new_weight;
697 delta_weights->push_back(std::make_pair(input_frame, weight_diff));
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
BaseFloat NumFrames() const
int32 Dim() const
Returns the dimensionality of the Gaussian mean vectors.
double ObjfChange(const VectorBase< double > &ivector) const
ObjfChange returns the change in objective function *per frame* from using the default value [ prior_...
void ReadConfigFromFile(const std::string &config_filename, C *c)
This template is provided for convenience in reading config classes from files; this is not the stand...
fst::ArcTpl< LatticeWeight > LatticeArc
OnlineIvectorFeature(const OnlineIvectorExtractionInfo &info, OnlineFeatureInterface *base_feature)
Constructor.
BaseFloat ObjfImprPerFrame() const
Matrix< double > speaker_cmvn_stats
Vector< double > current_ivector_
Most recently estimated iVector, will have been estimated at the greatest time t where t <= num_frame...
void Write(std::ostream &os, bool binary) const
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
LatticeIncrementalOnlineDecoderTpl is as LatticeIncrementalDecoderTpl but also supports an efficient ...
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
This class does an online version of the cepstral mean and [optionally] variance, but note that this ...
BaseFloat VectorToPosteriorEntry(const VectorBase< BaseFloat > &log_likes, int32 num_gselect, BaseFloat min_post, std::vector< std::pair< int32, BaseFloat > > *post_entry)
Given a vector of log-likelihoods (typically of Gaussians in a GMM but could be of pdf-ids)...
BaseFloat UbmLogLikePerFrame() const
void PrintDiagnostics() const
void GetAdaptationState(OnlineIvectorExtractorAdaptationState *adaptation_state) const
Get the adaptation state; you may want to call this before destroying this object, to get adaptation state that can be used to improve decoding of later utterances of this speaker.
Matrix< double > frozen_state
void Read(std::istream &is, bool binary)
virtual int32 Dim() const
Dim() will return the iVector dimension.
BestPathIterator BestPathEnd(bool use_final_probs, BaseFloat *final_cost=NULL) const
This function returns an iterator that can be used to trace back the best path.
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
virtual ~OnlineIvectorFeature()
void UpdateStatsForFrames(const std::vector< std::pair< int32, BaseFloat > > &frame_weights)
int32 num_frames_stats_
num_frames_stats_ is the number of frames of data we have already accumulated from this utterance and...
Real Norm(Real p) const
Compute the p-th norm of the vector.
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
void SetAdaptationState(const OnlineIvectorExtractorAdaptationState &adaptation_state)
Set the adaptation state to a particular value, e.g.
double tot_ubm_loglike_
The following is only needed for diagnostics.
OnlineFeatureInterface * lda_normalized_
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
OnlineIvectorEstimationStats ivector_stats_
the iVector estimation stats
void ComputeCurrentTraceback(const LatticeFasterOnlineDecoderTpl< FST > &decoder)
void Read(std::istream &is, bool binary)
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
const OnlineSilenceWeightingConfig & config_
void Scale(Real alpha)
Multiply each element with a scalar value.
int32 NumFramesDecoded() const
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
void SetState(const OnlineCmvnState &cmvn_state)
BaseFloat GetMinPost(BaseFloat weight) const
Struct OnlineCmvnState stores the state of CMVN adaptation between utterances (but not the state of t...
void Write(std::ostream &os, bool binary) const
int32 NumFrames(int64 num_samples, const FrameExtractionOptions &opts, bool flush)
This function returns the number of frames that we can extract from a wave file with the given number...
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
const OnlineIvectorExtractionInfo & info_
MatrixIndexT Dim() const
Returns the dimension of the vector.
LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also supports an efficient way to get...
This file contains code for online iVector extraction in a form compatible with OnlineFeatureInterfac...
BestPathIterator TraceBackBestPath(BestPathIterator iter, LatticeArc *arc) const
This function can be used in conjunction with BestPathEnd() to trace back the best path one link at a...
void UpdateStatsUntilFrame(int32 frame)
std::string silence_phones_str
void UpdateFrameWeights(const std::vector< std::pair< int32, BaseFloat > > &delta_weights)
std::vector< OnlineFeatureInterface * > to_delete_
unordered_set< int32 > silence_phones_
virtual void GetFrame(int32 frame, VectorBase< BaseFloat > *feat)
Gets the feature vector for this frame.
A class representing a vector.
int32 NumFramesDecoded() const
Returns the number of frames decoded so far.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
BaseFloat max_state_duration
void MergePairVectorSumming(std::vector< std::pair< I, F > > *vec)
For a vector of pair<I, F> where I is an integer and F a floating-point or integer type...
virtual BaseFloat FrameShiftInSeconds() const
OnlineFeatureInterface is an interface for online feature processing (it is also usable in the offlin...
virtual int32 NumFramesReady() const
returns the feature dimension.
BestPathIterator BestPathEnd(bool use_final_probs, BaseFloat *final_cost=NULL) const
This function returns an iterator that can be used to trace back the best path.
const TransitionModel & trans_model_
void UpdateStatsUntilFrameWeighted(int32 frame)
Provides a vector abstraction class.
int32 TransitionIdToPhone(int32 trans_id) const
std::vector< FrameInfo > frame_info_
int32 num_frames_output_and_correct_
This feature type can be used to cache its input, to avoid repetition of computation in a multi-pass ...
BestPathIterator TraceBackBestPath(BestPathIterator iter, LatticeArc *arc) const
This function can be used in conjunction with BestPathEnd() to trace back the best path one link at a...
OnlineFeatureInterface * lda_
OnlineSilenceWeighting(const TransitionModel &trans_model, const OnlineSilenceWeightingConfig &config, int32 frame_subsampling_factor=1)
int32 frame_subsampling_factor_
void Scale(double scale)
Scales the number of frames of stats by 0 <= scale <= 1, to make it as if we had fewer frames of adap...
void GetDeltaWeights(int32 num_frames_ready, int32 first_decoder_frame, std::vector< std::pair< int32, BaseFloat > > *delta_weights)