21 #ifndef KALDI_ONLINE2_ONLINE_IVECTOR_FEATURE_H_ 22 #define KALDI_ONLINE2_ONLINE_IVECTOR_FEATURE_H_ 106 ivector_period(10), num_gselect(5),
107 min_post(0.025), posterior_scale(0.1),
108 max_count(0.0), num_cg_iters(15),
109 use_most_recent_ivector(true),
110 greedy_ivector_extractor(false),
111 max_remembered_frames(1000) { }
114 opts->
Register(
"lda-matrix", &lda_mat_rxfilename,
"Filename of LDA matrix, " 115 "e.g. final.mat; used for iVector extraction. ");
116 opts->
Register(
"global-cmvn-stats", &global_cmvn_stats_rxfilename,
117 "(Extended) filename for global CMVN stats, used in iVector " 118 "extraction, obtained for example from " 119 "'matrix-sum scp:data/train/cmvn.scp -', only used for " 120 "iVector extraction");
121 opts->
Register(
"cmvn-config", &cmvn_config_rxfilename,
"Configuration " 122 "file for online CMVN features (e.g. conf/online_cmvn.conf)," 123 "only used for iVector extraction. Contains options " 124 "as for the program 'apply-cmvn-online'");
125 opts->
Register(
"online-cmvn-iextractor", &online_cmvn_iextractor,
126 "add online-cmvn to feature pipeline of ivector extractor, " 127 "use the cmvn setup from the UBM. Note: the default of " 128 "false is what we historically used; we'd use true if " 129 "we were using CMVN'ed features for the neural net.");
130 opts->
Register(
"splice-config", &splice_config_rxfilename,
"Configuration file " 131 "for frame splicing (--left-context and --right-context " 132 "options); used for iVector extraction.");
133 opts->
Register(
"diag-ubm", &diag_ubm_rxfilename,
"Filename of diagonal UBM " 134 "used to obtain posteriors for iVector extraction, e.g. " 136 opts->
Register(
"ivector-extractor", &ivector_extractor_rxfilename,
137 "Filename of iVector extractor, e.g. final.ie");
138 opts->
Register(
"ivector-period", &ivector_period,
"Frequency with which " 139 "we extract iVectors for neural network adaptation");
140 opts->
Register(
"num-gselect", &num_gselect,
"Number of Gaussians to select " 141 "for iVector extraction");
142 opts->
Register(
"min-post", &min_post,
"Threshold for posterior pruning in " 143 "iVector extraction");
144 opts->
Register(
"posterior-scale", &posterior_scale,
"Scale for posteriors in " 145 "iVector extraction (may be viewed as inverse of prior scale)");
146 opts->
Register(
"max-count", &max_count,
"Maximum data count we allow before " 147 "we start scaling the stats down (if nonzero)... helps to make " 148 "iVectors from long utterances look more typical. Interpret " 149 "as a frame-count times --posterior-scale, typically 1/10 of " 150 "a number of frames. Suggest 100.");
151 opts->
Register(
"use-most-recent-ivector", &use_most_recent_ivector,
"If true, " 152 "always use most recent available iVector, rather than the " 153 "one for the designated frame.");
154 opts->
Register(
"greedy-ivector-extractor", &greedy_ivector_extractor,
"If " 155 "true, 'read ahead' as many frames as we currently have available " 156 "when extracting the iVector. May improve iVector quality.");
157 opts->
Register(
"max-remembered-frames", &max_remembered_frames,
"The maximum " 158 "number of frames of adaptation history that we carry through " 159 "to later utterances of the same speaker (having a finite " 160 "number allows the speaker adaptation state to change over " 161 "time). Interpret as a real frame count, i.e. not a count " 162 "scaled by --posterior-scale.");
197 int32 ExpectedFeatureDim()
const;
224 cmvn_state(info.global_cmvn_stats),
225 ivector_stats(info.extractor.IvectorDim(),
226 info.extractor.PriorOffset(),
242 void Write(std::ostream &os,
bool binary)
const;
243 void Read(std::istream &is,
bool binary);
278 virtual int32 Dim()
const;
279 virtual bool IsLastFrame(
int32 frame)
const;
280 virtual int32 NumFramesReady()
const;
281 virtual BaseFloat FrameShiftInSeconds()
const;
287 void SetAdaptationState(
294 void GetAdaptationState(
308 return ivector_stats_.NumFrames() / info_.posterior_scale;
322 void UpdateFrameWeights(
323 const std::vector<std::pair<int32, BaseFloat> > &delta_weights);
331 void UpdateStatsForFrames(
332 const std::vector<std::pair<int32, BaseFloat> > &frame_weights);
342 void UpdateStatsUntilFrame(
int32 frame);
346 void UpdateStatsUntilFrameWeighted(
int32 frame);
348 void PrintDiagnostics()
const;
380 std::priority_queue<std::pair<int32, BaseFloat>,
381 std::vector<std::pair<int32, BaseFloat> >,
432 return !silence_phones_str.empty() && silence_weight != 1.0;
436 silence_weight(1.0), max_state_duration(-1) { }
439 opts->
Register(
"silence-phones", &silence_phones_str,
"(RE weighting in " 440 "iVector estimation for online decoding) List of integer ids of " 441 "silence phones, separated by colons (or commas). Data that " 442 "(according to the traceback of the decoder) corresponds to " 443 "these phones will be downweighted by --silence-weight.");
444 opts->
Register(
"silence-weight", &silence_weight,
"(RE weighting in " 445 "iVector estimation for online decoding) Weighting factor for frames " 446 "that the decoder trace-back identifies as silence; only " 447 "relevant if the --silence-phones option is set.");
448 opts->
Register(
"max-state-duration", &max_state_duration,
"(RE weighting in " 449 "iVector estimation for online decoding) Maximum allowed " 450 "duration of a single transition-id; runs with durations longer " 451 "than this will be weighted down to the silence-weight.");
476 int32 frame_subsampling_factor = 1);
478 bool Active()
const {
return config_.Active(); }
484 template <
typename FST>
486 template <
typename FST>
513 void GetDeltaWeights(
514 int32 num_frames_ready,
int32 first_decoder_frame,
515 std::vector<std::pair<int32, BaseFloat> > *delta_weights);
520 int32 num_frames_ready,
521 std::vector<std::pair<int32, BaseFloat> > *delta_weights) {
522 GetDeltaWeights(num_frames_ready, 0, delta_weights);
543 FrameInfo(): token(NULL), transition_id(-1), current_weight(0.0) {}
568 #endif // KALDI_ONLINE2_ONLINE_IVECTOR_FEATURE_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
BaseFloat NumFrames() const
Vector< double > current_ivector_
Most recently estimated iVector, will have been estimated at the greatest time t where t <= num_frame...
LatticeIncrementalOnlineDecoderTpl is as LatticeIncrementalDecoderTpl but also supports an efficient ...
This class does an online version of the cepstral mean and [optionally] variance, but note that this ...
void GetDeltaWeights(int32 num_frames_ready, std::vector< std::pair< int32, BaseFloat > > *delta_weights)
OnlineFeatureInterface * base_
This class helps us to efficiently estimate iVectors in situations where the data is coming in frame ...
#define KALDI_DISALLOW_COPY_AND_ASSIGN(type)
int32 num_frames_stats_
num_frames_stats_ is the number of frames of data we have already accumulated from this utterance and...
BaseFloat new_data_weight
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
double tot_ubm_loglike_
The following is only needed for diagnostics.
OnlineFeatureInterface * lda_normalized_
std::vector< Vector< BaseFloat > *> ivectors_history_
if info_.use_most_recent_ivector == false, we need to store the iVector we estimated each info_...
void RegisterWithPrefix(std::string prefix, OptionsItf *opts)
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
OnlineIvectorEstimationStats ivector_stats_
the iVector estimation stats
bool updated_with_no_delta_weights_
The following is also used to detect wrong usage of this class; it's set to true if UpdateStatsUntilF...
int32 most_recent_frame_with_weight_
if delta_weights_ was ever called, this keeps track of the most recent frame that ever had a weight...
const OnlineSilenceWeightingConfig & config_
Struct OnlineCmvnState stores the state of CMVN adaptation between utterances (but not the state of t...
void Register(OptionsItf *opts)
std::vector< BaseFloat > current_frame_weight_debug_
this is only used for validating that the frame-weighting code is not buggy.
const OnlineIvectorExtractionInfo & info_
LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also supports an efficient way to get...
std::priority_queue< std::pair< int32, BaseFloat >, std::vector< std::pair< int32, BaseFloat > >, std::greater< std::pair< int32, BaseFloat > > > delta_weights_
delta_weights_ is written to by UpdateFrameWeights, in the case where the iVector estimation is silen...
std::string silence_phones_str
bool delta_weights_provided_
delta_weights_provided_ is set to true if UpdateFrameWeights was ever called; it's used to detect wro...
std::vector< OnlineFeatureInterface * > to_delete_
unordered_set< int32 > silence_phones_
BaseFloat max_state_duration
Definition for Gaussian Mixture Model with diagonal covariances.
OnlineFeatureInterface is an interface for online feature processing (it is also usable in the offlin...
OnlineSilenceWeightingConfig()
const TransitionModel & trans_model_
Provides a vector abstraction class.
std::vector< FrameInfo > frame_info_
int32 num_frames_output_and_correct_
OnlineFeatureInterface * lda_
int32 frame_subsampling_factor_
OnlineIvectorFeature is an online feature-extraction class that's responsible for extracting iVectors...