29 using namespace kaldi;
31 typedef kaldi::int64 int64;
34 "Extract iVectors for utterances every --ivector-period frames, using a trained\n" 35 "iVector extractor and features and Gaussian-level posteriors. Similar to\n" 36 "ivector-extract-online but uses the actual online decoder code to do it,\n" 37 "and does everything in-memory instead of using multiple processes.\n" 38 "Note: the value of the --use-most-recent-ivector config variable is ignored\n" 39 "it's set to false. The <spk2utt-rspecifier> is mandatory, to simplify the code;\n" 40 "if you want to do it separately per utterance, just make it of the form\n" 41 "<utterance-id> <utterance-id>.\n" 42 "The iVectors are output as an archive of matrices, indexed by utterance-id;\n" 43 "each row corresponds to an iVector. If --repeat=true, outputs the whole matrix\n" 44 "of iVectors, not just every (ivector-period)'th frame\n" 45 "The input features are the raw, non-cepstral-mean-normalized features, e.g. MFCC.\n" 47 "Usage: ivector-extract-online2 [options] <spk2utt-rspecifier> <feature-rspecifier> <ivector-wspecifier>\n" 49 " ivector-extract-online2 --config=exp/nnet2_online/nnet_online/conf/ivector_extractor.conf \\\n" 50 " ark:data/train/spk2utt scp:data/train/feats.scp ark,t:ivectors.1.ark\n";
59 int32 length_tolerance = 0;
60 std::string frame_weights_rspecifier;
63 "Number of threads to use for computing derived variables " 64 "of iVector extractor, at process start-up.");
65 po.Register(
"repeat", &repeat,
66 "If true, output the same number of iVectors as input frames " 67 "(including repeated data).");
68 po.Register(
"frame-weights-rspecifier", &frame_weights_rspecifier,
69 "Archive of frame weights to scale stats");
70 po.Register(
"length-tolerance", &length_tolerance,
71 "Tolerance on the difference in number of frames " 72 "for feats and frame weights");
76 if (po.NumArgs() != 3) {
81 std::string spk2utt_rspecifier = po.GetArg(1),
82 feature_rspecifier = po.GetArg(2),
83 ivectors_wspecifier = po.GetArg(3);
85 double tot_ubm_loglike = 0.0, tot_objf_impr = 0.0, tot_t = 0.0,
86 tot_length = 0.0, tot_length_utt_end = 0.0;
87 int32 num_done = 0, num_err = 0;
97 bool warned_dim =
false;
98 for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
99 std::string spk = spk2utt_reader.Key();
100 const std::vector<std::string> &uttlist = spk2utt_reader.Value();
103 for (
size_t i = 0;
i < uttlist.size();
i++) {
104 std::string utt = uttlist[
i];
105 if (!feature_reader.HasKey(utt)) {
106 KALDI_WARN <<
"Did not find audio for utterance " << utt;
112 int32 feat_dim = feats.
NumCols();
113 if (feat_dim == ivector_info.ExpectedFeatureDim() + 3) {
115 KALDI_WARN <<
"Feature dimension is too large by 3, assuming there are " 116 "pitch features and removing the last 3 dims.";
128 ivector_feature.SetAdaptationState(adaptation_state);
130 if (!frame_weights_rspecifier.empty()) {
131 if (!frame_weights_reader.HasKey(utt)) {
132 KALDI_WARN <<
"Did not find weights for utterance " << utt;
138 if (std::abs(weights.
Dim() - feats.
NumRows()) > length_tolerance) {
143 std::vector<std::pair<int32, BaseFloat> > frame_weights;
145 if (
i < weights.
Dim())
146 frame_weights.push_back(std::make_pair(
i, weights(
i)));
148 frame_weights.push_back(std::make_pair(
i, 0.0));
152 ivector_feature.UpdateFrameWeights(frame_weights);
157 num_ivectors = (T +
n - 1) /
n;
160 ivector_feature.Dim());
162 for (int32
i = 0;
i < num_ivectors;
i++) {
165 ivector_feature.GetFrame(t, &ivector);
169 tot_ubm_loglike += T * ivector_feature.UbmLogLikePerFrame();
170 tot_objf_impr += T * ivector_feature.ObjfImprPerFrame();
171 tot_length_utt_end += T * ivectors.Row(num_ivectors - 1).Norm(2.0);
172 for (int32
i = 0;
i < num_ivectors;
i++)
173 tot_length += T * ivectors.Row(
i).Norm(2.0) / num_ivectors;
175 KALDI_VLOG(2) <<
"For utterance " << utt <<
" of speaker " << spk
176 <<
", UBM loglike/frame was " 177 << ivector_feature.UbmLogLikePerFrame()
178 <<
", iVector length (at utterance end) was " 179 << ivectors.Row(num_ivectors-1).Norm(2.0)
180 <<
", objf improvement/frame from iVector estimation was " 181 << ivector_feature.ObjfImprPerFrame();
183 ivector_feature.GetAdaptationState(&adaptation_state);
184 ivector_writer.Write(utt, ivectors);
189 KALDI_LOG <<
"Estimated iVectors for " << num_done <<
" files, " << num_err
191 KALDI_LOG <<
"Average objective-function improvement was " 192 << (tot_objf_impr / tot_t) <<
" per frame, over " 193 << tot_t <<
" frames (weighted).";
194 KALDI_LOG <<
"Average iVector length was " << (tot_length / tot_t)
195 <<
" and at utterance-end was " << (tot_length_utt_end / tot_t)
196 <<
", over " << tot_t <<
" frames (weighted); " 197 <<
" expected length is " 198 << sqrt(ivector_info.extractor.IvectorDim());
200 return (num_done != 0 ? 0 : 1);
201 }
catch(
const std::exception &e) {
202 std::cerr << e.what();
This class takes a Matrix<BaseFloat> and wraps it as an OnlineFeatureInterface: this can be useful wh...
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
A templated class for writing objects to an archive or script file; see The Table concept...
Allows random access to a collection of objects in an archive or script file; see The Table concept...
SubMatrix< Real > ColRange(const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
MatrixIndexT Dim() const
Returns the dimension of the vector.
A class representing a vector.
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Sub-matrix representation.
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
OnlineIvectorFeature is an online feature-extraction class that's responsible for extracting iVectors...