28 int main(
int argc,
char *argv[]) {
29 using namespace kaldi;
31 typedef kaldi::int64 int64;
34 "Extract iVectors for utterances every --ivector-period frames, using a trained\n" 35 "iVector extractor and features and Gaussian-level posteriors. Similar to\n" 36 "ivector-extract-online but uses the actual online decoder code to do it,\n" 37 "and does everything in-memory instead of using multiple processes.\n" 38 "Note: the value of the --use-most-recent-ivector config variable is ignored\n" 39 "it's set to false. The <spk2utt-rspecifier> is mandatory, to simplify the code;\n" 40 "if you want to do it separately per utterance, just make it of the form\n" 41 "<utterance-id> <utterance-id>.\n" 42 "The iVectors are output as an archive of matrices, indexed by utterance-id;\n" 43 "each row corresponds to an iVector. If --repeat=true, outputs the whole matrix\n" 44 "of iVectors, not just every (ivector-period)'th frame\n" 45 "The input features are the raw, non-cepstral-mean-normalized features, e.g. MFCC.\n" 47 "Usage: ivector-extract-online2 [options] <spk2utt-rspecifier> <feature-rspecifier> <ivector-wspecifier>\n" 49 " ivector-extract-online2 --config=exp/nnet2_online/nnet_online/conf/ivector_extractor.conf \\\n" 50 " ark:data/train/spk2utt scp:data/train/feats.scp ark,t:ivectors.1.ark\n";
59 int32 length_tolerance = 0;
60 std::string frame_weights_rspecifier;
63 "Number of threads to use for computing derived variables " 64 "of iVector extractor, at process start-up.");
66 "If true, output the same number of iVectors as input frames " 67 "(including repeated data).");
68 po.
Register(
"frame-weights-rspecifier", &frame_weights_rspecifier,
69 "Archive of frame weights to scale stats");
70 po.
Register(
"length-tolerance", &length_tolerance,
71 "Tolerance on the difference in number of frames " 72 "for feats and frame weights");
81 std::string spk2utt_rspecifier = po.
GetArg(1),
82 feature_rspecifier = po.
GetArg(2),
83 ivectors_wspecifier = po.
GetArg(3);
85 double tot_ubm_loglike = 0.0, tot_objf_impr = 0.0, tot_t = 0.0,
86 tot_length = 0.0, tot_length_utt_end = 0.0;
87 int32 num_done = 0, num_err = 0;
97 bool warned_dim =
false;
98 for (; !spk2utt_reader.
Done(); spk2utt_reader.
Next()) {
99 std::string spk = spk2utt_reader.
Key();
100 const std::vector<std::string> &uttlist = spk2utt_reader.
Value();
103 for (
size_t i = 0;
i < uttlist.size();
i++) {
104 std::string utt = uttlist[
i];
105 if (!feature_reader.
HasKey(utt)) {
106 KALDI_WARN <<
"Did not find audio for utterance " << utt;
112 int32 feat_dim = feats.
NumCols();
115 KALDI_WARN <<
"Feature dimension is too large by 3, assuming there are " 116 "pitch features and removing the last 3 dims.";
130 if (!frame_weights_rspecifier.empty()) {
131 if (!frame_weights_reader.
HasKey(utt)) {
132 KALDI_WARN <<
"Did not find weights for utterance " << utt;
138 if (std::abs(weights.
Dim() - feats.
NumRows()) > length_tolerance) {
143 std::vector<std::pair<int32, BaseFloat> > frame_weights;
145 if (
i < weights.
Dim())
146 frame_weights.push_back(std::make_pair(
i, weights(
i)));
148 frame_weights.push_back(std::make_pair(
i, 0.0));
157 num_ivectors = (T +
n - 1) /
n;
160 ivector_feature.
Dim());
162 for (int32
i = 0;
i < num_ivectors;
i++) {
165 ivector_feature.
GetFrame(t, &ivector);
171 tot_length_utt_end += T * ivectors.Row(num_ivectors - 1).Norm(2.0);
172 for (int32
i = 0;
i < num_ivectors;
i++)
173 tot_length += T * ivectors.Row(
i).Norm(2.0) / num_ivectors;
175 KALDI_VLOG(2) <<
"For utterance " << utt <<
" of speaker " << spk
176 <<
", UBM loglike/frame was " 178 <<
", iVector length (at utterance end) was " 179 << ivectors.Row(num_ivectors-1).Norm(2.0)
180 <<
", objf improvement/frame from iVector estimation was " 184 ivector_writer.
Write(utt, ivectors);
189 KALDI_LOG <<
"Estimated iVectors for " << num_done <<
" files, " << num_err
191 KALDI_LOG <<
"Average objective-function improvement was " 192 << (tot_objf_impr / tot_t) <<
" per frame, over " 193 << tot_t <<
" frames (weighted).";
194 KALDI_LOG <<
"Average iVector length was " << (tot_length / tot_t)
195 <<
" and at utterance-end was " << (tot_length_utt_end / tot_t)
196 <<
", over " << tot_t <<
" frames (weighted); " 197 <<
" expected length is " 200 return (num_done != 0 ? 0 : 1);
201 }
catch(
const std::exception &e) {
202 std::cerr << e.what();
This class takes a Matrix<BaseFloat> and wraps it as an OnlineFeatureInterface: this can be useful wh...
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
BaseFloat ObjfImprPerFrame() const
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
BaseFloat UbmLogLikePerFrame() const
void GetAdaptationState(OnlineIvectorExtractorAdaptationState *adaptation_state) const
Get the adaptation state; you may want to call this before destroying this object, to get adaptation state that can be used to improve decoding of later utterances of this speaker.
A templated class for writing objects to an archive or script file; see The Table concept...
virtual int32 Dim() const
Dim() will return the iVector dimension.
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
void SetAdaptationState(const OnlineIvectorExtractorAdaptationState &adaptation_state)
Set the adaptation state to a particular value, e.g.
SubMatrix< Real > ColRange(const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
const T & Value(const std::string &key)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
MatrixIndexT Dim() const
Returns the dimension of the vector.
bool HasKey(const std::string &key)
This file contains code for online iVector extraction in a form compatible with OnlineFeatureInterfac...
int NumArgs() const
Number of positional parameters (c.f. argc-1).
void UpdateFrameWeights(const std::vector< std::pair< int32, BaseFloat > > &delta_weights)
virtual void GetFrame(int32 frame, VectorBase< BaseFloat > *feat)
Gets the feature vector for this frame.
A class representing a vector.
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Sub-matrix representation.
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
OnlineIvectorFeature is an online feature-extraction class that's responsible for extracting iVectors...