65 using namespace kaldi;
68 typedef kaldi::int64 int64;
71 "Propagate features through an xvector neural network model and write\n" 72 "the output vectors. \"Xvector\" is our term for a vector or\n" 73 "embedding which is the output of a particular type of neural network\n" 74 "architecture found in speaker recognition. This architecture\n" 75 "consists of several layers that operate on frames, a statistics\n" 76 "pooling layer that aggregates over the frame-level representations\n" 77 "and possibly additional layers that operate on segment-level\n" 78 "representations. The xvectors are generally extracted from an\n" 79 "output layer after the statistics pooling layer. By default, one\n" 80 "xvector is extracted directly from the set of features for each\n" 81 "utterance. Optionally, xvectors are extracted from chunks of input\n" 82 "features and averaged, to produce a single vector.\n" 84 "Usage: nnet3-xvector-compute [options] <raw-nnet-in> " 85 "<features-rspecifier> <vector-wspecifier>\n" 86 "e.g.: nnet3-xvector-compute final.raw scp:feats.scp " 87 "ark:nnet_prediction.ark\n" 88 "See also: nnet3-compute\n";
98 std::string use_gpu =
"no";
99 std::string cached_compiler_in;
100 std::string cached_compiler_out;
101 int32 chunk_size = -1,
102 min_chunk_size = 100;
103 bool pad_input =
true;
108 po.Register(
"use-gpu", &use_gpu,
109 "yes|no|optional|wait, only has effect if compiled with CUDA");
110 po.Register(
"chunk-size", &chunk_size,
111 "If set, extracts xectors from specified chunk-size, and averages. " 112 "If not set, extracts an xvector from all available features.");
113 po.Register(
"min-chunk-size", &min_chunk_size,
114 "Minimum chunk-size allowed when extracting xvectors.");
115 po.Register(
"pad-input", &pad_input,
"If true, duplicate the first and " 116 "last frames of the input features as required to equal min-chunk-size.");
117 po.Register(
"cached-compiler-in", &cached_compiler_in,
118 "If set, read the cached compiler from the specified file path.");
119 po.Register(
"cached-compiler-out", &cached_compiler_out,
120 "If set, write the cached compiler to the specified file path.");
123 CuDevice::RegisterDeviceOptions(&po);
128 if (po.NumArgs() != 3) {
134 CuDevice::Instantiate().SelectGpuId(use_gpu);
137 std::string nnet_rxfilename = po.GetArg(1),
138 feature_rspecifier = po.GetArg(2),
139 vector_wspecifier = po.GetArg(3);
149 if (!cached_compiler_in.empty()) {
150 KALDI_LOG <<
"Reading cache from " << cached_compiler_in;
151 bool cache_binary_in;
152 Input ki(cached_compiler_in, &cache_binary_in);
153 compiler.ReadCache(ki.Stream(), cache_binary_in);
158 int32 num_success = 0, num_fail = 0;
159 int64 frame_count = 0;
160 int32 xvector_dim = nnet.
OutputDim(
"output");
164 for (; !feature_reader.Done(); feature_reader.Next()) {
165 std::string utt = feature_reader.Key();
167 if (features.NumRows() == 0) {
168 KALDI_WARN <<
"Zero-length utterance: " << utt;
172 int32 num_rows = features.NumRows(),
173 feat_dim = features.NumCols(),
174 this_chunk_size = chunk_size;
175 if (!pad_input && num_rows < min_chunk_size) {
176 KALDI_WARN <<
"Minimum chunk size of " << min_chunk_size
177 <<
" is greater than the number of rows " 178 <<
"in utterance: " << utt;
181 }
else if (num_rows < chunk_size) {
182 KALDI_LOG <<
"Chunk size of " << chunk_size <<
" is greater than " 183 <<
"the number of rows in utterance: " << utt
184 <<
", using chunk size of " << num_rows;
185 this_chunk_size = num_rows;
186 }
else if (chunk_size == -1) {
187 this_chunk_size = num_rows;
190 int32 num_chunks = ceil(
191 num_rows / static_cast<BaseFloat>(this_chunk_size));
196 for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
200 int32 offset = std::min(
201 this_chunk_size, num_rows - chunk_indx * this_chunk_size);
202 if (!pad_input && offset < min_chunk_size)
205 features, chunk_indx * this_chunk_size, offset, 0, feat_dim);
207 tot_weight += offset;
210 if (pad_input && offset < min_chunk_size) {
212 int32 left_context = (min_chunk_size - offset) / 2;
213 int32 right_context = min_chunk_size - offset - left_context;
214 for (int32
i = 0;
i < left_context;
i++) {
215 padded_features.Row(
i).CopyFromVec(sub_features.Row(0));
217 for (int32
i = 0;
i < right_context;
i++) {
218 padded_features.Row(min_chunk_size -
i - 1).CopyFromVec(sub_features.Row(offset - 1));
220 padded_features.Range(left_context, offset, 0, feat_dim).CopyFromMat(sub_features);
225 xvector_avg.AddVec(offset, xvector);
227 xvector_avg.Scale(1.0 / tot_weight);
228 vector_writer.Write(utt, xvector_avg);
230 frame_count += features.NumRows();
235 CuDevice::Instantiate().PrintProfile();
237 double elapsed = timer.
Elapsed();
239 <<
"s: real-time factor assuming 100 frames/sec is " 240 << (elapsed*100.0/frame_count);
241 KALDI_LOG <<
"Done " << num_success <<
" utterances, failed for " 244 if (!cached_compiler_out.empty()) {
245 KALDI_LOG <<
"Writing cache to " << cached_compiler_out;
246 bool binary_write =
true;
247 Output ko(cached_compiler_out, &binary_write);
248 compiler.WriteCache(ko.Stream(), binary_write);
251 if (num_success != 0)
return 0;
253 }
catch(
const std::exception &e) {
254 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void CollapseModel(const CollapseModelConfig &config, Nnet *nnet)
This function modifies the neural net for efficiency, in a way that suitable to be done in test time...
This class enables you to do the compilation and optimization in one call, and also ensures that if t...
void Register(OptionsItf *opts)
void SetBatchnormTestMode(bool test_mode, Nnet *nnet)
This function affects only components of type BatchNormComponent.
A templated class for writing objects to an archive or script file; see The Table concept...
int32 OutputDim(const std::string &output_name) const
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
static void RunNnetComputation(const MatrixBase< BaseFloat > &features, const Nnet &nnet, CachingOptimizingCompiler *compiler, Vector< BaseFloat > *xvector)
void SetDropoutTestMode(bool test_mode, Nnet *nnet)
This function affects components of child-classes of RandomComponent.
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
void Register(OptionsItf *opts)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
A class representing a vector.
NnetOptimizeOptions optimize_config
double Elapsed() const
Returns time in seconds.
Sub-matrix representation.
Config class for the CollapseModel function.