42 output_spec.
name =
"output";
46 request.
outputs[0].Swap(&output_spec);
47 std::shared_ptr<const NnetComputation> computation(compiler->
Compile(request));
48 Nnet *nnet_to_update = NULL;
50 nnet, nnet_to_update);
63 int main(
int argc,
char *argv[]) {
65 using namespace kaldi;
68 typedef kaldi::int64 int64;
71 "Propagate features through an xvector neural network model and write\n" 72 "the output vectors. \"Xvector\" is our term for a vector or\n" 73 "embedding which is the output of a particular type of neural network\n" 74 "architecture found in speaker recognition. This architecture\n" 75 "consists of several layers that operate on frames, a statistics\n" 76 "pooling layer that aggregates over the frame-level representations\n" 77 "and possibly additional layers that operate on segment-level\n" 78 "representations. The xvectors are generally extracted from an\n" 79 "output layer after the statistics pooling layer. By default, one\n" 80 "xvector is extracted directly from the set of features for each\n" 81 "utterance. Optionally, xvectors are extracted from chunks of input\n" 82 "features and averaged, to produce a single vector.\n" 84 "Usage: nnet3-xvector-compute [options] <raw-nnet-in> " 85 "<features-rspecifier> <vector-wspecifier>\n" 86 "e.g.: nnet3-xvector-compute final.raw scp:feats.scp " 87 "ark:nnet_prediction.ark\n" 88 "See also: nnet3-compute\n";
98 std::string use_gpu =
"no";
99 std::string cached_compiler_in;
100 std::string cached_compiler_out;
101 int32 chunk_size = -1,
102 min_chunk_size = 100;
103 bool pad_input =
true;
109 "yes|no|optional|wait, only has effect if compiled with CUDA");
110 po.
Register(
"chunk-size", &chunk_size,
111 "If set, extracts xectors from specified chunk-size, and averages. " 112 "If not set, extracts an xvector from all available features.");
113 po.
Register(
"min-chunk-size", &min_chunk_size,
114 "Minimum chunk-size allowed when extracting xvectors.");
115 po.
Register(
"pad-input", &pad_input,
"If true, duplicate the first and " 116 "last frames of the input features as required to equal min-chunk-size.");
117 po.
Register(
"cached-compiler-in", &cached_compiler_in,
118 "If set, read the cached compiler from the specified file path.");
119 po.
Register(
"cached-compiler-out", &cached_compiler_out,
120 "If set, write the cached compiler to the specified file path.");
123 CuDevice::RegisterDeviceOptions(&po);
134 CuDevice::Instantiate().SelectGpuId(use_gpu);
137 std::string nnet_rxfilename = po.
GetArg(1),
138 feature_rspecifier = po.
GetArg(2),
139 vector_wspecifier = po.
GetArg(3);
149 if (!cached_compiler_in.empty()) {
150 KALDI_LOG <<
"Reading cache from " << cached_compiler_in;
151 bool cache_binary_in;
152 Input ki(cached_compiler_in, &cache_binary_in);
158 int32 num_success = 0, num_fail = 0;
159 int64 frame_count = 0;
160 int32 xvector_dim = nnet.
OutputDim(
"output");
164 for (; !feature_reader.
Done(); feature_reader.
Next()) {
165 std::string utt = feature_reader.
Key();
167 if (features.NumRows() == 0) {
168 KALDI_WARN <<
"Zero-length utterance: " << utt;
172 int32 num_rows = features.NumRows(),
173 feat_dim = features.NumCols(),
174 this_chunk_size = chunk_size;
175 if (!pad_input && num_rows < min_chunk_size) {
176 KALDI_WARN <<
"Minimum chunk size of " << min_chunk_size
177 <<
" is greater than the number of rows " 178 <<
"in utterance: " << utt;
181 }
else if (num_rows < chunk_size) {
182 KALDI_LOG <<
"Chunk size of " << chunk_size <<
" is greater than " 183 <<
"the number of rows in utterance: " << utt
184 <<
", using chunk size of " << num_rows;
185 this_chunk_size = num_rows;
186 }
else if (chunk_size == -1) {
187 this_chunk_size = num_rows;
190 int32 num_chunks = ceil(
191 num_rows / static_cast<BaseFloat>(this_chunk_size));
196 for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
200 int32 offset = std::min(
201 this_chunk_size, num_rows - chunk_indx * this_chunk_size);
202 if (!pad_input && offset < min_chunk_size)
205 features, chunk_indx * this_chunk_size, offset, 0, feat_dim);
207 tot_weight += offset;
210 if (pad_input && offset < min_chunk_size) {
212 int32 left_context = (min_chunk_size - offset) / 2;
213 int32 right_context = min_chunk_size - offset - left_context;
214 for (int32
i = 0;
i < left_context;
i++) {
215 padded_features.
Row(
i).CopyFromVec(sub_features.
Row(0));
217 for (int32
i = 0;
i < right_context;
i++) {
218 padded_features.
Row(min_chunk_size -
i - 1).CopyFromVec(sub_features.
Row(offset - 1));
220 padded_features.
Range(left_context, offset, 0, feat_dim).CopyFromMat(sub_features);
225 xvector_avg.
AddVec(offset, xvector);
227 xvector_avg.
Scale(1.0 / tot_weight);
228 vector_writer.
Write(utt, xvector_avg);
230 frame_count += features.NumRows();
235 CuDevice::Instantiate().PrintProfile();
237 double elapsed = timer.
Elapsed();
239 <<
"s: real-time factor assuming 100 frames/sec is " 240 << (elapsed*100.0/frame_count);
241 KALDI_LOG <<
"Done " << num_success <<
" utterances, failed for " 244 if (!cached_compiler_out.empty()) {
245 KALDI_LOG <<
"Writing cache to " << cached_compiler_out;
246 bool binary_write =
true;
247 Output ko(cached_compiler_out, &binary_write);
251 if (num_success != 0)
return 0;
253 }
catch(
const std::exception &e) {
254 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void CollapseModel(const CollapseModelConfig &config, Nnet *nnet)
This function modifies the neural net for efficiency, in a way that suitable to be done in test time...
bool store_component_stats
you should set need_component_stats to true if you need the average-activation and average-derivative...
const CuSubVector< Real > Row(MatrixIndexT i) const
bool need_model_derivative
if need_model_derivative is true, then we'll be doing either model training or model-derivative compu...
Base class which provides matrix operations not involving resizing or allocation. ...
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
This class enables you to do the compilation and optimization in one call, and also ensures that if t...
void Register(OptionsItf *opts)
void SetBatchnormTestMode(bool test_mode, Nnet *nnet)
This function affects only components of type BatchNormComponent.
A templated class for writing objects to an archive or script file; see The Table concept...
std::vector< IoSpecification > inputs
This class represents a matrix that's stored on the GPU if we have one, and in memory if not...
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
void Write(const std::string &key, const T &value) const
int32 OutputDim(const std::string &output_name) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
This file contains some miscellaneous functions dealing with class Nnet.
static void RunNnetComputation(const MatrixBase< BaseFloat > &features, const Nnet &nnet, CachingOptimizingCompiler *compiler, Vector< BaseFloat > *xvector)
void SetDropoutTestMode(bool test_mode, Nnet *nnet)
This function affects components of child-classes of RandomComponent.
void AcceptInput(const std::string &node_name, CuMatrix< BaseFloat > *input)
e.g.
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
void Register(OptionsItf *opts)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
void Scale(Real alpha)
Multiplies all elements by this constant.
std::shared_ptr< const NnetComputation > Compile(const ComputationRequest &request)
Does the compilation and returns a const pointer to the result, which is owned by this class...
int NumArgs() const
Number of positional parameters (c.f. argc-1).
void ReadCache(std::istream &is, bool binary)
std::vector< Index > indexes
MatrixIndexT NumCols() const
void WriteCache(std::ostream &os, bool binary)
A class representing a vector.
class NnetComputer is responsible for executing the computation described in the "computation" object...
std::vector< IoSpecification > outputs
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
SubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Return a sub-part of matrix.
int main(int argc, char *argv[])
void GetOutputDestructive(const std::string &output_name, CuMatrix< BaseFloat > *output)
NnetOptimizeOptions optimize_config
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
double Elapsed() const
Returns time in seconds.
Sub-matrix representation.
Config class for the CollapseModel function.
void Run()
This does either the forward or backward computation, depending when it is called (in a typical compu...