65     using namespace kaldi;
    68     typedef kaldi::int64 int64;
    71         "Propagate features through an xvector neural network model and write\n"    72         "the output vectors.  \"Xvector\" is our term for a vector or\n"    73         "embedding which is the output of a particular type of neural network\n"    74         "architecture found in speaker recognition.  This architecture\n"    75         "consists of several layers that operate on frames, a statistics\n"    76         "pooling layer that aggregates over the frame-level representations\n"    77         "and possibly additional layers that operate on segment-level\n"    78         "representations.  The xvectors are generally extracted from an\n"    79         "output layer after the statistics pooling layer.  By default, one\n"    80         "xvector is extracted directly from the set of features for each\n"    81         "utterance.  Optionally, xvectors are extracted from chunks of input\n"    82         "features and averaged, to produce a single vector.\n"    84         "Usage: nnet3-xvector-compute [options] <raw-nnet-in> "    85         "<features-rspecifier> <vector-wspecifier>\n"    86         "e.g.: nnet3-xvector-compute final.raw scp:feats.scp "    87         "ark:nnet_prediction.ark\n"    88         "See also: nnet3-compute\n";
    98     std::string use_gpu = 
"no";
    99     std::string cached_compiler_in;
   100     std::string cached_compiler_out;
   101     int32 chunk_size = -1,
   102       min_chunk_size = 100;
   103     bool pad_input = 
true;
   108     po.Register(
"use-gpu", &use_gpu,
   109       "yes|no|optional|wait, only has effect if compiled with CUDA");
   110     po.Register(
"chunk-size", &chunk_size,
   111       "If set, extracts xectors from specified chunk-size, and averages.  "   112       "If not set, extracts an xvector from all available features.");
   113     po.Register(
"min-chunk-size", &min_chunk_size,
   114       "Minimum chunk-size allowed when extracting xvectors.");
   115     po.Register(
"pad-input", &pad_input, 
"If true, duplicate the first and "   116       "last frames of the input features as required to equal min-chunk-size.");
   117     po.Register(
"cached-compiler-in", &cached_compiler_in,
   118       "If set, read the cached compiler from the specified file path.");
   119     po.Register(
"cached-compiler-out", &cached_compiler_out,
   120       "If set, write the cached compiler to the specified file path.");
   123     CuDevice::RegisterDeviceOptions(&po);
   128     if (po.NumArgs() != 3) {
   134     CuDevice::Instantiate().SelectGpuId(use_gpu);
   137     std::string nnet_rxfilename = po.GetArg(1),
   138                 feature_rspecifier = po.GetArg(2),
   139                 vector_wspecifier = po.GetArg(3);
   149     if (!cached_compiler_in.empty()) {
   150         KALDI_LOG << 
"Reading cache from " << cached_compiler_in;
   151         bool cache_binary_in;
   152         Input ki(cached_compiler_in, &cache_binary_in);
   153         compiler.ReadCache(ki.Stream(), cache_binary_in);
   158     int32 num_success = 0, num_fail = 0;
   159     int64 frame_count = 0;
   160     int32 xvector_dim = nnet.
OutputDim(
"output");
   164     for (; !feature_reader.Done(); feature_reader.Next()) {
   165       std::string utt = feature_reader.Key();
   167       if (features.NumRows() == 0) {
   168         KALDI_WARN << 
"Zero-length utterance: " << utt;
   172       int32 num_rows = features.NumRows(),
   173             feat_dim = features.NumCols(),
   174             this_chunk_size = chunk_size;
   175       if (!pad_input && num_rows < min_chunk_size) {
   176         KALDI_WARN << 
"Minimum chunk size of " << min_chunk_size
   177                    << 
" is greater than the number of rows "   178                    << 
"in utterance: " << utt;
   181       } 
else if (num_rows < chunk_size) {
   182         KALDI_LOG << 
"Chunk size of " << chunk_size << 
" is greater than "   183                   << 
"the number of rows in utterance: " << utt
   184                   << 
", using chunk size  of " << num_rows;
   185         this_chunk_size = num_rows;
   186       } 
else if (chunk_size == -1) {
   187         this_chunk_size = num_rows;
   190       int32 num_chunks = ceil(
   191         num_rows / static_cast<BaseFloat>(this_chunk_size));
   196       for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
   200         int32 offset = std::min(
   201           this_chunk_size, num_rows - chunk_indx * this_chunk_size);
   202         if (!pad_input && offset < min_chunk_size)
   205           features, chunk_indx * this_chunk_size, offset, 0, feat_dim);
   207         tot_weight += offset;
   210         if (pad_input && offset < min_chunk_size) {
   212           int32 left_context = (min_chunk_size - offset) / 2;
   213           int32 right_context = min_chunk_size - offset - left_context;
   214           for (int32 
i = 0; 
i < left_context; 
i++) {
   215             padded_features.Row(
i).CopyFromVec(sub_features.Row(0));
   217           for (int32 
i = 0; 
i < right_context; 
i++) {
   218             padded_features.Row(min_chunk_size - 
i - 1).CopyFromVec(sub_features.Row(offset - 1));
   220           padded_features.Range(left_context, offset, 0, feat_dim).CopyFromMat(sub_features);
   225         xvector_avg.AddVec(offset, xvector);
   227       xvector_avg.Scale(1.0 / tot_weight);
   228       vector_writer.Write(utt, xvector_avg);
   230       frame_count += features.NumRows();
   235     CuDevice::Instantiate().PrintProfile();
   237     double elapsed = timer.
Elapsed();
   239               << 
"s: real-time factor assuming 100 frames/sec is "   240               << (elapsed*100.0/frame_count);
   241     KALDI_LOG << 
"Done " << num_success << 
" utterances, failed for "   244     if (!cached_compiler_out.empty()) {
   245         KALDI_LOG << 
"Writing cache to " << cached_compiler_out;
   246         bool binary_write = 
true;
   247         Output ko(cached_compiler_out, &binary_write);
   248         compiler.WriteCache(ko.Stream(), binary_write);
   251     if (num_success != 0) 
return 0;
   253   } 
catch(
const std::exception &e) {
   254     std::cerr << e.what();
 This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
 
void CollapseModel(const CollapseModelConfig &config, Nnet *nnet)
This function modifies the neural net for efficiency, in a way that suitable to be done in test time...
 
This class enables you to do the compilation and optimization in one call, and also ensures that if t...
 
void Register(OptionsItf *opts)
 
void SetBatchnormTestMode(bool test_mode, Nnet *nnet)
This function affects only components of type BatchNormComponent. 
 
A templated class for writing objects to an archive or script file; see The Table concept...
 
int32 OutputDim(const std::string &output_name) const
 
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
 
static void RunNnetComputation(const MatrixBase< BaseFloat > &features, const Nnet &nnet, CachingOptimizingCompiler *compiler, Vector< BaseFloat > *xvector)
 
void SetDropoutTestMode(bool test_mode, Nnet *nnet)
This function affects components of child-classes of RandomComponent. 
 
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
 
void Register(OptionsItf *opts)
 
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
 
A class representing a vector. 
 
NnetOptimizeOptions optimize_config
 
double Elapsed() const
Returns time in seconds. 
 
Sub-matrix representation. 
 
Config class for the CollapseModel function.