nnet3-xvector-compute.cc File Reference
Include dependency graph for nnet3-xvector-compute.cc:

Go to the source code of this file.

Namespaces

 kaldi
 This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for mispronunciations detection tasks, the reference:
 
 kaldi::nnet3
 

Functions

static void RunNnetComputation (const MatrixBase< BaseFloat > &features, const Nnet &nnet, CachingOptimizingCompiler *compiler, Vector< BaseFloat > *xvector)
 
int main (int argc, char *argv[])
 

Function Documentation

◆ main()

int main ( int  argc,
char *  argv[] 
)

Definition at line 63 of file nnet3-xvector-compute.cc.

References NnetSimpleComputationOptions::acoustic_scale, VectorBase< Real >::AddVec(), kaldi::nnet3::CollapseModel(), SequentialTableReader< Holder >::Done(), Timer::Elapsed(), ParseOptions::GetArg(), rnnlm::i, KALDI_LOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), kaldi::kSetZero, SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), NnetSimpleComputationOptions::optimize_config, Nnet::OutputDim(), ParseOptions::PrintUsage(), MatrixBase< Real >::Range(), ParseOptions::Read(), CachingOptimizingCompiler::ReadCache(), kaldi::ReadKaldiObject(), NnetSimpleComputationOptions::Register(), ParseOptions::Register(), CachingOptimizingCompilerOptions::Register(), MatrixBase< Real >::Row(), kaldi::nnet3::RunNnetComputation(), VectorBase< Real >::Scale(), kaldi::nnet3::SetBatchnormTestMode(), kaldi::nnet3::SetDropoutTestMode(), Output::Stream(), Input::Stream(), SequentialTableReader< Holder >::Value(), TableWriter< Holder >::Write(), and CachingOptimizingCompiler::WriteCache().

63  {
64  try {
65  using namespace kaldi;
66  using namespace kaldi::nnet3;
67  typedef kaldi::int32 int32;
68  typedef kaldi::int64 int64;
69 
70  const char *usage =
71  "Propagate features through an xvector neural network model and write\n"
72  "the output vectors. \"Xvector\" is our term for a vector or\n"
73  "embedding which is the output of a particular type of neural network\n"
74  "architecture found in speaker recognition. This architecture\n"
75  "consists of several layers that operate on frames, a statistics\n"
76  "pooling layer that aggregates over the frame-level representations\n"
77  "and possibly additional layers that operate on segment-level\n"
78  "representations. The xvectors are generally extracted from an\n"
79  "output layer after the statistics pooling layer. By default, one\n"
80  "xvector is extracted directly from the set of features for each\n"
81  "utterance. Optionally, xvectors are extracted from chunks of input\n"
82  "features and averaged, to produce a single vector.\n"
83  "\n"
84  "Usage: nnet3-xvector-compute [options] <raw-nnet-in> "
85  "<features-rspecifier> <vector-wspecifier>\n"
86  "e.g.: nnet3-xvector-compute final.raw scp:feats.scp "
87  "ark:nnet_prediction.ark\n"
88  "See also: nnet3-compute\n";
89 
90  ParseOptions po(usage);
91  Timer timer;
92 
94  CachingOptimizingCompilerOptions compiler_config;
95 
96  opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.
97 
98  std::string use_gpu = "no";
99  std::string cached_compiler_in;
100  std::string cached_compiler_out;
101  int32 chunk_size = -1,
102  min_chunk_size = 100;
103  bool pad_input = true;
104 
105  opts.Register(&po);
106  compiler_config.Register(&po);
107 
108  po.Register("use-gpu", &use_gpu,
109  "yes|no|optional|wait, only has effect if compiled with CUDA");
110  po.Register("chunk-size", &chunk_size,
111  "If set, extracts xectors from specified chunk-size, and averages. "
112  "If not set, extracts an xvector from all available features.");
113  po.Register("min-chunk-size", &min_chunk_size,
114  "Minimum chunk-size allowed when extracting xvectors.");
115  po.Register("pad-input", &pad_input, "If true, duplicate the first and "
116  "last frames of the input features as required to equal min-chunk-size.");
117  po.Register("cached-compiler-in", &cached_compiler_in,
118  "If set, read the cached compiler from the specified file path.");
119  po.Register("cached-compiler-out", &cached_compiler_out,
120  "If set, write the cached compiler to the specified file path.");
121 
122 #if HAVE_CUDA==1
123  CuDevice::RegisterDeviceOptions(&po);
124 #endif
125 
126  po.Read(argc, argv);
127 
128  if (po.NumArgs() != 3) {
129  po.PrintUsage();
130  exit(1);
131  }
132 
133 #if HAVE_CUDA==1
134  CuDevice::Instantiate().SelectGpuId(use_gpu);
135 #endif
136 
137  std::string nnet_rxfilename = po.GetArg(1),
138  feature_rspecifier = po.GetArg(2),
139  vector_wspecifier = po.GetArg(3);
140 
141  Nnet nnet;
142  ReadKaldiObject(nnet_rxfilename, &nnet);
143  SetBatchnormTestMode(true, &nnet);
144  SetDropoutTestMode(true, &nnet);
146 
147  CachingOptimizingCompiler compiler(nnet, opts.optimize_config, compiler_config);
148 
149  if (!cached_compiler_in.empty()) {
150  KALDI_LOG << "Reading cache from " << cached_compiler_in;
151  bool cache_binary_in;
152  Input ki(cached_compiler_in, &cache_binary_in);
153  compiler.ReadCache(ki.Stream(), cache_binary_in);
154  }
155 
156  BaseFloatVectorWriter vector_writer(vector_wspecifier);
157 
158  int32 num_success = 0, num_fail = 0;
159  int64 frame_count = 0;
160  int32 xvector_dim = nnet.OutputDim("output");
161 
162  SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
163 
164  for (; !feature_reader.Done(); feature_reader.Next()) {
165  std::string utt = feature_reader.Key();
166  const Matrix<BaseFloat> &features (feature_reader.Value());
167  if (features.NumRows() == 0) {
168  KALDI_WARN << "Zero-length utterance: " << utt;
169  num_fail++;
170  continue;
171  }
172  int32 num_rows = features.NumRows(),
173  feat_dim = features.NumCols(),
174  this_chunk_size = chunk_size;
175  if (!pad_input && num_rows < min_chunk_size) {
176  KALDI_WARN << "Minimum chunk size of " << min_chunk_size
177  << " is greater than the number of rows "
178  << "in utterance: " << utt;
179  num_fail++;
180  continue;
181  } else if (num_rows < chunk_size) {
182  KALDI_LOG << "Chunk size of " << chunk_size << " is greater than "
183  << "the number of rows in utterance: " << utt
184  << ", using chunk size of " << num_rows;
185  this_chunk_size = num_rows;
186  } else if (chunk_size == -1) {
187  this_chunk_size = num_rows;
188  }
189 
190  int32 num_chunks = ceil(
191  num_rows / static_cast<BaseFloat>(this_chunk_size));
192  Vector<BaseFloat> xvector_avg(xvector_dim, kSetZero);
193  BaseFloat tot_weight = 0.0;
194 
195  // Iterate over the feature chunks.
196  for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
197  // If we're nearing the end of the input, we may need to shift the
198  // offset back so that we can get this_chunk_size frames of input to
199  // the nnet.
200  int32 offset = std::min(
201  this_chunk_size, num_rows - chunk_indx * this_chunk_size);
202  if (!pad_input && offset < min_chunk_size)
203  continue;
204  SubMatrix<BaseFloat> sub_features(
205  features, chunk_indx * this_chunk_size, offset, 0, feat_dim);
206  Vector<BaseFloat> xvector;
207  tot_weight += offset;
208 
209  // Pad input if the offset is less than the minimum chunk size
210  if (pad_input && offset < min_chunk_size) {
211  Matrix<BaseFloat> padded_features(min_chunk_size, feat_dim);
212  int32 left_context = (min_chunk_size - offset) / 2;
213  int32 right_context = min_chunk_size - offset - left_context;
214  for (int32 i = 0; i < left_context; i++) {
215  padded_features.Row(i).CopyFromVec(sub_features.Row(0));
216  }
217  for (int32 i = 0; i < right_context; i++) {
218  padded_features.Row(min_chunk_size - i - 1).CopyFromVec(sub_features.Row(offset - 1));
219  }
220  padded_features.Range(left_context, offset, 0, feat_dim).CopyFromMat(sub_features);
221  RunNnetComputation(padded_features, nnet, &compiler, &xvector);
222  } else {
223  RunNnetComputation(sub_features, nnet, &compiler, &xvector);
224  }
225  xvector_avg.AddVec(offset, xvector);
226  }
227  xvector_avg.Scale(1.0 / tot_weight);
228  vector_writer.Write(utt, xvector_avg);
229 
230  frame_count += features.NumRows();
231  num_success++;
232  }
233 
234 #if HAVE_CUDA==1
235  CuDevice::Instantiate().PrintProfile();
236 #endif
237  double elapsed = timer.Elapsed();
238  KALDI_LOG << "Time taken "<< elapsed
239  << "s: real-time factor assuming 100 frames/sec is "
240  << (elapsed*100.0/frame_count);
241  KALDI_LOG << "Done " << num_success << " utterances, failed for "
242  << num_fail;
243 
244  if (!cached_compiler_out.empty()) {
245  KALDI_LOG << "Writing cache to " << cached_compiler_out;
246  bool binary_write = true;
247  Output ko(cached_compiler_out, &binary_write);
248  compiler.WriteCache(ko.Stream(), binary_write);
249  }
250 
251  if (num_success != 0) return 0;
252  else return 1;
253  } catch(const std::exception &e) {
254  std::cerr << e.what();
255  return -1;
256  }
257 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void CollapseModel(const CollapseModelConfig &config, Nnet *nnet)
This function modifies the neural net for efficiency, in a way that suitable to be done in test time...
Definition: nnet-utils.cc:2100
This class enables you to do the compilation and optimization in one call, and also ensures that if t...
void SetBatchnormTestMode(bool test_mode, Nnet *nnet)
This function affects only components of type BatchNormComponent.
Definition: nnet-utils.cc:564
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
int32 OutputDim(const std::string &output_name) const
Definition: nnet-nnet.cc:677
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832
static void RunNnetComputation(const MatrixBase< BaseFloat > &features, const Nnet &nnet, CachingOptimizingCompiler *compiler, Vector< BaseFloat > *xvector)
void SetDropoutTestMode(bool test_mode, Nnet *nnet)
This function affects components of child-classes of RandomComponent.
Definition: nnet-utils.cc:573
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
#define KALDI_WARN
Definition: kaldi-error.h:150
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_LOG
Definition: kaldi-error.h:153
double Elapsed() const
Returns time in seconds.
Definition: timer.h:74
Sub-matrix representation.
Definition: kaldi-matrix.h:988
Config class for the CollapseModel function.
Definition: nnet-utils.h:240