nnet3-xvector-compute.cc
Go to the documentation of this file.
1 // nnet3bin/nnet3-xvector-compute.cc
2 
3 // Copyright 2017 Johns Hopkins University (author: Daniel Povey)
4 // 2017 Johns Hopkins University (author: Daniel Garcia-Romero)
5 // 2017 David Snyder
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 
23 #include "base/kaldi-common.h"
24 #include "util/common-utils.h"
26 #include "base/timer.h"
27 #include "nnet3/nnet-utils.h"
28 
29 namespace kaldi {
30 namespace nnet3 {
31 
32 // Computes an xvector from a chunk of speech features.
33 static void RunNnetComputation(const MatrixBase<BaseFloat> &features,
34  const Nnet &nnet, CachingOptimizingCompiler *compiler,
35  Vector<BaseFloat> *xvector) {
36  ComputationRequest request;
37  request.need_model_derivative = false;
38  request.store_component_stats = false;
39  request.inputs.push_back(
40  IoSpecification("input", 0, features.NumRows()));
41  IoSpecification output_spec;
42  output_spec.name = "output";
43  output_spec.has_deriv = false;
44  output_spec.indexes.resize(1);
45  request.outputs.resize(1);
46  request.outputs[0].Swap(&output_spec);
47  std::shared_ptr<const NnetComputation> computation(compiler->Compile(request));
48  Nnet *nnet_to_update = NULL; // we're not doing any update.
49  NnetComputer computer(NnetComputeOptions(), *computation,
50  nnet, nnet_to_update);
51  CuMatrix<BaseFloat> input_feats_cu(features);
52  computer.AcceptInput("input", &input_feats_cu);
53  computer.Run();
54  CuMatrix<BaseFloat> cu_output;
55  computer.GetOutputDestructive("output", &cu_output);
56  xvector->Resize(cu_output.NumCols());
57  xvector->CopyFromVec(cu_output.Row(0));
58 }
59 
60 } // namespace nnet3
61 } // namespace kaldi
62 
63 int main(int argc, char *argv[]) {
64  try {
65  using namespace kaldi;
66  using namespace kaldi::nnet3;
67  typedef kaldi::int32 int32;
68  typedef kaldi::int64 int64;
69 
70  const char *usage =
71  "Propagate features through an xvector neural network model and write\n"
72  "the output vectors. \"Xvector\" is our term for a vector or\n"
73  "embedding which is the output of a particular type of neural network\n"
74  "architecture found in speaker recognition. This architecture\n"
75  "consists of several layers that operate on frames, a statistics\n"
76  "pooling layer that aggregates over the frame-level representations\n"
77  "and possibly additional layers that operate on segment-level\n"
78  "representations. The xvectors are generally extracted from an\n"
79  "output layer after the statistics pooling layer. By default, one\n"
80  "xvector is extracted directly from the set of features for each\n"
81  "utterance. Optionally, xvectors are extracted from chunks of input\n"
82  "features and averaged, to produce a single vector.\n"
83  "\n"
84  "Usage: nnet3-xvector-compute [options] <raw-nnet-in> "
85  "<features-rspecifier> <vector-wspecifier>\n"
86  "e.g.: nnet3-xvector-compute final.raw scp:feats.scp "
87  "ark:nnet_prediction.ark\n"
88  "See also: nnet3-compute\n";
89 
90  ParseOptions po(usage);
91  Timer timer;
92 
94  CachingOptimizingCompilerOptions compiler_config;
95 
96  opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.
97 
98  std::string use_gpu = "no";
99  std::string cached_compiler_in;
100  std::string cached_compiler_out;
101  int32 chunk_size = -1,
102  min_chunk_size = 100;
103  bool pad_input = true;
104 
105  opts.Register(&po);
106  compiler_config.Register(&po);
107 
108  po.Register("use-gpu", &use_gpu,
109  "yes|no|optional|wait, only has effect if compiled with CUDA");
110  po.Register("chunk-size", &chunk_size,
111  "If set, extracts xectors from specified chunk-size, and averages. "
112  "If not set, extracts an xvector from all available features.");
113  po.Register("min-chunk-size", &min_chunk_size,
114  "Minimum chunk-size allowed when extracting xvectors.");
115  po.Register("pad-input", &pad_input, "If true, duplicate the first and "
116  "last frames of the input features as required to equal min-chunk-size.");
117  po.Register("cached-compiler-in", &cached_compiler_in,
118  "If set, read the cached compiler from the specified file path.");
119  po.Register("cached-compiler-out", &cached_compiler_out,
120  "If set, write the cached compiler to the specified file path.");
121 
122 #if HAVE_CUDA==1
123  CuDevice::RegisterDeviceOptions(&po);
124 #endif
125 
126  po.Read(argc, argv);
127 
128  if (po.NumArgs() != 3) {
129  po.PrintUsage();
130  exit(1);
131  }
132 
133 #if HAVE_CUDA==1
134  CuDevice::Instantiate().SelectGpuId(use_gpu);
135 #endif
136 
137  std::string nnet_rxfilename = po.GetArg(1),
138  feature_rspecifier = po.GetArg(2),
139  vector_wspecifier = po.GetArg(3);
140 
141  Nnet nnet;
142  ReadKaldiObject(nnet_rxfilename, &nnet);
143  SetBatchnormTestMode(true, &nnet);
144  SetDropoutTestMode(true, &nnet);
146 
147  CachingOptimizingCompiler compiler(nnet, opts.optimize_config, compiler_config);
148 
149  if (!cached_compiler_in.empty()) {
150  KALDI_LOG << "Reading cache from " << cached_compiler_in;
151  bool cache_binary_in;
152  Input ki(cached_compiler_in, &cache_binary_in);
153  compiler.ReadCache(ki.Stream(), cache_binary_in);
154  }
155 
156  BaseFloatVectorWriter vector_writer(vector_wspecifier);
157 
158  int32 num_success = 0, num_fail = 0;
159  int64 frame_count = 0;
160  int32 xvector_dim = nnet.OutputDim("output");
161 
162  SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
163 
164  for (; !feature_reader.Done(); feature_reader.Next()) {
165  std::string utt = feature_reader.Key();
166  const Matrix<BaseFloat> &features (feature_reader.Value());
167  if (features.NumRows() == 0) {
168  KALDI_WARN << "Zero-length utterance: " << utt;
169  num_fail++;
170  continue;
171  }
172  int32 num_rows = features.NumRows(),
173  feat_dim = features.NumCols(),
174  this_chunk_size = chunk_size;
175  if (!pad_input && num_rows < min_chunk_size) {
176  KALDI_WARN << "Minimum chunk size of " << min_chunk_size
177  << " is greater than the number of rows "
178  << "in utterance: " << utt;
179  num_fail++;
180  continue;
181  } else if (num_rows < chunk_size) {
182  KALDI_LOG << "Chunk size of " << chunk_size << " is greater than "
183  << "the number of rows in utterance: " << utt
184  << ", using chunk size of " << num_rows;
185  this_chunk_size = num_rows;
186  } else if (chunk_size == -1) {
187  this_chunk_size = num_rows;
188  }
189 
190  int32 num_chunks = ceil(
191  num_rows / static_cast<BaseFloat>(this_chunk_size));
192  Vector<BaseFloat> xvector_avg(xvector_dim, kSetZero);
193  BaseFloat tot_weight = 0.0;
194 
195  // Iterate over the feature chunks.
196  for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
197  // If we're nearing the end of the input, we may need to shift the
198  // offset back so that we can get this_chunk_size frames of input to
199  // the nnet.
200  int32 offset = std::min(
201  this_chunk_size, num_rows - chunk_indx * this_chunk_size);
202  if (!pad_input && offset < min_chunk_size)
203  continue;
204  SubMatrix<BaseFloat> sub_features(
205  features, chunk_indx * this_chunk_size, offset, 0, feat_dim);
206  Vector<BaseFloat> xvector;
207  tot_weight += offset;
208 
209  // Pad input if the offset is less than the minimum chunk size
210  if (pad_input && offset < min_chunk_size) {
211  Matrix<BaseFloat> padded_features(min_chunk_size, feat_dim);
212  int32 left_context = (min_chunk_size - offset) / 2;
213  int32 right_context = min_chunk_size - offset - left_context;
214  for (int32 i = 0; i < left_context; i++) {
215  padded_features.Row(i).CopyFromVec(sub_features.Row(0));
216  }
217  for (int32 i = 0; i < right_context; i++) {
218  padded_features.Row(min_chunk_size - i - 1).CopyFromVec(sub_features.Row(offset - 1));
219  }
220  padded_features.Range(left_context, offset, 0, feat_dim).CopyFromMat(sub_features);
221  RunNnetComputation(padded_features, nnet, &compiler, &xvector);
222  } else {
223  RunNnetComputation(sub_features, nnet, &compiler, &xvector);
224  }
225  xvector_avg.AddVec(offset, xvector);
226  }
227  xvector_avg.Scale(1.0 / tot_weight);
228  vector_writer.Write(utt, xvector_avg);
229 
230  frame_count += features.NumRows();
231  num_success++;
232  }
233 
234 #if HAVE_CUDA==1
235  CuDevice::Instantiate().PrintProfile();
236 #endif
237  double elapsed = timer.Elapsed();
238  KALDI_LOG << "Time taken "<< elapsed
239  << "s: real-time factor assuming 100 frames/sec is "
240  << (elapsed*100.0/frame_count);
241  KALDI_LOG << "Done " << num_success << " utterances, failed for "
242  << num_fail;
243 
244  if (!cached_compiler_out.empty()) {
245  KALDI_LOG << "Writing cache to " << cached_compiler_out;
246  bool binary_write = true;
247  Output ko(cached_compiler_out, &binary_write);
248  compiler.WriteCache(ko.Stream(), binary_write);
249  }
250 
251  if (num_success != 0) return 0;
252  else return 1;
253  } catch(const std::exception &e) {
254  std::cerr << e.what();
255  return -1;
256  }
257 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void CollapseModel(const CollapseModelConfig &config, Nnet *nnet)
This function modifies the neural net for efficiency, in a way that suitable to be done in test time...
Definition: nnet-utils.cc:2100
bool store_component_stats
you should set need_component_stats to true if you need the average-activation and average-derivative...
const CuSubVector< Real > Row(MatrixIndexT i) const
Definition: cu-matrix.h:670
bool need_model_derivative
if need_model_derivative is true, then we&#39;ll be doing either model training or model-derivative compu...
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
This class enables you to do the compilation and optimization in one call, and also ensures that if t...
void SetBatchnormTestMode(bool test_mode, Nnet *nnet)
This function affects only components of type BatchNormComponent.
Definition: nnet-utils.cc:564
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
std::vector< IoSpecification > inputs
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
void Write(const std::string &key, const T &value) const
int32 OutputDim(const std::string &output_name) const
Definition: nnet-nnet.cc:677
void Register(const std::string &name, bool *ptr, const std::string &doc)
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832
This file contains some miscellaneous functions dealing with class Nnet.
static void RunNnetComputation(const MatrixBase< BaseFloat > &features, const Nnet &nnet, CachingOptimizingCompiler *compiler, Vector< BaseFloat > *xvector)
void SetDropoutTestMode(bool test_mode, Nnet *nnet)
This function affects components of child-classes of RandomComponent.
Definition: nnet-utils.cc:573
void AcceptInput(const std::string &node_name, CuMatrix< BaseFloat > *input)
e.g.
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
std::istream & Stream()
Definition: kaldi-io.cc:826
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
std::ostream & Stream()
Definition: kaldi-io.cc:701
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
void Scale(Real alpha)
Multiplies all elements by this constant.
std::shared_ptr< const NnetComputation > Compile(const ComputationRequest &request)
Does the compilation and returns a const pointer to the result, which is owned by this class...
int NumArgs() const
Number of positional parameters (c.f. argc-1).
void ReadCache(std::istream &is, bool binary)
std::vector< Index > indexes
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
void WriteCache(std::ostream &os, bool binary)
A class representing a vector.
Definition: kaldi-vector.h:406
class NnetComputer is responsible for executing the computation described in the "computation" object...
Definition: nnet-compute.h:59
std::vector< IoSpecification > outputs
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
SubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Return a sub-part of matrix.
Definition: kaldi-matrix.h:202
int main(int argc, char *argv[])
void GetOutputDestructive(const std::string &output_name, CuMatrix< BaseFloat > *output)
#define KALDI_LOG
Definition: kaldi-error.h:153
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
double Elapsed() const
Returns time in seconds.
Definition: timer.h:74
Sub-matrix representation.
Definition: kaldi-matrix.h:988
Config class for the CollapseModel function.
Definition: nnet-utils.h:240
void Run()
This does either the forward or backward computation, depending when it is called (in a typical compu...