nnet-am-compute.cc
Go to the documentation of this file.
1 // nnet2bin/nnet-am-compute.cc
2 
3 // Copyright 2012 Johns Hopkins University (author: Daniel Povey)
4 // 2015 Johns Hopkins University (author: Daniel Garcia-Romero)
5 // 2015 David Snyder
6 // 2017 Karel Vesely
7 
8 // See ../../COPYING for clarification regarding multiple authors
9 //
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 //
14 // http://www.apache.org/licenses/LICENSE-2.0
15 //
16 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
18 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
19 // MERCHANTABLITY OR NON-INFRINGEMENT.
20 // See the Apache 2 License for the specific language governing permissions and
21 // limitations under the License.
22 
23 #include "base/kaldi-common.h"
24 #include "util/common-utils.h"
25 #include "hmm/transition-model.h"
26 #include "nnet2/train-nnet.h"
27 #include "nnet2/am-nnet.h"
28 
29 
30 int main(int argc, char *argv[]) {
31  try {
32  using namespace kaldi;
33  using namespace kaldi::nnet2;
34  typedef kaldi::int32 int32;
35  typedef kaldi::int64 int64;
36 
37  const char *usage =
38  "Does the neural net computation for each file of input features, and\n"
39  "outputs as a matrix the result. Used mostly for debugging.\n"
40  "Note: if you want it to apply a log (e.g. for log-likelihoods), use\n"
41  "--apply-log=true\n"
42  "\n"
43  "Usage: nnet-am-compute [options] <model-in> <feature-rspecifier> "
44  "<feature-or-loglikes-wspecifier>\n"
45  "See also: nnet-compute, nnet-logprob\n";
46 
47  bool divide_by_priors = false;
48  bool apply_log = false;
49  bool pad_input = true;
50  std::string use_gpu = "no";
51  int32 chunk_size = 0;
52  ParseOptions po(usage);
53  po.Register("divide-by-priors", &divide_by_priors, "If true, "
54  "divide by the priors stored in the model and re-normalize, apply-log may follow");
55  po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
56  "before outputting.");
57  po.Register("pad-input", &pad_input, "If true, duplicate the first and last frames "
58  "of input features as required for temporal context, to prevent #frames "
59  "of output being less than those of input.");
60  po.Register("use-gpu", &use_gpu,
61  "yes|no|optional|wait, only has effect if compiled with CUDA");
62  po.Register("chunk-size", &chunk_size, "Process the feature matrix in chunks. "
63  "This is useful when processing large feature files in the GPU. "
64  "If chunk-size > 0, pad-input must be true.");
65 
66  po.Read(argc, argv);
67 
68  if (po.NumArgs() != 3) {
69  po.PrintUsage();
70  exit(1);
71  }
72  // If chunk_size is greater than 0, pad_input needs to be true.
73  KALDI_ASSERT(chunk_size < 0 || pad_input);
74 
75 #if HAVE_CUDA==1
76  CuDevice::Instantiate().SelectGpuId(use_gpu);
77 #endif
78 
79  std::string nnet_rxfilename = po.GetArg(1),
80  features_rspecifier = po.GetArg(2),
81  features_or_loglikes_wspecifier = po.GetArg(3);
82 
83  TransitionModel trans_model;
84  AmNnet am_nnet;
85  {
86  bool binary_read;
87  Input ki(nnet_rxfilename, &binary_read);
88  trans_model.Read(ki.Stream(), binary_read);
89  am_nnet.Read(ki.Stream(), binary_read);
90  }
91 
92  Nnet &nnet = am_nnet.GetNnet();
93 
94  int64 num_done = 0, num_frames = 0;
95 
96  Vector<BaseFloat> inv_priors(am_nnet.Priors());
97  KALDI_ASSERT((!divide_by_priors || inv_priors.Dim() == am_nnet.NumPdfs()) &&
98  "Priors in neural network not set up.");
99  inv_priors.ApplyPow(-1.0);
100 
101  SequentialBaseFloatMatrixReader feature_reader(features_rspecifier);
102  BaseFloatMatrixWriter writer(features_or_loglikes_wspecifier);
103 
104  for (; !feature_reader.Done(); feature_reader.Next()) {
105  std::string utt = feature_reader.Key();
106  const Matrix<BaseFloat> &feats = feature_reader.Value();
107 
108  int32 output_frames = feats.NumRows(), output_dim = nnet.OutputDim();
109  if (!pad_input)
110  output_frames -= nnet.LeftContext() + nnet.RightContext();
111  if (output_frames <= 0) {
112  KALDI_WARN << "Skipping utterance " << utt << " because output "
113  << "would be empty.";
114  continue;
115  }
116 
117  Matrix<BaseFloat> output(output_frames, output_dim);
118  CuMatrix<BaseFloat> cu_feats(feats);
119  CuMatrix<BaseFloat> cu_output(output);
120  if (chunk_size > 0 && chunk_size < feats.NumRows()) {
121  NnetComputationChunked(nnet, cu_feats, chunk_size, &cu_output);
122  } else {
123  NnetComputation(nnet, cu_feats, pad_input, &cu_output);
124  }
125  cu_output.Swap(&output);
126 
127  if (divide_by_priors) {
128  output.MulColsVec(inv_priors); // scales each column by the corresponding element
129  // of inv_priors.
130  for (int32 i = 0; i < output.NumRows(); i++) {
131  SubVector<BaseFloat> frame(output, i);
132  BaseFloat p = frame.Sum();
133  if (!(p > 0.0)) {
134  KALDI_WARN << "Bad sum of probabilities " << p;
135  } else {
136  frame.Scale(1.0 / p); // re-normalize to sum to one.
137  }
138  }
139  }
140 
141  if (apply_log) {
142  output.ApplyFloor(1.0e-20);
143  output.ApplyLog();
144  }
145  writer.Write(utt, output);
146  num_frames += feats.NumRows();
147  num_done++;
148  }
149 #if HAVE_CUDA==1
150  CuDevice::Instantiate().PrintProfile();
151 #endif
152 
153  KALDI_LOG << "Processed " << num_done << " feature files, "
154  << num_frames << " frames of input were processed.";
155 
156  return (num_done == 0 ? 1 : 0);
157  } catch(const std::exception &e) {
158  std::cerr << e.what() << '\n';
159  return -1;
160  }
161 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
int32 LeftContext() const
Returns the left-context summed over all the Components...
Definition: nnet-nnet.cc:42
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
void Read(std::istream &is, bool binary)
Definition: am-nnet.cc:39
int32 OutputDim() const
The output dimension of the network – typically the number of pdfs.
Definition: nnet-nnet.cc:31
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void NnetComputationChunked(const Nnet &nnet, const CuMatrixBase< BaseFloat > &input, int32 chunk_size, CuMatrixBase< BaseFloat > *output)
Does the basic neural net computation, on a sequence of data (e.g.
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void NnetComputation(const Nnet &nnet, const CuMatrixBase< BaseFloat > &input, bool pad_input, CuMatrixBase< BaseFloat > *output)
Does the basic neural net computation, on a sequence of data (e.g.
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
std::istream & Stream()
Definition: kaldi-io.cc:826
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void Swap(Matrix< Real > *mat)
Definition: cu-matrix.cc:123
void Read(std::istream &is, bool binary)
const VectorBase< BaseFloat > & Priors() const
Definition: am-nnet.h:67
int32 RightContext() const
Returns the right-context summed over all the Components...
Definition: nnet-nnet.cc:56
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int main(int argc, char *argv[])
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
void Scale(Real alpha)
Multiplies all elements by this constant.
Real Sum() const
Returns sum of the elements.
void MulColsVec(const VectorBase< Real > &scale)
Equivalent to (*this) = (*this) * diag(scale).
int NumArgs() const
Number of positional parameters (c.f. argc-1).
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
int32 NumPdfs() const
Definition: am-nnet.h:55
void ApplyFloor(Real floor_val)
Definition: kaldi-matrix.h:354
#define KALDI_LOG
Definition: kaldi-error.h:153
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
const Nnet & GetNnet() const
Definition: am-nnet.h:61