nnet-get-egs.cc
Go to the documentation of this file.
1 // nnet2bin/nnet-get-egs.cc
2 
3 // Copyright 2012-2014 Johns Hopkins University (author: Daniel Povey)
4 // 2014 Vimal Manohar
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include <sstream>
22 
23 #include "base/kaldi-common.h"
24 #include "util/common-utils.h"
25 #include "hmm/transition-model.h"
27 
28 namespace kaldi {
29 namespace nnet2 {
30 
31 
32 static void ProcessFile(const MatrixBase<BaseFloat> &feats,
33  const Posterior &pdf_post,
34  const std::string &utt_id,
35  int32 left_context,
36  int32 right_context,
37  int32 num_frames,
38  int32 const_feat_dim,
39  int64 *num_frames_written,
40  int64 *num_egs_written,
41  NnetExampleWriter *example_writer) {
42  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(pdf_post.size()));
43  int32 feat_dim = feats.NumCols();
44  KALDI_ASSERT(const_feat_dim < feat_dim);
45  KALDI_ASSERT(num_frames > 0);
46  int32 basic_feat_dim = feat_dim - const_feat_dim;
47 
48  for (int32 t = 0; t < feats.NumRows(); t += num_frames) {
49  int32 this_num_frames = std::min(num_frames,
50  feats.NumRows() - t);
51 
52  int32 tot_frames = left_context + this_num_frames + right_context;
53  NnetExample eg;
54  Matrix<BaseFloat> input_frames(tot_frames, basic_feat_dim);
55  eg.left_context = left_context;
56  eg.spk_info.Resize(const_feat_dim);
57 
58  // Set up "input_frames".
59  for (int32 j = -left_context; j < this_num_frames + right_context; j++) {
60  int32 t2 = j + t;
61  if (t2 < 0) t2 = 0;
62  if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1;
63  SubVector<BaseFloat> src(feats.Row(t2), 0, basic_feat_dim),
64  dest(input_frames, j + left_context);
65  dest.CopyFromVec(src);
66  if (const_feat_dim > 0) {
67  SubVector<BaseFloat> src(feats.Row(t2), basic_feat_dim, const_feat_dim);
68  // set eg.spk_info to the average of the corresponding dimensions of
69  // the input, taken over the frames whose features we store in the eg.
70  eg.spk_info.AddVec(1.0 / tot_frames, src);
71  }
72  }
73  eg.labels.resize(this_num_frames);
74  for (int32 j = 0; j < this_num_frames; j++)
75  eg.labels[j] = pdf_post[t + j];
76  eg.input_frames = input_frames; // Copy to CompressedMatrix.
77 
78  std::ostringstream os;
79  os << utt_id << "-" << t;
80 
81  std::string key = os.str(); // key is <utt_id>-<frame_id>
82 
83  *num_frames_written += this_num_frames;
84  *num_egs_written += 1;
85 
86  example_writer->Write(key, eg);
87  }
88 }
89 
90 
91 } // namespace nnet2
92 } // namespace kaldi
93 
94 int main(int argc, char *argv[]) {
95  try {
96  using namespace kaldi;
97  using namespace kaldi::nnet2;
98  typedef kaldi::int32 int32;
99  typedef kaldi::int64 int64;
100 
101  const char *usage =
102  "Get frame-by-frame examples of data for neural network training.\n"
103  "Essentially this is a format change from features and posteriors\n"
104  "into a special frame-by-frame format. To split randomly into\n"
105  "different subsets, do nnet-copy-egs with --random=true, but\n"
106  "note that this does not randomize the order of frames.\n"
107  "\n"
108  "Usage: nnet-get-egs [options] <features-rspecifier> "
109  "<pdf-post-rspecifier> <training-examples-out>\n"
110  "\n"
111  "An example [where $feats expands to the actual features]:\n"
112  "nnet-get-egs --left-context=8 --right-context=8 \"$feats\" \\\n"
113  " \"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n"
114  " ark:- \n"
115  "Note: the --left-context and --right-context would be derived from\n"
116  "the output of nnet-info.";
117 
118 
119  int32 left_context = 0, right_context = 0,
120  num_frames = 1, const_feat_dim = 0;
121 
122  ParseOptions po(usage);
123  po.Register("left-context", &left_context, "Number of frames of left "
124  "context the neural net requires.");
125  po.Register("right-context", &right_context, "Number of frames of right "
126  "context the neural net requires.");
127  po.Register("num-frames", &num_frames, "Number of frames with labels "
128  "that each example contains.");
129  po.Register("const-feat-dim", &const_feat_dim, "If specified, the last "
130  "const-feat-dim dimensions of the feature input are treated as "
131  "constant over the context window (so are not spliced)");
132 
133  po.Read(argc, argv);
134 
135  if (po.NumArgs() != 3) {
136  po.PrintUsage();
137  exit(1);
138  }
139 
140  std::string feature_rspecifier = po.GetArg(1),
141  pdf_post_rspecifier = po.GetArg(2),
142  examples_wspecifier = po.GetArg(3);
143 
144  // Read in all the training files.
145  SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
146  RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
147  NnetExampleWriter example_writer(examples_wspecifier);
148 
149  int32 num_done = 0, num_err = 0;
150  int64 num_frames_written = 0, num_egs_written = 0;
151 
152  for (; !feat_reader.Done(); feat_reader.Next()) {
153  std::string key = feat_reader.Key();
154  const Matrix<BaseFloat> &feats = feat_reader.Value();
155  if (!pdf_post_reader.HasKey(key)) {
156  KALDI_WARN << "No pdf-level posterior for key " << key;
157  num_err++;
158  } else {
159  const Posterior &pdf_post = pdf_post_reader.Value(key);
160  if (pdf_post.size() != feats.NumRows()) {
161  KALDI_WARN << "Posterior has wrong size " << pdf_post.size()
162  << " versus " << feats.NumRows();
163  num_err++;
164  continue;
165  }
166  ProcessFile(feats, pdf_post, key,
167  left_context, right_context, num_frames,
168  const_feat_dim, &num_frames_written, &num_egs_written,
169  &example_writer);
170  num_done++;
171  }
172  }
173 
174  KALDI_LOG << "Finished generating examples, "
175  << "successfully processed " << num_done
176  << " feature files, wrote " << num_egs_written << " examples, "
177  << " with " << num_frames_written << " egs in total; "
178  << num_err << " files had errors.";
179  return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
180  } catch(const std::exception &e) {
181  std::cerr << e.what() << '\n';
182  return -1;
183  }
184 }
CompressedMatrix input_frames
The input data, with NumRows() >= labels.size() + left_context; it includes features to the left and ...
Definition: nnet-example.h:49
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
NnetExample is the input data and corresponding label (or labels) for one or more frames of input...
Definition: nnet-example.h:36
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
int32 left_context
The number of frames of left context (we can work out the #frames of right context from input_frames...
Definition: nnet-example.h:53
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
const T & Value(const std::string &key)
int main(int argc, char *argv[])
Definition: nnet-get-egs.cc:94
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
bool HasKey(const std::string &key)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
std::vector< std::vector< std::pair< int32, BaseFloat > > > labels
The label(s) for each frame in a sequence of frames; in the normal case, this will be just [ [ (pdf-i...
Definition: nnet-example.h:43
static void ProcessFile(const MatrixBase< BaseFloat > &feats, const Posterior &pdf_post, const std::string &utt_id, int32 left_context, int32 right_context, int32 num_frames, int32 const_feat_dim, int64 *num_frames_written, int64 *num_egs_written, NnetExampleWriter *example_writer)
Definition: nnet-get-egs.cc:32
#define KALDI_LOG
Definition: kaldi-error.h:153
Note on how to parse this filename: it contains functions relatied to neural-net training examples...
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
Vector< BaseFloat > spk_info
The speaker-specific input, if any, or an empty vector if we&#39;re not using this features.
Definition: nnet-example.h:58