34 const std::string &utt_id,
39 int64 *num_frames_written,
40 int64 *num_egs_written,
46 int32 basic_feat_dim = feat_dim - const_feat_dim;
48 for (
int32 t = 0; t < feats.
NumRows(); t += num_frames) {
49 int32 this_num_frames = std::min(num_frames,
52 int32 tot_frames = left_context + this_num_frames + right_context;
59 for (
int32 j = -left_context;
j < this_num_frames + right_context;
j++) {
64 dest(input_frames,
j + left_context);
65 dest.CopyFromVec(src);
66 if (const_feat_dim > 0) {
70 eg.
spk_info.AddVec(1.0 / tot_frames, src);
73 eg.
labels.resize(this_num_frames);
74 for (
int32 j = 0;
j < this_num_frames;
j++)
78 std::ostringstream os;
79 os << utt_id <<
"-" << t;
81 std::string key = os.str();
83 *num_frames_written += this_num_frames;
84 *num_egs_written += 1;
86 example_writer->
Write(key, eg);
94 int main(
int argc,
char *argv[]) {
96 using namespace kaldi;
99 typedef kaldi::int64 int64;
102 "Get frame-by-frame examples of data for neural network training.\n" 103 "Essentially this is a format change from features and posteriors\n" 104 "into a special frame-by-frame format. To split randomly into\n" 105 "different subsets, do nnet-copy-egs with --random=true, but\n" 106 "note that this does not randomize the order of frames.\n" 108 "Usage: nnet-get-egs [options] <features-rspecifier> " 109 "<pdf-post-rspecifier> <training-examples-out>\n" 111 "An example [where $feats expands to the actual features]:\n" 112 "nnet-get-egs --left-context=8 --right-context=8 \"$feats\" \\\n" 113 " \"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n" 115 "Note: the --left-context and --right-context would be derived from\n" 116 "the output of nnet-info.";
119 int32 left_context = 0, right_context = 0,
120 num_frames = 1, const_feat_dim = 0;
123 po.
Register(
"left-context", &left_context,
"Number of frames of left " 124 "context the neural net requires.");
125 po.
Register(
"right-context", &right_context,
"Number of frames of right " 126 "context the neural net requires.");
127 po.
Register(
"num-frames", &num_frames,
"Number of frames with labels " 128 "that each example contains.");
129 po.
Register(
"const-feat-dim", &const_feat_dim,
"If specified, the last " 130 "const-feat-dim dimensions of the feature input are treated as " 131 "constant over the context window (so are not spliced)");
140 std::string feature_rspecifier = po.
GetArg(1),
141 pdf_post_rspecifier = po.
GetArg(2),
142 examples_wspecifier = po.
GetArg(3);
149 int32 num_done = 0, num_err = 0;
150 int64 num_frames_written = 0, num_egs_written = 0;
152 for (; !feat_reader.
Done(); feat_reader.
Next()) {
153 std::string key = feat_reader.
Key();
155 if (!pdf_post_reader.
HasKey(key)) {
156 KALDI_WARN <<
"No pdf-level posterior for key " << key;
160 if (pdf_post.size() != feats.
NumRows()) {
161 KALDI_WARN <<
"Posterior has wrong size " << pdf_post.size()
162 <<
" versus " << feats.
NumRows();
167 left_context, right_context, num_frames,
168 const_feat_dim, &num_frames_written, &num_egs_written,
174 KALDI_LOG <<
"Finished generating examples, " 175 <<
"successfully processed " << num_done
176 <<
" feature files, wrote " << num_egs_written <<
" examples, " 177 <<
" with " << num_frames_written <<
" egs in total; " 178 << num_err <<
" files had errors.";
179 return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
180 }
catch(
const std::exception &e) {
181 std::cerr << e.what() <<
'\n';
CompressedMatrix input_frames
The input data, with NumRows() >= labels.size() + left_context; it includes features to the left and ...
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
NnetExample is the input data and corresponding label (or labels) for one or more frames of input...
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Base class which provides matrix operations not involving resizing or allocation. ...
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
int32 left_context
The number of frames of left context (we can work out the #frames of right context from input_frames...
A templated class for writing objects to an archive or script file; see The Table concept...
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
const T & Value(const std::string &key)
int main(int argc, char *argv[])
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
bool HasKey(const std::string &key)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
std::vector< std::vector< std::pair< int32, BaseFloat > > > labels
The label(s) for each frame in a sequence of frames; in the normal case, this will be just [ [ (pdf-i...
static void ProcessFile(const MatrixBase< BaseFloat > &feats, const Posterior &pdf_post, const std::string &utt_id, int32 left_context, int32 right_context, int32 num_frames, int32 const_feat_dim, int64 *num_frames_written, int64 *num_egs_written, NnetExampleWriter *example_writer)
Note on how to parse this filename: it contains functions relatied to neural-net training examples...
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Vector< BaseFloat > spk_info
The speaker-specific input, if any, or an empty vector if we're not using this features.