doc/nnet3-get-egs_8cc_source.html

 // nnet3bin/nnet3-get-egs.cc

 // Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
 //                2014  Vimal Manohar

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.

 #include <sstream>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "hmm/transition-model.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"

 namespace kaldi {
 namespace nnet3 {


 static bool ProcessFile(const GeneralMatrix &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
                         int32 ivector_period,
                         const Posterior &pdf_post,
                         const std::string &utt_id,
                         bool compress,
                         int32 num_pdfs,
                         int32 length_tolerance,
                         UtteranceSplitter *utt_splitter,
                         NnetExampleWriter *example_writer) {
   int32 num_input_frames = feats.NumRows();
   if (!utt_splitter->LengthsMatch(utt_id, num_input_frames,
                                   static_cast<int32>(pdf_post.size()),
                                   length_tolerance))
     return false;  // LengthsMatch() will have printed a warning.

   std::vector<ChunkTimeInfo> chunks;

   utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);

   if (chunks.empty()) {
     KALDI_WARN << "Not producing egs for utterance " << utt_id
                << " because it is too short: "
                << num_input_frames << " frames.";
   }

   // 'frame_subsampling_factor' is not used in any recipes at the time of
   // writing, this is being supported to unify the code with the 'chain' recipes
   // and in case we need it for some reason in future.
   int32 frame_subsampling_factor =
       utt_splitter->Config().frame_subsampling_factor;

   for (size_t c = 0; c < chunks.size(); c++) {
     const ChunkTimeInfo &chunk = chunks[c];

     int32 tot_input_frames = chunk.left_context + chunk.num_frames +
         chunk.right_context;

     int32 start_frame = chunk.first_frame - chunk.left_context;

     GeneralMatrix input_frames;
     ExtractRowRangeWithPadding(feats, start_frame, tot_input_frames,
                                &input_frames);

     // 'input_frames' now stores the relevant rows (maybe with padding) from the
     // original Matrix or (more likely) CompressedMatrix.  If a CompressedMatrix,
     // it does this without un-compressing and re-compressing, so there is no loss
     // of accuracy.

     NnetExample eg;
     // call the regular input "input".
     eg.io.push_back(NnetIo("input", -chunk.left_context, input_frames));

     if (ivector_feats != NULL) {
       // if applicable, add the iVector feature.
       // choose iVector from a random frame in the chunk
       int32 ivector_frame = RandInt(start_frame,
                                     start_frame + num_input_frames - 1),
           ivector_frame_subsampled = ivector_frame / ivector_period;
       if (ivector_frame_subsampled < 0)
         ivector_frame_subsampled = 0;
       if (ivector_frame_subsampled >= ivector_feats->NumRows())
         ivector_frame_subsampled = ivector_feats->NumRows() - 1;
       Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
       ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
       eg.io.push_back(NnetIo("ivector", 0, ivector));
     }

     // Note: chunk.first_frame and chunk.num_frames will both be
     // multiples of frame_subsampling_factor.
     int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
         num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;

     Posterior labels(num_frames_subsampled);

     // TODO: it may be that using these weights is not actually helpful (with
     // chain training, it was not), and that setting them all to 1 is better.
     // We could add a boolean option to this program to control that; but I
     // don't want to add such an option if experiments show that it is not
     // helpful.
     for (int32 i = 0; i < num_frames_subsampled; i++) {
       int32 t = i + start_frame_subsampled;
       if (t < pdf_post.size())
         labels[i] = pdf_post[t];
       for (std::vector<std::pair<int32, BaseFloat> >::iterator
                iter = labels[i].begin(); iter != labels[i].end(); ++iter)
         iter->second *= chunk.output_weights[i];
     }

     eg.io.push_back(NnetIo("output", num_pdfs, 0, labels, frame_subsampling_factor));

     if (compress)
       eg.Compress();

     std::ostringstream os;
     os << utt_id << "-" << chunk.first_frame;

     std::string key = os.str(); // key is <utt_id>-<frame_id>

     example_writer->Write(key, eg);
   }
   return true;
 }

 } // namespace nnet3
 } // namespace kaldi

 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace kaldi::nnet3;
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;

     const char *usage =
         "Get frame-by-frame examples of data for nnet3 neural network training.\n"
         "Essentially this is a format change from features and posteriors\n"
         "into a special frame-by-frame format.  This program handles the\n"
         "common case where you have some input features, possibly some\n"
         "iVectors, and one set of labels.  If people in future want to\n"
         "do different things they may have to extend this program or create\n"
         "different versions of it for different tasks (the egs format is quite\n"
         "general)\n"
         "\n"
         "Usage:  nnet3-get-egs [options] <features-rspecifier> "
         "<pdf-post-rspecifier> <egs-out>\n"
         "\n"
         "An example [where $feats expands to the actual features]:\n"
         "nnet3-get-egs --num-pdfs=2658 --left-context=12 --right-context=9 --num-frames=8 \"$feats\"\\\n"
         "\"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n"
         "   ark:- \n"
         "See also: nnet3-chain-get-egs, nnet3-get-egs-simple\n";


     bool compress = true;
     int32 num_pdfs = -1, length_tolerance = 100,
         targets_length_tolerance = 2,
         online_ivector_period = 1;

     ExampleGenerationConfig eg_config;  // controls num-frames,
                                         // left/right-context, etc.

     std::string online_ivector_rspecifier;

     ParseOptions po(usage);

     po.Register("compress", &compress, "If true, write egs with input features "
                 "in compressed format (recommended).  This is "
                 "only relevant if the features being read are un-compressed; "
                 "if already compressed, we keep we same compressed format when "
                 "dumping egs.");
     po.Register("num-pdfs", &num_pdfs, "Number of pdfs in the acoustic "
                 "model");
     po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
                 "--online-ivectors option, for back compatibility");
     po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
                 "ivector features, as a matrix.");
     po.Register("online-ivector-period", &online_ivector_period, "Number of "
                 "frames between iVectors in matrices supplied to the "
                 "--online-ivectors option");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
     po.Register("targets-length-tolerance", &targets_length_tolerance,
                 "Tolerance for "
                 "difference in num-frames (after subsampling) between "
                 "feature matrix and posterior");
     eg_config.Register(&po);

     po.Read(argc, argv);

     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }

     if (num_pdfs <= 0)
       KALDI_ERR << "--num-pdfs options is required.";

     eg_config.ComputeDerived();
     UtteranceSplitter utt_splitter(eg_config);

     std::string feature_rspecifier = po.GetArg(1),
         pdf_post_rspecifier = po.GetArg(2),
         examples_wspecifier = po.GetArg(3);

     // SequentialGeneralMatrixReader can read either a Matrix or
     // CompressedMatrix (or SparseMatrix, but not as relevant here),
     // and it retains the type.  This way, we can generate parts of
     // the feature matrices without uncompressing and re-compressing.
     SequentialGeneralMatrixReader feat_reader(feature_rspecifier);
     RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
         online_ivector_rspecifier);

     int32 num_err = 0;

     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
       const GeneralMatrix &feats = feat_reader.Value();
       if (!pdf_post_reader.HasKey(key)) {
         KALDI_WARN << "No pdf-level posterior for key " << key;
         num_err++;
       } else {
         const Posterior &pdf_post = pdf_post_reader.Value(key);
         const Matrix<BaseFloat> *online_ivector_feats = NULL;
         if (!online_ivector_rspecifier.empty()) {
           if (!online_ivector_reader.HasKey(key)) {
             KALDI_WARN << "No iVectors for utterance " << key;
             num_err++;
             continue;
           } else {
             // this address will be valid until we call HasKey() or Value()
             // again.
             online_ivector_feats = &(online_ivector_reader.Value(key));
           }
         }

         if (online_ivector_feats != NULL &&
             (abs(feats.NumRows() - (online_ivector_feats->NumRows() *
                                     online_ivector_period)) > length_tolerance
              || online_ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
                      << " and iVectors " << online_ivector_feats->NumRows()
                      << "exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }

         if (!ProcessFile(feats, online_ivector_feats, online_ivector_period,
                          pdf_post, key, compress, num_pdfs,
                          targets_length_tolerance,
                          &utt_splitter, &example_writer))
           num_err++;
       }
     }
     if (num_err > 0)
       KALDI_WARN << num_err << " utterances had errors and could "
           "not be processed.";
     // utt_splitter prints stats in its destructor.
     return utt_splitter.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
kaldi::nnet3::NnetExample
NnetExample is the input data and corresponding label (or labels) for one or more frames of input...
Definition: nnet-example.h:111

nnet-example.h

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::nnet3::NnetIo
Definition: nnet-example.h:33

kaldi::GeneralMatrix
This class is a wrapper that enables you to store a matrix in one of three forms: either as a Matrix<...
Definition: sparse-matrix.h:282

kaldi::nnet3::UtteranceSplitter::LengthsMatch
bool LengthsMatch(const std::string &utt, int32 utterance_length, int32 supervision_length, int32 length_tolerance=0) const
Definition: nnet-example-utils.cc:553

kaldi::nnet3::ExampleGenerationConfig::frame_subsampling_factor
int32 frame_subsampling_factor
Definition: nnet-example-utils.h:88

kaldi::MatrixBase::NumCols
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67

kaldi::MatrixBase
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49

kaldi::ParseOptions::PrintUsage
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
Definition: parse-options.cc:393

kaldi::SequentialTableReader::Key
std::string Key()
Definition: kaldi-table-inl.h:918

kaldi::nnet3::ChunkTimeInfo::num_frames
int32 num_frames
Definition: nnet-example-utils.h:155

kaldi::nnet3::UtteranceSplitter::ExitStatus
int32 ExitStatus()
Definition: nnet-example-utils.h:198

kaldi::nnet3::UtteranceSplitter
Definition: nnet-example-utils.h:169

kaldi::nnet3::ExampleGenerationConfig
Definition: nnet-example-utils.h:82

kaldi::TableWriter
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

common-utils.h

kaldi::Matrix< BaseFloat >

posterior.h

kaldi::nnet3::ChunkTimeInfo::left_context
int32 left_context
Definition: nnet-example-utils.h:156

kaldi::TableWriter::Write
void Write(const std::string &key, const T &value) const
Definition: kaldi-table-inl.h:1511

kaldi::nnet3
Definition: dnn3_code_compilation.dox:22

kaldi::ParseOptions::Register
void Register(const std::string &name, bool *ptr, const std::string &doc)
Definition: parse-options.cc:56

kaldi::RandomAccessTableReader
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233

kaldi::Posterior
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42

kaldi::ParseOptions
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36

kaldi::MatrixBase::Row
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188

kaldi::RandomAccessTableReader::Value
const T & Value(const std::string &key)
Definition: kaldi-table-inl.h:2561

transition-model.h

kaldi::nnet3::ProcessFile
static bool ProcessFile(const discriminative::SplitDiscriminativeSupervisionOptions &config, const TransitionModel &tmodel, const MatrixBase< BaseFloat > &feats, const MatrixBase< BaseFloat > *ivector_feats, int32 ivector_period, const discriminative::DiscriminativeSupervision &supervision, const std::string &utt_id, bool compress, UtteranceSplitter *utt_splitter, NnetDiscriminativeExampleWriter *example_writer)
Definition: nnet3-discriminative-get-egs.cc:39

kaldi::SequentialTableReader
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287

nnet-example-utils.h

kaldi::nnet3::ExampleGenerationConfig::Register
void Register(OptionsItf *po)
Definition: nnet-example-utils.h:110

kaldi::ParseOptions::Read
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
Definition: parse-options.cc:311

kaldi::SequentialTableReader::Done
bool Done()
Definition: kaldi-table-inl.h:948

kaldi::ExtractRowRangeWithPadding
void ExtractRowRangeWithPadding(const GeneralMatrix &in, int32 row_offset, int32 num_rows, GeneralMatrix *out)
This function extracts a row-range of a GeneralMatrix and writes as a GeneralMatrix containing the sa...
Definition: sparse-matrix.cc:1233

KALDI_ERR
#define KALDI_ERR
Definition: kaldi-error.h:147

kaldi::nnet3::NnetExample::Compress
void Compress()
Compresses any (input) features that are not sparse.
Definition: nnet-example.cc:130

kaldi::ParseOptions::GetArg
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
Definition: parse-options.cc:202

KALDI_WARN
#define KALDI_WARN
Definition: kaldi-error.h:150

kaldi::SequentialTableReader::Next
void Next()
Definition: kaldi-table-inl.h:942

kaldi::RandomAccessTableReader::HasKey
bool HasKey(const std::string &key)
Definition: kaldi-table-inl.h:2551

kaldi::nnet3::UtteranceSplitter::Config
const ExampleGenerationConfig & Config() const
Definition: nnet-example-utils.h:175

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::ParseOptions::NumArgs
int NumArgs() const
Number of positional parameters (c.f. argc-1).
Definition: parse-options.cc:198

kaldi::SequentialTableReader::Value
T & Value()
Definition: kaldi-table-inl.h:934

kaldi::MatrixBase::NumRows
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64

kaldi::GeneralMatrix::NumRows
MatrixIndexT NumRows() const
Definition: sparse-matrix.cc:781

kaldi::nnet3::ChunkTimeInfo::first_frame
int32 first_frame
Definition: nnet-example-utils.h:154

kaldi::nnet3::UtteranceSplitter::GetChunksForUtterance
void GetChunksForUtterance(int32 utterance_length, std::vector< ChunkTimeInfo > *chunk_info)
Definition: nnet-example-utils.cc:822

kaldi::nnet3::ChunkTimeInfo
struct ChunkTimeInfo is used by class UtteranceSplitter to output information about how we split an u...
Definition: nnet-example-utils.h:153

kaldi::nnet3::NnetExample::io
std::vector< NnetIo > io
"io" contains the input and output.
Definition: nnet-example.h:116

kaldi::nnet3::ChunkTimeInfo::right_context
int32 right_context
Definition: nnet-example-utils.h:157

main
int main(int argc, char *argv[])
Definition: nnet3-get-egs.cc:140

kaldi-common.h

kaldi::nnet3::ChunkTimeInfo::output_weights
std::vector< BaseFloat > output_weights
Definition: nnet-example-utils.h:165

kaldi::RandInt
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
Definition: kaldi-math.cc:95

kaldi::nnet3::ExampleGenerationConfig::ComputeDerived
void ComputeDerived()
This function decodes &#39;num_frames_str&#39; into &#39;num_frames&#39;, and ensures that the members of &#39;num_frames...
Definition: nnet-example-utils.cc:302