online2-wav-dump-features.cc File Reference
Include dependency graph for online2-wav-dump-features.cc:

Go to the source code of this file.

Functions

int main (int argc, char *argv[])
 

Function Documentation

◆ main()

int main ( int  argc,
char *  argv[] 
)

Definition at line 30 of file online2-wav-dump-features.cc.

References WaveData::Data(), VectorBase< Real >::Dim(), SequentialTableReader< Holder >::Done(), ParseOptions::GetArg(), RandomAccessTableReader< Holder >::HasKey(), rnnlm::i, OnlineNnet2FeaturePipelineInfo::ivector_extractor_info, OnlineNnet2FeaturePipelineInfo::IvectorDim(), KALDI_LOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), kaldi::kUndefined, SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), ParseOptions::PrintUsage(), ParseOptions::Read(), ParseOptions::Register(), OnlineNnet2FeaturePipelineConfig::Register(), MatrixBase< Real >::Row(), WaveData::SampFreq(), RandomAccessTableReader< Holder >::Value(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

30  {
31  try {
32  using namespace kaldi;
33  using namespace fst;
34 
35  typedef kaldi::int32 int32;
36  typedef kaldi::int64 int64;
37 
38  const char *usage =
39  "Reads in wav file(s) and processes them as in online2-wav-nnet2-latgen-faster,\n"
40  "but instead of decoding, dumps the features. Most of the parameters\n"
41  "are set via configuration variables.\n"
42  "\n"
43  "Usage: online2-wav-dump-features [options] <spk2utt-rspecifier> <wav-rspecifier> <feature-wspecifier>\n"
44  "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
45  "you want to generate features utterance by utterance.\n"
46  "Alternate usage: online2-wav-dump-features [options] --print-ivector-dim=true\n"
47  "See steps/online/nnet2/{dump_nnet_activations,get_egs.sh} for examples.\n";
48 
49  ParseOptions po(usage);
50 
51  // feature_config includes configuration for the iVector adaptation,
52  // as well as the basic features.
53  OnlineNnet2FeaturePipelineConfig feature_config;
54  BaseFloat chunk_length_secs = 0.05;
55  bool print_ivector_dim = false;
56 
57  po.Register("chunk-length", &chunk_length_secs,
58  "Length of chunk size in seconds, that we process.");
59  po.Register("print-ivector-dim", &print_ivector_dim,
60  "If true, print iVector dimension (possibly zero) and exit. This "
61  "version requires no arguments.");
62 
63  feature_config.Register(&po);
64 
65  po.Read(argc, argv);
66 
67  if (!print_ivector_dim && po.NumArgs() != 3) {
68  po.PrintUsage();
69  return 1;
70  }
71 
72  OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
73 
74  if (print_ivector_dim) {
75  std::cout << feature_info.IvectorDim() << std::endl;
76  exit(0);
77  }
78 
79  std::string spk2utt_rspecifier = po.GetArg(1),
80  wav_rspecifier = po.GetArg(2),
81  feats_wspecifier = po.GetArg(3);
82 
83 
84  int32 num_done = 0, num_err = 0;
85  int64 num_frames_tot = 0;
86 
87  SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
88  RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
89  BaseFloatMatrixWriter feats_writer(feats_wspecifier);
90 
91  for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
92  std::string spk = spk2utt_reader.Key();
93  const std::vector<std::string> &uttlist = spk2utt_reader.Value();
95  feature_info.ivector_extractor_info);
96  for (size_t i = 0; i < uttlist.size(); i++) {
97  std::string utt = uttlist[i];
98  if (!wav_reader.HasKey(utt)) {
99  KALDI_WARN << "Did not find audio for utterance " << utt;
100  num_err++;
101  continue;
102  }
103  const WaveData &wave_data = wav_reader.Value(utt);
104  // get the data for channel zero (if the signal is not mono, we only
105  // take the first channel).
106  SubVector<BaseFloat> data(wave_data.Data(), 0);
107 
108  OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
109  feature_pipeline.SetAdaptationState(adaptation_state);
110 
111  std::vector<Vector<BaseFloat> *> feature_data;
112 
113  // We retrieve data from the feature pipeline while adding the wav data bit
114  // by bit... for features like pitch features, this may make a
115  // difference to what we get, and we want to make sure that the data we
116  // get it exactly compatible with online decoding.
117 
118  BaseFloat samp_freq = wave_data.SampFreq();
119  int32 chunk_length = int32(samp_freq * chunk_length_secs);
120  if (chunk_length == 0) chunk_length = 1;
121 
122  int32 samp_offset = 0;
123  while (samp_offset < data.Dim()) {
124  int32 samp_remaining = data.Dim() - samp_offset;
125  int32 num_samp = chunk_length < samp_remaining ? chunk_length
126  : samp_remaining;
127 
128  SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
129  feature_pipeline.AcceptWaveform(samp_freq, wave_part);
130  samp_offset += num_samp;
131  if (samp_offset == data.Dim()) // no more input. flush out last frames
132  feature_pipeline.InputFinished();
133 
134  while (static_cast<int32>(feature_data.size()) <
135  feature_pipeline.NumFramesReady()) {
136  int32 t = static_cast<int32>(feature_data.size());
137  feature_data.push_back(new Vector<BaseFloat>(feature_pipeline.Dim(),
138  kUndefined));
139  feature_pipeline.GetFrame(t, feature_data.back());
140  }
141  }
142  int32 T = static_cast<int32>(feature_data.size());
143  if (T == 0) {
144  KALDI_WARN << "Got no frames of data for utterance " << utt;
145  num_err++;
146  continue;
147  }
148  Matrix<BaseFloat> feats(T, feature_pipeline.Dim());
149  for (int32 t = 0; t < T; t++) {
150  feats.Row(t).CopyFromVec(*(feature_data[t]));
151  delete feature_data[t];
152  }
153  num_frames_tot += T;
154  feats_writer.Write(utt, feats);
155  feature_pipeline.GetAdaptationState(&adaptation_state);
156  num_done++;
157  }
158  }
159  KALDI_LOG << "Processed " << num_done << " utterances, "
160  << num_err << " with errors; " << num_frames_tot
161  << " frames in total.";
162  return (num_done != 0 ? 0 : 1);
163  } catch(const std::exception& e) {
164  std::cerr << e.what();
165  return -1;
166  }
167 } // main()
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which in turn is the configurat...
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
Definition: graph.dox:21
This class stores the adaptation state from the online iVector extractor, which can help you to initi...
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
BaseFloat SampFreq() const
Definition: wave-reader.h:126
const Matrix< BaseFloat > & Data() const
Definition: wave-reader.h:124
This class is responsible for storing configuration variables, objects and options for OnlineNnet2Fea...
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
#define KALDI_WARN
Definition: kaldi-error.h:150
This class&#39;s purpose is to read in Wave files.
Definition: wave-reader.h:106
A class representing a vector.
Definition: kaldi-vector.h:406
OnlineNnet2FeaturePipeline is a class that&#39;s responsible for putting together the various parts of th...
#define KALDI_LOG
Definition: kaldi-error.h:153
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501