compute-spectrogram-feats.cc
Go to the documentation of this file.
1 // featbin/compute-spectrogram-feats.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include "base/kaldi-common.h"
22 #include "feat/wave-reader.h"
23 #include "util/common-utils.h"
24 
25 
26 int main(int argc, char *argv[]) {
27  try {
28  using namespace kaldi;
29  const char *usage =
30  "Create spectrogram feature files.\n"
31  "Usage: compute-spectrogram-feats [options...] <wav-rspecifier> "
32  "<feats-wspecifier>\n";
33 
34  // Construct all the global objects.
35  ParseOptions po(usage);
36  SpectrogramOptions spec_opts;
37  // Define defaults for global options.
38  bool subtract_mean = false;
39  int32 channel = -1;
40  BaseFloat min_duration = 0.0;
41  std::string output_format = "kaldi";
42  std::string utt2dur_wspecifier;
43 
44  // Register the option struct
45  spec_opts.Register(&po);
46  // Register the options
47  po.Register("output-format", &output_format,
48  "Format of the output files [kaldi, htk]");
49  po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
50  "feature file [CMS]; not recommended to do it this way. ");
51  po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
52  "0 -> left, 1 -> right)");
53  po.Register("min-duration", &min_duration, "Minimum duration of segments "
54  "to process (in seconds).");
55  po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
56  "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
57 
58  po.Read(argc, argv);
59 
60  if (po.NumArgs() != 2) {
61  po.PrintUsage();
62  exit(1);
63  }
64 
65  std::string wav_rspecifier = po.GetArg(1);
66 
67  std::string output_wspecifier = po.GetArg(2);
68 
69  Spectrogram spec(spec_opts);
70 
71  SequentialTableReader<WaveHolder> reader(wav_rspecifier);
72  BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter<something>.
74 
75  if (output_format == "kaldi") {
76  if (!kaldi_writer.Open(output_wspecifier))
77  KALDI_ERR << "Could not initialize output with wspecifier "
78  << output_wspecifier;
79  } else if (output_format == "htk") {
80  if (!htk_writer.Open(output_wspecifier))
81  KALDI_ERR << "Could not initialize output with wspecifier "
82  << output_wspecifier;
83  } else {
84  KALDI_ERR << "Invalid output_format string " << output_format;
85  }
86 
87  DoubleWriter utt2dur_writer(utt2dur_wspecifier);
88 
89  int32 num_utts = 0, num_success = 0;
90  for (; !reader.Done(); reader.Next()) {
91  num_utts++;
92  std::string utt = reader.Key();
93  const WaveData &wave_data = reader.Value();
94  if (wave_data.Duration() < min_duration) {
95  KALDI_WARN << "File: " << utt << " is too short ("
96  << wave_data.Duration() << " sec): producing no output.";
97  continue;
98  }
99  int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
100  { // This block works out the channel (0=left, 1=right...)
101  KALDI_ASSERT(num_chan > 0); // should have been caught in
102  // reading code if no channels.
103  if (channel == -1) {
104  this_chan = 0;
105  if (num_chan != 1)
106  KALDI_WARN << "Channel not specified but you have data with "
107  << num_chan << " channels; defaulting to zero";
108  } else {
109  if (this_chan >= num_chan) {
110  KALDI_WARN << "File with id " << utt << " has "
111  << num_chan << " channels but you specified channel "
112  << channel << ", producing no output.";
113  continue;
114  }
115  }
116  }
117 
118  SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
119  Matrix<BaseFloat> features;
120  try {
121  spec.ComputeFeatures(waveform, wave_data.SampFreq(), 1.0, &features);
122  } catch (...) {
123  KALDI_WARN << "Failed to compute features for utterance " << utt;
124  continue;
125  }
126  if (subtract_mean) {
127  Vector<BaseFloat> mean(features.NumCols());
128  mean.AddRowSumMat(1.0, features);
129  mean.Scale(1.0 / features.NumRows());
130  for (int32 i = 0; i < features.NumRows(); i++)
131  features.Row(i).AddVec(-1.0, mean);
132  }
133  if (output_format == "kaldi") {
134  kaldi_writer.Write(utt, features);
135  } else {
136  std::pair<Matrix<BaseFloat>, HtkHeader> p;
137  p.first.Resize(features.NumRows(), features.NumCols());
138  p.first.CopyFromMat(features);
139  int32 frame_shift = spec_opts.frame_opts.frame_shift_ms * 10000;
140  HtkHeader header = {
141  features.NumRows(),
142  frame_shift,
143  static_cast<int16>(sizeof(float)*features.NumCols()),
144  007 | 020000
145  };
146  p.second = header;
147  htk_writer.Write(utt, p);
148  }
149  if (utt2dur_writer.IsOpen()) {
150  utt2dur_writer.Write(utt, wave_data.Duration());
151  }
152  if(num_utts % 10 == 0)
153  KALDI_LOG << "Processed " << num_utts << " utterances";
154  KALDI_VLOG(2) << "Processed features for key " << utt;
155  num_success++;
156  }
157  KALDI_LOG << " Done " << num_success << " out of " << num_utts
158  << " utterances.";
159  return (num_success != 0 ? 0 : 1);
160  } catch(const std::exception& e) {
161  std::cerr << e.what();
162  return -1;
163  }
164 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool Open(const std::string &wspecifier)
void AddRowSumMat(Real alpha, const MatrixBase< Real > &M, Real beta=1.0)
Does *this = alpha * (sum of rows of M) + beta * *this.
void ComputeFeatures(const VectorBase< BaseFloat > &wave, BaseFloat sample_freq, BaseFloat vtln_warp, Matrix< BaseFloat > *output)
Computes the features for one file (one sequence of features).
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
BaseFloat SampFreq() const
Definition: wave-reader.h:126
void Register(OptionsItf *opts)
const Matrix< BaseFloat > & Data() const
Definition: wave-reader.h:124
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
This class&#39;s purpose is to read in Wave files.
Definition: wave-reader.h:106
int NumArgs() const
Number of positional parameters (c.f. argc-1).
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
BaseFloat Duration() const
Definition: wave-reader.h:129
This templated class is intended for offline feature extraction, i.e.
SpectrogramOptions contains basic options for computing spectrogram features.
#define KALDI_LOG
Definition: kaldi-error.h:153
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
int main(int argc, char *argv[])
A structure containing the HTK header.
Definition: kaldi-matrix.h:955
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
FrameExtractionOptions frame_opts