27 int main(
int argc,
char *argv[]) {
29 using namespace kaldi;
31 "Create Mel-filter bank (FBANK) feature files.\n" 32 "Usage: compute-fbank-feats [options...] <wav-rspecifier> " 33 "<feats-wspecifier>\n";
39 bool subtract_mean =
false;
41 std::string vtln_map_rspecifier;
42 std::string utt2spk_rspecifier;
45 std::string output_format =
"kaldi";
46 std::string utt2dur_wspecifier;
51 po.
Register(
"output-format", &output_format,
52 "Format of the output files [kaldi, htk]");
53 po.
Register(
"subtract-mean", &subtract_mean,
"Subtract mean of each " 54 "feature file [CMS]; not recommended to do it this way. ");
56 "Vtln warp factor (only applicable if vtln-map not specified)");
57 po.
Register(
"vtln-map", &vtln_map_rspecifier,
"Map from utterance or " 58 "speaker-id to vtln warp factor (rspecifier)");
59 po.
Register(
"utt2spk", &utt2spk_rspecifier,
"Utterance to speaker-id map " 60 "(if doing VTLN and you have warps per speaker)");
61 po.
Register(
"channel", &channel,
"Channel to extract (-1 -> expect mono, " 62 "0 -> left, 1 -> right)");
63 po.
Register(
"min-duration", &min_duration,
"Minimum duration of segments " 64 "to process (in seconds).");
65 po.
Register(
"write-utt2dur", &utt2dur_wspecifier,
"Wspecifier to write " 66 "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
75 std::string wav_rspecifier = po.
GetArg(1);
77 std::string output_wspecifier = po.
GetArg(2);
79 Fbank fbank(fbank_opts);
81 if (utt2spk_rspecifier !=
"" && vtln_map_rspecifier !=
"")
82 KALDI_ERR << (
"The --utt2spk option is only needed if " 83 "the --vtln-map option is used.");
91 if (output_format ==
"kaldi") {
92 if (!kaldi_writer.
Open(output_wspecifier))
93 KALDI_ERR <<
"Could not initialize output with wspecifier " 95 }
else if (output_format ==
"htk") {
96 if (!htk_writer.
Open(output_wspecifier))
97 KALDI_ERR <<
"Could not initialize output with wspecifier " 100 KALDI_ERR <<
"Invalid output_format string " << output_format;
105 int32 num_utts = 0, num_success = 0;
106 for (; !reader.
Done(); reader.
Next()) {
108 std::string utt = reader.
Key();
110 if (wave_data.
Duration() < min_duration) {
111 KALDI_WARN <<
"File: " << utt <<
" is too short (" 112 << wave_data.
Duration() <<
" sec): producing no output.";
122 KALDI_WARN <<
"Channel not specified but you have data with " 123 << num_chan <<
" channels; defaulting to zero";
125 if (this_chan >= num_chan) {
126 KALDI_WARN <<
"File with id " << utt <<
" has " 127 << num_chan <<
" channels but you specified channel " 128 << channel <<
", producing no output.";
134 if (vtln_map_rspecifier !=
"") {
135 if (!vtln_map_reader.HasKey(utt)) {
136 KALDI_WARN <<
"No vtln-map entry for utterance-id (or speaker-id) " 140 vtln_warp_local = vtln_map_reader.Value(utt);
142 vtln_warp_local = vtln_warp;
149 vtln_warp_local, &features);
151 KALDI_WARN <<
"Failed to compute features for utterance " << utt;
157 mean.Scale(1.0 / features.NumRows());
158 for (
int32 i = 0;
i < features.NumRows();
i++)
159 features.Row(
i).
AddVec(-1.0, mean);
161 if (output_format ==
"kaldi") {
162 kaldi_writer.
Write(utt, features);
164 std::pair<Matrix<BaseFloat>,
HtkHeader> p;
165 p.first.Resize(features.NumRows(), features.NumCols());
166 p.first.CopyFromMat(features);
170 static_cast<int16
>(
sizeof(
float)*features.NumCols()),
171 static_cast<uint16>(007 |
175 htk_writer.
Write(utt, p);
177 if (utt2dur_writer.
IsOpen()) {
180 if (num_utts % 10 == 0)
181 KALDI_LOG <<
"Processed " << num_utts <<
" utterances";
182 KALDI_VLOG(2) <<
"Processed features for key " << utt;
185 KALDI_LOG <<
" Done " << num_success <<
" out of " << num_utts
187 return (num_success != 0 ? 0 : 1);
188 }
catch(
const std::exception &e) {
189 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
bool Open(const std::string &wspecifier)
void Register(OptionsItf *opts)
void AddRowSumMat(Real alpha, const MatrixBase< Real > &M, Real beta=1.0)
Does *this = alpha * (sum of rows of M) + beta * *this.
void ComputeFeatures(const VectorBase< BaseFloat > &wave, BaseFloat sample_freq, BaseFloat vtln_warp, Matrix< BaseFloat > *output)
Computes the features for one file (one sequence of features).
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
This class is for when you are reading something in random access, but it may actually be stored per-...
A templated class for writing objects to an archive or script file; see The Table concept...
BaseFloat SampFreq() const
const Matrix< BaseFloat > & Data() const
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
This class's purpose is to read in Wave files.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
int main(int argc, char *argv[])
A class representing a vector.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
BaseFloat Duration() const
This templated class is intended for offline feature extraction, i.e.
FbankOptions contains basic options for computing filterbank features.
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...