29 using namespace kaldi;
31 "Create Mel-filter bank (FBANK) feature files.\n" 32 "Usage: compute-fbank-feats [options...] <wav-rspecifier> " 33 "<feats-wspecifier>\n";
39 bool subtract_mean =
false;
41 std::string vtln_map_rspecifier;
42 std::string utt2spk_rspecifier;
45 std::string output_format =
"kaldi";
46 std::string utt2dur_wspecifier;
51 po.Register(
"output-format", &output_format,
52 "Format of the output files [kaldi, htk]");
53 po.Register(
"subtract-mean", &subtract_mean,
"Subtract mean of each " 54 "feature file [CMS]; not recommended to do it this way. ");
55 po.Register(
"vtln-warp", &vtln_warp,
56 "Vtln warp factor (only applicable if vtln-map not specified)");
57 po.Register(
"vtln-map", &vtln_map_rspecifier,
"Map from utterance or " 58 "speaker-id to vtln warp factor (rspecifier)");
59 po.Register(
"utt2spk", &utt2spk_rspecifier,
"Utterance to speaker-id map " 60 "(if doing VTLN and you have warps per speaker)");
61 po.Register(
"channel", &channel,
"Channel to extract (-1 -> expect mono, " 62 "0 -> left, 1 -> right)");
63 po.Register(
"min-duration", &min_duration,
"Minimum duration of segments " 64 "to process (in seconds).");
65 po.Register(
"write-utt2dur", &utt2dur_wspecifier,
"Wspecifier to write " 66 "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
70 if (po.NumArgs() != 2) {
75 std::string wav_rspecifier = po.GetArg(1);
77 std::string output_wspecifier = po.GetArg(2);
79 Fbank fbank(fbank_opts);
81 if (utt2spk_rspecifier !=
"" && vtln_map_rspecifier !=
"")
82 KALDI_ERR << (
"The --utt2spk option is only needed if " 83 "the --vtln-map option is used.");
91 if (output_format ==
"kaldi") {
92 if (!kaldi_writer.
Open(output_wspecifier))
93 KALDI_ERR <<
"Could not initialize output with wspecifier " 95 }
else if (output_format ==
"htk") {
96 if (!htk_writer.
Open(output_wspecifier))
97 KALDI_ERR <<
"Could not initialize output with wspecifier " 100 KALDI_ERR <<
"Invalid output_format string " << output_format;
105 int32 num_utts = 0, num_success = 0;
106 for (; !reader.Done(); reader.Next()) {
108 std::string utt = reader.Key();
109 const WaveData &wave_data = reader.Value();
110 if (wave_data.
Duration() < min_duration) {
111 KALDI_WARN <<
"File: " << utt <<
" is too short (" 112 << wave_data.
Duration() <<
" sec): producing no output.";
122 KALDI_WARN <<
"Channel not specified but you have data with " 123 << num_chan <<
" channels; defaulting to zero";
125 if (this_chan >= num_chan) {
126 KALDI_WARN <<
"File with id " << utt <<
" has " 127 << num_chan <<
" channels but you specified channel " 128 << channel <<
", producing no output.";
134 if (vtln_map_rspecifier !=
"") {
135 if (!vtln_map_reader.HasKey(utt)) {
136 KALDI_WARN <<
"No vtln-map entry for utterance-id (or speaker-id) " 140 vtln_warp_local = vtln_map_reader.Value(utt);
142 vtln_warp_local = vtln_warp;
148 fbank.ComputeFeatures(waveform, wave_data.
SampFreq(),
149 vtln_warp_local, &features);
151 KALDI_WARN <<
"Failed to compute features for utterance " << utt;
157 mean.Scale(1.0 / features.NumRows());
158 for (
int32 i = 0;
i < features.NumRows();
i++)
159 features.Row(
i).
AddVec(-1.0, mean);
161 if (output_format ==
"kaldi") {
162 kaldi_writer.
Write(utt, features);
164 std::pair<Matrix<BaseFloat>,
HtkHeader> p;
165 p.first.Resize(features.NumRows(), features.NumCols());
166 p.first.CopyFromMat(features);
170 static_cast<int16
>(
sizeof(
float)*features.NumCols()),
171 static_cast<uint16>(007 |
175 htk_writer.
Write(utt, p);
177 if (utt2dur_writer.IsOpen()) {
178 utt2dur_writer.Write(utt, wave_data.
Duration());
180 if (num_utts % 10 == 0)
181 KALDI_LOG <<
"Processed " << num_utts <<
" utterances";
182 KALDI_VLOG(2) <<
"Processed features for key " << utt;
185 KALDI_LOG <<
" Done " << num_success <<
" out of " << num_utts
187 return (num_success != 0 ? 0 : 1);
188 }
catch(
const std::exception &e) {
189 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
bool Open(const std::string &wspecifier)
void Register(OptionsItf *opts)
void AddRowSumMat(Real alpha, const MatrixBase< Real > &M, Real beta=1.0)
Does *this = alpha * (sum of rows of M) + beta * *this.
This class is for when you are reading something in random access, but it may actually be stored per-...
A templated class for writing objects to an archive or script file; see The Table concept...
BaseFloat SampFreq() const
const Matrix< BaseFloat > & Data() const
void Write(const std::string &key, const T &value) const
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
This class's purpose is to read in Wave files.
A class representing a vector.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
BaseFloat Duration() const
This templated class is intended for offline feature extraction, i.e.
FbankOptions contains basic options for computing filterbank features.
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...