28 using namespace kaldi;
30 "Create MFCC feature files.\n" 31 "Usage: compute-mfcc-feats [options...] <wav-rspecifier> " 32 "<feats-wspecifier>\n";
38 bool subtract_mean =
false;
40 std::string vtln_map_rspecifier;
41 std::string utt2spk_rspecifier;
44 std::string output_format =
"kaldi";
45 std::string utt2dur_wspecifier;
51 po.Register(
"output-format", &output_format,
"Format of the output " 52 "files [kaldi, htk]");
53 po.Register(
"subtract-mean", &subtract_mean,
"Subtract mean of each " 54 "feature file [CMS]; not recommended to do it this way. ");
55 po.Register(
"vtln-warp", &vtln_warp,
"Vtln warp factor (only applicable " 56 "if vtln-map not specified)");
57 po.Register(
"vtln-map", &vtln_map_rspecifier,
"Map from utterance or " 58 "speaker-id to vtln warp factor (rspecifier)");
59 po.Register(
"utt2spk", &utt2spk_rspecifier,
"Utterance to speaker-id map " 60 "rspecifier (if doing VTLN and you have warps per speaker)");
61 po.Register(
"channel", &channel,
"Channel to extract (-1 -> expect mono, " 62 "0 -> left, 1 -> right)");
63 po.Register(
"min-duration", &min_duration,
"Minimum duration of segments " 64 "to process (in seconds).");
65 po.Register(
"write-utt2dur", &utt2dur_wspecifier,
"Wspecifier to write " 66 "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
70 if (po.NumArgs() != 2) {
75 std::string wav_rspecifier = po.GetArg(1);
77 std::string output_wspecifier = po.GetArg(2);
81 if (utt2spk_rspecifier !=
"" && vtln_map_rspecifier ==
"")
82 KALDI_ERR << (
"The --utt2spk option is only needed if " 83 "the --vtln-map option is used.");
91 if (output_format ==
"kaldi") {
92 if (!kaldi_writer.
Open(output_wspecifier))
93 KALDI_ERR <<
"Could not initialize output with wspecifier " 95 }
else if (output_format ==
"htk") {
96 if (!htk_writer.
Open(output_wspecifier))
97 KALDI_ERR <<
"Could not initialize output with wspecifier " 100 KALDI_ERR <<
"Invalid output_format string " << output_format;
105 int32 num_utts = 0, num_success = 0;
106 for (; !reader.Done(); reader.Next()) {
108 std::string utt = reader.Key();
109 const WaveData &wave_data = reader.Value();
110 if (wave_data.
Duration() < min_duration) {
111 KALDI_WARN <<
"File: " << utt <<
" is too short (" 112 << wave_data.
Duration() <<
" sec): producing no output.";
122 KALDI_WARN <<
"Channel not specified but you have data with " 123 << num_chan <<
" channels; defaulting to zero";
125 if (this_chan >= num_chan) {
126 KALDI_WARN <<
"File with id " << utt <<
" has " 127 << num_chan <<
" channels but you specified channel " 128 << channel <<
", producing no output.";
134 if (vtln_map_rspecifier !=
"") {
135 if (!vtln_map_reader.HasKey(utt)) {
136 KALDI_WARN <<
"No vtln-map entry for utterance-id (or speaker-id) " 140 vtln_warp_local = vtln_map_reader.Value(utt);
142 vtln_warp_local = vtln_warp;
148 mfcc.ComputeFeatures(waveform, wave_data.
SampFreq(),
149 vtln_warp_local, &features);
151 KALDI_WARN <<
"Failed to compute features for utterance " << utt;
157 mean.Scale(1.0 / features.NumRows());
158 for (
int32 i = 0;
i < features.NumRows();
i++)
159 features.Row(
i).
AddVec(-1.0, mean);
161 if (output_format ==
"kaldi") {
162 kaldi_writer.
Write(utt, features);
164 std::pair<Matrix<BaseFloat>,
HtkHeader> p;
165 p.first.Resize(features.NumRows(), features.NumCols());
166 p.first.CopyFromMat(features);
170 static_cast<int16
>(
sizeof(
float)*(features.NumCols())),
171 static_cast<uint16
>( 006 |
175 htk_writer.
Write(utt, p);
177 if (utt2dur_writer.IsOpen()) {
178 utt2dur_writer.Write(utt, wave_data.
Duration());
180 if (num_utts % 10 == 0)
181 KALDI_LOG <<
"Processed " << num_utts <<
" utterances";
182 KALDI_VLOG(2) <<
"Processed features for key " << utt;
185 KALDI_LOG <<
" Done " << num_success <<
" out of " << num_utts
187 return (num_success != 0 ? 0 : 1);
188 }
catch(
const std::exception &e) {
189 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
bool Open(const std::string &wspecifier)
void AddRowSumMat(Real alpha, const MatrixBase< Real > &M, Real beta=1.0)
Does *this = alpha * (sum of rows of M) + beta * *this.
MfccOptions contains basic options for computing MFCC features.
This class is for when you are reading something in random access, but it may actually be stored per-...
A templated class for writing objects to an archive or script file; see The Table concept...
BaseFloat SampFreq() const
const Matrix< BaseFloat > & Data() const
void Write(const std::string &key, const T &value) const
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
This class's purpose is to read in Wave files.
A class representing a vector.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
void Register(OptionsItf *opts)
BaseFloat Duration() const
This templated class is intended for offline feature extraction, i.e.
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...