26 int main(
int argc,
char *argv[]) {
28 using namespace kaldi;
30 "Create spectrogram feature files.\n" 31 "Usage: compute-spectrogram-feats [options...] <wav-rspecifier> " 32 "<feats-wspecifier>\n";
38 bool subtract_mean =
false;
41 std::string output_format =
"kaldi";
42 std::string utt2dur_wspecifier;
47 po.
Register(
"output-format", &output_format,
48 "Format of the output files [kaldi, htk]");
49 po.
Register(
"subtract-mean", &subtract_mean,
"Subtract mean of each " 50 "feature file [CMS]; not recommended to do it this way. ");
51 po.
Register(
"channel", &channel,
"Channel to extract (-1 -> expect mono, " 52 "0 -> left, 1 -> right)");
53 po.
Register(
"min-duration", &min_duration,
"Minimum duration of segments " 54 "to process (in seconds).");
55 po.
Register(
"write-utt2dur", &utt2dur_wspecifier,
"Wspecifier to write " 56 "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
65 std::string wav_rspecifier = po.
GetArg(1);
67 std::string output_wspecifier = po.
GetArg(2);
75 if (output_format ==
"kaldi") {
76 if (!kaldi_writer.
Open(output_wspecifier))
77 KALDI_ERR <<
"Could not initialize output with wspecifier " 79 }
else if (output_format ==
"htk") {
80 if (!htk_writer.
Open(output_wspecifier))
81 KALDI_ERR <<
"Could not initialize output with wspecifier " 84 KALDI_ERR <<
"Invalid output_format string " << output_format;
89 int32 num_utts = 0, num_success = 0;
90 for (; !reader.
Done(); reader.
Next()) {
92 std::string utt = reader.
Key();
94 if (wave_data.
Duration() < min_duration) {
95 KALDI_WARN <<
"File: " << utt <<
" is too short (" 96 << wave_data.
Duration() <<
" sec): producing no output.";
106 KALDI_WARN <<
"Channel not specified but you have data with " 107 << num_chan <<
" channels; defaulting to zero";
109 if (this_chan >= num_chan) {
110 KALDI_WARN <<
"File with id " << utt <<
" has " 111 << num_chan <<
" channels but you specified channel " 112 << channel <<
", producing no output.";
123 KALDI_WARN <<
"Failed to compute features for utterance " << utt;
129 mean.Scale(1.0 / features.NumRows());
130 for (
int32 i = 0;
i < features.NumRows();
i++)
131 features.Row(
i).
AddVec(-1.0, mean);
133 if (output_format ==
"kaldi") {
134 kaldi_writer.
Write(utt, features);
136 std::pair<Matrix<BaseFloat>,
HtkHeader> p;
137 p.first.Resize(features.NumRows(), features.NumCols());
138 p.first.CopyFromMat(features);
143 static_cast<int16
>(
sizeof(
float)*features.NumCols()),
147 htk_writer.
Write(utt, p);
149 if (utt2dur_writer.
IsOpen()) {
152 if(num_utts % 10 == 0)
153 KALDI_LOG <<
"Processed " << num_utts <<
" utterances";
154 KALDI_VLOG(2) <<
"Processed features for key " << utt;
157 KALDI_LOG <<
" Done " << num_success <<
" out of " << num_utts
159 return (num_success != 0 ? 0 : 1);
160 }
catch(
const std::exception& e) {
161 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
bool Open(const std::string &wspecifier)
void AddRowSumMat(Real alpha, const MatrixBase< Real > &M, Real beta=1.0)
Does *this = alpha * (sum of rows of M) + beta * *this.
void ComputeFeatures(const VectorBase< BaseFloat > &wave, BaseFloat sample_freq, BaseFloat vtln_warp, Matrix< BaseFloat > *output)
Computes the features for one file (one sequence of features).
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
BaseFloat SampFreq() const
void Register(OptionsItf *opts)
const Matrix< BaseFloat > & Data() const
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
This class's purpose is to read in Wave files.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
A class representing a vector.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
BaseFloat Duration() const
This templated class is intended for offline feature extraction, i.e.
SpectrogramOptions contains basic options for computing spectrogram features.
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
int main(int argc, char *argv[])
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
FrameExtractionOptions frame_opts