extend-wav-with-silence.cc File Reference
Include dependency graph for extend-wav-with-silence.cc:

Go to the source code of this file.

Namespaces

 kaldi
 This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for mispronunciations detection tasks, the reference:
 

Functions

void FindQuietestSegment (const Vector< BaseFloat > &wav_in, BaseFloat samp_rate, Vector< BaseFloat > *wav_sil, BaseFloat search_dur=0.5, BaseFloat seg_dur=0.1, BaseFloat seg_shift_dur=0.05)
 
void ExtendWaveWithSilence (const Vector< BaseFloat > &wav_in, BaseFloat samp_rate, Vector< BaseFloat > *wav_out, BaseFloat sil_search_len, BaseFloat sil_extract_len, BaseFloat sil_extract_shift)
 
int main (int argc, char *argv[])
 

Function Documentation

◆ main()

int main ( int  argc,
char *  argv[] 
)

Definition at line 43 of file extend-wav-with-silence.cc.

References kaldi::ClassifyRspecifier(), WaveData::Data(), kaldi::ExtendWaveWithSilence(), ParseOptions::GetArg(), rnnlm::i, KALDI_ASSERT, KALDI_ERR, KALDI_LOG, kaldi::kNoRspecifier, ParseOptions::NumArgs(), MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), kaldi::PrintableRxfilename(), kaldi::PrintableWxfilename(), ParseOptions::PrintUsage(), ParseOptions::Read(), WaveHolder::Read(), ParseOptions::Register(), MatrixBase< Real >::Row(), WaveData::SampFreq(), Output::Stream(), Input::Stream(), WaveHolder::Value(), and WaveHolder::Write().

43  {
44  try {
45  typedef kaldi::int32 int32;
46  using namespace kaldi;
47  const char *usage =
48  "Extend wave data with a fairly long silence at the end (e.g. 5 seconds).\n"
49  "The input waveforms are assumed having silences at the begin/end and those\n"
50  "segments are extracted and appended to the end of the utterance.\n"
51  "Note this is for use in testing endpointing in decoding.\n"
52  "\n"
53  "Usage: extend-wav-with-silence [options] <wav-rspecifier> <wav-wspecifier>\n"
54  " extend-wav-with-silence [options] <wav-rxfilename> <wav-wxfilename>\n";
55 
56  ParseOptions po(usage);
57  BaseFloat sil_len = 5.0,
58  sil_search_len = 0.5,
59  sil_extract_len = 0.05,
60  sil_extract_shift = 0.025;
61  po.Register("extra-silence-length", &sil_len, "the length of silence that will be "
62  "appended to the end of each waveform, in seconds.");
63  po.Register("silence-search-length", &sil_search_len, "the length at the beginning "
64  "or end of each waveform in which to search for the quietest segment of "
65  "silence, in seconds.");
66  po.Register("silence-extract-length", &sil_extract_len, "the length of silence segments "
67  "to be extracted from the waveform, which must be smaller than silence-"
68  "search-length, in seconds.");
69  po.Register("silence-extract-shift", &sil_extract_shift, "the shift length when searching "
70  "for segments of silences, typically samller than silence-extract-length, "
71  "in seconds.");
72 
73  po.Read(argc, argv);
74 
75  if (po.NumArgs() != 2) {
76  po.PrintUsage();
77  exit(1);
78  }
79 
80  if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
81  SequentialTableReader<WaveHolder> reader(po.GetArg(1));
82  TableWriter<WaveHolder> writer(po.GetArg(2));
83  int32 num_success = 0;
84 
85  for(; !reader.Done(); reader.Next()){
86  std::string wav_key = reader.Key();
87  const WaveData &wave = reader.Value();
88  BaseFloat samp_freq = wave.SampFreq(); // read sampling fequency
89  const Matrix<BaseFloat> &wave_data = wave.Data();
90  int32 num_chan = wave_data.NumRows(), // number of channels in recording
91  num_ext_samp = (int32)(samp_freq * sil_len); // number of samples that will be extended
92  KALDI_ASSERT(num_ext_samp > 0);
93  Matrix<BaseFloat> new_wave(wave_data.NumRows(), wave_data.NumCols() + num_ext_samp);
94  for(int32 i = 0; i < num_chan; i++){
95  Vector<BaseFloat> wav_this_chan(wave_data.Row(i));
96  Vector<BaseFloat> wav_extend(wav_this_chan.Dim() + num_ext_samp);
97  ExtendWaveWithSilence(wav_this_chan, samp_freq, &wav_extend,
98  sil_search_len, sil_extract_len, sil_extract_shift);
99  KALDI_ASSERT(wav_extend.Dim() == wav_this_chan.Dim() + num_ext_samp);
100  new_wave.CopyRowFromVec(wav_extend, i);
101  }
102  WaveData wave_out(samp_freq, new_wave);
103  writer.Write(wav_key, wave_out);
104  num_success++;
105  }
106  KALDI_LOG << "Successfully extended " << num_success << " files.";
107  return 0;
108  } else {
109  std::string wav_rxfilename = po.GetArg(1);
110  std::string wav_wxfilename = po.GetArg(2);
111  bool binary = true;
112  Input ki(wav_rxfilename, &binary);
113  WaveHolder wh;
114  if (!wh.Read(ki.Stream())) {
115  KALDI_ERR << "Read failure from "
116  << PrintableRxfilename(wav_rxfilename);
117  }
118 
119  const WaveData& wave = wh.Value();
120 
121  BaseFloat samp_freq = wave.SampFreq(); // read sampling fequency
122  const Matrix<BaseFloat> &wave_data = wave.Data();
123  int32 num_chan = wave_data.NumRows(), // number of channels in recording
124  num_ext_samp = (int32)(samp_freq * sil_len); // number of samples that will be extended
125  KALDI_ASSERT(num_ext_samp > 0);
126  Matrix<BaseFloat> new_wave(wave_data.NumRows(), wave_data.NumCols() + num_ext_samp);
127  for(int32 i = 0; i < num_chan; i++){
128  Vector<BaseFloat> wav_this_chan(wave_data.Row(i));
129  Vector<BaseFloat> wav_extend(wav_this_chan.Dim() + num_ext_samp);
130  ExtendWaveWithSilence(wav_this_chan, samp_freq, &wav_extend,
131  sil_search_len, sil_extract_len, sil_extract_shift);
132  KALDI_ASSERT(wav_extend.Dim() == wav_this_chan.Dim() + num_ext_samp);
133  new_wave.CopyRowFromVec(wav_extend, i);
134  }
135  WaveData wave_out(samp_freq, new_wave);
136 
137  Output ko(wav_wxfilename, binary, false);
138  if (!WaveHolder::Write(ko.Stream(), true, wave_out)) {
139  KALDI_ERR << "Write failure to "
140  << PrintableWxfilename(wav_wxfilename);
141  }
142  // we do not print any log messages here
143  }
144  } catch(const std::exception &e) {
145  std::cerr << e.what();
146  return -1;
147  }
148 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool Read(std::istream &is)
Definition: wave-reader.h:191
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
BaseFloat SampFreq() const
Definition: wave-reader.h:126
const Matrix< BaseFloat > & Data() const
Definition: wave-reader.h:124
RspecifierType ClassifyRspecifier(const std::string &rspecifier, std::string *rxfilename, RspecifierOptions *opts)
Definition: kaldi-table.cc:225
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
#define KALDI_ERR
Definition: kaldi-error.h:147
This class&#39;s purpose is to read in Wave files.
Definition: wave-reader.h:106
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
std::string PrintableRxfilename(const std::string &rxfilename)
PrintableRxfilename turns the rxfilename into a more human-readable form for error reporting...
Definition: kaldi-io.cc:61
std::string PrintableWxfilename(const std::string &wxfilename)
PrintableWxfilename turns the wxfilename into a more human-readable form for error reporting...
Definition: kaldi-io.cc:73
#define KALDI_LOG
Definition: kaldi-error.h:153
static bool Write(std::ostream &os, bool binary, const T &t)
Definition: wave-reader.h:162
void ExtendWaveWithSilence(const Vector< BaseFloat > &wav_in, BaseFloat samp_rate, Vector< BaseFloat > *wav_out, BaseFloat sil_search_len, BaseFloat sil_extract_len, BaseFloat sil_extract_shift)