extend-wav-with-silence.cc
Go to the documentation of this file.
1 // online2bin/extend-wav-with-silence.cc
2 
3 // 2014 IMSL, PKU-HKUST (author: Wei Shi)
4 // 2015 Tom Ko
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include "base/kaldi-common.h"
22 #include "util/common-utils.h"
23 #include "feat/wave-reader.h"
24 
25 namespace kaldi{
26 void FindQuietestSegment(const Vector<BaseFloat> &wav_in,
27  BaseFloat samp_rate,
28  Vector<BaseFloat> *wav_sil,
29  BaseFloat search_dur = 0.5,
30  BaseFloat seg_dur = 0.1,
31  BaseFloat seg_shift_dur = 0.05);
32 
33 void ExtendWaveWithSilence(const Vector<BaseFloat> &wav_in,
34  BaseFloat samp_rate,
35  Vector<BaseFloat> *wav_out,
36  BaseFloat sil_search_len,
37  BaseFloat sil_extract_len,
38  BaseFloat sil_extract_shift);
39 
40 }
41 
42 
43 int main(int argc, char *argv[]) {
44  try {
45  typedef kaldi::int32 int32;
46  using namespace kaldi;
47  const char *usage =
48  "Extend wave data with a fairly long silence at the end (e.g. 5 seconds).\n"
49  "The input waveforms are assumed having silences at the begin/end and those\n"
50  "segments are extracted and appended to the end of the utterance.\n"
51  "Note this is for use in testing endpointing in decoding.\n"
52  "\n"
53  "Usage: extend-wav-with-silence [options] <wav-rspecifier> <wav-wspecifier>\n"
54  " extend-wav-with-silence [options] <wav-rxfilename> <wav-wxfilename>\n";
55 
56  ParseOptions po(usage);
57  BaseFloat sil_len = 5.0,
58  sil_search_len = 0.5,
59  sil_extract_len = 0.05,
60  sil_extract_shift = 0.025;
61  po.Register("extra-silence-length", &sil_len, "the length of silence that will be "
62  "appended to the end of each waveform, in seconds.");
63  po.Register("silence-search-length", &sil_search_len, "the length at the beginning "
64  "or end of each waveform in which to search for the quietest segment of "
65  "silence, in seconds.");
66  po.Register("silence-extract-length", &sil_extract_len, "the length of silence segments "
67  "to be extracted from the waveform, which must be smaller than silence-"
68  "search-length, in seconds.");
69  po.Register("silence-extract-shift", &sil_extract_shift, "the shift length when searching "
70  "for segments of silences, typically samller than silence-extract-length, "
71  "in seconds.");
72 
73  po.Read(argc, argv);
74 
75  if (po.NumArgs() != 2) {
76  po.PrintUsage();
77  exit(1);
78  }
79 
80  if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
82  TableWriter<WaveHolder> writer(po.GetArg(2));
83  int32 num_success = 0;
84 
85  for(; !reader.Done(); reader.Next()){
86  std::string wav_key = reader.Key();
87  const WaveData &wave = reader.Value();
88  BaseFloat samp_freq = wave.SampFreq(); // read sampling fequency
89  const Matrix<BaseFloat> &wave_data = wave.Data();
90  int32 num_chan = wave_data.NumRows(), // number of channels in recording
91  num_ext_samp = (int32)(samp_freq * sil_len); // number of samples that will be extended
92  KALDI_ASSERT(num_ext_samp > 0);
93  Matrix<BaseFloat> new_wave(wave_data.NumRows(), wave_data.NumCols() + num_ext_samp);
94  for(int32 i = 0; i < num_chan; i++){
95  Vector<BaseFloat> wav_this_chan(wave_data.Row(i));
96  Vector<BaseFloat> wav_extend(wav_this_chan.Dim() + num_ext_samp);
97  ExtendWaveWithSilence(wav_this_chan, samp_freq, &wav_extend,
98  sil_search_len, sil_extract_len, sil_extract_shift);
99  KALDI_ASSERT(wav_extend.Dim() == wav_this_chan.Dim() + num_ext_samp);
100  new_wave.CopyRowFromVec(wav_extend, i);
101  }
102  WaveData wave_out(samp_freq, new_wave);
103  writer.Write(wav_key, wave_out);
104  num_success++;
105  }
106  KALDI_LOG << "Successfully extended " << num_success << " files.";
107  return 0;
108  } else {
109  std::string wav_rxfilename = po.GetArg(1);
110  std::string wav_wxfilename = po.GetArg(2);
111  bool binary = true;
112  Input ki(wav_rxfilename, &binary);
113  WaveHolder wh;
114  if (!wh.Read(ki.Stream())) {
115  KALDI_ERR << "Read failure from "
116  << PrintableRxfilename(wav_rxfilename);
117  }
118 
119  const WaveData& wave = wh.Value();
120 
121  BaseFloat samp_freq = wave.SampFreq(); // read sampling fequency
122  const Matrix<BaseFloat> &wave_data = wave.Data();
123  int32 num_chan = wave_data.NumRows(), // number of channels in recording
124  num_ext_samp = (int32)(samp_freq * sil_len); // number of samples that will be extended
125  KALDI_ASSERT(num_ext_samp > 0);
126  Matrix<BaseFloat> new_wave(wave_data.NumRows(), wave_data.NumCols() + num_ext_samp);
127  for(int32 i = 0; i < num_chan; i++){
128  Vector<BaseFloat> wav_this_chan(wave_data.Row(i));
129  Vector<BaseFloat> wav_extend(wav_this_chan.Dim() + num_ext_samp);
130  ExtendWaveWithSilence(wav_this_chan, samp_freq, &wav_extend,
131  sil_search_len, sil_extract_len, sil_extract_shift);
132  KALDI_ASSERT(wav_extend.Dim() == wav_this_chan.Dim() + num_ext_samp);
133  new_wave.CopyRowFromVec(wav_extend, i);
134  }
135  WaveData wave_out(samp_freq, new_wave);
136 
137  Output ko(wav_wxfilename, binary, false);
138  if (!WaveHolder::Write(ko.Stream(), true, wave_out)) {
139  KALDI_ERR << "Write failure to "
140  << PrintableWxfilename(wav_wxfilename);
141  }
142  // we do not print any log messages here
143  }
144  } catch(const std::exception &e) {
145  std::cerr << e.what();
146  return -1;
147  }
148 }
149 
150 namespace kaldi{
151 
153  BaseFloat samp_rate,
154  Vector<BaseFloat> *wav_out,
155  BaseFloat sil_search_len,
156  BaseFloat sil_extract_len,
157  BaseFloat sil_extract_shift){
158  Vector<BaseFloat> quietest_seg;
159  FindQuietestSegment(wav_in, samp_rate, &quietest_seg,
160  sil_search_len, sil_extract_len, sil_extract_shift);
161 
162  int32 window_size = quietest_seg.Dim(),
163  window_size_half = window_size / 2;
164  KALDI_ASSERT(window_size > 0);
165  Vector<BaseFloat> window(window_size);
166  Vector<BaseFloat> windowed_silence(window_size);
167  Vector<BaseFloat> half_window(window_size_half);
168  for(int32 i = 0; i < window.Dim(); i++){
169  BaseFloat i_fl = static_cast<BaseFloat>(i);
170  window(i) = 0.54 - 0.46*cos(M_2PI * i_fl / (window_size-1));
171  }
172  half_window = window.Range(window_size_half, window_size_half);
173  windowed_silence.AddVecVec(1.0, window, quietest_seg, 0.0);
174 
175  wav_out->Range(0, wav_in.Dim()).CopyFromVec(wav_in);
176  SubVector<BaseFloat> wav_ext(*wav_out, wav_in.Dim() - window_size_half,
177  wav_out->Dim() - wav_in.Dim() + window_size_half);
178  for(int32 i = 0; i < window_size_half; i++) // windowing the first half window
179  wav_ext(i) *= half_window(i);
180 
181  int32 tmp_offset = 0;
182  for(; tmp_offset + window_size < wav_ext.Dim();) {
183  wav_ext.Range(tmp_offset, window_size).AddVec(1.0, windowed_silence);
184  tmp_offset += window_size_half;
185  }
186 
187  for(int32 i = tmp_offset; i < wav_ext.Dim(); i++)
188  wav_ext(i) += windowed_silence(i-tmp_offset);
189 
190 }
191 
192 // Try to find the quietest seq_dur(default 0.1) second segment in the
193 // search_dur(default 0.5) seconds at the beginning and the end
194 // of input waveform by simply find a segment with the least energy.
196  BaseFloat samp_rate,
197  Vector<BaseFloat> *wav_sil,
198  BaseFloat search_dur,
199  BaseFloat seg_dur,
200  BaseFloat seg_shift_dur){
201  KALDI_ASSERT(seg_dur < search_dur);
202 
203  int32 search_len = (int32) (search_dur * samp_rate),
204  seg_len = (int32) (seg_dur * samp_rate),
205  seg_shift = (int32) (seg_shift_dur *samp_rate),
206  start = 0;
207  double min_energy;
208  Vector<BaseFloat> wav_min_energy;
209  Vector<BaseFloat> seg_tmp(wav_in.Range(0, seg_len));
210  wav_min_energy = seg_tmp;
211  min_energy = VecVec(seg_tmp, seg_tmp);
212  for(start = 0; start + seg_len < search_len; ){
213  SubVector<BaseFloat> seg_this(wav_in, start, seg_len);
214  seg_tmp = seg_this;
215  double energy_this = VecVec(seg_this, seg_this);
216  if(energy_this < min_energy && energy_this > 0.0){
217  min_energy = energy_this;
218  wav_min_energy = seg_tmp;
219  }
220  start += seg_shift;
221  }
222 
223  for(start = wav_in.Dim() - search_len; start + seg_len < wav_in.Dim(); ){
224  SubVector<BaseFloat> seg_this(wav_in, start, seg_len);
225  seg_tmp = seg_this;
226  double energy_this = VecVec(seg_this, seg_this);
227  if(energy_this < min_energy && energy_this > 0.0){
228  min_energy = energy_this;
229  wav_min_energy = seg_tmp;
230  }
231  start += seg_shift;
232  }
233 
234  if (min_energy == 0.0) {
235  KALDI_WARN << "Zero energy silence being used.";
236  }
237  *wav_sil = wav_min_energy;
238 }
239 
240 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool Read(std::istream &is)
Definition: wave-reader.h:191
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
BaseFloat SampFreq() const
Definition: wave-reader.h:126
const Matrix< BaseFloat > & Data() const
Definition: wave-reader.h:124
void Register(const std::string &name, bool *ptr, const std::string &doc)
RspecifierType ClassifyRspecifier(const std::string &rspecifier, std::string *rxfilename, RspecifierOptions *opts)
Definition: kaldi-table.cc:225
void AddVecVec(Real alpha, const VectorBase< Real > &v, const VectorBase< Real > &r, Real beta)
Add element-by-element product of vectors:
std::istream & Stream()
Definition: kaldi-io.cc:826
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
std::ostream & Stream()
Definition: kaldi-io.cc:701
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
This class&#39;s purpose is to read in Wave files.
Definition: wave-reader.h:106
int NumArgs() const
Number of positional parameters (c.f. argc-1).
int main(int argc, char *argv[])
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
#define M_2PI
Definition: kaldi-math.h:52
std::string PrintableRxfilename(const std::string &rxfilename)
PrintableRxfilename turns the rxfilename into a more human-readable form for error reporting...
Definition: kaldi-io.cc:61
std::string PrintableWxfilename(const std::string &wxfilename)
PrintableWxfilename turns the wxfilename into a more human-readable form for error reporting...
Definition: kaldi-io.cc:73
#define KALDI_LOG
Definition: kaldi-error.h:153
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
static bool Write(std::ostream &os, bool binary, const T &t)
Definition: wave-reader.h:162
void FindQuietestSegment(const Vector< BaseFloat > &wav_in, BaseFloat samp_rate, Vector< BaseFloat > *wav_sil, BaseFloat search_dur=0.5, BaseFloat seg_dur=0.1, BaseFloat seg_shift_dur=0.05)
void ExtendWaveWithSilence(const Vector< BaseFloat > &wav_in, BaseFloat samp_rate, Vector< BaseFloat > *wav_out, BaseFloat sil_search_len, BaseFloat sil_extract_len, BaseFloat sil_extract_shift)
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).
Definition: kaldi-vector.h:94