28 Vector<BaseFloat> *wav_sil,
35 Vector<BaseFloat> *wav_out,
43 int main(
int argc,
char *argv[]) {
46 using namespace kaldi;
48 "Extend wave data with a fairly long silence at the end (e.g. 5 seconds).\n" 49 "The input waveforms are assumed having silences at the begin/end and those\n" 50 "segments are extracted and appended to the end of the utterance.\n" 51 "Note this is for use in testing endpointing in decoding.\n" 53 "Usage: extend-wav-with-silence [options] <wav-rspecifier> <wav-wspecifier>\n" 54 " extend-wav-with-silence [options] <wav-rxfilename> <wav-wxfilename>\n";
59 sil_extract_len = 0.05,
60 sil_extract_shift = 0.025;
61 po.
Register(
"extra-silence-length", &sil_len,
"the length of silence that will be " 62 "appended to the end of each waveform, in seconds.");
63 po.
Register(
"silence-search-length", &sil_search_len,
"the length at the beginning " 64 "or end of each waveform in which to search for the quietest segment of " 65 "silence, in seconds.");
66 po.
Register(
"silence-extract-length", &sil_extract_len,
"the length of silence segments " 67 "to be extracted from the waveform, which must be smaller than silence-" 68 "search-length, in seconds.");
69 po.
Register(
"silence-extract-shift", &sil_extract_shift,
"the shift length when searching " 70 "for segments of silences, typically samller than silence-extract-length, " 83 int32 num_success = 0;
85 for(; !reader.Done(); reader.Next()){
86 std::string wav_key = reader.Key();
87 const WaveData &wave = reader.Value();
90 int32 num_chan = wave_data.
NumRows(),
91 num_ext_samp = (
int32)(samp_freq * sil_len);
94 for(int32
i = 0;
i < num_chan;
i++){
95 Vector<BaseFloat> wav_this_chan(wave_data.
Row(
i));
96 Vector<BaseFloat> wav_extend(wav_this_chan.Dim() + num_ext_samp);
98 sil_search_len, sil_extract_len, sil_extract_shift);
99 KALDI_ASSERT(wav_extend.Dim() == wav_this_chan.Dim() + num_ext_samp);
100 new_wave.CopyRowFromVec(wav_extend,
i);
102 WaveData wave_out(samp_freq, new_wave);
103 writer.Write(wav_key, wave_out);
106 KALDI_LOG <<
"Successfully extended " << num_success <<
" files.";
109 std::string wav_rxfilename = po.
GetArg(1);
110 std::string wav_wxfilename = po.
GetArg(2);
112 Input ki(wav_rxfilename, &binary);
123 int32 num_chan = wave_data.
NumRows(),
124 num_ext_samp = (
int32)(samp_freq * sil_len);
127 for(int32
i = 0;
i < num_chan;
i++){
128 Vector<BaseFloat> wav_this_chan(wave_data.
Row(
i));
129 Vector<BaseFloat> wav_extend(wav_this_chan.Dim() + num_ext_samp);
131 sil_search_len, sil_extract_len, sil_extract_shift);
132 KALDI_ASSERT(wav_extend.Dim() == wav_this_chan.Dim() + num_ext_samp);
133 new_wave.CopyRowFromVec(wav_extend,
i);
135 WaveData wave_out(samp_freq, new_wave);
137 Output ko(wav_wxfilename, binary,
false);
144 }
catch(
const std::exception &e) {
145 std::cerr << e.what();
160 sil_search_len, sil_extract_len, sil_extract_shift);
162 int32 window_size = quietest_seg.
Dim(),
163 window_size_half = window_size / 2;
170 window(
i) = 0.54 - 0.46*cos(
M_2PI * i_fl / (window_size-1));
172 half_window = window.
Range(window_size_half, window_size_half);
173 windowed_silence.
AddVecVec(1.0, window, quietest_seg, 0.0);
175 wav_out->
Range(0, wav_in.
Dim()).CopyFromVec(wav_in);
177 wav_out->
Dim() - wav_in.
Dim() + window_size_half);
178 for(
int32 i = 0;
i < window_size_half;
i++)
179 wav_ext(
i) *= half_window(
i);
181 int32 tmp_offset = 0;
182 for(; tmp_offset + window_size < wav_ext.Dim();) {
183 wav_ext.Range(tmp_offset, window_size).AddVec(1.0, windowed_silence);
184 tmp_offset += window_size_half;
187 for(
int32 i = tmp_offset;
i < wav_ext.Dim();
i++)
188 wav_ext(
i) += windowed_silence(
i-tmp_offset);
203 int32 search_len = (
int32) (search_dur * samp_rate),
204 seg_len = (
int32) (seg_dur * samp_rate),
205 seg_shift = (
int32) (seg_shift_dur *samp_rate),
210 wav_min_energy = seg_tmp;
211 min_energy =
VecVec(seg_tmp, seg_tmp);
212 for(start = 0; start + seg_len < search_len; ){
215 double energy_this =
VecVec(seg_this, seg_this);
216 if(energy_this < min_energy && energy_this > 0.0){
217 min_energy = energy_this;
218 wav_min_energy = seg_tmp;
223 for(start = wav_in.
Dim() - search_len; start + seg_len < wav_in.
Dim(); ){
226 double energy_this =
VecVec(seg_this, seg_this);
227 if(energy_this < min_energy && energy_this > 0.0){
228 min_energy = energy_this;
229 wav_min_energy = seg_tmp;
234 if (min_energy == 0.0) {
235 KALDI_WARN <<
"Zero energy silence being used.";
237 *wav_sil = wav_min_energy;
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
bool Read(std::istream &is)
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
BaseFloat SampFreq() const
const Matrix< BaseFloat > & Data() const
void Register(const std::string &name, bool *ptr, const std::string &doc)
RspecifierType ClassifyRspecifier(const std::string &rspecifier, std::string *rxfilename, RspecifierOptions *opts)
void AddVecVec(Real alpha, const VectorBase< Real > &v, const VectorBase< Real > &r, Real beta)
Add element-by-element product of vectors:
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
MatrixIndexT Dim() const
Returns the dimension of the vector.
This class's purpose is to read in Wave files.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
int main(int argc, char *argv[])
A class representing a vector.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
std::string PrintableRxfilename(const std::string &rxfilename)
PrintableRxfilename turns the rxfilename into a more human-readable form for error reporting...
std::string PrintableWxfilename(const std::string &wxfilename)
PrintableWxfilename turns the wxfilename into a more human-readable form for error reporting...
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
static bool Write(std::ostream &os, bool binary, const T &t)
void FindQuietestSegment(const Vector< BaseFloat > &wav_in, BaseFloat samp_rate, Vector< BaseFloat > *wav_sil, BaseFloat search_dur=0.5, BaseFloat seg_dur=0.1, BaseFloat seg_shift_dur=0.05)
void ExtendWaveWithSilence(const Vector< BaseFloat > &wav_in, BaseFloat samp_rate, Vector< BaseFloat > *wav_out, BaseFloat sil_search_len, BaseFloat sil_extract_len, BaseFloat sil_extract_shift)
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).