wav-reverberate.cc File Reference
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "feat/wave-reader.h"
#include "feat/signal.h"
Include dependency graph for wav-reverberate.cc:

Go to the source code of this file.

Namespaces

 kaldi
 This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for mispronunciations detection tasks, the reference:
 

Functions

void AddVectorsOfUnequalLength (const VectorBase< BaseFloat > &signal1, Vector< BaseFloat > *signal2)
 
void AddVectorsWithOffset (const Vector< BaseFloat > &signal1, int32 offset, Vector< BaseFloat > *signal2)
 
BaseFloat MaxAbsolute (const Vector< BaseFloat > &vector)
 
BaseFloat ComputeEarlyReverbEnergy (const Vector< BaseFloat > &rir, const Vector< BaseFloat > &signal, BaseFloat samp_freq)
 
float DoReverberation (const Vector< BaseFloat > &rir, BaseFloat samp_freq, Vector< BaseFloat > *signal)
 
void AddNoise (Vector< BaseFloat > *noise, BaseFloat snr_db, BaseFloat time, BaseFloat samp_freq, BaseFloat signal_power, Vector< BaseFloat > *signal)
 
void ReadCommaSeparatedCommand (const std::string &s, std::vector< BaseFloat > *v)
 
int main (int argc, char *argv[])
 

Function Documentation

◆ main()

int main ( int  argc,
char *  argv[] 
)

Definition at line 131 of file wav-reverberate.cc.

References kaldi::AddNoise(), kaldi::AddVectorsOfUnequalLength(), VectorBase< Real >::CopyRowFromMat(), MatrixBase< Real >::CopyRowFromVec(), WaveData::Data(), VectorBase< Real >::Dim(), kaldi::DoReverberation(), ParseOptions::GetArg(), rnnlm::i, KALDI_ASSERT, KALDI_ERR, KALDI_VLOG, KALDI_WARN, VectorBase< Real >::Max(), ParseOptions::NumArgs(), MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), ParseOptions::PrintUsage(), VectorBase< Real >::Range(), ParseOptions::Read(), WaveHolder::Read(), kaldi::ReadCommaSeparatedCommand(), ParseOptions::Register(), Vector< Real >::Resize(), WaveData::SampFreq(), VectorBase< Real >::Scale(), VectorBase< Real >::SetZero(), kaldi::SplitStringToVector(), Output::Stream(), Input::Stream(), WaveHolder::Value(), kaldi::VecVec(), and WaveData::Write().

131  {
132  try {
133  using namespace kaldi;
134 
135  const char *usage =
136  "Corrupts the wave files supplied via input pipe with the specified\n"
137  "room-impulse response (rir_matrix) and additive noise distortions\n"
138  "(specified by corresponding files).\n"
139  "Usage: wav-reverberate [options...] <wav-in-rxfilename> "
140  "<wav-out-wxfilename>\n"
141  "e.g.\n"
142  "wav-reverberate --duration=20.25 --impulse-response=rir.wav "
143  "--additive-signals='noise1.wav,noise2.wav' --snrs='20.0,15.0' "
144  "--start-times='0,17.8' input.wav output.wav\n";
145 
146  ParseOptions po(usage);
147  std::string rir_file;
148  std::string additive_signals;
149  std::string snrs;
150  std::string start_times;
151  bool multi_channel_output = false;
152  bool shift_output = true;
153  int32 input_channel = 0;
154  int32 rir_channel = 0;
155  int32 noise_channel = 0;
156  bool normalize_output = true;
157  BaseFloat volume = 0;
158  BaseFloat duration = 0;
159 
160  po.Register("multi-channel-output", &multi_channel_output,
161  "Specifies if the output should be multi-channel or not");
162  po.Register("shift-output", &shift_output,
163  "If true, the reverberated waveform will be shifted by the "
164  "amount of the peak position of the RIR and the length of "
165  "the output waveform will be equal to the input waveform. "
166  "If false, the length of the output waveform will be "
167  "equal to (original input length + rir length - 1). "
168  "This value is true by default and "
169  "it only affects the output when RIR file is provided.");
170  po.Register("input-wave-channel", &input_channel,
171  "Specifies the channel to be used from input as only a "
172  "single channel will be used to generate reverberated output");
173  po.Register("rir-channel", &rir_channel,
174  "Specifies the channel of the room impulse response, "
175  "it will only be used when multi-channel-output is false");
176  po.Register("noise-channel", &noise_channel,
177  "Specifies the channel of the noise file, "
178  "it will only be used when multi-channel-output is false");
179  po.Register("impulse-response", &rir_file,
180  "File with the impulse response for reverberating the input wave"
181  "It can be either a file in wav format or a piped command. "
182  "E.g. --impulse-response='rir.wav' or 'sox rir.wav - |' ");
183  po.Register("additive-signals", &additive_signals,
184  "A comma separated list of additive signals. "
185  "They can be either filenames or piped commands. "
186  "E.g. --additive-signals='noise1.wav,noise2.wav' or "
187  "'sox noise1.wav - |,sox noise2.wav - |'. "
188  "Requires --snrs and --start-times.");
189  po.Register("snrs", &snrs,
190  "A comma separated list of SNRs(dB). "
191  "The additive signals will be scaled according to these SNRs. "
192  "E.g. --snrs='20.0,0.0,5.0,10.0' ");
193  po.Register("start-times", &start_times,
194  "A comma separated list of start times referring to the "
195  "input signal. The additive signals will be added to the "
196  "input signal starting at the offset. If the start time "
197  "exceed the length of the input signal, the addition will "
198  "be ignored.");
199  po.Register("normalize-output", &normalize_output,
200  "If true, then after reverberating and "
201  "possibly adding noise, scale so that the signal "
202  "energy is the same as the original input signal. "
203  "See also the --volume option.");
204  po.Register("duration", &duration,
205  "If nonzero, it specified the duration (secs) of the output "
206  "signal. If the duration t is less than the length of the "
207  "input signal, the first t secs of the signal is trimmed, "
208  "otherwise, the signal will be repeated to "
209  "fulfill the duration specified.");
210  po.Register("volume", &volume,
211  "If nonzero, a scaling factor on the signal that is applied "
212  "after reverberating and possibly adding noise. "
213  "If you set this option to a nonzero value, it will be as "
214  "if you had also specified --normalize-output=false.");
215 
216  po.Read(argc, argv);
217  if (po.NumArgs() != 2) {
218  po.PrintUsage();
219  exit(1);
220  }
221 
222  if (multi_channel_output) {
223  if (rir_channel != 0 || noise_channel != 0)
224  KALDI_WARN << "options for --rir-channel and --noise-channel"
225  "are ignored as --multi-channel-output is true.";
226  }
227 
228  std::string input_wave_file = po.GetArg(1);
229  std::string output_wave_file = po.GetArg(2);
230 
231  WaveData input_wave;
232  {
233  WaveHolder waveholder;
234  Input ki(input_wave_file);
235  waveholder.Read(ki.Stream());
236  input_wave = waveholder.Value();
237  }
238 
239  const Matrix<BaseFloat> &input_matrix = input_wave.Data();
240  BaseFloat samp_freq_input = input_wave.SampFreq();
241  int32 num_samp_input = input_matrix.NumCols(), // #samples in the input
242  num_input_channel = input_matrix.NumRows(); // #channels in the input
243  KALDI_VLOG(1) << "sampling frequency of input: " << samp_freq_input
244  << " #samples: " << num_samp_input
245  << " #channel: " << num_input_channel;
246  KALDI_ASSERT(input_channel < num_input_channel);
247 
248  Matrix<BaseFloat> rir_matrix;
249  BaseFloat samp_freq_rir = samp_freq_input;
250  int32 num_samp_rir = 0,
251  num_rir_channel = 0;
252  if (!rir_file.empty()) {
253  WaveData rir_wave;
254  {
255  WaveHolder waveholder;
256  Input ki(rir_file);
257  waveholder.Read(ki.Stream());
258  rir_wave = waveholder.Value();
259  }
260  rir_matrix = rir_wave.Data();
261  samp_freq_rir = rir_wave.SampFreq();
262  num_samp_rir = rir_matrix.NumCols();
263  num_rir_channel = rir_matrix.NumRows();
264  KALDI_VLOG(1) << "sampling frequency of rir: " << samp_freq_rir
265  << " #samples: " << num_samp_rir
266  << " #channel: " << num_rir_channel;
267  if (!multi_channel_output) {
268  KALDI_ASSERT(rir_channel < num_rir_channel);
269  }
270  }
271 
272  std::vector<Matrix<BaseFloat> > additive_signal_matrices;
273  if (!additive_signals.empty()) {
274  if (snrs.empty() || start_times.empty())
275  KALDI_ERR << "--additive-signals option requires "
276  "--snrs and --start-times to be set.";
277  std::vector<std::string> split_string;
278  SplitStringToVector(additive_signals, ",", true, &split_string);
279  for (size_t i = 0; i < split_string.size(); i++) {
280  WaveHolder waveholder;
281  Input ki(split_string[i]);
282  waveholder.Read(ki.Stream());
283  WaveData additive_signal_wave = waveholder.Value();
284  Matrix<BaseFloat> additive_signal_matrix = additive_signal_wave.Data();
285  BaseFloat samp_freq = additive_signal_wave.SampFreq();
286  KALDI_ASSERT(samp_freq == samp_freq_input);
287  int32 num_samp = additive_signal_matrix.NumCols(),
288  num_channel = additive_signal_matrix.NumRows();
289  KALDI_VLOG(1) << "sampling frequency of additive signal: " << samp_freq
290  << " #samples: " << num_samp
291  << " #channel: " << num_channel;
292  if (multi_channel_output) {
293  KALDI_ASSERT(num_rir_channel == num_channel);
294  } else {
295  KALDI_ASSERT(noise_channel < num_channel);
296  }
297 
298  additive_signal_matrices.push_back(additive_signal_matrix);
299  }
300  }
301 
302  std::vector<BaseFloat> snr_vector;
303  if (!snrs.empty()) {
304  ReadCommaSeparatedCommand(snrs, &snr_vector);
305  }
306 
307  std::vector<BaseFloat> start_time_vector;
308  if (!start_times.empty()) {
309  ReadCommaSeparatedCommand(start_times, &start_time_vector);
310  }
311 
312  int32 shift_index = 0;
313  int32 num_output_channels = (multi_channel_output ? num_rir_channel : 1);
314  int32 num_samp_output = (duration > 0 ? samp_freq_input * duration :
315  (shift_output ? num_samp_input :
316  num_samp_input + num_samp_rir - 1));
317  Matrix<BaseFloat> out_matrix(num_output_channels, num_samp_output);
318 
319  for (int32 output_channel = 0; output_channel < num_output_channels; output_channel++) {
320  Vector<BaseFloat> input(num_samp_input);
321  input.CopyRowFromMat(input_matrix, input_channel);
322  float power_before_reverb = VecVec(input, input) / input.Dim();
323 
324  int32 this_rir_channel = (multi_channel_output ? output_channel : rir_channel);
325 
326  float early_energy = power_before_reverb;
327  if (!rir_file.empty()) {
328  Vector<BaseFloat> rir;
329  rir.Resize(num_samp_rir);
330  rir.CopyRowFromMat(rir_matrix, this_rir_channel);
331  rir.Scale(1.0 / (1 << 15));
332  early_energy = DoReverberation(rir, samp_freq_rir, &input);
333  if (shift_output) {
334  // find the position of the peak of the impulse response
335  // and shift the output waveform by this amount
336  rir.Max(&shift_index);
337  }
338  }
339 
340  if (additive_signal_matrices.size() > 0) {
341  Vector<BaseFloat> noise(0);
342  int32 this_noise_channel = (multi_channel_output ? output_channel : noise_channel);
343  KALDI_ASSERT(additive_signal_matrices.size() == snr_vector.size());
344  KALDI_ASSERT(additive_signal_matrices.size() == start_time_vector.size());
345  for (int32 i = 0; i < additive_signal_matrices.size(); i++) {
346  noise.Resize(additive_signal_matrices[i].NumCols());
347  noise.CopyRowFromMat(additive_signal_matrices[i], this_noise_channel);
348  AddNoise(&noise, snr_vector[i], start_time_vector[i],
349  samp_freq_input, early_energy, &input);
350  }
351  }
352 
353  float power_after_reverb = VecVec(input, input) / input.Dim();
354 
355  if (volume > 0)
356  input.Scale(volume);
357  else if (normalize_output)
358  input.Scale(sqrt(power_before_reverb / power_after_reverb));
359 
360  if (num_samp_output <= num_samp_input) {
361  // trim the signal from the start
362  out_matrix.CopyRowFromVec(input.Range(shift_index, num_samp_output), output_channel);
363  } else {
364  // repeat the signal to fill up the duration
365  Vector<BaseFloat> extended_input(num_samp_output);
366  extended_input.SetZero();
367  AddVectorsOfUnequalLength(input.Range(shift_index, num_samp_input), &extended_input);
368  out_matrix.CopyRowFromVec(extended_input, output_channel);
369  }
370  }
371 
372  WaveData out_wave(samp_freq_input, out_matrix);
373  Output ko(output_wave_file, false);
374  out_wave.Write(ko.Stream());
375 
376  return 0;
377  } catch(const std::exception &e) {
378  std::cerr << e.what();
379  return -1;
380  }
381 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool Read(std::istream &is)
Definition: wave-reader.h:191
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
void ReadCommaSeparatedCommand(const std::string &s, std::vector< BaseFloat > *v)
float DoReverberation(const Vector< BaseFloat > &rir, BaseFloat samp_freq, Vector< BaseFloat > *signal)
kaldi::int32 int32
BaseFloat SampFreq() const
Definition: wave-reader.h:126
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
const Matrix< BaseFloat > & Data() const
Definition: wave-reader.h:124
void CopyRowFromMat(const MatrixBase< Real > &M, MatrixIndexT row)
Extracts a row of the matrix M.
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void AddVectorsOfUnequalLength(const VectorBase< BaseFloat > &signal1, Vector< BaseFloat > *signal2)
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
#define KALDI_ERR
Definition: kaldi-error.h:147
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
#define KALDI_WARN
Definition: kaldi-error.h:150
void Scale(Real alpha)
Multiplies all elements by this constant.
This class&#39;s purpose is to read in Wave files.
Definition: wave-reader.h:106
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void AddNoise(Vector< BaseFloat > *noise, BaseFloat snr_db, BaseFloat time, BaseFloat samp_freq, BaseFloat signal_power, Vector< BaseFloat > *signal)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37