FeatureExtraction

Files

file  resample.h
 ]
 

Classes

struct  ExampleFeatureComputerOptions
 This class is only added for documentation, it is not intended to ever be used. More...
 
class  ExampleFeatureComputer
 This class is only added for documentation, it is not intended to ever be used. More...
 
class  OfflineFeatureTpl< F >
 This templated class is intended for offline feature extraction, i.e. More...
 
struct  FbankOptions
 FbankOptions contains basic options for computing filterbank features. More...
 
class  FbankComputer
 Class for computing mel-filterbank features; see Computing MFCC features for more information. More...
 
struct  DeltaFeaturesOptions
 
class  DeltaFeatures
 
struct  ShiftedDeltaFeaturesOptions
 
class  ShiftedDeltaFeatures
 
struct  SlidingWindowCmnOptions
 
struct  MfccOptions
 MfccOptions contains basic options for computing MFCC features. More...
 
class  MfccComputer
 
struct  PlpOptions
 PlpOptions contains basic options for computing PLP features. More...
 
class  PlpComputer
 This is the new-style interface to the PLP computation. More...
 
struct  SpectrogramOptions
 SpectrogramOptions contains basic options for computing spectrogram features. More...
 
class  SpectrogramComputer
 Class for computing spectrogram features. More...
 
struct  FrameExtractionOptions
 
struct  FeatureWindowFunction
 
struct  MelBanksOptions
 
class  MelBanks
 
struct  PitchExtractionOptions
 
struct  ProcessPitchOptions
 
class  OnlinePitchFeature
 
class  OnlineProcessPitch
 This online-feature class implements post processing of pitch features. More...
 
class  ArbitraryResample
 Class ArbitraryResample allows you to resample a signal (assumed zero outside the sample region, not periodic) at arbitrary specified time values, which don't have to be linearly spaced. More...
 
class  LinearResample
 LinearResample is a special case of ArbitraryResample, where we want to resample a signal at linearly spaced intervals (this means we want to upsample or downsample the signal). More...
 

Typedefs

typedef OfflineFeatureTpl< FbankComputerFbank
 
typedef OfflineFeatureTpl< MfccComputerMfcc
 
typedef OfflineFeatureTpl< PlpComputerPlp
 
typedef OfflineFeatureTpl< SpectrogramComputerSpectrogram
 

Functions

void ComputePowerSpectrum (VectorBase< BaseFloat > *waveform)
 
void ComputeDeltas (const DeltaFeaturesOptions &delta_opts, const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
 
void ComputeShiftedDeltas (const ShiftedDeltaFeaturesOptions &delta_opts, const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
 
void SpliceFrames (const MatrixBase< BaseFloat > &input_features, int32 left_context, int32 right_context, Matrix< BaseFloat > *output_features)
 
void ReverseFrames (const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
 
void InitIdftBases (int32 n_bases, int32 dimension, Matrix< BaseFloat > *mat_out)
 
void SlidingWindowCmn (const SlidingWindowCmnOptions &opts, const MatrixBase< BaseFloat > &input, MatrixBase< BaseFloat > *output)
 Applies sliding-window cepstral mean and/or variance normalization. More...
 
int32 NumFrames (int64 num_samples, const FrameExtractionOptions &opts, bool flush=true)
 This function returns the number of frames that we can extract from a wave file with the given number of samples in it (assumed to have the same sampling rate as specified in 'opts'). More...
 
int64 FirstSampleOfFrame (int32 frame, const FrameExtractionOptions &opts)
 
void Dither (VectorBase< BaseFloat > *waveform, BaseFloat dither_value)
 
void Preemphasize (VectorBase< BaseFloat > *waveform, BaseFloat preemph_coeff)
 
void ProcessWindow (const FrameExtractionOptions &opts, const FeatureWindowFunction &window_function, VectorBase< BaseFloat > *window, BaseFloat *log_energy_pre_window=NULL)
 This function does all the windowing steps after actually extracting the windowed signal: depending on the configuration, it does dithering, dc offset removal, preemphasis, and multiplication by the windowing function. More...
 
void ExtractWindow (int64 sample_offset, const VectorBase< BaseFloat > &wave, int32 f, const FrameExtractionOptions &opts, const FeatureWindowFunction &window_function, Vector< BaseFloat > *window, BaseFloat *log_energy_pre_window)
 
void ComputeLifterCoeffs (BaseFloat Q, VectorBase< BaseFloat > *coeffs)
 
BaseFloat Durbin (int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp)
 
BaseFloat ComputeLpc (const VectorBase< BaseFloat > &autocorr_in, Vector< BaseFloat > *lpc_out)
 
void Lpc2Cepstrum (int n, const BaseFloat *pLPC, BaseFloat *pCepst)
 
void GetEqualLoudnessVector (const MelBanks &mel_banks, Vector< BaseFloat > *ans)
 
void ComputeKaldiPitch (const PitchExtractionOptions &opts, const VectorBase< BaseFloat > &wave, Matrix< BaseFloat > *output)
 This function extracts (pitch, NCCF) per frame, using the pitch extraction method described in "A Pitch Extraction Algorithm Tuned for Automatic Speech Recognition", Pegah Ghahremani, Bagher BabaAli, Daniel Povey, Korbinian Riedhammer, Jan Trmal and Sanjeev Khudanpur, ICASSP 2014. More...
 
void ProcessPitch (const ProcessPitchOptions &opts, const MatrixBase< BaseFloat > &input, Matrix< BaseFloat > *output)
 This function processes the raw (NCCF, pitch) quantities computed by ComputeKaldiPitch, and processes them into features. More...
 
void ComputeAndProcessKaldiPitch (const PitchExtractionOptions &pitch_opts, const ProcessPitchOptions &process_opts, const VectorBase< BaseFloat > &wave, Matrix< BaseFloat > *output)
 This function combines ComputeKaldiPitch and ProcessPitch. More...
 
void ResampleWaveform (BaseFloat orig_freq, const VectorBase< BaseFloat > &wave, BaseFloat new_freq, Vector< BaseFloat > *new_wave)
 Downsample or upsample a waveform. More...
 
void DownsampleWaveForm (BaseFloat orig_freq, const VectorBase< BaseFloat > &wave, BaseFloat new_freq, Vector< BaseFloat > *new_wave)
 This function is deprecated. More...
 

Detailed Description

Typedef Documentation

◆ Fbank

Definition at line 143 of file feature-fbank.h.

◆ Mfcc

Definition at line 147 of file feature-mfcc.h.

◆ Plp

Definition at line 169 of file feature-plp.h.

◆ Spectrogram

Function Documentation

◆ ComputeAndProcessKaldiPitch()

void ComputeAndProcessKaldiPitch ( const PitchExtractionOptions pitch_opts,
const ProcessPitchOptions process_opts,
const VectorBase< BaseFloat > &  wave,
Matrix< BaseFloat > *  output 
)

This function combines ComputeKaldiPitch and ProcessPitch.

The reason why we need a separate function to do this is in order to be able to accurately simulate the online pitch-processing, for testing and for training models matched to the "first-pass" features. It is sensitive to the variables in pitch_opts that relate to online processing, i.e. max_frames_latency, frames_per_chunk, simulate_first_pass_online, recompute_frame.

Definition at line 1597 of file pitch-functions.cc.

References OnlinePitchFeature::AcceptWaveform(), VectorBase< Real >::Dim(), OnlineProcessPitch::Dim(), PitchExtractionOptions::frame_shift_ms, PitchExtractionOptions::frames_per_chunk, OnlineProcessPitch::GetFrame(), OnlinePitchFeature::InputFinished(), KALDI_ASSERT, KALDI_WARN, kaldi::kCopyData, OnlineProcessPitch::NumFramesReady(), Matrix< Real >::Resize(), MatrixBase< Real >::RowRange(), PitchExtractionOptions::samp_freq, and PitchExtractionOptions::simulate_first_pass_online.

Referenced by main(), kaldi::UnitTestSimple(), and kaldi::UnitTestSnipEdges().

1601  {
1602 
1603  OnlinePitchFeature pitch_extractor(pitch_opts);
1604 
1605  if (pitch_opts.simulate_first_pass_online) {
1606  KALDI_ASSERT(pitch_opts.frames_per_chunk > 0 &&
1607  "--simulate-first-pass-online option does not make sense "
1608  "unless you specify --frames-per-chunk");
1609  }
1610 
1611  OnlineProcessPitch post_process(process_opts, &pitch_extractor);
1612 
1613  int32 cur_rows = 100;
1614  Matrix<BaseFloat> feats(cur_rows, post_process.Dim());
1615 
1616  int32 cur_offset = 0, cur_frame = 0,
1617  samp_per_chunk = pitch_opts.frames_per_chunk *
1618  pitch_opts.samp_freq * pitch_opts.frame_shift_ms / 1000.0f;
1619 
1620  // We request the first-pass features as soon as they are available,
1621  // regardless of whether opts.simulate_first_pass_online == true. If
1622  // opts.simulate_first_pass_online == true this should
1623  // not affect the features generated, but it helps us to test the code
1624  // in a way that's closer to what online decoding would see.
1625 
1626  while (cur_offset < wave.Dim()) {
1627  int32 num_samp;
1628  if (samp_per_chunk > 0)
1629  num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset);
1630  else // user left opts.frames_per_chunk at zero.
1631  num_samp = wave.Dim();
1632  SubVector<BaseFloat> wave_chunk(wave, cur_offset, num_samp);
1633  pitch_extractor.AcceptWaveform(pitch_opts.samp_freq, wave_chunk);
1634  cur_offset += num_samp;
1635  if (cur_offset == wave.Dim())
1636  pitch_extractor.InputFinished();
1637 
1638  // Get each frame as soon as it is ready.
1639  for (; cur_frame < post_process.NumFramesReady(); cur_frame++) {
1640  if (cur_frame >= cur_rows) {
1641  cur_rows *= 2;
1642  feats.Resize(cur_rows, post_process.Dim(), kCopyData);
1643  }
1644  SubVector<BaseFloat> row(feats, cur_frame);
1645  post_process.GetFrame(cur_frame, &row);
1646  }
1647  }
1648 
1649  if (pitch_opts.simulate_first_pass_online) {
1650  if (cur_frame == 0) {
1651  KALDI_WARN << "No features output since wave file too short";
1652  output->Resize(0, 0);
1653  } else {
1654  *output = feats.RowRange(0, cur_frame);
1655  }
1656  } else {
1657  // want the "final" features for second pass, so get them again.
1658  output->Resize(post_process.NumFramesReady(), post_process.Dim());
1659  for (int32 frame = 0; frame < post_process.NumFramesReady(); frame++) {
1660  SubVector<BaseFloat> row(*output, frame);
1661  post_process.GetFrame(frame, &row);
1662  }
1663  }
1664 }
kaldi::int32 int32
#define KALDI_WARN
Definition: kaldi-error.h:150
SubMatrix< Real > RowRange(const MatrixIndexT row_offset, const MatrixIndexT num_rows) const
Definition: kaldi-matrix.h:209
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).

◆ ComputeDeltas()

void ComputeDeltas ( const DeltaFeaturesOptions delta_opts,
const MatrixBase< BaseFloat > &  input_features,
Matrix< BaseFloat > *  output_features 
)

Definition at line 160 of file feature-functions.cc.

References MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), DeltaFeaturesOptions::order, DeltaFeatures::Process(), and Matrix< Real >::Resize().

Referenced by OnlineProcessPitch::GetDeltaPitchFeature(), main(), kaldi::TestOnlineDeltaFeature(), kaldi::TestOnlineDeltaInput(), UnitTestCompareWithDeltaFeatures(), UnitTestHTKCompare1(), UnitTestHTKCompare2(), UnitTestHTKCompare3(), UnitTestHTKCompare4(), UnitTestHTKCompare5(), and UnitTestHTKCompare6().

162  {
163  output_features->Resize(input_features.NumRows(),
164  input_features.NumCols()
165  *(delta_opts.order + 1));
166  DeltaFeatures delta(delta_opts);
167  for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
168  SubVector<BaseFloat> row(*output_features, r);
169  delta.Process(input_features, r, &row);
170  }
171 }
kaldi::int32 int32
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).

◆ ComputeKaldiPitch()

void ComputeKaldiPitch ( const PitchExtractionOptions opts,
const VectorBase< BaseFloat > &  wave,
Matrix< BaseFloat > *  output 
)

This function extracts (pitch, NCCF) per frame, using the pitch extraction method described in "A Pitch Extraction Algorithm Tuned for Automatic Speech Recognition", Pegah Ghahremani, Bagher BabaAli, Daniel Povey, Korbinian Riedhammer, Jan Trmal and Sanjeev Khudanpur, ICASSP 2014.

The output will have as many rows as there are frames, and two columns corresponding to (NCCF, pitch)

Definition at line 1291 of file pitch-functions.cc.

References OnlinePitchFeature::AcceptWaveform(), kaldi::ComputeKaldiPitchFirstPass(), VectorBase< Real >::Dim(), PitchExtractionOptions::frame_shift_ms, PitchExtractionOptions::frames_per_chunk, OnlinePitchFeature::GetFrame(), OnlinePitchFeature::InputFinished(), KALDI_ASSERT, KALDI_WARN, OnlinePitchFeature::NumFramesReady(), Matrix< Real >::Resize(), PitchExtractionOptions::samp_freq, and PitchExtractionOptions::simulate_first_pass_online.

Referenced by main(), kaldi::UnitTestDiffSampleRate(), kaldi::UnitTestKeele(), kaldi::UnitTestKeeleNccfBallast(), kaldi::UnitTestPenaltyFactor(), kaldi::UnitTestPieces(), kaldi::UnitTestPitchExtractionSpeed(), kaldi::UnitTestPitchExtractorCompareKeele(), kaldi::UnitTestProcess(), and kaldi::UnitTestSearch().

1293  {
1294  if (opts.simulate_first_pass_online) {
1295  ComputeKaldiPitchFirstPass(opts, wave, output);
1296  return;
1297  }
1298  OnlinePitchFeature pitch_extractor(opts);
1299 
1300  if (opts.frames_per_chunk == 0) {
1301  pitch_extractor.AcceptWaveform(opts.samp_freq, wave);
1302  } else {
1303  // the user may set opts.frames_per_chunk for better compatibility with
1304  // online operation.
1305  KALDI_ASSERT(opts.frames_per_chunk > 0);
1306  int32 cur_offset = 0, samp_per_chunk =
1307  opts.frames_per_chunk * opts.samp_freq * opts.frame_shift_ms / 1000.0f;
1308  while (cur_offset < wave.Dim()) {
1309  int32 num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset);
1310  SubVector<BaseFloat> wave_chunk(wave, cur_offset, num_samp);
1311  pitch_extractor.AcceptWaveform(opts.samp_freq, wave_chunk);
1312  cur_offset += num_samp;
1313  }
1314  }
1315  pitch_extractor.InputFinished();
1316  int32 num_frames = pitch_extractor.NumFramesReady();
1317  if (num_frames == 0) {
1318  KALDI_WARN << "No frames output in pitch extraction";
1319  output->Resize(0, 0);
1320  return;
1321  }
1322  output->Resize(num_frames, 2);
1323  for (int32 frame = 0; frame < num_frames; frame++) {
1324  SubVector<BaseFloat> row(*output, frame);
1325  pitch_extractor.GetFrame(frame, &row);
1326  }
1327 }
void ComputeKaldiPitchFirstPass(const PitchExtractionOptions &opts, const VectorBase< BaseFloat > &wave, Matrix< BaseFloat > *output)
This function is called from ComputeKaldiPitch when the user specifies opts.simulate_first_pass_onlin...
kaldi::int32 int32
#define KALDI_WARN
Definition: kaldi-error.h:150
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).

◆ ComputeLifterCoeffs()

void ComputeLifterCoeffs ( BaseFloat  Q,
VectorBase< BaseFloat > *  coeffs 
)

Definition at line 253 of file mel-computations.cc.

References VectorBase< Real >::Dim(), rnnlm::i, and M_PI.

Referenced by MfccComputer::MfccComputer(), and PlpComputer::PlpComputer().

253  {
254  // Compute liftering coefficients (scaling on cepstral coeffs)
255  // coeffs are numbered slightly differently from HTK: the zeroth
256  // index is C0, which is not affected.
257  for (int32 i = 0; i < coeffs->Dim(); i++)
258  (*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q);
259 }
#define M_PI
Definition: kaldi-math.h:44
kaldi::int32 int32

◆ ComputeLpc()

BaseFloat ComputeLpc ( const VectorBase< BaseFloat > &  autocorr_in,
Vector< BaseFloat > *  lpc_out 
)

Definition at line 326 of file mel-computations.cc.

References VectorBase< Real >::Data(), VectorBase< Real >::Dim(), kaldi::Durbin(), KALDI_ASSERT, KALDI_WARN, kaldi::Log(), and rnnlm::n.

Referenced by PlpComputer::Compute().

327  {
328  int32 n = autocorr_in.Dim() - 1;
329  KALDI_ASSERT(lpc_out->Dim() == n);
330  Vector<BaseFloat> tmp(n);
331  BaseFloat ans = Durbin(n, autocorr_in.Data(),
332  lpc_out->Data(),
333  tmp.Data());
334  if (ans <= 0.0)
335  KALDI_WARN << "Zero energy in LPC computation";
336  return -Log(1.0 / ans); // forms the C0 value
337 }
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
double Log(double x)
Definition: kaldi-math.h:100
struct rnnlm::@11::@12 n
#define KALDI_WARN
Definition: kaldi-error.h:150
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp)

◆ ComputePowerSpectrum()

void ComputePowerSpectrum ( VectorBase< BaseFloat > *  waveform)

Definition at line 29 of file feature-functions.cc.

References VectorBase< Real >::Dim(), and rnnlm::i.

Referenced by SpectrogramComputer::Compute(), MfccComputer::Compute(), FbankComputer::Compute(), and PlpComputer::Compute().

29  {
30  int32 dim = waveform->Dim();
31 
32  // no, letting it be non-power-of-two for now.
33  // KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0)); // make sure a power of two.. actually my FFT code
34  // does not require this (dan) but this is better in case we use different code [dan].
35 
36  // RealFft(waveform, true); // true == forward (not inverse) FFT; makes no difference here,
37  // as we just want power spectrum.
38 
39  // now we have in waveform, first half of complex spectrum
40  // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
41  int32 half_dim = dim/2;
42  BaseFloat first_energy = (*waveform)(0) * (*waveform)(0),
43  last_energy = (*waveform)(1) * (*waveform)(1); // handle this special case
44  for (int32 i = 1; i < half_dim; i++) {
45  BaseFloat real = (*waveform)(i*2), im = (*waveform)(i*2 + 1);
46  (*waveform)(i) = real*real + im*im;
47  }
48  (*waveform)(0) = first_energy;
49  (*waveform)(half_dim) = last_energy; // Will actually never be used, and anyway
50  // if the signal has been bandlimited sensibly this should be zero.
51 }
double real
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29

◆ ComputeShiftedDeltas()

void ComputeShiftedDeltas ( const ShiftedDeltaFeaturesOptions delta_opts,
const MatrixBase< BaseFloat > &  input_features,
Matrix< BaseFloat > *  output_features 
)

Definition at line 173 of file feature-functions.cc.

References ShiftedDeltaFeaturesOptions::num_blocks, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), ShiftedDeltaFeatures::Process(), and Matrix< Real >::Resize().

Referenced by main(), UnitTestCompareWithDeltaFeatures(), UnitTestEndEffects(), and UnitTestParams().

175  {
176  output_features->Resize(input_features.NumRows(),
177  input_features.NumCols()
178  * (delta_opts.num_blocks + 1));
179  ShiftedDeltaFeatures delta(delta_opts);
180 
181  for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
182  SubVector<BaseFloat> row(*output_features, r);
183  delta.Process(input_features, r, &row);
184  }
185 }
kaldi::int32 int32
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).

◆ Dither()

void Dither ( VectorBase< BaseFloat > *  waveform,
BaseFloat  dither_value 
)

Definition at line 90 of file feature-window.cc.

References VectorBase< Real >::Data(), VectorBase< Real >::Dim(), rnnlm::i, and kaldi::RandGauss().

Referenced by kaldi::ProcessWindow().

90  {
91  if (dither_value == 0.0)
92  return;
93  int32 dim = waveform->Dim();
94  BaseFloat *data = waveform->Data();
95  RandomState rstate;
96  for (int32 i = 0; i < dim; i++)
97  data[i] += RandGauss(&rstate) * dither_value;
98 }
float RandGauss(struct RandomState *state=NULL)
Definition: kaldi-math.h:155
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29

◆ DownsampleWaveForm()

void kaldi::DownsampleWaveForm ( BaseFloat  orig_freq,
const VectorBase< BaseFloat > &  wave,
BaseFloat  new_freq,
Vector< BaseFloat > *  new_wave 
)
inline

This function is deprecated.

It is provided for backward compatibility, to avoid breaking older code.

Definition at line 279 of file resample.h.

References kaldi::ResampleWaveform().

280  {
281  ResampleWaveform(orig_freq, wave, new_freq, new_wave);
282 }
void ResampleWaveform(BaseFloat orig_freq, const VectorBase< BaseFloat > &wave, BaseFloat new_freq, Vector< BaseFloat > *new_wave)
Downsample or upsample a waveform.
Definition: resample.cc:368

◆ Durbin()

BaseFloat Durbin ( int  n,
const BaseFloat pAC,
BaseFloat pLP,
BaseFloat pTmp 
)

Definition at line 267 of file mel-computations.cc.

References rnnlm::i, rnnlm::j, and rnnlm::n.

Referenced by kaldi::ComputeLpc().

267  {
268  BaseFloat ki; // reflection coefficient
269  int i;
270  int j;
271 
272  BaseFloat E = pAC[0];
273 
274  for (i = 0; i < n; i++) {
275  // next reflection coefficient
276  ki = pAC[i + 1];
277  for (j = 0; j < i; j++)
278  ki += pLP[j] * pAC[i - j];
279  ki = ki / E;
280 
281  // new error
282  BaseFloat c = 1 - ki * ki;
283  if (c < 1.0e-5) // remove NaNs for constan signal
284  c = 1.0e-5;
285  E *= c;
286 
287  // new LP coefficients
288  pTmp[i] = -ki;
289  for (j = 0; j < i; j++)
290  pTmp[j] = pLP[j] - ki * pLP[i - j - 1];
291 
292  for (j = 0; j <= i; j++)
293  pLP[j] = pTmp[j];
294  }
295 
296  return E;
297 }
float BaseFloat
Definition: kaldi-types.h:29
struct rnnlm::@11::@12 n

◆ ExtractWindow()

void ExtractWindow ( int64  sample_offset,
const VectorBase< BaseFloat > &  wave,
int32  f,
const FrameExtractionOptions opts,
const FeatureWindowFunction window_function,
Vector< BaseFloat > *  window,
BaseFloat log_energy_pre_window 
)

Definition at line 166 of file feature-window.cc.

References VectorBase< Real >::Dim(), kaldi::FirstSampleOfFrame(), KALDI_ASSERT, kaldi::kUndefined, FrameExtractionOptions::PaddedWindowSize(), kaldi::ProcessWindow(), VectorBase< Real >::Range(), Vector< Real >::Resize(), FrameExtractionOptions::snip_edges, and FrameExtractionOptions::WindowSize().

Referenced by OfflineFeatureTpl< F >::Compute(), and OnlineGenericBaseFeature< C >::ComputeFeatures().

172  {
173  KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
174  int32 frame_length = opts.WindowSize(),
175  frame_length_padded = opts.PaddedWindowSize();
176  int64 num_samples = sample_offset + wave.Dim(),
177  start_sample = FirstSampleOfFrame(f, opts),
178  end_sample = start_sample + frame_length;
179 
180  if (opts.snip_edges) {
181  KALDI_ASSERT(start_sample >= sample_offset &&
182  end_sample <= num_samples);
183  } else {
184  KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
185  }
186 
187  if (window->Dim() != frame_length_padded)
188  window->Resize(frame_length_padded, kUndefined);
189 
190  // wave_start and wave_end are start and end indexes into 'wave', for the
191  // piece of wave that we're trying to extract.
192  int32 wave_start = int32(start_sample - sample_offset),
193  wave_end = wave_start + frame_length;
194  if (wave_start >= 0 && wave_end <= wave.Dim()) {
195  // the normal case-- no edge effects to consider.
196  window->Range(0, frame_length).CopyFromVec(
197  wave.Range(wave_start, frame_length));
198  } else {
199  // Deal with any end effects by reflection, if needed. This code will only
200  // be reached for about two frames per utterance, so we don't concern
201  // ourselves excessively with efficiency.
202  int32 wave_dim = wave.Dim();
203  for (int32 s = 0; s < frame_length; s++) {
204  int32 s_in_wave = s + wave_start;
205  while (s_in_wave < 0 || s_in_wave >= wave_dim) {
206  // reflect around the beginning or end of the wave.
207  // e.g. -1 -> 0, -2 -> 1.
208  // dim -> dim - 1, dim + 1 -> dim - 2.
209  // the code supports repeated reflections, although this
210  // would only be needed in pathological cases.
211  if (s_in_wave < 0) s_in_wave = - s_in_wave - 1;
212  else s_in_wave = 2 * wave_dim - 1 - s_in_wave;
213  }
214  (*window)(s) = wave(s_in_wave);
215  }
216  }
217 
218  if (frame_length_padded > frame_length)
219  window->Range(frame_length, frame_length_padded - frame_length).SetZero();
220 
221  SubVector<BaseFloat> frame(*window, 0, frame_length);
222 
223  ProcessWindow(opts, window_function, &frame, log_energy_pre_window);
224 }
kaldi::int32 int32
int64 FirstSampleOfFrame(int32 frame, const FrameExtractionOptions &opts)
void ProcessWindow(const FrameExtractionOptions &opts, const FeatureWindowFunction &window_function, VectorBase< BaseFloat > *window, BaseFloat *log_energy_pre_window)
This function does all the windowing steps after actually extracting the windowed signal: depending o...
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ FirstSampleOfFrame()

int64 FirstSampleOfFrame ( int32  frame,
const FrameExtractionOptions opts 
)

Definition at line 30 of file feature-window.cc.

References FrameExtractionOptions::snip_edges, FrameExtractionOptions::WindowShift(), and FrameExtractionOptions::WindowSize().

Referenced by OnlineGenericBaseFeature< C >::ComputeFeatures(), kaldi::ExtractWindow(), and kaldi::NumFrames().

31  {
32  int64 frame_shift = opts.WindowShift();
33  if (opts.snip_edges) {
34  return frame * frame_shift;
35  } else {
36  int64 midpoint_of_frame = frame_shift * frame + frame_shift / 2,
37  beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
38  return beginning_of_frame;
39  }
40 }

◆ GetEqualLoudnessVector()

void GetEqualLoudnessVector ( const MelBanks mel_banks,
Vector< BaseFloat > *  ans 
)

Definition at line 311 of file mel-computations.cc.

References MelBanks::GetCenterFreqs(), rnnlm::i, rnnlm::n, MelBanks::NumBins(), and Vector< Real >::Resize().

Referenced by PlpComputer::GetEqualLoudness().

312  {
313  int32 n = mel_banks.NumBins();
314  // Central frequency of each mel bin.
315  const Vector<BaseFloat> &f0 = mel_banks.GetCenterFreqs();
316  ans->Resize(n);
317  for (int32 i = 0; i < n; i++) {
318  BaseFloat fsq = f0(i) * f0(i);
319  BaseFloat fsub = fsq / (fsq + 1.6e5);
320  (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6));
321  }
322 }
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
struct rnnlm::@11::@12 n

◆ InitIdftBases()

void InitIdftBases ( int32  n_bases,
int32  dimension,
Matrix< BaseFloat > *  mat_out 
)

Definition at line 188 of file feature-functions.cc.

References rnnlm::i, rnnlm::j, M_PI, and Matrix< Real >::Resize().

Referenced by PlpComputer::PlpComputer().

188  {
189  BaseFloat angle = M_PI / static_cast<BaseFloat>(dimension - 1);
190  BaseFloat scale = 1.0f / (2.0 * static_cast<BaseFloat>(dimension - 1));
191  mat_out->Resize(n_bases, dimension);
192  for (int32 i = 0; i < n_bases; i++) {
193  (*mat_out)(i, 0) = 1.0 * scale;
194  BaseFloat i_fl = static_cast<BaseFloat>(i);
195  for (int32 j = 1; j < dimension - 1; j++) {
196  BaseFloat j_fl = static_cast<BaseFloat>(j);
197  (*mat_out)(i, j) = 2.0 * scale * cos(angle * i_fl * j_fl);
198  }
199 
200  (*mat_out)(i, dimension -1)
201  = scale * cos(angle * i_fl * static_cast<BaseFloat>(dimension-1));
202  }
203 }
#define M_PI
Definition: kaldi-math.h:44
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).

◆ Lpc2Cepstrum()

void Lpc2Cepstrum ( int  n,
const BaseFloat pLPC,
BaseFloat pCepst 
)

Definition at line 300 of file mel-computations.cc.

References rnnlm::i, rnnlm::j, and rnnlm::n.

Referenced by PlpComputer::Compute().

300  {
301  for (int32 i = 0; i < n; i++) {
302  double sum = 0.0;
303  int j;
304  for (j = 0; j < i; j++) {
305  sum += static_cast<BaseFloat>(i - j) * pLPC[j] * pCepst[i - j - 1];
306  }
307  pCepst[i] = -pLPC[i] - sum / static_cast<BaseFloat>(i + 1);
308  }
309 }
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
struct rnnlm::@11::@12 n

◆ NumFrames()

int32 NumFrames ( int64  num_samples,
const FrameExtractionOptions opts,
bool  flush = true 
)

This function returns the number of frames that we can extract from a wave file with the given number of samples in it (assumed to have the same sampling rate as specified in 'opts').

Parameters
[in]num_samplesThe number of samples in the wave file.
[in]optsThe frame-extraction options class
[in]flushTrue if we are asserting that this number of samples is 'all there is', false if we expecting more data to possibly come in. This only makes a difference to the answer if opts.snips_edges == false. For offline feature extraction you always want flush == true. In an online-decoding context, once you know (or decide) that no more data is coming in, you'd call it with flush == true at the end to flush out any remaining data.

Definition at line 42 of file feature-window.cc.

References kaldi::FirstSampleOfFrame(), FrameExtractionOptions::snip_edges, FrameExtractionOptions::WindowShift(), and FrameExtractionOptions::WindowSize().

Referenced by OfflineFeatureTpl< F >::Compute(), OnlineFeInput< E >::Compute(), OnlineGenericBaseFeature< C >::ComputeFeatures(), and OnlineIvectorFeature::PrintDiagnostics().

44  {
45  int64 frame_shift = opts.WindowShift();
46  int64 frame_length = opts.WindowSize();
47  if (opts.snip_edges) {
48  // with --snip-edges=true (the default), we use a HTK-like approach to
49  // determining the number of frames-- all frames have to fit completely into
50  // the waveform, and the first frame begins at sample zero.
51  if (num_samples < frame_length)
52  return 0;
53  else
54  return (1 + ((num_samples - frame_length) / frame_shift));
55  // You can understand the expression above as follows: 'num_samples -
56  // frame_length' is how much room we have to shift the frame within the
57  // waveform; 'frame_shift' is how much we shift it each time; and the ratio
58  // is how many times we can shift it (integer arithmetic rounds down).
59  } else {
60  // if --snip-edges=false, the number of frames is determined by rounding the
61  // (file-length / frame-shift) to the nearest integer. The point of this
62  // formula is to make the number of frames an obvious and predictable
63  // function of the frame shift and signal length, which makes many
64  // segmentation-related questions simpler.
65  //
66  // Because integer division in C++ rounds toward zero, we add (half the
67  // frame-shift minus epsilon) before dividing, to have the effect of
68  // rounding towards the closest integer.
69  int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
70 
71  if (flush)
72  return num_frames;
73 
74  // note: 'end' always means the last plus one, i.e. one past the last.
75  int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
76  + frame_length;
77 
78  // the following code is optimized more for clarity than efficiency.
79  // If flush == false, we can't output frames that extend past the end
80  // of the signal.
81  while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
82  num_frames--;
83  end_sample_of_last_frame -= frame_shift;
84  }
85  return num_frames;
86  }
87 }
kaldi::int32 int32
int64 FirstSampleOfFrame(int32 frame, const FrameExtractionOptions &opts)

◆ Preemphasize()

void Preemphasize ( VectorBase< BaseFloat > *  waveform,
BaseFloat  preemph_coeff 
)

Definition at line 101 of file feature-window.cc.

References VectorBase< Real >::Dim(), rnnlm::i, and KALDI_ASSERT.

Referenced by kaldi::ProcessWindow().

101  {
102  if (preemph_coeff == 0.0) return;
103  KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
104  for (int32 i = waveform->Dim()-1; i > 0; i--)
105  (*waveform)(i) -= preemph_coeff * (*waveform)(i-1);
106  (*waveform)(0) -= preemph_coeff * (*waveform)(0);
107 }
kaldi::int32 int32
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ ProcessPitch()

void ProcessPitch ( const ProcessPitchOptions opts,
const MatrixBase< BaseFloat > &  input,
Matrix< BaseFloat > *  output 
)

This function processes the raw (NCCF, pitch) quantities computed by ComputeKaldiPitch, and processes them into features.

By default it will output three-dimensional features, (POV-feature, mean-subtracted-log-pitch, delta-of-raw-pitch), but this is configurable in the options. The number of rows of "output" will be the number of frames (rows) in "input", and the number of columns will be the number of different types of features requested (by default, 3; 4 is the max). The four config variables –add-pov-feature, –add-normalized-log-pitch, –add-delta-pitch, –add-raw-log-pitch determine which features we create; by default we create the first three.

Definition at line 1581 of file pitch-functions.cc.

References OnlineProcessPitch::Dim(), OnlineProcessPitch::GetFrame(), OnlineProcessPitch::NumFramesReady(), and Matrix< Real >::Resize().

Referenced by main(), kaldi::UnitTestPieces(), and kaldi::UnitTestProcess().

1583  {
1584  OnlineMatrixFeature pitch_feat(input);
1585 
1586  OnlineProcessPitch online_process_pitch(opts, &pitch_feat);
1587 
1588  output->Resize(online_process_pitch.NumFramesReady(),
1589  online_process_pitch.Dim());
1590  for (int32 t = 0; t < online_process_pitch.NumFramesReady(); t++) {
1591  SubVector<BaseFloat> row(*output, t);
1592  online_process_pitch.GetFrame(t, &row);
1593  }
1594 }
kaldi::int32 int32
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).

◆ ProcessWindow()

void ProcessWindow ( const FrameExtractionOptions opts,
const FeatureWindowFunction window_function,
VectorBase< BaseFloat > *  window,
BaseFloat log_energy_pre_window = NULL 
)

This function does all the windowing steps after actually extracting the windowed signal: depending on the configuration, it does dithering, dc offset removal, preemphasis, and multiplication by the windowing function.

Parameters
[in]optsThe options class to be used
[in]window_functionThe windowing function– should have been initialized using 'opts'.
[in,out]windowA vector of size opts.WindowSize(). Note: it will typically be a sub-vector of a larger vector of size opts.PaddedWindowSize(), with the remaining samples zero, as the FFT code is more efficient if it operates on data with power-of-two size.
[out]log_energy_pre_windowIf non-NULL, then after dithering and DC offset removal, this function will write to this pointer the log of the total energy (i.e. sum-squared) of the frame.

Definition at line 137 of file feature-window.cc.

References VectorBase< Real >::Add(), VectorBase< Real >::Dim(), FrameExtractionOptions::dither, kaldi::Dither(), KALDI_ASSERT, kaldi::Log(), VectorBase< Real >::MulElements(), FrameExtractionOptions::preemph_coeff, kaldi::Preemphasize(), FrameExtractionOptions::remove_dc_offset, VectorBase< Real >::Sum(), kaldi::VecVec(), FeatureWindowFunction::window, and FrameExtractionOptions::WindowSize().

Referenced by kaldi::ExtractWindow().

140  {
141  int32 frame_length = opts.WindowSize();
142  KALDI_ASSERT(window->Dim() == frame_length);
143 
144  if (opts.dither != 0.0)
145  Dither(window, opts.dither);
146 
147  if (opts.remove_dc_offset)
148  window->Add(-window->Sum() / frame_length);
149 
150  if (log_energy_pre_window != NULL) {
151  BaseFloat energy = std::max<BaseFloat>(VecVec(*window, *window),
152  std::numeric_limits<float>::epsilon());
153  *log_energy_pre_window = Log(energy);
154  }
155 
156  if (opts.preemph_coeff != 0.0)
157  Preemphasize(window, opts.preemph_coeff);
158 
159  window->MulElements(window_function.window);
160 }
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
double Log(double x)
Definition: kaldi-math.h:100
void Preemphasize(VectorBase< BaseFloat > *waveform, BaseFloat preemph_coeff)
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
void Dither(VectorBase< BaseFloat > *waveform, BaseFloat dither_value)

◆ ResampleWaveform()

void ResampleWaveform ( BaseFloat  orig_freq,
const VectorBase< BaseFloat > &  wave,
BaseFloat  new_freq,
Vector< BaseFloat > *  new_wave 
)

Downsample or upsample a waveform.

This is a convenience wrapper for the class 'LinearResample'. The low-pass filter cutoff used in 'LinearResample' is 0.99 of the Nyquist, where the Nyquist is half of the minimum of (orig_freq, new_freq). The resampling is done with a symmetric FIR filter with N_z (number of zeros) as 6.

We compared the downsampling results with those from the sox resampling toolkit. Sox's design is inspired by Laurent De Soras' paper, https://ccrma.stanford.edu/~jos/resample/Implementation.html

Note: we expect that while orig_freq and new_freq are of type BaseFloat, they are actually required to have exact integer values (like 16000 or 8000) with a ratio between them that can be expressed as a rational number with reasonably small integer factors.

Definition at line 368 of file resample.cc.

References LinearResample::Resample().

Referenced by OfflineFeatureTpl< F >::ComputeFeatures(), and kaldi::DownsampleWaveForm().

369  {
370  BaseFloat min_freq = std::min(orig_freq, new_freq);
371  BaseFloat lowpass_cutoff = 0.99 * 0.5 * min_freq;
372  int32 lowpass_filter_width = 6;
373  LinearResample resampler(orig_freq, new_freq,
374  lowpass_cutoff, lowpass_filter_width);
375  resampler.Resample(wave, true, new_wave);
376 }
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29

◆ ReverseFrames()

void ReverseFrames ( const MatrixBase< BaseFloat > &  input_features,
Matrix< BaseFloat > *  output_features 
)

Definition at line 228 of file feature-functions.cc.

References VectorBase< Real >::CopyFromVec(), KALDI_ERR, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), and Matrix< Real >::Resize().

229  {
230  int32 T = input_features.NumRows(), D = input_features.NumCols();
231  if (T == 0 || D == 0)
232  KALDI_ERR << "ReverseFrames: empty input";
233  output_features->Resize(T, D);
234  for (int32 t = 0; t < T; t++) {
235  SubVector<BaseFloat> dst_row(*output_features, t);
236  SubVector<BaseFloat> src_row(input_features, T-1-t);
237  dst_row.CopyFromVec(src_row);
238  }
239 }
kaldi::int32 int32
#define KALDI_ERR
Definition: kaldi-error.h:147
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).

◆ SlidingWindowCmn()

void SlidingWindowCmn ( const SlidingWindowCmnOptions opts,
const MatrixBase< BaseFloat > &  input,
MatrixBase< BaseFloat > *  output 
)

Applies sliding-window cepstral mean and/or variance normalization.

See the strings registering the options in the options class for information on how this works and what the options are. input and output must have the same dimension.

Definition at line 350 of file feature-functions.cc.

References MatrixBase< Real >::CopyFromMat(), KALDI_ASSERT, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), kaldi::SameDim(), and kaldi::SlidingWindowCmnInternal().

Referenced by main(), SlidingWindowCmnOptions::Register(), and kaldi::UnitTestOnlineCmvn().

352  {
353  KALDI_ASSERT(SameDim(input, *output) && input.NumRows() > 0);
354  Matrix<double> input_dbl(input), output_dbl(input.NumRows(), input.NumCols());
355  // call double-precision version
356  SlidingWindowCmnInternal(opts, input_dbl, &output_dbl);
357  output->CopyFromMat(output_dbl);
358 }
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts, const MatrixBase< double > &input, MatrixBase< double > *output)
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ SpliceFrames()

void SpliceFrames ( const MatrixBase< BaseFloat > &  input_features,
int32  left_context,
int32  right_context,
Matrix< BaseFloat > *  output_features 
)

Definition at line 205 of file feature-functions.cc.

References rnnlm::j, KALDI_ASSERT, KALDI_ERR, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), and Matrix< Real >::Resize().

Referenced by OnlineLdaInput::Dim(), main(), kaldi::TestOnlineLdaInput(), and kaldi::TestOnlineSpliceFrames().

208  {
209  int32 T = input_features.NumRows(), D = input_features.NumCols();
210  if (T == 0 || D == 0)
211  KALDI_ERR << "SpliceFrames: empty input";
212  KALDI_ASSERT(left_context >= 0 && right_context >= 0);
213  int32 N = 1 + left_context + right_context;
214  output_features->Resize(T, D*N);
215  for (int32 t = 0; t < T; t++) {
216  SubVector<BaseFloat> dst_row(*output_features, t);
217  for (int32 j = 0; j < N; j++) {
218  int32 t2 = t + j - left_context;
219  if (t2 < 0) t2 = 0;
220  if (t2 >= T) t2 = T-1;
221  SubVector<BaseFloat> dst(dst_row, j*D, D),
222  src(input_features, t2);
223  dst.CopyFromVec(src);
224  }
225  }
226 }
kaldi::int32 int32
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).