feature-window.cc
Go to the documentation of this file.
1 // feat/feature-window.cc
2 
3 // Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
4 // 2013-2016 Johns Hopkins University (author: Daniel Povey)
5 // 2014 IMSL, PKU-HKUST (author: Wei Shi)
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 
23 #include "feat/feature-window.h"
25 
26 
27 namespace kaldi {
28 
29 
31  const FrameExtractionOptions &opts) {
32  int64 frame_shift = opts.WindowShift();
33  if (opts.snip_edges) {
34  return frame * frame_shift;
35  } else {
36  int64 midpoint_of_frame = frame_shift * frame + frame_shift / 2,
37  beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
38  return beginning_of_frame;
39  }
40 }
41 
42 int32 NumFrames(int64 num_samples,
43  const FrameExtractionOptions &opts,
44  bool flush) {
45  int64 frame_shift = opts.WindowShift();
46  int64 frame_length = opts.WindowSize();
47  if (opts.snip_edges) {
48  // with --snip-edges=true (the default), we use a HTK-like approach to
49  // determining the number of frames-- all frames have to fit completely into
50  // the waveform, and the first frame begins at sample zero.
51  if (num_samples < frame_length)
52  return 0;
53  else
54  return (1 + ((num_samples - frame_length) / frame_shift));
55  // You can understand the expression above as follows: 'num_samples -
56  // frame_length' is how much room we have to shift the frame within the
57  // waveform; 'frame_shift' is how much we shift it each time; and the ratio
58  // is how many times we can shift it (integer arithmetic rounds down).
59  } else {
60  // if --snip-edges=false, the number of frames is determined by rounding the
61  // (file-length / frame-shift) to the nearest integer. The point of this
62  // formula is to make the number of frames an obvious and predictable
63  // function of the frame shift and signal length, which makes many
64  // segmentation-related questions simpler.
65  //
66  // Because integer division in C++ rounds toward zero, we add (half the
67  // frame-shift minus epsilon) before dividing, to have the effect of
68  // rounding towards the closest integer.
69  int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
70 
71  if (flush)
72  return num_frames;
73 
74  // note: 'end' always means the last plus one, i.e. one past the last.
75  int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
76  + frame_length;
77 
78  // the following code is optimized more for clarity than efficiency.
79  // If flush == false, we can't output frames that extend past the end
80  // of the signal.
81  while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
82  num_frames--;
83  end_sample_of_last_frame -= frame_shift;
84  }
85  return num_frames;
86  }
87 }
88 
89 
90 void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value) {
91  if (dither_value == 0.0)
92  return;
93  int32 dim = waveform->Dim();
94  BaseFloat *data = waveform->Data();
95  RandomState rstate;
96  for (int32 i = 0; i < dim; i++)
97  data[i] += RandGauss(&rstate) * dither_value;
98 }
99 
100 
101 void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff) {
102  if (preemph_coeff == 0.0) return;
103  KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
104  for (int32 i = waveform->Dim()-1; i > 0; i--)
105  (*waveform)(i) -= preemph_coeff * (*waveform)(i-1);
106  (*waveform)(0) -= preemph_coeff * (*waveform)(0);
107 }
108 
110  int32 frame_length = opts.WindowSize();
111  KALDI_ASSERT(frame_length > 0);
112  window.Resize(frame_length);
113  double a = M_2PI / (frame_length-1);
114  for (int32 i = 0; i < frame_length; i++) {
115  double i_fl = static_cast<double>(i);
116  if (opts.window_type == "hanning") {
117  window(i) = 0.5 - 0.5*cos(a * i_fl);
118  } else if (opts.window_type == "sine") {
119  // when you are checking ws wikipedia, please
120  // note that 0.5 * a = M_PI/(frame_length-1)
121  window(i) = sin(0.5 * a * i_fl);
122  } else if (opts.window_type == "hamming") {
123  window(i) = 0.54 - 0.46*cos(a * i_fl);
124  } else if (opts.window_type == "povey") { // like hamming but goes to zero at edges.
125  window(i) = pow(0.5 - 0.5*cos(a * i_fl), 0.85);
126  } else if (opts.window_type == "rectangular") {
127  window(i) = 1.0;
128  } else if (opts.window_type == "blackman") {
129  window(i) = opts.blackman_coeff - 0.5*cos(a * i_fl) +
130  (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
131  } else {
132  KALDI_ERR << "Invalid window type " << opts.window_type;
133  }
134  }
135 }
136 
138  const FeatureWindowFunction &window_function,
140  BaseFloat *log_energy_pre_window) {
141  int32 frame_length = opts.WindowSize();
142  KALDI_ASSERT(window->Dim() == frame_length);
143 
144  if (opts.dither != 0.0)
145  Dither(window, opts.dither);
146 
147  if (opts.remove_dc_offset)
148  window->Add(-window->Sum() / frame_length);
149 
150  if (log_energy_pre_window != NULL) {
151  BaseFloat energy = std::max<BaseFloat>(VecVec(*window, *window),
152  std::numeric_limits<float>::epsilon());
153  *log_energy_pre_window = Log(energy);
154  }
155 
156  if (opts.preemph_coeff != 0.0)
157  Preemphasize(window, opts.preemph_coeff);
158 
159  window->MulElements(window_function.window);
160 }
161 
162 
163 // ExtractWindow extracts a windowed frame of waveform with a power-of-two,
164 // padded size. It does mean subtraction, pre-emphasis and dithering as
165 // requested.
166 void ExtractWindow(int64 sample_offset,
167  const VectorBase<BaseFloat> &wave,
168  int32 f, // with 0 <= f < NumFrames(feats, opts)
169  const FrameExtractionOptions &opts,
170  const FeatureWindowFunction &window_function,
172  BaseFloat *log_energy_pre_window) {
173  KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
174  int32 frame_length = opts.WindowSize(),
175  frame_length_padded = opts.PaddedWindowSize();
176  int64 num_samples = sample_offset + wave.Dim(),
177  start_sample = FirstSampleOfFrame(f, opts),
178  end_sample = start_sample + frame_length;
179 
180  if (opts.snip_edges) {
181  KALDI_ASSERT(start_sample >= sample_offset &&
182  end_sample <= num_samples);
183  } else {
184  KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
185  }
186 
187  if (window->Dim() != frame_length_padded)
188  window->Resize(frame_length_padded, kUndefined);
189 
190  // wave_start and wave_end are start and end indexes into 'wave', for the
191  // piece of wave that we're trying to extract.
192  int32 wave_start = int32(start_sample - sample_offset),
193  wave_end = wave_start + frame_length;
194  if (wave_start >= 0 && wave_end <= wave.Dim()) {
195  // the normal case-- no edge effects to consider.
196  window->Range(0, frame_length).CopyFromVec(
197  wave.Range(wave_start, frame_length));
198  } else {
199  // Deal with any end effects by reflection, if needed. This code will only
200  // be reached for about two frames per utterance, so we don't concern
201  // ourselves excessively with efficiency.
202  int32 wave_dim = wave.Dim();
203  for (int32 s = 0; s < frame_length; s++) {
204  int32 s_in_wave = s + wave_start;
205  while (s_in_wave < 0 || s_in_wave >= wave_dim) {
206  // reflect around the beginning or end of the wave.
207  // e.g. -1 -> 0, -2 -> 1.
208  // dim -> dim - 1, dim + 1 -> dim - 2.
209  // the code supports repeated reflections, although this
210  // would only be needed in pathological cases.
211  if (s_in_wave < 0) s_in_wave = - s_in_wave - 1;
212  else s_in_wave = 2 * wave_dim - 1 - s_in_wave;
213  }
214  (*window)(s) = wave(s_in_wave);
215  }
216  }
217 
218  if (frame_length_padded > frame_length)
219  window->Range(frame_length, frame_length_padded - frame_length).SetZero();
220 
221  SubVector<BaseFloat> frame(*window, 0, frame_length);
222 
223  ProcessWindow(opts, window_function, &frame, log_energy_pre_window);
224 }
225 
226 } // namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Vector< BaseFloat > window
float RandGauss(struct RandomState *state=NULL)
Definition: kaldi-math.h:155
kaldi::int32 int32
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
void ExtractWindow(int64 sample_offset, const VectorBase< BaseFloat > &wave, int32 f, const FrameExtractionOptions &opts, const FeatureWindowFunction &window_function, Vector< BaseFloat > *window, BaseFloat *log_energy_pre_window)
float BaseFloat
Definition: kaldi-types.h:29
double Log(double x)
Definition: kaldi-math.h:100
void MulElements(const VectorBase< Real > &v)
Multiply element-by-element by another vector.
int64 FirstSampleOfFrame(int32 frame, const FrameExtractionOptions &opts)
#define KALDI_ERR
Definition: kaldi-error.h:147
int32 NumFrames(int64 num_samples, const FrameExtractionOptions &opts, bool flush)
This function returns the number of frames that we can extract from a wave file with the given number...
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70
void ProcessWindow(const FrameExtractionOptions &opts, const FeatureWindowFunction &window_function, VectorBase< BaseFloat > *window, BaseFloat *log_energy_pre_window)
This function does all the windowing steps after actually extracting the windowed signal: depending o...
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
Real Sum() const
Returns sum of the elements.
void Preemphasize(VectorBase< BaseFloat > *waveform, BaseFloat preemph_coeff)
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define M_2PI
Definition: kaldi-math.h:52
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
void Add(Real c)
Add a constant to each element of a vector.
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
void Dither(VectorBase< BaseFloat > *waveform, BaseFloat dither_value)
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).
Definition: kaldi-vector.h:94