feature-functions.cc
Go to the documentation of this file.
1 // feat/feature-functions.cc
2 
3 // Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
4 // 2013 Johns Hopkins University (author: Daniel Povey)
5 // 2014 IMSL, PKU-HKUST (author: Wei Shi)
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 
23 #include "feat/feature-functions.h"
25 
26 
27 namespace kaldi {
28 
30  int32 dim = waveform->Dim();
31 
32  // no, letting it be non-power-of-two for now.
33  // KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0)); // make sure a power of two.. actually my FFT code
34  // does not require this (dan) but this is better in case we use different code [dan].
35 
36  // RealFft(waveform, true); // true == forward (not inverse) FFT; makes no difference here,
37  // as we just want power spectrum.
38 
39  // now we have in waveform, first half of complex spectrum
40  // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
41  int32 half_dim = dim/2;
42  BaseFloat first_energy = (*waveform)(0) * (*waveform)(0),
43  last_energy = (*waveform)(1) * (*waveform)(1); // handle this special case
44  for (int32 i = 1; i < half_dim; i++) {
45  BaseFloat real = (*waveform)(i*2), im = (*waveform)(i*2 + 1);
46  (*waveform)(i) = real*real + im*im;
47  }
48  (*waveform)(0) = first_energy;
49  (*waveform)(half_dim) = last_energy; // Will actually never be used, and anyway
50  // if the signal has been bandlimited sensibly this should be zero.
51 }
52 
53 
55  KALDI_ASSERT(opts.order >= 0 && opts.order < 1000); // just make sure we don't get binary junk.
56  // opts will normally be 2 or 3.
57  KALDI_ASSERT(opts.window > 0 && opts.window < 1000); // again, basic sanity check.
58  // normally the window size will be two.
59 
60  scales_.resize(opts.order+1);
61  scales_[0].Resize(1);
62  scales_[0](0) = 1.0; // trivial window for 0th order delta [i.e. baseline feats]
63 
64  for (int32 i = 1; i <= opts.order; i++) {
65  Vector<BaseFloat> &prev_scales = scales_[i-1],
66  &cur_scales = scales_[i];
67  int32 window = opts.window; // this code is designed to still
68  // work if instead we later make it an array and do opts.window[i-1],
69  // or something like that. "window" is a parameter specifying delta-window
70  // width which is actually 2*window + 1.
71  KALDI_ASSERT(window != 0);
72  int32 prev_offset = (static_cast<int32>(prev_scales.Dim()-1))/2,
73  cur_offset = prev_offset + window;
74  cur_scales.Resize(prev_scales.Dim() + 2*window); // also zeros it.
75 
76  BaseFloat normalizer = 0.0;
77  for (int32 j = -window; j <= window; j++) {
78  normalizer += j*j;
79  for (int32 k = -prev_offset; k <= prev_offset; k++) {
80  cur_scales(j+k+cur_offset) +=
81  static_cast<BaseFloat>(j) * prev_scales(k+prev_offset);
82  }
83  }
84  cur_scales.Scale(1.0 / normalizer);
85  }
86 }
87 
89  int32 frame,
90  VectorBase<BaseFloat> *output_frame) const {
91  KALDI_ASSERT(frame < input_feats.NumRows());
92  int32 num_frames = input_feats.NumRows(),
93  feat_dim = input_feats.NumCols();
94  KALDI_ASSERT(static_cast<int32>(output_frame->Dim()) == feat_dim * (opts_.order+1));
95  output_frame->SetZero();
96  for (int32 i = 0; i <= opts_.order; i++) {
97  const Vector<BaseFloat> &scales = scales_[i];
98  int32 max_offset = (scales.Dim() - 1) / 2;
99  SubVector<BaseFloat> output(*output_frame, i*feat_dim, feat_dim);
100  for (int32 j = -max_offset; j <= max_offset; j++) {
101  // if asked to read
102  int32 offset_frame = frame + j;
103  if (offset_frame < 0) offset_frame = 0;
104  else if (offset_frame >= num_frames)
105  offset_frame = num_frames - 1;
106  BaseFloat scale = scales(j + max_offset);
107  if (scale != 0.0)
108  output.AddVec(scale, input_feats.Row(offset_frame));
109  }
110  }
111 }
112 
114  const ShiftedDeltaFeaturesOptions &opts): opts_(opts) {
115  KALDI_ASSERT(opts.window > 0 && opts.window < 1000);
116 
117  // Default window is 1.
118  int32 window = opts.window;
119  KALDI_ASSERT(window != 0);
120  scales_.Resize(1 + 2*window); // also zeros it.
121  BaseFloat normalizer = 0.0;
122  for (int32 j = -window; j <= window; j++) {
123  normalizer += j*j;
124  scales_(j + window) += static_cast<BaseFloat>(j);
125  }
126  scales_.Scale(1.0 / normalizer);
127 }
128 
130  int32 frame,
131  SubVector<BaseFloat> *output_frame) const {
132  KALDI_ASSERT(frame < input_feats.NumRows());
133  int32 num_frames = input_feats.NumRows(),
134  feat_dim = input_feats.NumCols();
135  KALDI_ASSERT(static_cast<int32>(output_frame->Dim())
136  == feat_dim * (opts_.num_blocks + 1));
137  output_frame->SetZero();
138 
139  // The original features
140  SubVector<BaseFloat> output(*output_frame, 0, feat_dim);
141  output.AddVec(1.0, input_feats.Row(frame));
142 
143  // Concatenate the delta-blocks. Each block is block_shift
144  // (usually 3) frames apart.
145  for (int32 i = 0; i < opts_.num_blocks; i++) {
146  int32 max_offset = (scales_.Dim() - 1) / 2;
147  SubVector<BaseFloat> output(*output_frame, (i + 1) * feat_dim, feat_dim);
148  for (int32 j = -max_offset; j <= max_offset; j++) {
149  int32 offset_frame = frame + j + i * opts_.block_shift;
150  if (offset_frame < 0) offset_frame = 0;
151  else if (offset_frame >= num_frames)
152  offset_frame = num_frames - 1;
153  BaseFloat scale = scales_(j + max_offset);
154  if (scale != 0.0)
155  output.AddVec(scale, input_feats.Row(offset_frame));
156  }
157  }
158 }
159 
160 void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
161  const MatrixBase<BaseFloat> &input_features,
162  Matrix<BaseFloat> *output_features) {
163  output_features->Resize(input_features.NumRows(),
164  input_features.NumCols()
165  *(delta_opts.order + 1));
166  DeltaFeatures delta(delta_opts);
167  for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
168  SubVector<BaseFloat> row(*output_features, r);
169  delta.Process(input_features, r, &row);
170  }
171 }
172 
174  const MatrixBase<BaseFloat> &input_features,
175  Matrix<BaseFloat> *output_features) {
176  output_features->Resize(input_features.NumRows(),
177  input_features.NumCols()
178  * (delta_opts.num_blocks + 1));
179  ShiftedDeltaFeatures delta(delta_opts);
180 
181  for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
182  SubVector<BaseFloat> row(*output_features, r);
183  delta.Process(input_features, r, &row);
184  }
185 }
186 
187 
188 void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out) {
189  BaseFloat angle = M_PI / static_cast<BaseFloat>(dimension - 1);
190  BaseFloat scale = 1.0f / (2.0 * static_cast<BaseFloat>(dimension - 1));
191  mat_out->Resize(n_bases, dimension);
192  for (int32 i = 0; i < n_bases; i++) {
193  (*mat_out)(i, 0) = 1.0 * scale;
194  BaseFloat i_fl = static_cast<BaseFloat>(i);
195  for (int32 j = 1; j < dimension - 1; j++) {
196  BaseFloat j_fl = static_cast<BaseFloat>(j);
197  (*mat_out)(i, j) = 2.0 * scale * cos(angle * i_fl * j_fl);
198  }
199 
200  (*mat_out)(i, dimension -1)
201  = scale * cos(angle * i_fl * static_cast<BaseFloat>(dimension-1));
202  }
203 }
204 
205 void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
206  int32 left_context,
207  int32 right_context,
208  Matrix<BaseFloat> *output_features) {
209  int32 T = input_features.NumRows(), D = input_features.NumCols();
210  if (T == 0 || D == 0)
211  KALDI_ERR << "SpliceFrames: empty input";
212  KALDI_ASSERT(left_context >= 0 && right_context >= 0);
213  int32 N = 1 + left_context + right_context;
214  output_features->Resize(T, D*N);
215  for (int32 t = 0; t < T; t++) {
216  SubVector<BaseFloat> dst_row(*output_features, t);
217  for (int32 j = 0; j < N; j++) {
218  int32 t2 = t + j - left_context;
219  if (t2 < 0) t2 = 0;
220  if (t2 >= T) t2 = T-1;
221  SubVector<BaseFloat> dst(dst_row, j*D, D),
222  src(input_features, t2);
223  dst.CopyFromVec(src);
224  }
225  }
226 }
227 
228 void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
229  Matrix<BaseFloat> *output_features) {
230  int32 T = input_features.NumRows(), D = input_features.NumCols();
231  if (T == 0 || D == 0)
232  KALDI_ERR << "ReverseFrames: empty input";
233  output_features->Resize(T, D);
234  for (int32 t = 0; t < T; t++) {
235  SubVector<BaseFloat> dst_row(*output_features, t);
236  SubVector<BaseFloat> src_row(input_features, T-1-t);
237  dst_row.CopyFromVec(src_row);
238  }
239 }
240 
241 
243  KALDI_ASSERT(cmn_window > 0);
244  if (center)
245  KALDI_ASSERT(min_window > 0 && min_window <= cmn_window);
246  // else ignored so value doesn't matter.
247 }
248 
249 // Internal version of SlidingWindowCmn with double-precision arguments.
251  const MatrixBase<double> &input,
252  MatrixBase<double> *output) {
253  opts.Check();
254  int32 num_frames = input.NumRows(), dim = input.NumCols(),
255  last_window_start = -1, last_window_end = -1,
256  warning_count = 0;
257  Vector<double> cur_sum(dim), cur_sumsq(dim);
258 
259  for (int32 t = 0; t < num_frames; t++) {
260  int32 window_start, window_end; // note: window_end will be one
261  // past the end of the window we use for normalization.
262  if (opts.center) {
263  window_start = t - (opts.cmn_window / 2);
264  window_end = window_start + opts.cmn_window;
265  } else {
266  window_start = t - opts.cmn_window;
267  window_end = t + 1;
268  }
269  if (window_start < 0) { // shift window right if starts <0.
270  window_end -= window_start;
271  window_start = 0; // or: window_start -= window_start
272  }
273  if (!opts.center) {
274  if (window_end > t)
275  window_end = std::max(t + 1, opts.min_window);
276  }
277  if (window_end > num_frames) {
278  window_start -= (window_end - num_frames);
279  window_end = num_frames;
280  if (window_start < 0) window_start = 0;
281  }
282  if (last_window_start == -1) {
283  SubMatrix<double> input_part(input,
284  window_start, window_end - window_start,
285  0, dim);
286  cur_sum.AddRowSumMat(1.0, input_part , 0.0);
287  if (opts.normalize_variance)
288  cur_sumsq.AddDiagMat2(1.0, input_part, kTrans, 0.0);
289  } else {
290  if (window_start > last_window_start) {
291  KALDI_ASSERT(window_start == last_window_start + 1);
292  SubVector<double> frame_to_remove(input, last_window_start);
293  cur_sum.AddVec(-1.0, frame_to_remove);
294  if (opts.normalize_variance)
295  cur_sumsq.AddVec2(-1.0, frame_to_remove);
296  }
297  if (window_end > last_window_end) {
298  KALDI_ASSERT(window_end == last_window_end + 1);
299  SubVector<double> frame_to_add(input, last_window_end);
300  cur_sum.AddVec(1.0, frame_to_add);
301  if (opts.normalize_variance)
302  cur_sumsq.AddVec2(1.0, frame_to_add);
303  }
304  }
305  int32 window_frames = window_end - window_start;
306  last_window_start = window_start;
307  last_window_end = window_end;
308 
309  KALDI_ASSERT(window_frames > 0);
310  SubVector<double> input_frame(input, t),
311  output_frame(*output, t);
312  output_frame.CopyFromVec(input_frame);
313  output_frame.AddVec(-1.0 / window_frames, cur_sum);
314 
315  if (opts.normalize_variance) {
316  if (window_frames == 1) {
317  output_frame.Set(0.0);
318  } else {
319  Vector<double> variance(cur_sumsq);
320  variance.Scale(1.0 / window_frames);
321  variance.AddVec2(-1.0 / (window_frames * window_frames), cur_sum);
322  // now "variance" is the variance of the features in the window,
323  // around their own mean.
324  int32 num_floored;
325  variance.ApplyFloor(1.0e-10, &num_floored);
326  if (num_floored > 0 && num_frames > 1) {
327  if (opts.max_warnings == warning_count) {
328  KALDI_WARN << "Suppressing the remaining variance flooring "
329  << "warnings. Run program with --max-warnings=-1 to "
330  << "see all warnings.";
331  }
332  // If opts.max_warnings is a negative number, we won't restrict the
333  // number of times that the warning is printed out.
334  else if (opts.max_warnings < 0
335  || opts.max_warnings > warning_count) {
336  KALDI_WARN << "Flooring when normalizing variance, floored "
337  << num_floored << " elements; num-frames was "
338  << window_frames;
339  }
340  warning_count++;
341  }
342  variance.ApplyPow(-0.5); // get inverse standard deviation.
343  output_frame.MulElements(variance);
344  }
345  }
346  }
347 }
348 
349 
351  const MatrixBase<BaseFloat> &input,
352  MatrixBase<BaseFloat> *output) {
353  KALDI_ASSERT(SameDim(input, *output) && input.NumRows() > 0);
354  Matrix<double> input_dbl(input), output_dbl(input.NumRows(), input.NumCols());
355  // call double-precision version
356  SlidingWindowCmnInternal(opts, input_dbl, &output_dbl);
357  output->CopyFromMat(output_dbl);
358 }
359 
360 
361 
362 } // namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
double real
#define M_PI
Definition: kaldi-math.h:44
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
void AddDiagMat2(Real alpha, const MatrixBase< Real > &M, MatrixTransposeType trans=kNoTrans, Real beta=1.0)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
kaldi::int32 int32
void CopyFromMat(const MatrixBase< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given matrix. (no resize is done).
void ReverseFrames(const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
void AddVec2(const Real alpha, const VectorBase< Real > &v)
Add vector : *this = *this + alpha * rv^2 [element-wise squaring].
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=nullptr)
Applies floor to all elements.
Definition: kaldi-vector.h:149
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
ShiftedDeltaFeaturesOptions opts_
float BaseFloat
Definition: kaldi-types.h:29
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
void MulElements(const VectorBase< Real > &v)
Multiply element-by-element by another vector.
void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts, const MatrixBase< double > &input, MatrixBase< double > *output)
DeltaFeaturesOptions opts_
void Process(const MatrixBase< BaseFloat > &input_feats, int32 frame, VectorBase< BaseFloat > *output_frame) const
#define KALDI_ERR
Definition: kaldi-error.h:147
void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts, const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
#define KALDI_WARN
Definition: kaldi-error.h:150
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
void InitIdftBases(int32 n_bases, int32 dimension, Matrix< BaseFloat > *mat_out)
void Scale(Real alpha)
Multiplies all elements by this constant.
DeltaFeatures(const DeltaFeaturesOptions &opts)
void ComputePowerSpectrum(VectorBase< BaseFloat > *waveform)
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts, const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
void Set(Real f)
Set all members of a vector to a specified value.
void SpliceFrames(const MatrixBase< BaseFloat > &input_features, int32 left_context, int32 right_context, Matrix< BaseFloat > *output_features)
void ApplyPow(Real power)
Take all elements of vector to a power.
Definition: kaldi-vector.h:179
std::vector< Vector< BaseFloat > > scales_
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
void SetZero()
Set vector to all zeros.
void Process(const MatrixBase< BaseFloat > &input_feats, int32 frame, SubVector< BaseFloat > *output_frame) const
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Sub-matrix representation.
Definition: kaldi-matrix.h:988
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
void SlidingWindowCmn(const SlidingWindowCmnOptions &opts, const MatrixBase< BaseFloat > &input, MatrixBase< BaseFloat > *output)
Applies sliding-window cepstral mean and/or variance normalization.
ShiftedDeltaFeatures(const ShiftedDeltaFeaturesOptions &opts)