mel-computations.cc
Go to the documentation of this file.
1 // feat/mel-computations.cc
2 
3 // Copyright 2009-2011 Phonexia s.r.o.; Karel Vesely; Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <float.h>
23 #include <algorithm>
24 #include <iostream>
25 
26 #include "feat/feature-functions.h"
27 #include "feat/feature-window.h"
28 #include "feat/mel-computations.h"
29 
30 namespace kaldi {
31 
32 
34  const FrameExtractionOptions &frame_opts,
35  BaseFloat vtln_warp_factor):
36  htk_mode_(opts.htk_mode) {
37  int32 num_bins = opts.num_bins;
38  if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins";
39  BaseFloat sample_freq = frame_opts.samp_freq;
40  int32 window_length_padded = frame_opts.PaddedWindowSize();
41  KALDI_ASSERT(window_length_padded % 2 == 0);
42  int32 num_fft_bins = window_length_padded / 2;
43  BaseFloat nyquist = 0.5 * sample_freq;
44 
45  BaseFloat low_freq = opts.low_freq, high_freq;
46  if (opts.high_freq > 0.0)
47  high_freq = opts.high_freq;
48  else
49  high_freq = nyquist + opts.high_freq;
50 
51  if (low_freq < 0.0 || low_freq >= nyquist
52  || high_freq <= 0.0 || high_freq > nyquist
53  || high_freq <= low_freq)
54  KALDI_ERR << "Bad values in options: low-freq " << low_freq
55  << " and high-freq " << high_freq << " vs. nyquist "
56  << nyquist;
57 
58  BaseFloat fft_bin_width = sample_freq / window_length_padded;
59  // fft-bin width [think of it as Nyquist-freq / half-window-length]
60 
61  BaseFloat mel_low_freq = MelScale(low_freq);
62  BaseFloat mel_high_freq = MelScale(high_freq);
63 
64  debug_ = opts.debug_mel;
65 
66  // divide by num_bins+1 in next line because of end-effects where the bins
67  // spread out to the sides.
68  BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1);
69 
70  BaseFloat vtln_low = opts.vtln_low,
71  vtln_high = opts.vtln_high;
72  if (vtln_high < 0.0) {
73  vtln_high += nyquist;
74  }
75 
76  if (vtln_warp_factor != 1.0 &&
77  (vtln_low < 0.0 || vtln_low <= low_freq
78  || vtln_low >= high_freq
79  || vtln_high <= 0.0 || vtln_high >= high_freq
80  || vtln_high <= vtln_low))
81  KALDI_ERR << "Bad values in options: vtln-low " << vtln_low
82  << " and vtln-high " << vtln_high << ", versus "
83  << "low-freq " << low_freq << " and high-freq "
84  << high_freq;
85 
86  bins_.resize(num_bins);
87  center_freqs_.Resize(num_bins);
88 
89  for (int32 bin = 0; bin < num_bins; bin++) {
90  BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta,
91  center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
92  right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
93 
94  if (vtln_warp_factor != 1.0) {
95  left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
96  vtln_warp_factor, left_mel);
97  center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
98  vtln_warp_factor, center_mel);
99  right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
100  vtln_warp_factor, right_mel);
101  }
102  center_freqs_(bin) = InverseMelScale(center_mel);
103  // this_bin will be a vector of coefficients that is only
104  // nonzero where this mel bin is active.
105  Vector<BaseFloat> this_bin(num_fft_bins);
106  int32 first_index = -1, last_index = -1;
107  for (int32 i = 0; i < num_fft_bins; i++) {
108  BaseFloat freq = (fft_bin_width * i); // Center frequency of this fft
109  // bin.
110  BaseFloat mel = MelScale(freq);
111  if (mel > left_mel && mel < right_mel) {
112  BaseFloat weight;
113  if (mel <= center_mel)
114  weight = (mel - left_mel) / (center_mel - left_mel);
115  else
116  weight = (right_mel-mel) / (right_mel-center_mel);
117  this_bin(i) = weight;
118  if (first_index == -1)
119  first_index = i;
120  last_index = i;
121  }
122  }
123  KALDI_ASSERT(first_index != -1 && last_index >= first_index
124  && "You may have set --num-mel-bins too large.");
125 
126  bins_[bin].first = first_index;
127  int32 size = last_index + 1 - first_index;
128  bins_[bin].second.Resize(size);
129  bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));
130 
131  // Replicate a bug in HTK, for testing purposes.
132  if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0)
133  bins_[bin].second(0) = 0.0;
134 
135  }
136  if (debug_) {
137  for (size_t i = 0; i < bins_.size(); i++) {
138  KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first
139  << ", vec = " << bins_[i].second;
140  }
141  }
142 }
143 
146  bins_(other.bins_),
147  debug_(other.debug_),
148  htk_mode_(other.htk_mode_) { }
149 
150 BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
151  BaseFloat vtln_high_cutoff,
152  BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation
153  BaseFloat high_freq,
154  BaseFloat vtln_warp_factor,
155  BaseFloat freq) {
159 
181 
182 
183  if (freq < low_freq || freq > high_freq) return freq; // in case this gets called
184  // for out-of-range frequencies, just return the freq.
185 
186  KALDI_ASSERT(vtln_low_cutoff > low_freq &&
187  "be sure to set the --vtln-low option higher than --low-freq");
188  KALDI_ASSERT(vtln_high_cutoff < high_freq &&
189  "be sure to set the --vtln-high option lower than --high-freq [or negative]");
190  BaseFloat one = 1.0;
191  BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
192  BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
193  BaseFloat scale = 1.0 / vtln_warp_factor;
194  BaseFloat Fl = scale * l; // F(l);
195  BaseFloat Fh = scale * h; // F(h);
196  KALDI_ASSERT(l > low_freq && h < high_freq);
197  // slope of left part of the 3-piece linear function
198  BaseFloat scale_left = (Fl - low_freq) / (l - low_freq);
199  // [slope of center part is just "scale"]
200 
201  // slope of right part of the 3-piece linear function
202  BaseFloat scale_right = (high_freq - Fh) / (high_freq - h);
203 
204  if (freq < l) {
205  return low_freq + scale_left * (freq - low_freq);
206  } else if (freq < h) {
207  return scale * freq;
208  } else { // freq >= h
209  return high_freq + scale_right * (freq - high_freq);
210  }
211 }
212 
213 BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
214  BaseFloat vtln_high_cutoff,
215  BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation
216  BaseFloat high_freq,
217  BaseFloat vtln_warp_factor,
218  BaseFloat mel_freq) {
219  return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
220  low_freq, high_freq,
221  vtln_warp_factor, InverseMelScale(mel_freq)));
222 }
223 
224 
225 // "power_spectrum" contains fft energies.
226 void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
227  VectorBase<BaseFloat> *mel_energies_out) const {
228  int32 num_bins = bins_.size();
229  KALDI_ASSERT(mel_energies_out->Dim() == num_bins);
230 
231  for (int32 i = 0; i < num_bins; i++) {
232  int32 offset = bins_[i].first;
233  const Vector<BaseFloat> &v(bins_[i].second);
234  BaseFloat energy = VecVec(v, power_spectrum.Range(offset, v.Dim()));
235  // HTK-like flooring- for testing purposes (we prefer dither)
236  if (htk_mode_ && energy < 1.0) energy = 1.0;
237  (*mel_energies_out)(i) = energy;
238 
239  // The following assert was added due to a problem with OpenBlas that
240  // we had at one point (it was a bug in that library). Just to detect
241  // it early.
242  KALDI_ASSERT(!KALDI_ISNAN((*mel_energies_out)(i)));
243  }
244 
245  if (debug_) {
246  fprintf(stderr, "MEL BANKS:\n");
247  for (int32 i = 0; i < num_bins; i++)
248  fprintf(stderr, " %f", (*mel_energies_out)(i));
249  fprintf(stderr, "\n");
250  }
251 }
252 
254  // Compute liftering coefficients (scaling on cepstral coeffs)
255  // coeffs are numbered slightly differently from HTK: the zeroth
256  // index is C0, which is not affected.
257  for (int32 i = 0; i < coeffs->Dim(); i++)
258  (*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q);
259 }
260 
261 
262 // Durbin's recursion - converts autocorrelation coefficients to the LPC
263 // pTmp - temporal place [n]
264 // pAC - autocorrelation coefficients [n + 1]
265 // pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
266 // F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
267 BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp) {
268  BaseFloat ki; // reflection coefficient
269  int i;
270  int j;
271 
272  BaseFloat E = pAC[0];
273 
274  for (i = 0; i < n; i++) {
275  // next reflection coefficient
276  ki = pAC[i + 1];
277  for (j = 0; j < i; j++)
278  ki += pLP[j] * pAC[i - j];
279  ki = ki / E;
280 
281  // new error
282  BaseFloat c = 1 - ki * ki;
283  if (c < 1.0e-5) // remove NaNs for constan signal
284  c = 1.0e-5;
285  E *= c;
286 
287  // new LP coefficients
288  pTmp[i] = -ki;
289  for (j = 0; j < i; j++)
290  pTmp[j] = pLP[j] - ki * pLP[i - j - 1];
291 
292  for (j = 0; j <= i; j++)
293  pLP[j] = pTmp[j];
294  }
295 
296  return E;
297 }
298 
299 
300 void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst) {
301  for (int32 i = 0; i < n; i++) {
302  double sum = 0.0;
303  int j;
304  for (j = 0; j < i; j++) {
305  sum += static_cast<BaseFloat>(i - j) * pLPC[j] * pCepst[i - j - 1];
306  }
307  pCepst[i] = -pLPC[i] - sum / static_cast<BaseFloat>(i + 1);
308  }
309 }
310 
311 void GetEqualLoudnessVector(const MelBanks &mel_banks,
312  Vector<BaseFloat> *ans) {
313  int32 n = mel_banks.NumBins();
314  // Central frequency of each mel bin.
315  const Vector<BaseFloat> &f0 = mel_banks.GetCenterFreqs();
316  ans->Resize(n);
317  for (int32 i = 0; i < n; i++) {
318  BaseFloat fsq = f0(i) * f0(i);
319  BaseFloat fsub = fsq / (fsq + 1.6e5);
320  (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6));
321  }
322 }
323 
324 
325 // Compute LP coefficients from autocorrelation coefficients.
327  Vector<BaseFloat> *lpc_out) {
328  int32 n = autocorr_in.Dim() - 1;
329  KALDI_ASSERT(lpc_out->Dim() == n);
330  Vector<BaseFloat> tmp(n);
331  BaseFloat ans = Durbin(n, autocorr_in.Data(),
332  lpc_out->Data(),
333  tmp.Data());
334  if (ans <= 0.0)
335  KALDI_WARN << "Zero energy in LPC computation";
336  return -Log(1.0 / ans); // forms the C0 value
337 }
338 
339 
340 } // namespace kaldi
MelBanks(const MelBanksOptions &opts, const FrameExtractionOptions &frame_opts, BaseFloat vtln_warp_factor)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
static BaseFloat MelScale(BaseFloat freq)
std::vector< std::pair< int32, Vector< BaseFloat > > > bins_
static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff, BaseFloat vtln_high_cutoff, BaseFloat low_freq, BaseFloat high_freq, BaseFloat vtln_warp_factor, BaseFloat freq)
Vector< BaseFloat > center_freqs_
#define M_PI
Definition: kaldi-math.h:44
void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst)
int32 NumBins() const
kaldi::int32 int32
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
float BaseFloat
Definition: kaldi-types.h:29
static BaseFloat InverseMelScale(BaseFloat mel_freq)
double Log(double x)
Definition: kaldi-math.h:100
struct rnnlm::@11::@12 n
void ComputeLifterCoeffs(BaseFloat Q, VectorBase< BaseFloat > *coeffs)
const Vector< BaseFloat > & GetCenterFreqs() const
BaseFloat ComputeLpc(const VectorBase< BaseFloat > &autocorr_in, Vector< BaseFloat > *lpc_out)
#define KALDI_ERR
Definition: kaldi-error.h:147
static BaseFloat VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, BaseFloat vtln_high_cutoff, BaseFloat low_freq, BaseFloat high_freq, BaseFloat vtln_warp_factor, BaseFloat mel_freq)
#define KALDI_WARN
Definition: kaldi-error.h:150
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
void Compute(const VectorBase< BaseFloat > &fft_energies, VectorBase< BaseFloat > *mel_energies_out) const
Compute Mel energies (note: not log enerties).
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ISNAN
Definition: kaldi-math.h:72
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void GetEqualLoudnessVector(const MelBanks &mel_banks, Vector< BaseFloat > *ans)
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp)
#define KALDI_LOG
Definition: kaldi-error.h:153
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).
Definition: kaldi-vector.h:94