mel-computations.h
Go to the documentation of this file.
1 // feat/mel-computations.h
2 
3 // Copyright 2009-2011 Phonexia s.r.o.; Microsoft Corporation
4 // 2016 Johns Hopkins University (author: Daniel Povey)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #ifndef KALDI_FEAT_MEL_COMPUTATIONS_H_
22 #define KALDI_FEAT_MEL_COMPUTATIONS_H_
23 
24 #include <math.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <complex>
28 #include <utility>
29 #include <vector>
30 
31 #include "base/kaldi-common.h"
32 #include "util/common-utils.h"
33 #include "matrix/matrix-lib.h"
34 
35 
36 namespace kaldi {
39 
40 struct FrameExtractionOptions; // defined in feature-window.h
41 
42 
44  int32 num_bins; // e.g. 25; number of triangular bins
45  BaseFloat low_freq; // e.g. 20; lower frequency cutoff
46  BaseFloat high_freq; // an upper frequency cutoff; 0 -> no cutoff, negative
47  // ->added to the Nyquist frequency to get the cutoff.
48  BaseFloat vtln_low; // vtln lower cutoff of warping function.
49  BaseFloat vtln_high; // vtln upper cutoff of warping function: if negative, added
50  // to the Nyquist frequency to get the cutoff.
51  bool debug_mel;
52  // htk_mode is a "hidden" config, it does not show up on command line.
53  // Enables more exact compatibility with HTK, for testing purposes. Affects
54  // mel-energy flooring and reproduces a bug in HTK.
55  bool htk_mode;
56  explicit MelBanksOptions(int num_bins = 25)
57  : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100),
58  vtln_high(-500), debug_mel(false), htk_mode(false) {}
59 
60  void Register(OptionsItf *opts) {
61  opts->Register("num-mel-bins", &num_bins,
62  "Number of triangular mel-frequency bins");
63  opts->Register("low-freq", &low_freq,
64  "Low cutoff frequency for mel bins");
65  opts->Register("high-freq", &high_freq,
66  "High cutoff frequency for mel bins (if <= 0, offset from Nyquist)");
67  opts->Register("vtln-low", &vtln_low,
68  "Low inflection point in piecewise linear VTLN warping function");
69  opts->Register("vtln-high", &vtln_high,
70  "High inflection point in piecewise linear VTLN warping function"
71  " (if negative, offset from high-mel-freq");
72  opts->Register("debug-mel", &debug_mel,
73  "Print out debugging information for mel bin computation");
74  }
75 };
76 
77 
78 class MelBanks {
79  public:
80 
81  static inline BaseFloat InverseMelScale(BaseFloat mel_freq) {
82  return 700.0f * (expf (mel_freq / 1127.0f) - 1.0f);
83  }
84 
85  static inline BaseFloat MelScale(BaseFloat freq) {
86  return 1127.0f * logf (1.0f + freq / 700.0f);
87  }
88 
89  static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff,
90  BaseFloat vtln_high_cutoff, // discontinuities in warp func
92  BaseFloat high_freq, // upper+lower frequency cutoffs in
93  // the mel computation
94  BaseFloat vtln_warp_factor,
95  BaseFloat freq);
96 
97  static BaseFloat VtlnWarpMelFreq(BaseFloat vtln_low_cutoff,
98  BaseFloat vtln_high_cutoff,
99  BaseFloat low_freq,
100  BaseFloat high_freq,
101  BaseFloat vtln_warp_factor,
102  BaseFloat mel_freq);
103 
104 
105  MelBanks(const MelBanksOptions &opts,
106  const FrameExtractionOptions &frame_opts,
107  BaseFloat vtln_warp_factor);
108 
111  void Compute(const VectorBase<BaseFloat> &fft_energies,
112  VectorBase<BaseFloat> *mel_energies_out) const;
113 
114  int32 NumBins() const { return bins_.size(); }
115 
116  // returns vector of central freq of each bin; needed by plp code.
117  const Vector<BaseFloat> &GetCenterFreqs() const { return center_freqs_; }
118 
119  const std::vector<std::pair<int32, Vector<BaseFloat> > >& GetBins() const {
120  return bins_;
121  }
122 
123  // Copy constructor
124  MelBanks(const MelBanks &other);
125  private:
126  // Disallow assignment
127  MelBanks &operator = (const MelBanks &other);
128 
129  // center frequencies of bins, numbered from 0 ... num_bins-1.
130  // Needed by GetCenterFreqs().
132 
133  // the "bins_" vector is a vector, one for each bin, of a pair:
134  // (the first nonzero fft-bin), (the vector of weights).
135  std::vector<std::pair<int32, Vector<BaseFloat> > > bins_;
136 
137  bool debug_;
138  bool htk_mode_;
139 };
140 
141 
142 // Compute liftering coefficients (scaling on cepstral coeffs)
143 // coeffs are numbered slightly differently from HTK: the zeroth
144 // index is C0, which is not affected.
146 
147 
148 // Durbin's recursion - converts autocorrelation coefficients to the LPC
149 // pTmp - temporal place [n]
150 // pAC - autocorrelation coefficients [n + 1]
151 // pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
152 // F(z) = 1 / (1 - A(z)), 1 is not stored in the denominator
153 // Returns log energy of residual (I think)
154 BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
155 
156 // Compute LP coefficients from autocorrelation coefficients.
157 // Returns log energy of residual (I think)
158 BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
159  Vector<BaseFloat> *lpc_out);
160 
161 void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst);
162 
163 
164 
165 void GetEqualLoudnessVector(const MelBanks &mel_banks,
166  Vector<BaseFloat> *ans);
167 
169 } // namespace kaldi
170 
171 #endif // KALDI_FEAT_MEL_COMPUTATIONS_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
static BaseFloat MelScale(BaseFloat freq)
std::vector< std::pair< int32, Vector< BaseFloat > > > bins_
Vector< BaseFloat > center_freqs_
void Register(OptionsItf *opts)
void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst)
int32 NumBins() const
kaldi::int32 int32
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
static BaseFloat InverseMelScale(BaseFloat mel_freq)
struct rnnlm::@11::@12 n
void ComputeLifterCoeffs(BaseFloat Q, VectorBase< BaseFloat > *coeffs)
const Vector< BaseFloat > & GetCenterFreqs() const
BaseFloat ComputeLpc(const VectorBase< BaseFloat > &autocorr_in, Vector< BaseFloat > *lpc_out)
MelBanksOptions(int num_bins=25)
A class representing a vector.
Definition: kaldi-vector.h:406
void GetEqualLoudnessVector(const MelBanks &mel_banks, Vector< BaseFloat > *ans)
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp)
const std::vector< std::pair< int32, Vector< BaseFloat > > > & GetBins() const