feature-functions.h
Go to the documentation of this file.
1 // feat/feature-functions.h
2 
3 // Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
4 // 2014 IMSL, PKU-HKUST (author: Wei Shi)
5 // 2016 Johns Hopkins University (author: Daniel Povey)
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 
23 #ifndef KALDI_FEAT_FEATURE_FUNCTIONS_H_
24 #define KALDI_FEAT_FEATURE_FUNCTIONS_H_
25 
26 #include <string>
27 #include <vector>
28 
29 #include "matrix/matrix-lib.h"
30 #include "util/common-utils.h"
31 #include "base/kaldi-error.h"
32 
33 namespace kaldi {
36 
37 
38 // ComputePowerSpectrum converts a complex FFT (as produced by the FFT
39 // functions in matrix/matrix-functions.h), and converts it into
40 // a power spectrum. If the complex FFT is a vector of size n (representing
41 // half the complex FFT of a real signal of size n, as described there),
42 // this function computes in the first (n/2) + 1 elements of it, the
43 // energies of the fft bins from zero to the Nyquist frequency. Contents of the
44 // remaining (n/2) - 1 elements are undefined at output.
45 void ComputePowerSpectrum(VectorBase<BaseFloat> *complex_fft);
46 
47 
50  int32 window; // e.g. 2; controls window size (window size is 2*window + 1)
51  // the behavior at the edges is to replicate the first or last frame.
52  // this is not configurable.
53 
54  DeltaFeaturesOptions(int32 order = 2, int32 window = 2):
55  order(order), window(window) { }
56  void Register(OptionsItf *opts) {
57  opts->Register("delta-order", &order, "Order of delta computation");
58  opts->Register("delta-window", &window,
59  "Parameter controlling window for delta computation (actual window"
60  " size for each delta order is 1 + 2*delta-window-size)");
61  }
62 };
63 
65  public:
66  // This class provides a low-level function to compute delta features.
67  // The function takes as input a matrix of features and a frame index
68  // that it should compute the deltas on. It puts its output in an object
69  // of type VectorBase, of size (original-feature-dimension) * (opts.order+1).
70  // This is not the most efficient way to do the computation, but it's
71  // state-free and thus easier to understand
72 
73  explicit DeltaFeatures(const DeltaFeaturesOptions &opts);
74 
75  void Process(const MatrixBase<BaseFloat> &input_feats,
76  int32 frame,
77  VectorBase<BaseFloat> *output_frame) const;
78  private:
80  std::vector<Vector<BaseFloat> > scales_; // a scaling window for each
81  // of the orders, including zero: multiply the features for each
82  // dimension by this window.
83 };
84 
86  int32 window, // The time delay and advance
87  num_blocks,
88  block_shift; // Distance between consecutive blocks
89 
91  window(1), num_blocks(7), block_shift(3) { }
92  void Register(OptionsItf *opts) {
93  opts->Register("delta-window", &window, "Size of delta advance and delay.");
94  opts->Register("num-blocks", &num_blocks, "Number of delta blocks in advance"
95  " of each frame to be concatenated");
96  opts->Register("block-shift", &block_shift, "Distance between each block");
97  }
98 };
99 
101  public:
102  // This class provides a low-level function to compute shifted
103  // delta cesptra (SDC).
104  // The function takes as input a matrix of features and a frame index
105  // that it should compute the deltas on. It puts its output in an object
106  // of type VectorBase, of size original-feature-dimension + (1 * num_blocks).
107 
109 
110  void Process(const MatrixBase<BaseFloat> &input_feats,
111  int32 frame,
112  SubVector<BaseFloat> *output_frame) const;
113  private:
115  Vector<BaseFloat> scales_; // a scaling window for each
116 
117 };
118 
119 // ComputeDeltas is a convenience function that computes deltas on a feature
120 // file. If you want to deal with features coming in bit by bit you would have
121 // to use the DeltaFeatures class directly, and do the computation frame by
122 // frame. Later we will have to come up with a nice mechanism to do this for
123 // features coming in.
124 void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
125  const MatrixBase<BaseFloat> &input_features,
126  Matrix<BaseFloat> *output_features);
127 
128 // ComputeShiftedDeltas computes deltas from a feature file by applying
129 // ShiftedDeltaFeatures over the frames. This function is provided for
130 // convenience, however, ShiftedDeltaFeatures can be used directly.
132  const MatrixBase<BaseFloat> &input_features,
133  Matrix<BaseFloat> *output_features);
134 
135 // SpliceFrames will normally be used together with LDA.
136 // It splices frames together to make a window. At the
137 // start and end of an utterance, it duplicates the first
138 // and last frames.
139 // Will throw if input features are empty.
140 // left_context and right_context must be nonnegative.
141 // these both represent a number of frames (e.g. 4, 4 is
142 // a good choice).
143 void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
144  int32 left_context,
145  int32 right_context,
146  Matrix<BaseFloat> *output_features);
147 
148 // ReverseFrames reverses the frames in time (used for backwards decoding)
149 void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
150  Matrix<BaseFloat> *output_features);
151 
152 
153 void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out);
154 
155 
156 // This is used for speaker-id. Also see OnlineCmnOptions in ../online2/, which
157 // is online CMN with no latency, for online speech recognition.
163  bool center;
164 
166  cmn_window(600),
167  min_window(100),
168  max_warnings(5),
169  normalize_variance(false),
170  center(false) { }
171 
172  void Register(OptionsItf *opts) {
173  opts->Register("cmn-window", &cmn_window, "Window in frames for running "
174  "average CMN computation");
175  opts->Register("min-cmn-window", &min_window, "Minimum CMN window "
176  "used at start of decoding (adds latency only at start). "
177  "Only applicable if center == false, ignored if center==true");
178  opts->Register("max-warnings", &max_warnings, "Maximum warnings to report "
179  "per utterance. 0 to disable, -1 to show all.");
180  opts->Register("norm-vars", &normalize_variance, "If true, normalize "
181  "variance to one."); // naming this as in apply-cmvn.cc
182  opts->Register("center", &center, "If true, use a window centered on the "
183  "current frame (to the extent possible, modulo end effects). "
184  "If false, window is to the left.");
185  }
186  void Check() const;
187 };
188 
189 
195  const MatrixBase<BaseFloat> &input,
196  MatrixBase<BaseFloat> *output);
197 
198 
200 } // namespace kaldi
201 
202 
203 
204 #endif // KALDI_FEAT_FEATURE_FUNCTIONS_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
kaldi::int32 int32
void ReverseFrames(const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
ShiftedDeltaFeaturesOptions opts_
DeltaFeaturesOptions opts_
DeltaFeaturesOptions(int32 order=2, int32 window=2)
void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts, const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
void Register(OptionsItf *opts)
void InitIdftBases(int32 n_bases, int32 dimension, Matrix< BaseFloat > *mat_out)
void ComputePowerSpectrum(VectorBase< BaseFloat > *waveform)
A class representing a vector.
Definition: kaldi-vector.h:406
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts, const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
void SpliceFrames(const MatrixBase< BaseFloat > &input_features, int32 left_context, int32 right_context, Matrix< BaseFloat > *output_features)
std::vector< Vector< BaseFloat > > scales_
void Register(OptionsItf *opts)
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
void SlidingWindowCmn(const SlidingWindowCmnOptions &opts, const MatrixBase< BaseFloat > &input, MatrixBase< BaseFloat > *output)
Applies sliding-window cepstral mean and/or variance normalization.