pitch-functions.h
Go to the documentation of this file.
1 // feat/pitch-functions.h
2 
3 // Copyright 2013 Pegah Ghahremani
4 // 2014 IMSL, PKU-HKUST (author: Wei Shi)
5 // 2014 Yanqing Sun, Junjie Wang,
6 // Daniel Povey, Korbinian Riedhammer
7 // Xin Lei
8 
9 // See ../../COPYING for clarification regarding multiple authors
10 //
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 //
15 // http://www.apache.org/licenses/LICENSE-2.0
16 //
17 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
19 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
20 // MERCHANTABLITY OR NON-INFRINGEMENT.
21 // See the Apache 2 License for the specific language governing permissions and
22 // limitations under the License.
23 
24 #ifndef KALDI_FEAT_PITCH_FUNCTIONS_H_
25 #define KALDI_FEAT_PITCH_FUNCTIONS_H_
26 
27 #include <cassert>
28 #include <cstdlib>
29 #include <string>
30 #include <vector>
31 
32 #include "base/kaldi-error.h"
33 #include "feat/mel-computations.h"
34 #include "itf/online-feature-itf.h"
35 #include "matrix/matrix-lib.h"
36 #include "util/common-utils.h"
37 
38 namespace kaldi {
41 
43  // FrameExtractionOptions frame_opts;
44  BaseFloat samp_freq; // sample frequency in hertz
45  BaseFloat frame_shift_ms; // in milliseconds.
46  BaseFloat frame_length_ms; // in milliseconds.
47  BaseFloat preemph_coeff; // Preemphasis coefficient. [use is deprecated.]
48  BaseFloat min_f0; // min f0 to search (Hz)
49  BaseFloat max_f0; // max f0 to search (Hz)
50  BaseFloat soft_min_f0; // Minimum f0, applied in soft way, must not
51  // exceed min-f0
52  BaseFloat penalty_factor; // cost factor for FO change
53  BaseFloat lowpass_cutoff; // cutoff frequency for Low pass filter
54  BaseFloat resample_freq; // Integer that determines filter width when
55  // upsampling NCCF
56  BaseFloat delta_pitch; // the pitch tolerance in pruning lags
57  BaseFloat nccf_ballast; // Increasing this factor reduces NCCF for
58  // quiet frames, helping ensure pitch
59  // continuity in unvoiced region
60  int32 lowpass_filter_width; // Integer that determines filter width of
61  // lowpass filter
62  int32 upsample_filter_width; // Integer that determines filter width when
63  // upsampling NCCF
64 
65  // Below are newer config variables, not present in the original paper,
66  // that relate to the online pitch extraction algorithm.
67 
68  // The maximum number of frames of latency that we allow the pitch-processing
69  // to introduce, for online operation. If you set this to a large value,
70  // there would be no inaccuracy from the Viterbi traceback (but it might make
71  // you wait to see the pitch). This is not very relevant for the online
72  // operation: normalization-right-context is more relevant, you
73  // can just leave this value at zero.
75 
76  // Only relevant for the function ComputeKaldiPitch which is called by
77  // compute-kaldi-pitch-feats. If nonzero, we provide the input as chunks of
78  // this size. This affects the energy normalization which has a small effect
79  // on the resulting features, especially at the beginning of a file. For best
80  // compatibility with online operation (e.g. if you plan to train models for
81  // the online-deocding setup), you might want to set this to a small value,
82  // like one frame.
84 
85  // Only relevant for the function ComputeKaldiPitch which is called by
86  // compute-kaldi-pitch-feats, and only relevant if frames_per_chunk is
87  // nonzero. If true, it will query the features as soon as they are
88  // available, which simulates the first-pass features you would get in online
89  // decoding. If false, the features you will get will be the same as those
90  // available at the end of the utterance, after InputFinished() has been
91  // called: e.g. during lattice rescoring.
93 
94  // Only relevant for online operation or when emulating online operation
95  // (e.g. when setting frames_per_chunk). This is the frame-index on which we
96  // recompute the NCCF (e.g. frame-index 500 = after 5 seconds); if the
97  // segment ends before this we do it when the segment ends. We do this by
98  // re-computing the signal average energy, which affects the NCCF via the
99  // "ballast term", scaling the resampled NCCF by a factor derived from the
100  // average change in the "ballast term", and re-doing the backtrace
101  // computation. Making this infinity would be the most exact, but would
102  // introduce unwanted latency at the end of long utterances, for little
103  // benefit.
105 
106  // This is a "hidden config" used only for testing the online pitch
107  // extraction. If true, we compute the signal root-mean-squared for the
108  // ballast term, only up to the current frame, rather than the end of the
109  // current chunk of signal. This makes the output insensitive to the
110  // chunking, which is useful for testing purposes.
114  samp_freq(16000),
115  frame_shift_ms(10.0),
116  frame_length_ms(25.0),
117  preemph_coeff(0.0),
118  min_f0(50),
119  max_f0(400),
120  soft_min_f0(10.0),
121  penalty_factor(0.1),
122  lowpass_cutoff(1000),
123  resample_freq(4000),
124  delta_pitch(0.005),
125  nccf_ballast(7000),
126  lowpass_filter_width(1),
127  upsample_filter_width(5),
128  max_frames_latency(0),
129  frames_per_chunk(0),
130  simulate_first_pass_online(false),
131  recompute_frame(500),
132  nccf_ballast_online(false),
133  snip_edges(true) { }
134 
135  void Register(OptionsItf *opts) {
136  opts->Register("sample-frequency", &samp_freq,
137  "Waveform data sample frequency (must match the waveform "
138  "file, if specified there)");
139  opts->Register("frame-length", &frame_length_ms, "Frame length in "
140  "milliseconds");
141  opts->Register("frame-shift", &frame_shift_ms, "Frame shift in "
142  "milliseconds");
143  opts->Register("preemphasis-coefficient", &preemph_coeff,
144  "Coefficient for use in signal preemphasis (deprecated)");
145  opts->Register("min-f0", &min_f0,
146  "min. F0 to search for (Hz)");
147  opts->Register("max-f0", &max_f0,
148  "max. F0 to search for (Hz)");
149  opts->Register("soft-min-f0", &soft_min_f0,
150  "Minimum f0, applied in soft way, must not exceed min-f0");
151  opts->Register("penalty-factor", &penalty_factor,
152  "cost factor for FO change.");
153  opts->Register("lowpass-cutoff", &lowpass_cutoff,
154  "cutoff frequency for LowPass filter (Hz) ");
155  opts->Register("resample-frequency", &resample_freq,
156  "Frequency that we down-sample the signal to. Must be "
157  "more than twice lowpass-cutoff");
158  opts->Register("delta-pitch", &delta_pitch,
159  "Smallest relative change in pitch that our algorithm "
160  "measures");
161  opts->Register("nccf-ballast", &nccf_ballast,
162  "Increasing this factor reduces NCCF for quiet frames");
163  opts->Register("nccf-ballast-online", &nccf_ballast_online,
164  "This is useful mainly for debug; it affects how the NCCF "
165  "ballast is computed.");
166  opts->Register("lowpass-filter-width", &lowpass_filter_width,
167  "Integer that determines filter width of "
168  "lowpass filter, more gives sharper filter");
169  opts->Register("upsample-filter-width", &upsample_filter_width,
170  "Integer that determines filter width when upsampling NCCF");
171  opts->Register("frames-per-chunk", &frames_per_chunk, "Only relevant for "
172  "offline pitch extraction (e.g. compute-kaldi-pitch-feats), "
173  "you can set it to a small nonzero value, such as 10, for "
174  "better feature compatibility with online decoding (affects "
175  "energy normalization in the algorithm)");
176  opts->Register("simulate-first-pass-online", &simulate_first_pass_online,
177  "If true, compute-kaldi-pitch-feats will output features "
178  "that correspond to what an online decoder would see in the "
179  "first pass of decoding-- not the final version of the "
180  "features, which is the default. Relevant if "
181  "--frames-per-chunk > 0");
182  opts->Register("recompute-frame", &recompute_frame, "Only relevant for "
183  "online pitch extraction, or for compatibility with online "
184  "pitch extraction. A non-critical parameter; the frame at "
185  "which we recompute some of the forward pointers, after "
186  "revising our estimate of the signal energy. Relevant if"
187  "--frames-per-chunk > 0");
188  opts->Register("max-frames-latency", &max_frames_latency, "Maximum number "
189  "of frames of latency that we allow pitch tracking to "
190  "introduce into the feature processing (affects output only "
191  "if --frames-per-chunk > 0 and "
192  "--simulate-first-pass-online=true");
193  opts->Register("snip-edges", &snip_edges, "If this is set to false, the "
194  "incomplete frames near the ending edge won't be snipped, "
195  "so that the number of frames is the file size divided by "
196  "the frame-shift. This makes different types of features "
197  "give the same number of frames.");
198  }
201  // Because of floating point representation, it is more reliable to divide
202  // by 1000 instead of multiplying by 0.001, but it is a bit slower.
204  return static_cast<int32>(resample_freq * frame_length_ms / 1000.0);
205  }
208  return static_cast<int32>(resample_freq * frame_shift_ms / 1000.0);
209  }
210 };
211 
213  BaseFloat pitch_scale; // the final normalized-log-pitch feature is scaled
214  // with this value
215  BaseFloat pov_scale; // the final POV feature is scaled with this value
216  BaseFloat pov_offset; // An offset that can be added to the final POV
217  // feature (useful for online-decoding, where we don't
218  // do CMN to the pitch-derived features.
219 
221  BaseFloat delta_pitch_noise_stddev; // stddev of noise we add to delta-pitch
222  int32 normalization_left_context; // left-context used for sliding-window
223  // normalization
224  int32 normalization_right_context; // this should be reduced in online
225  // decoding to reduce latency
226 
229 
234 
236  pitch_scale(2.0),
237  pov_scale(2.0),
238  pov_offset(0.0),
239  delta_pitch_scale(10.0),
240  delta_pitch_noise_stddev(0.005),
241  normalization_left_context(75),
242  normalization_right_context(75),
243  delta_window(2),
244  delay(0),
245  add_pov_feature(true),
246  add_normalized_log_pitch(true),
247  add_delta_pitch(true),
248  add_raw_log_pitch(false) { }
249 
250 
251  void Register(ParseOptions *opts) {
252  opts->Register("pitch-scale", &pitch_scale,
253  "Scaling factor for the final normalized log-pitch value");
254  opts->Register("pov-scale", &pov_scale,
255  "Scaling factor for final POV (probability of voicing) "
256  "feature");
257  opts->Register("pov-offset", &pov_offset,
258  "This can be used to add an offset to the POV feature. "
259  "Intended for use in online decoding as a substitute for "
260  " CMN.");
261  opts->Register("delta-pitch-scale", &delta_pitch_scale,
262  "Term to scale the final delta log-pitch feature");
263  opts->Register("delta-pitch-noise-stddev", &delta_pitch_noise_stddev,
264  "Standard deviation for noise we add to the delta log-pitch "
265  "(before scaling); should be about the same as delta-pitch "
266  "option to pitch creation. The purpose is to get rid of "
267  "peaks in the delta-pitch caused by discretization of pitch "
268  "values.");
269  opts->Register("normalization-left-context", &normalization_left_context,
270  "Left-context (in frames) for moving window normalization");
271  opts->Register("normalization-right-context", &normalization_right_context,
272  "Right-context (in frames) for moving window normalization");
273  opts->Register("delta-window", &delta_window,
274  "Number of frames on each side of central frame, to use for "
275  "delta window.");
276  opts->Register("delay", &delay,
277  "Number of frames by which the pitch information is "
278  "delayed.");
279  opts->Register("add-pov-feature", &add_pov_feature,
280  "If true, the warped NCCF is added to output features");
281  opts->Register("add-normalized-log-pitch", &add_normalized_log_pitch,
282  "If true, the log-pitch with POV-weighted mean subtraction "
283  "over 1.5 second window is added to output features");
284  opts->Register("add-delta-pitch", &add_delta_pitch,
285  "If true, time derivative of log-pitch is added to output "
286  "features");
287  opts->Register("add-raw-log-pitch", &add_raw_log_pitch,
288  "If true, log(pitch) is added to output features");
289  }
290 };
291 
292 
293 // We don't want to expose the pitch-extraction internals here as it's
294 // quite complex, so we use a private implementation.
296 
297 
298 // Note: to start on a new waveform, just construct a new version
299 // of this object.
301  public:
302  explicit OnlinePitchFeature(const PitchExtractionOptions &opts);
303 
304  virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ }
305 
306  virtual int32 NumFramesReady() const;
307 
308  virtual BaseFloat FrameShiftInSeconds() const;
309 
310  virtual bool IsLastFrame(int32 frame) const;
311 
314  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
315 
316  virtual void AcceptWaveform(BaseFloat sampling_rate,
317  const VectorBase<BaseFloat> &waveform);
318 
319  virtual void InputFinished();
320 
321  virtual ~OnlinePitchFeature();
322 
323  private:
325 };
326 
327 
333  public:
334  virtual int32 Dim() const { return dim_; }
335 
336  virtual bool IsLastFrame(int32 frame) const {
337  if (frame <= -1)
338  return src_->IsLastFrame(-1);
339  else if (frame < opts_.delay)
340  return src_->IsLastFrame(-1) == true ? false : src_->IsLastFrame(0);
341  else
342  return src_->IsLastFrame(frame - opts_.delay);
343  }
344  virtual BaseFloat FrameShiftInSeconds() const {
345  return src_->FrameShiftInSeconds();
346  }
347 
348  virtual int32 NumFramesReady() const;
349 
350  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
351 
352  virtual ~OnlineProcessPitch() { }
353 
354  // Does not take ownership of "src".
357 
358  private:
359  enum { kRawFeatureDim = 2}; // anonymous enum to define a constant.
360  // kRawFeatureDim defines the dimension
361  // of the input: (nccf, pitch)
362 
365  int32 dim_; // Output feature dimension, set in initializer.
366 
368  int32 cur_num_frames; // value of src_->NumFramesReady() when
369  // "mean_pitch" was set.
370  bool input_finished; // true if input data was finished when
371  // "mean_pitch" was computed.
372  double sum_pov; // sum of pov over relevant range
373  double sum_log_pitch_pov; // sum of log(pitch) * pov over relevant range
374 
375  NormalizationStats(): cur_num_frames(-1), input_finished(false),
376  sum_pov(0.0), sum_log_pitch_pov(0.0) { }
377  };
378 
379  std::vector<BaseFloat> delta_feature_noise_;
380 
381  std::vector<NormalizationStats> normalization_stats_;
382 
385  inline BaseFloat GetPovFeature(int32 frame) const;
386 
389  inline BaseFloat GetDeltaPitchFeature(int32 frame);
390 
393  inline BaseFloat GetRawLogPitchFeature(int32 frame) const;
394 
397  inline BaseFloat GetNormalizedLogPitchFeature(int32 frame);
398 
400  inline void GetNormalizationWindow(int32 frame,
401  int32 src_frames_ready,
402  int32 *window_begin,
403  int32 *window_end) const;
404 
407  inline void UpdateNormalizationStats(int32 frame);
408 };
409 
410 
418  const VectorBase<BaseFloat> &wave,
419  Matrix<BaseFloat> *output);
420 
431 void ProcessPitch(const ProcessPitchOptions &opts,
432  const MatrixBase<BaseFloat> &input,
433  Matrix<BaseFloat> *output);
434 
443  const ProcessPitchOptions &process_opts,
444  const VectorBase<BaseFloat> &wave,
445  Matrix<BaseFloat> *output);
446 
447 
449 } // namespace kaldi
450 #endif // KALDI_FEAT_PITCH_FUNCTIONS_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
std::vector< BaseFloat > delta_feature_noise_
virtual int32 Dim() const
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
void ComputeKaldiPitch(const PitchExtractionOptions &opts, const VectorBase< BaseFloat > &wave, Matrix< BaseFloat > *output)
This function extracts (pitch, NCCF) per frame, using the pitch extraction method described in "A Pit...
virtual BaseFloat FrameShiftInSeconds() const
This online-feature class implements post processing of pitch features.
kaldi::int32 int32
ProcessPitchOptions opts_
OnlinePitchFeatureImpl * impl_
void Register(const std::string &name, bool *ptr, const std::string &doc)
int32 NccfWindowShift() const
Returns the window-shift in samples, after resampling.
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
virtual int32 Dim() const
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
std::vector< NormalizationStats > normalization_stats_
void ComputeAndProcessKaldiPitch(const PitchExtractionOptions &pitch_opts, const ProcessPitchOptions &process_opts, const VectorBase< BaseFloat > &wave, Matrix< BaseFloat > *output)
This function combines ComputeKaldiPitch and ProcessPitch.
void Register(OptionsItf *opts)
Add a virtual class for "source" features such as MFCC or PLP or pitch features.
int32 NccfWindowSize() const
Returns the window-size in samples, after resampling.
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
OnlineFeatureInterface is an interface for online feature processing (it is also usable in the offlin...
void ProcessPitch(const ProcessPitchOptions &opts, const MatrixBase< BaseFloat > &input, Matrix< BaseFloat > *output)
This function processes the raw (NCCF, pitch) quantities computed by ComputeKaldiPitch, and processes them into features.
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
OnlineFeatureInterface * src_
void Register(ParseOptions *opts)