pitch-functions.cc
Go to the documentation of this file.
1 // feat/pitch-functions.cc
2 
3 // Copyright 2013 Pegah Ghahremani
4 // 2014 IMSL, PKU-HKUST (author: Wei Shi)
5 // 2014 Yanqing Sun, Junjie Wang,
6 // Daniel Povey, Korbinian Riedhammer
7 // Xin Lei
8 
9 // See ../../COPYING for clarification regarding multiple authors
10 //
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 //
15 // http://www.apache.org/licenses/LICENSE-2.0
16 //
17 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
19 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
20 // MERCHANTABLITY OR NON-INFRINGEMENT.
21 // See the Apache 2 License for the specific language governing permissions and
22 // limitations under the License.
23 
24 #include <algorithm>
25 #include <limits>
26 
27 #include "feat/feature-functions.h"
28 #include "feat/mel-computations.h"
29 #include "feat/online-feature.h"
30 #include "feat/pitch-functions.h"
31 #include "feat/resample.h"
33 
34 namespace kaldi {
35 
45  if (n > 1.0) {
46  n = 1.0;
47  } else if (n < -1.0) {
48  n = -1.0;
49  }
50  BaseFloat f = pow((1.0001 - n), 0.15) - 1.0;
51  KALDI_ASSERT(f - f == 0); // check for NaN,inf.
52  return f;
53 }
54 
79  BaseFloat ndash = fabs(n);
80  if (ndash > 1.0) ndash = 1.0; // just in case it was slightly outside [-1, 1]
81 
82  BaseFloat r = -5.2 + 5.4 * Exp(7.5 * (ndash - 1.0)) + 4.8 * ndash -
83  2.0 * Exp(-10.0 * ndash) + 4.2 * Exp(20.0 * (ndash - 1.0));
84  // r is the approximate log-prob-ratio of voicing, log(p/(1-p)).
85  BaseFloat p = 1.0 / (1 + Exp(-1.0 * r));
86  KALDI_ASSERT(p - p == 0); // Check for NaN/inf
87  return p;
88 }
89 
103  int32 first_lag, int32 last_lag,
104  int32 nccf_window_size,
105  VectorBase<BaseFloat> *inner_prod,
106  VectorBase<BaseFloat> *norm_prod) {
107  Vector<BaseFloat> zero_mean_wave(wave);
108  // TODO: possibly fix this, the mean normalization is done in a strange way.
109  SubVector<BaseFloat> wave_part(wave, 0, nccf_window_size);
110  // subtract mean-frame from wave
111  zero_mean_wave.Add(-wave_part.Sum() / nccf_window_size);
112  BaseFloat e1, e2, sum;
113  SubVector<BaseFloat> sub_vec1(zero_mean_wave, 0, nccf_window_size);
114  e1 = VecVec(sub_vec1, sub_vec1);
115  for (int32 lag = first_lag; lag <= last_lag; lag++) {
116  SubVector<BaseFloat> sub_vec2(zero_mean_wave, lag, nccf_window_size);
117  e2 = VecVec(sub_vec2, sub_vec2);
118  sum = VecVec(sub_vec1, sub_vec2);
119  (*inner_prod)(lag - first_lag) = sum;
120  (*norm_prod)(lag - first_lag) = e1 * e2;
121  }
122 }
123 
131 void ComputeNccf(const VectorBase<BaseFloat> &inner_prod,
132  const VectorBase<BaseFloat> &norm_prod,
133  BaseFloat nccf_ballast,
134  VectorBase<BaseFloat> *nccf_vec) {
135  KALDI_ASSERT(inner_prod.Dim() == norm_prod.Dim() &&
136  inner_prod.Dim() == nccf_vec->Dim());
137  for (int32 lag = 0; lag < inner_prod.Dim(); lag++) {
138  BaseFloat numerator = inner_prod(lag),
139  denominator = pow(norm_prod(lag) + nccf_ballast, 0.5),
140  nccf;
141  if (denominator != 0.0) {
142  nccf = numerator / denominator;
143  } else {
144  KALDI_ASSERT(numerator == 0.0);
145  nccf = 0.0;
146  }
147  KALDI_ASSERT(nccf < 1.01 && nccf > -1.01);
148  (*nccf_vec)(lag) = nccf;
149  }
150 }
151 
158  Vector<BaseFloat> *lags) {
159  // choose lags relative to acceptable pitch tolerance
160  BaseFloat min_lag = 1.0 / opts.max_f0, max_lag = 1.0 / opts.min_f0;
161 
162  std::vector<BaseFloat> tmp_lags;
163  for (BaseFloat lag = min_lag; lag <= max_lag; lag *= 1.0 + opts.delta_pitch)
164  tmp_lags.push_back(lag);
165  lags->Resize(tmp_lags.size());
166  std::copy(tmp_lags.begin(), tmp_lags.end(), lags->Data());
167 }
168 
169 
179  const VectorBase<BaseFloat> &lags,
180  const PitchExtractionOptions &opts,
181  VectorBase<BaseFloat> *local_cost) {
182  // from the paper, eq. 5, local_cost = 1 - Phi(t,i)(1 - soft_min_f0 L_i)
183  // nccf is the nccf on this frame measured at the lags in "lags".
184  KALDI_ASSERT(nccf_pitch.Dim() == local_cost->Dim() &&
185  nccf_pitch.Dim() == lags.Dim());
186  local_cost->Set(1.0);
187  // add the term -Phi(t,i):
188  local_cost->AddVec(-1.0, nccf_pitch);
189  // add the term soft_min_f0 Phi(t,i) L_i
190  local_cost->AddVecVec(opts.soft_min_f0, lags, nccf_pitch, 1.0);
191 }
192 
193 
194 
195 // class PitchFrameInfo is used inside class OnlinePitchFeatureImpl.
196 // It stores the information we need to keep around for a single frame
197 // of the pitch computation.
199  public:
205  void Cleanup(PitchFrameInfo *prev_frame);
206 
214  void SetBestState(int32 best_state,
215  std::vector<std::pair<int32, BaseFloat> > &lag_nccf);
216 
223  int32 ComputeLatency(int32 max_latency);
224 
226  bool UpdatePreviousBestState(PitchFrameInfo *prev_frame);
227 
230  explicit PitchFrameInfo(int32 num_states);
231 
234 
237  void SetNccfPov(const VectorBase<BaseFloat> &nccf_pov);
238 
253  void ComputeBacktraces(const PitchExtractionOptions &opts,
254  const VectorBase<BaseFloat> &nccf_pitch,
255  const VectorBase<BaseFloat> &lags,
256  const VectorBase<BaseFloat> &prev_forward_cost,
257  std::vector<std::pair<int32, int32> > *index_info,
258  VectorBase<BaseFloat> *this_forward_cost);
259  private:
260  // struct StateInfo is the information we keep for a single one of the
261  // log-spaced lags, for a single frame. This is a state in the Viterbi
262  // computation.
263  struct StateInfo {
270  StateInfo(): backpointer(0), pov_nccf(0.0) { }
271  };
272  std::vector<StateInfo> state_info_;
276 
279 
282 };
283 
284 
285 // This constructor is used for frame -1; it sets the costs to be all zeros
286 // the pov_nccf's to zero and the backpointers to -1.
288  :state_info_(num_states), state_offset_(0),
289  cur_best_state_(-1), prev_info_(NULL) { }
290 
291 
292 bool pitch_use_naive_search = false; // This is used in unit-tests.
293 
294 
296  state_info_(prev_info->state_info_.size()), state_offset_(0),
297  cur_best_state_(-1), prev_info_(prev_info) { }
298 
300  int32 num_states = nccf_pov.Dim();
301  KALDI_ASSERT(num_states == state_info_.size());
302  for (int32 i = 0; i < num_states; i++)
303  state_info_[i].pov_nccf = nccf_pov(i);
304 }
305 
307  const PitchExtractionOptions &opts,
308  const VectorBase<BaseFloat> &nccf_pitch,
309  const VectorBase<BaseFloat> &lags,
310  const VectorBase<BaseFloat> &prev_forward_cost_vec,
311  std::vector<std::pair<int32, int32> > *index_info,
312  VectorBase<BaseFloat> *this_forward_cost_vec) {
313  int32 num_states = nccf_pitch.Dim();
314 
315  Vector<BaseFloat> local_cost(num_states, kUndefined);
316  ComputeLocalCost(nccf_pitch, lags, opts, &local_cost);
317 
318  const BaseFloat delta_pitch_sq = pow(Log(1.0 + opts.delta_pitch), 2.0),
319  inter_frame_factor = delta_pitch_sq * opts.penalty_factor;
320 
321  // index local_cost, prev_forward_cost and this_forward_cost using raw pointer
322  // indexing not operator (), since this is the very inner loop and a lot of
323  // time is taken here.
324  const BaseFloat *prev_forward_cost = prev_forward_cost_vec.Data();
325  BaseFloat *this_forward_cost = this_forward_cost_vec->Data();
326 
327  if (index_info->empty())
328  index_info->resize(num_states);
329 
330  // make it a reference for more concise indexing.
331  std::vector<std::pair<int32, int32> > &bounds = *index_info;
332 
333  /* bounds[i].first will be a lower bound on the backpointer for state i,
334  bounds[i].second will be an upper bound on it. We progressively tighten
335  these bounds till we know the backpointers exactly.
336  */
337 
338  if (pitch_use_naive_search) {
339  // This branch is only taken in unit-testing code.
340  for (int32 i = 0; i < num_states; i++) {
341  BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
342  int32 best_j = -1;
343  for (int32 j = 0; j < num_states; j++) {
344  BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor
345  + prev_forward_cost[j];
346  if (this_cost < best_cost) {
347  best_cost = this_cost;
348  best_j = j;
349  }
350  }
351  this_forward_cost[i] = best_cost;
352  state_info_[i].backpointer = best_j;
353  }
354  } else {
355  int32 last_backpointer = 0;
356  for (int32 i = 0; i < num_states; i++) {
357  int32 start_j = last_backpointer;
358  BaseFloat best_cost = (start_j - i) * (start_j - i) * inter_frame_factor
359  + prev_forward_cost[start_j];
360  int32 best_j = start_j;
361 
362  for (int32 j = start_j + 1; j < num_states; j++) {
363  BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor
364  + prev_forward_cost[j];
365  if (this_cost < best_cost) {
366  best_cost = this_cost;
367  best_j = j;
368  } else { // as soon as the costs stop improving, we stop searching.
369  break; // this is a loose lower bound we're getting.
370  }
371  }
372  state_info_[i].backpointer = best_j;
373  this_forward_cost[i] = best_cost;
374  bounds[i].first = best_j; // this is now a lower bound on the
375  // backpointer.
376  bounds[i].second = num_states - 1; // we have no meaningful upper bound
377  // yet.
378  last_backpointer = best_j;
379  }
380 
381  // We iterate, progressively refining the upper and lower bounds until they
382  // meet and we know that the resulting backtraces are optimal. Each
383  // iteration takes time linear in num_states. We won't normally iterate as
384  // far as num_states; normally we only do two iterations; when printing out
385  // the number of iterations, it's rarely more than that (once I saw seven
386  // iterations). Anyway, this part of the computation does not dominate.
387  for (int32 iter = 0; iter < num_states; iter++) {
388  bool changed = false;
389  if (iter % 2 == 0) { // go backwards through the states
390  last_backpointer = num_states - 1;
391  for (int32 i = num_states - 1; i >= 0; i--) {
392  int32 lower_bound = bounds[i].first,
393  upper_bound = std::min(last_backpointer, bounds[i].second);
394  if (upper_bound == lower_bound) {
395  last_backpointer = lower_bound;
396  continue;
397  }
398  BaseFloat best_cost = this_forward_cost[i];
399  int32 best_j = state_info_[i].backpointer, initial_best_j = best_j;
400 
401  if (best_j == upper_bound) {
402  // if best_j already equals upper bound, don't bother tightening the
403  // upper bound, we'll tighten the lower bound when the time comes.
404  last_backpointer = best_j;
405  continue;
406  }
407  // Below, we have j > lower_bound + 1 because we know we've already
408  // evaluated lower_bound and lower_bound + 1 [via knowledge of
409  // this algorithm.]
410  for (int32 j = upper_bound; j > lower_bound + 1; j--) {
411  BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor
412  + prev_forward_cost[j];
413  if (this_cost < best_cost) {
414  best_cost = this_cost;
415  best_j = j;
416  } else { // as soon as the costs stop improving, we stop searching,
417  // unless the best j is still lower than j, in which case
418  // we obviously need to keep moving.
419  if (best_j > j)
420  break; // this is a loose lower bound we're getting.
421  }
422  }
423  // our "best_j" is now an upper bound on the backpointer.
424  bounds[i].second = best_j;
425  if (best_j != initial_best_j) {
426  this_forward_cost[i] = best_cost;
427  state_info_[i].backpointer = best_j;
428  changed = true;
429  }
430  last_backpointer = best_j;
431  }
432  } else { // go forwards through the states.
433  last_backpointer = 0;
434  for (int32 i = 0; i < num_states; i++) {
435  int32 lower_bound = std::max(last_backpointer, bounds[i].first),
436  upper_bound = bounds[i].second;
437  if (upper_bound == lower_bound) {
438  last_backpointer = lower_bound;
439  continue;
440  }
441  BaseFloat best_cost = this_forward_cost[i];
442  int32 best_j = state_info_[i].backpointer, initial_best_j = best_j;
443 
444  if (best_j == lower_bound) {
445  // if best_j already equals lower bound, we don't bother tightening
446  // the lower bound, we'll tighten the upper bound when the time
447  // comes.
448  last_backpointer = best_j;
449  continue;
450  }
451  // Below, we have j < upper_bound because we know we've already
452  // evaluated that point.
453  for (int32 j = lower_bound; j < upper_bound - 1; j++) {
454  BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor
455  + prev_forward_cost[j];
456  if (this_cost < best_cost) {
457  best_cost = this_cost;
458  best_j = j;
459  } else { // as soon as the costs stop improving, we stop searching,
460  // unless the best j is still higher than j, in which case
461  // we obviously need to keep moving.
462  if (best_j < j)
463  break; // this is a loose lower bound we're getting.
464  }
465  }
466  // our "best_j" is now a lower bound on the backpointer.
467  bounds[i].first = best_j;
468  if (best_j != initial_best_j) {
469  this_forward_cost[i] = best_cost;
470  state_info_[i].backpointer = best_j;
471  changed = true;
472  }
473  last_backpointer = best_j;
474  }
475  }
476  if (!changed)
477  break;
478  }
479  }
480  // The next statement is needed due to RecomputeBacktraces: we have to
481  // invalidate the previously computed best-state info.
482  cur_best_state_ = -1;
483  this_forward_cost_vec->AddVec(1.0, local_cost);
484 }
485 
487  int32 best_state,
488  std::vector<std::pair<int32, BaseFloat> > &lag_nccf) {
489 
490  // This function would naturally be recursive, but we have coded this to avoid
491  // recursion, which would otherwise eat up the stack. Think of it as a static
492  // member function, except we do use "this" right at the beginning.
493 
494  std::vector<std::pair<int32, BaseFloat> >::reverse_iterator iter = lag_nccf.rbegin();
495 
496  PitchFrameInfo *this_info = this; // it will change in the loop.
497  while (this_info != NULL) {
498  PitchFrameInfo *prev_info = this_info->prev_info_;
499  if (best_state == this_info->cur_best_state_)
500  return; // no change
501  if (prev_info != NULL) // don't write anything for frame -1.
502  iter->first = best_state;
503  size_t state_info_index = best_state - this_info->state_offset_;
504  KALDI_ASSERT(state_info_index < this_info->state_info_.size());
505  this_info->cur_best_state_ = best_state;
506  best_state = this_info->state_info_[state_info_index].backpointer;
507  if (prev_info != NULL) // don't write anything for frame -1.
508  iter->second = this_info->state_info_[state_info_index].pov_nccf;
509  this_info = prev_info;
510  if (this_info != NULL) ++iter;
511  }
512 }
513 
515  if (max_latency <= 0) return 0;
516 
517  int32 latency = 0;
518 
519  // This function would naturally be recursive, but we have coded this to avoid
520  // recursion, which would otherwise eat up the stack. Think of it as a static
521  // member function, except we do use "this" right at the beginning.
522  // This function is called only on the most recent PitchFrameInfo object.
523  int32 num_states = state_info_.size();
524  int32 min_living_state = 0, max_living_state = num_states - 1;
525  PitchFrameInfo *this_info = this; // it will change in the loop.
526 
527 
528  for (; this_info != NULL && latency < max_latency;) {
529  int32 offset = this_info->state_offset_;
530  KALDI_ASSERT(min_living_state >= offset &&
531  max_living_state - offset < this_info->state_info_.size());
532  min_living_state =
533  this_info->state_info_[min_living_state - offset].backpointer;
534  max_living_state =
535  this_info->state_info_[max_living_state - offset].backpointer;
536  if (min_living_state == max_living_state) {
537  return latency;
538  }
539  this_info = this_info->prev_info_;
540  if (this_info != NULL) // avoid incrementing latency for frame -1,
541  latency++; // as it's not a real frame.
542  }
543  return latency;
544 }
545 
547  KALDI_ERR << "Cleanup not implemented.";
548 }
549 
550 
551 // struct NccfInfo is used to cache certain quantities that we need for online
552 // operation, for the first "recompute_frame" frames of the file (e.g. 300);
553 // after that many frames, or after the user calls InputFinished(), we redo the
554 // initial backtraces, as we'll then have a better estimate of the average signal
555 // energy.
556 struct NccfInfo {
557 
558  Vector<BaseFloat> nccf_pitch_resampled; // resampled nccf_pitch
559  BaseFloat avg_norm_prod; // average value of e1 * e2.
560  BaseFloat mean_square_energy; // mean_square energy we used when computing the
561  // original ballast term for
562  // "nccf_pitch_resampled".
563 
564  NccfInfo(BaseFloat avg_norm_prod,
565  BaseFloat mean_square_energy):
566  avg_norm_prod(avg_norm_prod),
567  mean_square_energy(mean_square_energy) { }
568 };
569 
570 
571 
572 // We could inherit from OnlineBaseFeature as we have the same interface,
573 // but this will unnecessary force a lot of our functions to be virtual.
575  public:
576  explicit OnlinePitchFeatureImpl(const PitchExtractionOptions &opts);
577 
578  int32 Dim() const { return 2; }
579 
580  BaseFloat FrameShiftInSeconds() const;
581 
582  int32 NumFramesReady() const;
583 
584  bool IsLastFrame(int32 frame) const;
585 
586  void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
587 
588  void AcceptWaveform(BaseFloat sampling_rate,
589  const VectorBase<BaseFloat> &waveform);
590 
591  void InputFinished();
592 
594 
595 
596  // Copy-constructor, can be used to obtain a new copy of this object,
597  // any state from this utterance.
599 
600  private:
601 
608  int32 NumFramesAvailable(int64 num_downsampled_samples, bool snip_edges) const;
609 
622  void ExtractFrame(const VectorBase<BaseFloat> &downsampled_wave_part,
623  int64 frame_index,
624  VectorBase<BaseFloat> *window);
625 
626 
635  void RecomputeBacktraces();
636 
637 
641  void UpdateRemainder(const VectorBase<BaseFloat> &downsampled_wave_part);
642 
643 
644  // The following variables don't change throughout the lifetime
645  // of this object.
647 
648  // the first lag of the downsampled signal at which we measure NCCF
650  // the last lag of the downsampled signal at which we measure NCCF
652 
653  // The log-spaced lags at which we will resample the NCCF
655 
656  // This object is used to resample from evenly spaced to log-evenly-spaced
657  // nccf values. It's a pointer for convenience of initialization, so we don't
658  // have to use the initializer from the constructor.
660 
661  // The following objects may change during the lifetime of this object.
662 
663  // This object is used to resample the signal.
665 
666  // frame_info_ is indexed by [frame-index + 1]. frame_info_[0] is an object
667  // that corresponds to frame -1, which is not a real frame.
668  std::vector<PitchFrameInfo*> frame_info_;
669 
670 
671  // nccf_info_ is indexed by frame-index, from frame 0 to at most
672  // opts_.recompute_frame - 1. It contains some information we'll
673  // need to recompute the tracebacks after getting a better estimate
674  // of the average energy of the signal.
675  std::vector<NccfInfo*> nccf_info_;
676 
677  // Current number of frames which we can't output because Viterbi has not
678  // converged for them, or opts_.max_frames_latency if we have reached that
679  // limit.
681 
682  // The forward-cost at the current frame (the last frame in frame_info_);
683  // this has the same dimension as lags_. We normalize each time so
684  // the lowest cost is zero, for numerical accuracy and so we can use float.
686 
687  // stores the constant part of forward_cost_.
689 
690  // The resampled-lag index and the NCCF (as computed for POV, without ballast
691  // term) for each frame, as determined by Viterbi traceback from the best
692  // final state.
693  std::vector<std::pair<int32, BaseFloat> > lag_nccf_;
694 
696 
700 
703  double signal_sum_;
704 
712 };
713 
714 
716  const PitchExtractionOptions &opts):
717  opts_(opts), forward_cost_remainder_(0.0), input_finished_(false),
718  signal_sumsq_(0.0), signal_sum_(0.0), downsampled_samples_processed_(0) {
720  opts.lowpass_cutoff,
721  opts.lowpass_filter_width);
722 
723  double outer_min_lag = 1.0 / opts.max_f0 -
724  (opts.upsample_filter_width/(2.0 * opts.resample_freq));
725  double outer_max_lag = 1.0 / opts.min_f0 +
726  (opts.upsample_filter_width/(2.0 * opts.resample_freq));
727  nccf_first_lag_ = ceil(opts.resample_freq * outer_min_lag);
728  nccf_last_lag_ = floor(opts.resample_freq * outer_max_lag);
729 
730  frames_latency_ = 0; // will be set in AcceptWaveform()
731 
732  // Choose the lags at which we resample the NCCF.
733  SelectLags(opts, &lags_);
734 
735  // upsample_cutoff is the filter cutoff for upsampling the NCCF, which is the
736  // Nyquist of the resampling frequency. The NCCF is (almost completely)
737  // bandlimited to around "lowpass_cutoff" (1000 by default), and when the
738  // spectrum of this bandlimited signal is convolved with the spectrum of an
739  // impulse train with frequency "resample_freq", which are separated by 4kHz,
740  // we get energy at -5000,-3000, -1000...1000, 3000..5000, etc. Filtering at
741  // half the Nyquist (2000 by default) is sufficient to get only the first
742  // repetition.
743  BaseFloat upsample_cutoff = opts.resample_freq * 0.5;
744 
745 
746  Vector<BaseFloat> lags_offset(lags_);
747  // lags_offset equals lags_ (which are the log-spaced lag values we want to
748  // measure the NCCF at) with nccf_first_lag_ / opts.resample_freq subtracted
749  // from each element, so we can treat the measured NCCF values as as starting
750  // from sample zero in a signal that starts at the point start /
751  // opts.resample_freq. This is necessary because the ArbitraryResample code
752  // assumes that the input signal starts from sample zero.
753  lags_offset.Add(-nccf_first_lag_ / opts.resample_freq);
754 
755  int32 num_measured_lags = nccf_last_lag_ + 1 - nccf_first_lag_;
756 
757  nccf_resampler_ = new ArbitraryResample(num_measured_lags, opts.resample_freq,
758  upsample_cutoff, lags_offset,
759  opts.upsample_filter_width);
760 
761  // add a PitchInfo object for frame -1 (not a real frame).
762  frame_info_.push_back(new PitchFrameInfo(lags_.Dim()));
763  // zeroes forward_cost_; this is what we want for the fake frame -1.
764  forward_cost_.Resize(lags_.Dim());
765 }
766 
767 
769  int64 num_downsampled_samples, bool snip_edges) const {
770  int32 frame_shift = opts_.NccfWindowShift(),
771  frame_length = opts_.NccfWindowSize();
772  // Use the "full frame length" to compute the number
773  // of frames only if the input is not finished.
774  if (!input_finished_)
775  frame_length += nccf_last_lag_;
776  if (num_downsampled_samples < frame_length) {
777  return 0;
778  } else {
779  if (!snip_edges) {
780  if (input_finished_) {
781  return static_cast<int32>(num_downsampled_samples * 1.0f /
782  frame_shift + 0.5f);
783  } else {
784  return static_cast<int32>((num_downsampled_samples - frame_length / 2) *
785  1.0f / frame_shift + 0.5f);
786  }
787  } else {
788  return static_cast<int32>((num_downsampled_samples - frame_length) /
789  frame_shift + 1);
790  }
791  }
792 }
793 
795  const VectorBase<BaseFloat> &downsampled_wave_part) {
796  // frame_info_ has an extra element at frame-1, so subtract
797  // one from the length.
798  int64 num_frames = static_cast<int64>(frame_info_.size()) - 1,
799  next_frame = num_frames,
800  frame_shift = opts_.NccfWindowShift(),
801  next_frame_sample = frame_shift * next_frame;
802 
803  signal_sumsq_ += VecVec(downsampled_wave_part, downsampled_wave_part);
804  signal_sum_ += downsampled_wave_part.Sum();
805 
806  // next_frame_sample is the first sample index we'll need for the
807  // next frame.
808  int64 next_downsampled_samples_processed =
809  downsampled_samples_processed_ + downsampled_wave_part.Dim();
810 
811  if (next_frame_sample > next_downsampled_samples_processed) {
812  // this could only happen in the weird situation that the full frame length
813  // is less than the frame shift.
814  int32 full_frame_length = opts_.NccfWindowSize() + nccf_last_lag_;
815  KALDI_ASSERT(full_frame_length < frame_shift && "Code error");
817  } else {
818  Vector<BaseFloat> new_remainder(next_downsampled_samples_processed -
819  next_frame_sample);
820  // note: next_frame_sample is the index into the entire signal, of
821  // new_remainder(0).
822  // i is the absolute index of the signal.
823  for (int64 i = next_frame_sample;
824  i < next_downsampled_samples_processed; i++) {
825  if (i >= downsampled_samples_processed_) { // in current signal.
826  new_remainder(i - next_frame_sample) =
827  downsampled_wave_part(i - downsampled_samples_processed_);
828  } else { // in old remainder; only reach here if waveform supplied is
829  new_remainder(i - next_frame_sample) = // tiny.
832  }
833  }
834  downsampled_signal_remainder_.Swap(&new_remainder);
835  }
836  downsampled_samples_processed_ = next_downsampled_samples_processed;
837 }
838 
840  const VectorBase<BaseFloat> &downsampled_wave_part,
841  int64 sample_index,
842  VectorBase<BaseFloat> *window) {
843  int32 full_frame_length = window->Dim();
844  int32 offset = static_cast<int32>(sample_index -
846 
847  // Treat edge cases first
848  if (sample_index < 0) {
849  // Part of the frame is before the beginning of the signal. This
850  // should only happen if opts_.snip_edges == false, when we are
851  // processing the first few frames of signal. In this case
852  // we pad with zeros.
853  KALDI_ASSERT(opts_.snip_edges == false);
854  int32 sub_frame_length = sample_index + full_frame_length;
855  int32 sub_frame_index = full_frame_length - sub_frame_length;
856  KALDI_ASSERT(sub_frame_length > 0 && sub_frame_index > 0);
857  window->SetZero();
858  SubVector<BaseFloat> sub_window(*window, sub_frame_index, sub_frame_length);
859  ExtractFrame(downsampled_wave_part, 0, &sub_window);
860  return;
861  }
862 
863  if (offset + full_frame_length > downsampled_wave_part.Dim()) {
864  // Requested frame is past end of the signal. This should only happen if
865  // input_finished_ == true, when we're flushing out the last couple of
866  // frames of signal. In this case we pad with zeros.
868  int32 sub_frame_length = downsampled_wave_part.Dim() - offset;
869  KALDI_ASSERT(sub_frame_length > 0);
870  window->SetZero();
871  SubVector<BaseFloat> sub_window(*window, 0, sub_frame_length);
872  ExtractFrame(downsampled_wave_part, sample_index, &sub_window);
873  return;
874  }
875 
876  // "offset" is the offset of the start of the frame, into this
877  // signal.
878  if (offset >= 0) {
879  // frame is full inside the new part of the signal.
880  window->CopyFromVec(downsampled_wave_part.Range(offset, full_frame_length));
881  } else {
882  // frame is partly in the remainder and partly in the new part.
883  int32 remainder_offset = downsampled_signal_remainder_.Dim() + offset;
884  KALDI_ASSERT(remainder_offset >= 0); // or we didn't keep enough remainder.
885  KALDI_ASSERT(offset + full_frame_length > 0); // or we should have
886  // processed this frame last
887  // time.
888 
889  int32 old_length = -offset, new_length = offset + full_frame_length;
890  window->Range(0, old_length).CopyFromVec(
891  downsampled_signal_remainder_.Range(remainder_offset, old_length));
892  window->Range(old_length, new_length).CopyFromVec(
893  downsampled_wave_part.Range(0, new_length));
894  }
895  if (opts_.preemph_coeff != 0.0) {
896  BaseFloat preemph_coeff = opts_.preemph_coeff;
897  for (int32 i = window->Dim() - 1; i > 0; i--)
898  (*window)(i) -= preemph_coeff * (*window)(i-1);
899  (*window)(0) *= (1.0 - preemph_coeff);
900  }
901 }
902 
904  int32 T = NumFramesReady();
905  KALDI_ASSERT(frame < T);
906  return (input_finished_ && frame + 1 == T);
907 }
908 
910  return opts_.frame_shift_ms / 1000.0f;
911 }
912 
914  int32 num_frames = lag_nccf_.size(),
915  latency = frames_latency_;
916  KALDI_ASSERT(latency <= num_frames);
917  return num_frames - latency;
918 }
919 
920 
922  VectorBase<BaseFloat> *feat) {
923  KALDI_ASSERT(frame < NumFramesReady() && feat->Dim() == 2);
924  (*feat)(0) = lag_nccf_[frame].second;
925  (*feat)(1) = 1.0 / lags_(lag_nccf_[frame].first);
926 }
927 
929  input_finished_ = true;
930  // Process an empty waveform; this has an effect because
931  // after setting input_finished_ to true, NumFramesAvailable()
932  // will return a slightly larger number.
934  int32 num_frames = static_cast<size_t>(frame_info_.size() - 1);
935  if (num_frames < opts_.recompute_frame && !opts_.nccf_ballast_online)
937  frames_latency_ = 0;
938  KALDI_VLOG(3) << "Pitch-tracking Viterbi cost is "
939  << (forward_cost_remainder_ / num_frames)
940  << " per frame, over " << num_frames << " frames.";
941 }
942 
943 // see comment with declaration. This is only relevant for online
944 // operation (it gets called for non-online mode, but is a no-op).
947  int32 num_frames = static_cast<int32>(frame_info_.size()) - 1;
948 
949  // The assertion reflects how we believe this function will be called.
950  KALDI_ASSERT(num_frames <= opts_.recompute_frame);
951  KALDI_ASSERT(nccf_info_.size() == static_cast<size_t>(num_frames));
952  if (num_frames == 0)
953  return;
954  double num_samp = downsampled_samples_processed_, sum = signal_sum_,
955  sumsq = signal_sumsq_, mean = sum / num_samp;
956  BaseFloat mean_square = sumsq / num_samp - mean * mean;
957 
958  bool must_recompute = false;
959  BaseFloat threshold = 0.01;
960  for (int32 frame = 0; frame < num_frames; frame++)
961  if (!ApproxEqual(nccf_info_[frame]->mean_square_energy,
962  mean_square, threshold))
963  must_recompute = true;
964 
965  if (!must_recompute) {
966  // Nothing to do. We'll reach here, for instance, if everything was in one
967  // chunk and opts_.nccf_ballast_online == false. This is the case for
968  // offline processing.
969  for (size_t i = 0; i < nccf_info_.size(); i++)
970  delete nccf_info_[i];
971  nccf_info_.clear();
972  return;
973  }
974 
975  int32 num_states = forward_cost_.Dim(),
976  basic_frame_length = opts_.NccfWindowSize();
977 
978  BaseFloat new_nccf_ballast = pow(mean_square * basic_frame_length, 2) *
980 
981  double forward_cost_remainder = 0.0;
982  Vector<BaseFloat> forward_cost(num_states), // start off at zero.
983  next_forward_cost(forward_cost);
984  std::vector<std::pair<int32, int32 > > index_info;
985 
986  for (int32 frame = 0; frame < num_frames; frame++) {
987  NccfInfo &nccf_info = *nccf_info_[frame];
988  BaseFloat old_mean_square = nccf_info_[frame]->mean_square_energy,
989  avg_norm_prod = nccf_info_[frame]->avg_norm_prod,
990  old_nccf_ballast = pow(old_mean_square * basic_frame_length, 2) *
992  nccf_scale = pow((old_nccf_ballast + avg_norm_prod) /
993  (new_nccf_ballast + avg_norm_prod),
994  static_cast<BaseFloat>(0.5));
995  // The "nccf_scale" is an estimate of the scaling factor by which the NCCF
996  // would change on this frame, on average, by changing the ballast term from
997  // "old_nccf_ballast" to "new_nccf_ballast". It's not exact because the
998  // "avg_norm_prod" is just an average of the product e1 * e2 of frame
999  // energies of the (frame, shifted-frame), but these won't change that much
1000  // within a frame, and even if they do, the inaccuracy of the scaled NCCF
1001  // will still be very small if the ballast term didn't change much, or if
1002  // it's much larger or smaller than e1*e2. By doing it as a simple scaling,
1003  // we save the overhead of the NCCF resampling, which is a considerable part
1004  // of the whole computation.
1005  nccf_info.nccf_pitch_resampled.Scale(nccf_scale);
1006 
1007  frame_info_[frame + 1]->ComputeBacktraces(
1008  opts_, nccf_info.nccf_pitch_resampled, lags_,
1009  forward_cost, &index_info, &next_forward_cost);
1010 
1011  forward_cost.Swap(&next_forward_cost);
1012  BaseFloat remainder = forward_cost.Min();
1013  forward_cost_remainder += remainder;
1014  forward_cost.Add(-remainder);
1015  }
1016  KALDI_VLOG(3) << "Forward-cost per frame changed from "
1017  << (forward_cost_remainder_ / num_frames) << " to "
1018  << (forward_cost_remainder / num_frames);
1019 
1020  forward_cost_remainder_ = forward_cost_remainder;
1021  forward_cost_.Swap(&forward_cost);
1022 
1023  int32 best_final_state;
1024  forward_cost_.Min(&best_final_state);
1025 
1026  if (lag_nccf_.size() != static_cast<size_t>(num_frames))
1027  lag_nccf_.resize(num_frames);
1028 
1029  frame_info_.back()->SetBestState(best_final_state, lag_nccf_);
1030  frames_latency_ =
1031  frame_info_.back()->ComputeLatency(opts_.max_frames_latency);
1032  for (size_t i = 0; i < nccf_info_.size(); i++)
1033  delete nccf_info_[i];
1034  nccf_info_.clear();
1035 }
1036 
1038  delete nccf_resampler_;
1039  delete signal_resampler_;
1040  for (size_t i = 0; i < frame_info_.size(); i++)
1041  delete frame_info_[i];
1042  for (size_t i = 0; i < nccf_info_.size(); i++)
1043  delete nccf_info_[i];
1044 }
1045 
1047  BaseFloat sampling_rate,
1048  const VectorBase<BaseFloat> &wave) {
1049  // flush out the last few samples of input waveform only if input_finished_ ==
1050  // true.
1051  const bool flush = input_finished_;
1052 
1053  Vector<BaseFloat> downsampled_wave;
1054  signal_resampler_->Resample(wave, flush, &downsampled_wave);
1055 
1056  // these variables will be used to compute the root-mean-square value of the
1057  // signal for the ballast term.
1058  double cur_sumsq = signal_sumsq_, cur_sum = signal_sum_;
1059  int64 cur_num_samp = downsampled_samples_processed_,
1060  prev_frame_end_sample = 0;
1061  if (!opts_.nccf_ballast_online) {
1062  cur_sumsq += VecVec(downsampled_wave, downsampled_wave);
1063  cur_sum += downsampled_wave.Sum();
1064  cur_num_samp += downsampled_wave.Dim();
1065  }
1066 
1067  // end_frame is the total number of frames we can now process, including
1068  // previously processed ones.
1069  int32 end_frame = NumFramesAvailable(
1070  downsampled_samples_processed_ + downsampled_wave.Dim(), opts_.snip_edges);
1071  // "start_frame" is the first frame-index we process
1072  int32 start_frame = frame_info_.size() - 1,
1073  num_new_frames = end_frame - start_frame;
1074 
1075  if (num_new_frames == 0) {
1076  UpdateRemainder(downsampled_wave);
1077  return;
1078  // continuing to the rest of the code would generate
1079  // an error when sizing matrices with zero rows, and
1080  // anyway is a waste of time.
1081  }
1082 
1083  int32 num_measured_lags = nccf_last_lag_ + 1 - nccf_first_lag_,
1084  num_resampled_lags = lags_.Dim(),
1085  frame_shift = opts_.NccfWindowShift(),
1086  basic_frame_length = opts_.NccfWindowSize(),
1087  full_frame_length = basic_frame_length + nccf_last_lag_;
1088 
1089  Vector<BaseFloat> window(full_frame_length),
1090  inner_prod(num_measured_lags),
1091  norm_prod(num_measured_lags);
1092  Matrix<BaseFloat> nccf_pitch(num_new_frames, num_measured_lags),
1093  nccf_pov(num_new_frames, num_measured_lags);
1094 
1095  Vector<BaseFloat> cur_forward_cost(num_resampled_lags);
1096 
1097 
1098  // Because the resampling of the NCCF is more efficient when grouped together,
1099  // we first compute the NCCF for all frames, then resample as a matrix, then
1100  // do the Viterbi [that happens inside the constructor of PitchFrameInfo].
1101 
1102  for (int32 frame = start_frame; frame < end_frame; frame++) {
1103  // start_sample is index into the whole wave, not just this part.
1104  int64 start_sample;
1105  if (opts_.snip_edges) {
1106  // Usual case: offset starts at 0
1107  start_sample = static_cast<int64>(frame) * frame_shift;
1108  } else {
1109  // When we are not snipping the edges, the first offsets may be
1110  // negative. In this case we will pad with zeros, it should not impact
1111  // the pitch tracker.
1112  start_sample =
1113  static_cast<int64>((frame + 0.5) * frame_shift) - full_frame_length / 2;
1114  }
1115  ExtractFrame(downsampled_wave, start_sample, &window);
1116  if (opts_.nccf_ballast_online) {
1117  // use only up to end of current frame to compute root-mean-square value.
1118  // end_sample will be the sample-index into "downsampled_wave", so
1119  // not really comparable to start_sample.
1120  int64 end_sample = start_sample + full_frame_length -
1122  KALDI_ASSERT(end_sample > 0); // or should have processed this frame last
1123  // time. Note: end_sample is one past last
1124  // sample.
1125  if (end_sample > downsampled_wave.Dim()) {
1127  end_sample = downsampled_wave.Dim();
1128  }
1129  SubVector<BaseFloat> new_part(downsampled_wave, prev_frame_end_sample,
1130  end_sample - prev_frame_end_sample);
1131  cur_num_samp += new_part.Dim();
1132  cur_sumsq += VecVec(new_part, new_part);
1133  cur_sum += new_part.Sum();
1134  prev_frame_end_sample = end_sample;
1135  }
1136  double mean_square = cur_sumsq / cur_num_samp -
1137  pow(cur_sum / cur_num_samp, 2.0);
1138 
1139  ComputeCorrelation(window, nccf_first_lag_, nccf_last_lag_,
1140  basic_frame_length, &inner_prod, &norm_prod);
1141  double nccf_ballast_pov = 0.0,
1142  nccf_ballast_pitch = pow(mean_square * basic_frame_length, 2) *
1144  avg_norm_prod = norm_prod.Sum() / norm_prod.Dim();
1145  SubVector<BaseFloat> nccf_pitch_row(nccf_pitch, frame - start_frame);
1146  ComputeNccf(inner_prod, norm_prod, nccf_ballast_pitch,
1147  &nccf_pitch_row);
1148  SubVector<BaseFloat> nccf_pov_row(nccf_pov, frame - start_frame);
1149  ComputeNccf(inner_prod, norm_prod, nccf_ballast_pov,
1150  &nccf_pov_row);
1151  if (frame < opts_.recompute_frame)
1152  nccf_info_.push_back(new NccfInfo(avg_norm_prod, mean_square));
1153  }
1154 
1155  Matrix<BaseFloat> nccf_pitch_resampled(num_new_frames, num_resampled_lags);
1156  nccf_resampler_->Resample(nccf_pitch, &nccf_pitch_resampled);
1157  nccf_pitch.Resize(0, 0); // no longer needed.
1158  Matrix<BaseFloat> nccf_pov_resampled(num_new_frames, num_resampled_lags);
1159  nccf_resampler_->Resample(nccf_pov, &nccf_pov_resampled);
1160  nccf_pov.Resize(0, 0); // no longer needed.
1161 
1162  // We've finished dealing with the waveform so we can call UpdateRemainder
1163  // now; we need to call it before we possibly call RecomputeBacktraces()
1164  // below, which is why we don't do it at the very end.
1165  UpdateRemainder(downsampled_wave);
1166 
1167  std::vector<std::pair<int32, int32 > > index_info;
1168 
1169  for (int32 frame = start_frame; frame < end_frame; frame++) {
1170  int32 frame_idx = frame - start_frame;
1171  PitchFrameInfo *prev_info = frame_info_.back(),
1172  *cur_info = new PitchFrameInfo(prev_info);
1173  cur_info->SetNccfPov(nccf_pov_resampled.Row(frame_idx));
1174  cur_info->ComputeBacktraces(opts_, nccf_pitch_resampled.Row(frame_idx),
1175  lags_, forward_cost_, &index_info,
1176  &cur_forward_cost);
1177  forward_cost_.Swap(&cur_forward_cost);
1178  // Renormalize forward_cost so smallest element is zero.
1179  BaseFloat remainder = forward_cost_.Min();
1180  forward_cost_remainder_ += remainder;
1181  forward_cost_.Add(-remainder);
1182  frame_info_.push_back(cur_info);
1183  if (frame < opts_.recompute_frame)
1184  nccf_info_[frame]->nccf_pitch_resampled =
1185  nccf_pitch_resampled.Row(frame_idx);
1186  if (frame == opts_.recompute_frame - 1 && !opts_.nccf_ballast_online)
1188  }
1189 
1190  // Trace back the best-path.
1191  int32 best_final_state;
1192  forward_cost_.Min(&best_final_state);
1193  lag_nccf_.resize(frame_info_.size() - 1); // will keep any existing data.
1194  frame_info_.back()->SetBestState(best_final_state, lag_nccf_);
1195  frames_latency_ =
1196  frame_info_.back()->ComputeLatency(opts_.max_frames_latency);
1197  KALDI_VLOG(4) << "Latency is " << frames_latency_;
1198 }
1199 
1200 
1201 
1202 // Some functions that forward from OnlinePitchFeature to
1203 // OnlinePitchFeatureImpl.
1205  return impl_->NumFramesReady();
1206 }
1207 
1209  :impl_(new OnlinePitchFeatureImpl(opts)) { }
1210 
1212  return impl_->IsLastFrame(frame);
1213 }
1214 
1216  return impl_->FrameShiftInSeconds();
1217 }
1218 
1220  impl_->GetFrame(frame, feat);
1221 }
1222 
1224  BaseFloat sampling_rate,
1225  const VectorBase<BaseFloat> &waveform) {
1226  impl_->AcceptWaveform(sampling_rate, waveform);
1227 }
1228 
1230  impl_->InputFinished();
1231 }
1232 
1234  delete impl_;
1235 }
1236 
1237 
1249  const PitchExtractionOptions &opts,
1250  const VectorBase<BaseFloat> &wave,
1251  Matrix<BaseFloat> *output) {
1252 
1253  int32 cur_rows = 100;
1254  Matrix<BaseFloat> feats(cur_rows, 2);
1255 
1256  OnlinePitchFeature pitch_extractor(opts);
1257  KALDI_ASSERT(opts.frames_per_chunk > 0 &&
1258  "--simulate-first-pass-online option does not make sense "
1259  "unless you specify --frames-per-chunk");
1260 
1261  int32 cur_offset = 0, cur_frame = 0, samp_per_chunk =
1262  opts.frames_per_chunk * opts.samp_freq * opts.frame_shift_ms / 1000.0f;
1263 
1264  while (cur_offset < wave.Dim()) {
1265  int32 num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset);
1266  SubVector<BaseFloat> wave_chunk(wave, cur_offset, num_samp);
1267  pitch_extractor.AcceptWaveform(opts.samp_freq, wave_chunk);
1268  cur_offset += num_samp;
1269  if (cur_offset == wave.Dim())
1270  pitch_extractor.InputFinished();
1271  // Get each frame as soon as it is ready.
1272  for (; cur_frame < pitch_extractor.NumFramesReady(); cur_frame++) {
1273  if (cur_frame >= cur_rows) {
1274  cur_rows *= 2;
1275  feats.Resize(cur_rows, 2, kCopyData);
1276  }
1277  SubVector<BaseFloat> row(feats, cur_frame);
1278  pitch_extractor.GetFrame(cur_frame, &row);
1279  }
1280  }
1281  if (cur_frame == 0) {
1282  KALDI_WARN << "No features output since wave file too short";
1283  output->Resize(0, 0);
1284  } else {
1285  *output = feats.RowRange(0, cur_frame);
1286  }
1287 }
1288 
1289 
1290 
1292  const VectorBase<BaseFloat> &wave,
1293  Matrix<BaseFloat> *output) {
1294  if (opts.simulate_first_pass_online) {
1295  ComputeKaldiPitchFirstPass(opts, wave, output);
1296  return;
1297  }
1298  OnlinePitchFeature pitch_extractor(opts);
1299 
1300  if (opts.frames_per_chunk == 0) {
1301  pitch_extractor.AcceptWaveform(opts.samp_freq, wave);
1302  } else {
1303  // the user may set opts.frames_per_chunk for better compatibility with
1304  // online operation.
1305  KALDI_ASSERT(opts.frames_per_chunk > 0);
1306  int32 cur_offset = 0, samp_per_chunk =
1307  opts.frames_per_chunk * opts.samp_freq * opts.frame_shift_ms / 1000.0f;
1308  while (cur_offset < wave.Dim()) {
1309  int32 num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset);
1310  SubVector<BaseFloat> wave_chunk(wave, cur_offset, num_samp);
1311  pitch_extractor.AcceptWaveform(opts.samp_freq, wave_chunk);
1312  cur_offset += num_samp;
1313  }
1314  }
1315  pitch_extractor.InputFinished();
1316  int32 num_frames = pitch_extractor.NumFramesReady();
1317  if (num_frames == 0) {
1318  KALDI_WARN << "No frames output in pitch extraction";
1319  output->Resize(0, 0);
1320  return;
1321  }
1322  output->Resize(num_frames, 2);
1323  for (int32 frame = 0; frame < num_frames; frame++) {
1324  SubVector<BaseFloat> row(*output, frame);
1325  pitch_extractor.GetFrame(frame, &row);
1326  }
1327 }
1328 
1329 
1330 /*
1331  This comment describes our invesigation of how much latency the
1332  online-processing algorithm introduces, i.e. how many frames you would
1333  typically have to wait until the traceback converges, if you were to set the
1334  --max-frames-latency to a very large value.
1335 
1336  This was done on a couple of files of language-id data.
1337 
1338  /home/dpovey/kaldi-online/src/featbin/compute-kaldi-pitch-feats --frames-per-chunk=10 --max-frames-latency=100 --verbose=4 --sample-frequency=8000 --resample-frequency=2600 "scp:head -n 2 data/train/wav.scp |" ark:/dev/null 2>&1 | grep Latency | wc
1339  4871 24355 443991
1340  /home/dpovey/kaldi-online/src/featbin/compute-kaldi-pitch-feats --frames-per-chunk=10 --max-frames-latency=100 --verbose=4 --sample-frequency=8000 --resample-frequency=2600 "scp:head -n 2 data/train/wav.scp |" ark:/dev/null 2>&1 | grep Latency | grep 100 | wc
1341  1534 7670 141128
1342 
1343 # as above, but with 50 instead of 10 in the --max-frames-latency and grep statements.
1344  2070 10350 188370
1345 # as above, but with 10 instead of 50.
1346  4067 20335 370097
1347 
1348  This says that out of 4871 selected frames [we measured the latency every 10
1349  frames, since --frames-per-chunk=10], in 1534 frames (31%), the latency was
1350  >= 100 frames, i.e. >= 1 second. Including the other numbers, we can see
1351  that
1352 
1353  31% of frames had latency >= 1 second
1354  42% of frames had latency >= 0.5 second
1355  83% of frames had latency >= 0.1 second.
1356 
1357  This doesn't necessarily mean that we actually have a latency of >= 1 second 31% of
1358  the time when using these features, since by using the --max-frames-latency option
1359  (default: 30 frames), it will limit the latency to, say, 0.3 seconds, and trace back
1360  from the best current pitch. Most of the time this will probably cause no change in
1361  the pitch traceback since the best current pitch is probably the "right" point to
1362  trace back from. And anyway, in the online-decoding, we will most likely rescore
1363  the features at the end anyway, and the traceback gets recomputed, so there will
1364  be no inaccuracy (assuming the first-pass lattice had everything we needed).
1365 
1366  Probably the greater source of inaccuracy due to the online algorithm is the
1367  online energy-normalization, which affects the NCCF-ballast term, and which,
1368  for reasons of efficiency, we don't attempt to "correct" in a later rescoring
1369  pass. This will make the most difference in the first few frames of the file,
1370  before the first voicing, where it will tend to produce more pitch movement
1371  than the offline version of the algorithm.
1372 */
1373 
1374 
1375 // Function to do data accumulation for on-line usage
1376 template<typename Real>
1377 inline void AppendVector(const VectorBase<Real> &src, Vector<Real> *dst) {
1378  if (src.Dim() == 0) return;
1379  dst->Resize(dst->Dim() + src.Dim(), kCopyData);
1380  dst->Range(dst->Dim() - src.Dim(), src.Dim()).CopyFromVec(src);
1381 }
1382 
1399  const ProcessPitchOptions &opts,
1400  OnlineFeatureInterface *src):
1401  opts_(opts), src_(src),
1402  dim_ ((opts.add_pov_feature ? 1 : 0)
1403  + (opts.add_normalized_log_pitch ? 1 : 0)
1404  + (opts.add_delta_pitch ? 1 : 0)
1405  + (opts.add_raw_log_pitch ? 1 : 0)) {
1406  KALDI_ASSERT(dim_ > 0 &&
1407  " At least one of the pitch features should be chosen. "
1408  "Check your post-process-pitch options.");
1409  KALDI_ASSERT(src->Dim() == kRawFeatureDim &&
1410  "Input feature must be pitch feature (should have dimension 2)");
1411 }
1412 
1413 
1415  VectorBase<BaseFloat> *feat) {
1416  int32 frame_delayed = frame < opts_.delay ? 0 : frame - opts_.delay;
1417  KALDI_ASSERT(feat->Dim() == dim_ &&
1418  frame_delayed < NumFramesReady());
1419  int32 index = 0;
1420  if (opts_.add_pov_feature)
1421  (*feat)(index++) = GetPovFeature(frame_delayed);
1423  (*feat)(index++) = GetNormalizedLogPitchFeature(frame_delayed);
1424  if (opts_.add_delta_pitch)
1425  (*feat)(index++) = GetDeltaPitchFeature(frame_delayed);
1427  (*feat)(index++) = GetRawLogPitchFeature(frame_delayed);
1428  KALDI_ASSERT(index == dim_);
1429 }
1430 
1433  src_->GetFrame(frame, &tmp); // (NCCF, pitch) from pitch extractor
1434  BaseFloat nccf = tmp(0);
1435  return opts_.pov_scale * NccfToPovFeature(nccf)
1436  + opts_.pov_offset;
1437 }
1438 
1440  // Rather than computing the delta pitch directly in code here,
1441  // which might seem easier, we accumulate a small window of features
1442  // and call ComputeDeltas. This might seem like overkill; the reason
1443  // we do it this way is to ensure that the end effects (at file
1444  // beginning and end) are handled in a consistent way.
1445  int32 context = opts_.delta_window;
1446  int32 start_frame = std::max(0, frame - context),
1447  end_frame = std::min(frame + context + 1, src_->NumFramesReady()),
1448  frames_in_window = end_frame - start_frame;
1449  Matrix<BaseFloat> feats(frames_in_window, 1),
1450  delta_feats;
1451 
1452  for (int32 f = start_frame; f < end_frame; f++)
1453  feats(f - start_frame, 0) = GetRawLogPitchFeature(f);
1454 
1455  DeltaFeaturesOptions delta_opts;
1456  delta_opts.order = 1;
1457  delta_opts.window = opts_.delta_window;
1458  ComputeDeltas(delta_opts, feats, &delta_feats);
1459  while (delta_feature_noise_.size() <= static_cast<size_t>(frame)) {
1460  delta_feature_noise_.push_back(RandGauss() *
1462  }
1463  // note: delta_feats will have two columns, second contains deltas.
1464  return (delta_feats(frame - start_frame, 1) + delta_feature_noise_[frame]) *
1466 }
1467 
1470  src_->GetFrame(frame, &tmp);
1471  BaseFloat pitch = tmp(1);
1472  KALDI_ASSERT(pitch > 0);
1473  return Log(pitch);
1474 }
1475 
1477  UpdateNormalizationStats(frame);
1478  BaseFloat log_pitch = GetRawLogPitchFeature(frame),
1479  avg_log_pitch = normalization_stats_[frame].sum_log_pitch_pov /
1480  normalization_stats_[frame].sum_pov,
1481  normalized_log_pitch = log_pitch - avg_log_pitch;
1482  return normalized_log_pitch * opts_.pitch_scale;
1483 }
1484 
1485 
1486 // inline
1488  int32 src_frames_ready,
1489  int32 *window_begin,
1490  int32 *window_end) const {
1491  int32 left_context = opts_.normalization_left_context;
1492  int32 right_context = opts_.normalization_right_context;
1493  *window_begin = std::max(0, t - left_context);
1494  *window_end = std::min(t + right_context + 1, src_frames_ready);
1495 }
1496 
1497 
1498 // Makes sure the entry in normalization_stats_ for this frame is up to date;
1499 // called from GetNormalizedLogPitchFeature.
1500 // the cur_num_frames and input_finished variables are needed because the
1501 // pitch features for a given frame may change as we see more data.
1503  KALDI_ASSERT(frame >= 0);
1504  if (normalization_stats_.size() <= frame)
1505  normalization_stats_.resize(frame + 1);
1506  int32 cur_num_frames = src_->NumFramesReady();
1507  bool input_finished = src_->IsLastFrame(cur_num_frames - 1);
1508 
1509  NormalizationStats &this_stats = normalization_stats_[frame];
1510  if (this_stats.cur_num_frames == cur_num_frames &&
1511  this_stats.input_finished == input_finished) {
1512  // Stats are fully up-to-date.
1513  return;
1514  }
1515  int32 this_window_begin, this_window_end;
1516  GetNormalizationWindow(frame, cur_num_frames,
1517  &this_window_begin, &this_window_end);
1518 
1519  if (frame > 0) {
1520  const NormalizationStats &prev_stats = normalization_stats_[frame - 1];
1521  if (prev_stats.cur_num_frames == cur_num_frames &&
1522  prev_stats.input_finished == input_finished) {
1523  // we'll derive this_stats efficiently from prev_stats.
1524  // Checking that cur_num_frames and input_finished have not changed
1525  // ensures that the underlying features will not have changed.
1526  this_stats = prev_stats;
1527  int32 prev_window_begin, prev_window_end;
1528  GetNormalizationWindow(frame - 1, cur_num_frames,
1529  &prev_window_begin, &prev_window_end);
1530  if (this_window_begin != prev_window_begin) {
1531  KALDI_ASSERT(this_window_begin == prev_window_begin + 1);
1533  src_->GetFrame(prev_window_begin, &tmp);
1534  BaseFloat accurate_pov = NccfToPov(tmp(0)),
1535  log_pitch = Log(tmp(1));
1536  this_stats.sum_pov -= accurate_pov;
1537  this_stats.sum_log_pitch_pov -= accurate_pov * log_pitch;
1538  }
1539  if (this_window_end != prev_window_end) {
1540  KALDI_ASSERT(this_window_end == prev_window_end + 1);
1542  src_->GetFrame(prev_window_end, &tmp);
1543  BaseFloat accurate_pov = NccfToPov(tmp(0)),
1544  log_pitch = Log(tmp(1));
1545  this_stats.sum_pov += accurate_pov;
1546  this_stats.sum_log_pitch_pov += accurate_pov * log_pitch;
1547  }
1548  return;
1549  }
1550  }
1551  // The way we do it here is not the most efficient way to do it;
1552  // we'll see if it becomes a problem. The issue is we have to redo
1553  // this computation from scratch each time we process a new chunk, which
1554  // may be a little inefficient if the chunk-size is very small.
1555  this_stats.cur_num_frames = cur_num_frames;
1556  this_stats.input_finished = input_finished;
1557  this_stats.sum_pov = 0.0;
1558  this_stats.sum_log_pitch_pov = 0.0;
1560  for (int32 f = this_window_begin; f < this_window_end; f++) {
1561  src_->GetFrame(f, &tmp);
1562  BaseFloat accurate_pov = NccfToPov(tmp(0)),
1563  log_pitch = Log(tmp(1));
1564  this_stats.sum_pov += accurate_pov;
1565  this_stats.sum_log_pitch_pov += accurate_pov * log_pitch;
1566  }
1567 }
1568 
1570  int32 src_frames_ready = src_->NumFramesReady();
1571  if (src_frames_ready == 0) {
1572  return 0;
1573  } else if (src_->IsLastFrame(src_frames_ready - 1)) {
1574  return src_frames_ready + opts_.delay;
1575  } else {
1576  return std::max(0, src_frames_ready -
1578  }
1579 }
1580 
1582  const MatrixBase<BaseFloat> &input,
1583  Matrix<BaseFloat> *output) {
1584  OnlineMatrixFeature pitch_feat(input);
1585 
1586  OnlineProcessPitch online_process_pitch(opts, &pitch_feat);
1587 
1588  output->Resize(online_process_pitch.NumFramesReady(),
1589  online_process_pitch.Dim());
1590  for (int32 t = 0; t < online_process_pitch.NumFramesReady(); t++) {
1591  SubVector<BaseFloat> row(*output, t);
1592  online_process_pitch.GetFrame(t, &row);
1593  }
1594 }
1595 
1596 
1598  const PitchExtractionOptions &pitch_opts,
1599  const ProcessPitchOptions &process_opts,
1600  const VectorBase<BaseFloat> &wave,
1601  Matrix<BaseFloat> *output) {
1602 
1603  OnlinePitchFeature pitch_extractor(pitch_opts);
1604 
1605  if (pitch_opts.simulate_first_pass_online) {
1606  KALDI_ASSERT(pitch_opts.frames_per_chunk > 0 &&
1607  "--simulate-first-pass-online option does not make sense "
1608  "unless you specify --frames-per-chunk");
1609  }
1610 
1611  OnlineProcessPitch post_process(process_opts, &pitch_extractor);
1612 
1613  int32 cur_rows = 100;
1614  Matrix<BaseFloat> feats(cur_rows, post_process.Dim());
1615 
1616  int32 cur_offset = 0, cur_frame = 0,
1617  samp_per_chunk = pitch_opts.frames_per_chunk *
1618  pitch_opts.samp_freq * pitch_opts.frame_shift_ms / 1000.0f;
1619 
1620  // We request the first-pass features as soon as they are available,
1621  // regardless of whether opts.simulate_first_pass_online == true. If
1622  // opts.simulate_first_pass_online == true this should
1623  // not affect the features generated, but it helps us to test the code
1624  // in a way that's closer to what online decoding would see.
1625 
1626  while (cur_offset < wave.Dim()) {
1627  int32 num_samp;
1628  if (samp_per_chunk > 0)
1629  num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset);
1630  else // user left opts.frames_per_chunk at zero.
1631  num_samp = wave.Dim();
1632  SubVector<BaseFloat> wave_chunk(wave, cur_offset, num_samp);
1633  pitch_extractor.AcceptWaveform(pitch_opts.samp_freq, wave_chunk);
1634  cur_offset += num_samp;
1635  if (cur_offset == wave.Dim())
1636  pitch_extractor.InputFinished();
1637 
1638  // Get each frame as soon as it is ready.
1639  for (; cur_frame < post_process.NumFramesReady(); cur_frame++) {
1640  if (cur_frame >= cur_rows) {
1641  cur_rows *= 2;
1642  feats.Resize(cur_rows, post_process.Dim(), kCopyData);
1643  }
1644  SubVector<BaseFloat> row(feats, cur_frame);
1645  post_process.GetFrame(cur_frame, &row);
1646  }
1647  }
1648 
1649  if (pitch_opts.simulate_first_pass_online) {
1650  if (cur_frame == 0) {
1651  KALDI_WARN << "No features output since wave file too short";
1652  output->Resize(0, 0);
1653  } else {
1654  *output = feats.RowRange(0, cur_frame);
1655  }
1656  } else {
1657  // want the "final" features for second pass, so get them again.
1658  output->Resize(post_process.NumFramesReady(), post_process.Dim());
1659  for (int32 frame = 0; frame < post_process.NumFramesReady(); frame++) {
1660  SubVector<BaseFloat> row(*output, frame);
1661  post_process.GetFrame(frame, &row);
1662  }
1663  }
1664 }
1665 
1666 
1667 } // namespace kaldi
std::vector< PitchFrameInfo * > frame_info_
virtual int32 NumFramesReady() const
returns the feature dimension.
int32 backpointer
The state index on the previous frame that is the best preceding state for this state.
void ComputeKaldiPitchFirstPass(const PitchExtractionOptions &opts, const VectorBase< BaseFloat > &wave, Matrix< BaseFloat > *output)
This function is called from ComputeKaldiPitch when the user specifies opts.simulate_first_pass_onlin...
This class takes a Matrix<BaseFloat> and wraps it as an OnlineFeatureInterface: this can be useful wh...
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
double Exp(double x)
Definition: kaldi-math.h:83
virtual BaseFloat FrameShiftInSeconds() const
void Resample(const VectorBase< BaseFloat > &input, bool flush, Vector< BaseFloat > *output)
This function does the resampling.
Definition: resample.cc:152
PitchFrameInfo(int32 num_states)
This constructor is used for frame -1; it sets the costs to be all zeros the pov_nccf&#39;s to zero and t...
virtual void GetFrame(int32 frame, VectorBase< BaseFloat > *feat)
Gets the feature vector for this frame.
OnlinePitchFeature(const PitchExtractionOptions &opts)
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
void Cleanup(PitchFrameInfo *prev_frame)
This function resizes the arrays for this object and updates the reference counts for the previous ob...
std::vector< BaseFloat > delta_feature_noise_
OnlineProcessPitch(const ProcessPitchOptions &opts, OnlineFeatureInterface *src)
Note on the implementation of OnlineProcessPitch: the OnlineFeatureInterface allows random access to ...
BaseFloat NccfToPovFeature(BaseFloat n)
This function processes the NCCF n to a POV feature f by applying the formula f = (1...
virtual int32 Dim() const
double signal_sum_
sum of previously processed parts of signal; used to do mean-subtraction when getting sum-squared...
void ComputeBacktraces(const PitchExtractionOptions &opts, const VectorBase< BaseFloat > &nccf_pitch, const VectorBase< BaseFloat > &lags, const VectorBase< BaseFloat > &prev_forward_cost, std::vector< std::pair< int32, int32 > > *index_info, VectorBase< BaseFloat > *this_forward_cost)
This constructor is used for frames apart from frame -1; the bulk of the Viterbi computation takes pl...
void AcceptWaveform(BaseFloat sampling_rate, const VectorBase< BaseFloat > &waveform)
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
virtual void InputFinished()
InputFinished() tells the class you won&#39;t be providing any more waveform.
NccfInfo(BaseFloat avg_norm_prod, BaseFloat mean_square_energy)
void ComputeKaldiPitch(const PitchExtractionOptions &opts, const VectorBase< BaseFloat > &wave, Matrix< BaseFloat > *output)
This function extracts (pitch, NCCF) per frame, using the pitch extraction method described in "A Pit...
virtual void AcceptWaveform(BaseFloat sampling_rate, const VectorBase< BaseFloat > &waveform)
This would be called from the application, when you get more wave data.
This online-feature class implements post processing of pitch features.
void ExtractFrame(const VectorBase< BaseFloat > &downsampled_wave_part, int64 frame_index, VectorBase< BaseFloat > *window)
This function extracts from the signal the samples numbered from "sample_index" (numbered in the full...
virtual void GetFrame(int32 frame, VectorBase< BaseFloat > *feat)=0
Gets the feature vector for this frame.
float RandGauss(struct RandomState *state=NULL)
Definition: kaldi-math.h:155
kaldi::int32 int32
ProcessPitchOptions opts_
BaseFloat pov_nccf
the version of the NCCF we keep for the POV computation (without the ballast term).
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
OnlinePitchFeatureImpl * impl_
Vector< BaseFloat > downsampled_signal_remainder_
This is a small remainder of the previous downsampled signal; it&#39;s used by ExtractFrame for frames ne...
BaseFloat GetPovFeature(int32 frame) const
Computes and returns the POV feature for this frame.
int32 NccfWindowShift() const
Returns the window-shift in samples, after resampling.
BaseFloat FrameShiftInSeconds() const
ArbitraryResample * nccf_resampler_
void AddVecVec(Real alpha, const VectorBase< Real > &v, const VectorBase< Real > &r, Real beta)
Add element-by-element product of vectors:
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
void RecomputeBacktraces()
This function is called after we reach frame "recompute_frame", or when InputFinished() is called...
bool IsLastFrame(int32 frame) const
float BaseFloat
Definition: kaldi-types.h:29
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
Vector< BaseFloat > forward_cost_
double Log(double x)
Definition: kaldi-math.h:100
Class ArbitraryResample allows you to resample a signal (assumed zero outside the sample region...
Definition: resample.h:95
virtual int32 NumFramesReady() const
returns the feature dimension.
void Resample(const MatrixBase< BaseFloat > &input, MatrixBase< BaseFloat > *output) const
This function does the resampling.
Definition: resample.cc:280
PitchExtractionOptions opts_
BaseFloat mean_square_energy
struct rnnlm::@11::@12 n
std::vector< NormalizationStats > normalization_stats_
virtual bool IsLastFrame(int32 frame) const =0
Returns true if this is the last frame.
void ComputeNccf(const VectorBase< BaseFloat > &inner_prod, const VectorBase< BaseFloat > &norm_prod, BaseFloat nccf_ballast, VectorBase< BaseFloat > *nccf_vec)
Computes the NCCF as a fraction of the numerator term (a dot product between two vectors) and a denom...
void ComputeAndProcessKaldiPitch(const PitchExtractionOptions &pitch_opts, const ProcessPitchOptions &process_opts, const VectorBase< BaseFloat > &wave, Matrix< BaseFloat > *output)
This function combines ComputeKaldiPitch and ProcessPitch.
#define KALDI_ERR
Definition: kaldi-error.h:147
int64 downsampled_samples_processed_
downsampled_samples_processed is the number of samples (after downsampling) that we got in previous c...
#define KALDI_WARN
Definition: kaldi-error.h:150
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70
BaseFloat avg_norm_prod
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
void ComputeLocalCost(const VectorBase< BaseFloat > &nccf_pitch, const VectorBase< BaseFloat > &lags, const PitchExtractionOptions &opts, VectorBase< BaseFloat > *local_cost)
This function computes the local-cost for the Viterbi computation, see eq.
int32 NumFramesAvailable(int64 num_downsampled_samples, bool snip_edges) const
This function works out from the signal how many frames are currently available to process (this is c...
std::vector< NccfInfo * > nccf_info_
void GetNormalizationWindow(int32 frame, int32 src_frames_ready, int32 *window_begin, int32 *window_end) const
Computes the normalization window sizes.
BaseFloat NccfToPov(BaseFloat n)
This function processes the NCCF n to a reasonably accurate probability of voicing p by applying the ...
bool pitch_use_naive_search
Real Sum() const
Returns sum of the elements.
void GetFrame(int32 frame, VectorBase< BaseFloat > *feat)
std::vector< std::pair< int32, BaseFloat > > lag_nccf_
BaseFloat GetNormalizedLogPitchFeature(int32 frame)
Computes and returns the mean-subtracted log-pitch feature for this frame.
void UpdateRemainder(const VectorBase< BaseFloat > &downsampled_wave_part)
This function updates downsampled_signal_remainder_, downsampled_samples_processed_, signal_sum_ and signal_sumsq_; it&#39;s called from AcceptWaveform().
void ComputeCorrelation(const VectorBase< BaseFloat > &wave, int32 first_lag, int32 last_lag, int32 nccf_window_size, VectorBase< BaseFloat > *inner_prod, VectorBase< BaseFloat > *norm_prod)
This function computes some dot products that are required while computing the NCCF.
std::vector< StateInfo > state_info_
SubMatrix< Real > RowRange(const MatrixIndexT row_offset, const MatrixIndexT num_rows) const
Definition: kaldi-matrix.h:209
A class representing a vector.
Definition: kaldi-vector.h:406
int32 NccfWindowSize() const
Returns the window-size in samples, after resampling.
int32 cur_best_state_
The current best state in the backtrace from the end.
LinearResample is a special case of ArbitraryResample, where we want to resample a signal at linearly...
Definition: resample.h:147
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts, const MatrixBase< BaseFloat > &input_features, Matrix< BaseFloat > *output_features)
void Set(Real f)
Set all members of a vector to a specified value.
PitchFrameInfo * prev_info_
The structure for the previous frame.
BaseFloat GetDeltaPitchFeature(int32 frame)
Computes and returns the delta-log-pitch feature for this frame.
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
void UpdateNormalizationStats(int32 frame)
Makes sure the entry in normalization_stats_ for this frame is up to date; called from GetNormalizedL...
void AppendVector(const VectorBase< Real > &src, Vector< Real > *dst)
int32 ComputeLatency(int32 max_latency)
This function may be called on the last (most recent) PitchFrameInfo object; it computes how many fra...
OnlinePitchFeatureImpl(const PitchExtractionOptions &opts)
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
OnlineFeatureInterface is an interface for online feature processing (it is also usable in the offlin...
int32 state_offset_
the state index of the first entry in "state_info"; this will initially be zero, but after cleanup mi...
bool UpdatePreviousBestState(PitchFrameInfo *prev_frame)
This function updates.
void ProcessPitch(const ProcessPitchOptions &opts, const MatrixBase< BaseFloat > &input, Matrix< BaseFloat > *output)
This function processes the raw (NCCF, pitch) quantities computed by ComputeKaldiPitch, and processes them into features.
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
void Add(Real c)
Add a constant to each element of a vector.
void SetZero()
Set vector to all zeros.
Vector< BaseFloat > nccf_pitch_resampled
virtual void GetFrame(int32 frame, VectorBase< BaseFloat > *feat)
Outputs the two-dimensional feature consisting of (pitch, NCCF).
double signal_sumsq_
sum-squared of previously processed parts of signal; used to get NCCF ballast term.
void SetBestState(int32 best_state, std::vector< std::pair< int32, BaseFloat > > &lag_nccf)
This function may be called for the last (most recent) PitchFrameInfo object with the best state (obt...
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
OnlineFeatureInterface * src_
void SetNccfPov(const VectorBase< BaseFloat > &nccf_pov)
Record the nccf_pov value.
virtual int32 NumFramesReady() const =0
returns the feature dimension.
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265
void SelectLags(const PitchExtractionOptions &opts, Vector< BaseFloat > *lags)
This function selects the lags at which we measure the NCCF: we need to select lags from 1/max_f0 to ...
virtual int32 Dim() const =0
BaseFloat GetRawLogPitchFeature(int32 frame) const
Computes and returns the raw log-pitch feature for this frame.
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).
Definition: kaldi-vector.h:94