online-feature.h
Go to the documentation of this file.
1 // feat/online-feature.h
2 
3 // Copyright 2013 Johns Hopkins University (author: Daniel Povey)
4 // 2014 Yanqing Sun, Junjie Wang,
5 // Daniel Povey, Korbinian Riedhammer
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 
23 #ifndef KALDI_FEAT_ONLINE_FEATURE_H_
24 #define KALDI_FEAT_ONLINE_FEATURE_H_
25 
26 #include <string>
27 #include <vector>
28 #include <deque>
29 
30 #include "matrix/matrix-lib.h"
31 #include "util/common-utils.h"
32 #include "base/kaldi-error.h"
33 #include "feat/feature-functions.h"
34 #include "feat/feature-mfcc.h"
35 #include "feat/feature-plp.h"
36 #include "feat/feature-fbank.h"
37 #include "itf/online-feature-itf.h"
38 
39 namespace kaldi {
42 
43 
51 public:
53  RecyclingVector(int items_to_hold = -1);
54 
56  Vector<BaseFloat> *At(int index) const;
57 
59  void PushBack(Vector<BaseFloat> *item);
60 
63  int Size() const;
64 
66 
67 private:
68  std::deque<Vector<BaseFloat>*> items_;
71 };
72 
73 
77 template<class C>
79  public:
80  //
81  // First, functions that are present in the interface:
82  //
83  virtual int32 Dim() const { return computer_.Dim(); }
84 
85  // Note: IsLastFrame() will only ever return true if you have called
86  // InputFinished() (and this frame is the last frame).
87  virtual bool IsLastFrame(int32 frame) const {
88  return input_finished_ && frame == NumFramesReady() - 1;
89  }
90  virtual BaseFloat FrameShiftInSeconds() const {
91  return computer_.GetFrameOptions().frame_shift_ms / 1000.0f;
92  }
93 
94  virtual int32 NumFramesReady() const { return features_.Size(); }
95 
96  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
97 
98  // Next, functions that are not in the interface.
99 
100 
101  // Constructor from options class
102  explicit OnlineGenericBaseFeature(const typename C::Options &opts);
103 
104  // This would be called from the application, when you get
105  // more wave data. Note: the sampling_rate is only provided so
106  // the code can assert that it matches the sampling rate
107  // expected in the options.
108  virtual void AcceptWaveform(BaseFloat sampling_rate,
109  const VectorBase<BaseFloat> &waveform);
110 
111 
112  // InputFinished() tells the class you won't be providing any
113  // more waveform. This will help flush out the last frame or two
114  // of features, in the case where snip-edges == false; it also
115  // affects the return value of IsLastFrame().
116  virtual void InputFinished();
117 
118  private:
119  // This function computes any additional feature frames that it is possible to
120  // compute from 'waveform_remainder_', which at this point may contain more
121  // than just a remainder-sized quantity (because AcceptWaveform() appends to
122  // waveform_remainder_ before calling this function). It adds these feature
123  // frames to features_, and shifts off any now-unneeded samples of input from
124  // waveform_remainder_ while incrementing waveform_offset_ by the same amount.
125  void ComputeFeatures();
126 
127  void MaybeCreateResampler(BaseFloat sampling_rate);
128 
129  C computer_; // class that does the MFCC or PLP or filterbank computation
130 
131  // resampler in cases when the input sampling frequency is not equal to
132  // the expected sampling rate
133  std::unique_ptr<LinearResample> resampler_;
134 
136 
137  // features_ is the Mfcc or Plp or Fbank features that we have already computed.
138 
140 
141  // True if the user has called "InputFinished()"
143 
144  // The sampling frequency, extracted from the config. Should
145  // be identical to the waveform supplied.
147 
148  // waveform_offset_ is the number of samples of waveform that we have
149  // already discarded, i.e. that were prior to 'waveform_remainder_'.
151 
152  // waveform_remainder_ is a short piece of waveform that we may need to keep
153  // after extracting all the whole frames we can (whatever length of feature
154  // will be required for the next phase of computation).
156 };
157 
161 
162 
168  public:
171  explicit OnlineMatrixFeature(const MatrixBase<BaseFloat> &mat): mat_(mat) { }
172 
173  virtual int32 Dim() const { return mat_.NumCols(); }
174 
175  virtual BaseFloat FrameShiftInSeconds() const {
176  return 0.01f;
177  }
178 
179  virtual int32 NumFramesReady() const { return mat_.NumRows(); }
180 
181  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
182  feat->CopyFromVec(mat_.Row(frame));
183  }
184 
185  virtual bool IsLastFrame(int32 frame) const {
186  return (frame + 1 == mat_.NumRows());
187  }
188 
189 
190  private:
192 };
193 
194 
195 // Note the similarity with SlidingWindowCmnOptions, but there
196 // are also differences. One which doesn't appear in the config
197 // itself, because it's a difference between the setups, is that
198 // in OnlineCmn, we carry over data from the previous utterance,
199 // or, if no previous utterance is available, from global stats,
200 // or, if previous utterances are available but the total amount
201 // of data is less than prev_frames, we pad with up to "global_frames"
202 // frames from the global stats.
205  int32 speaker_frames; // must be <= cmn_window
206  int32 global_frames; // must be <= speaker_frames.
207  bool normalize_mean; // Must be true if normalize_variance==true.
209 
210  int32 modulus; // not configurable from command line, relates to how the
211  // class computes the cmvn internally. smaller->more
212  // time-efficient but less memory-efficient. Must be >= 1.
213  int32 ring_buffer_size; // not configurable from command line; size of ring
214  // buffer used for caching CMVN stats. Must be >=
215  // modulus.
216  std::string skip_dims; // Colon-separated list of dimensions to skip normalization
217  // of, e.g. 13:14:15.
218 
220  cmn_window(600),
221  speaker_frames(600),
222  global_frames(200),
223  normalize_mean(true),
224  normalize_variance(false),
225  modulus(20),
226  ring_buffer_size(20),
227  skip_dims("") { }
228 
229  void Check() const {
230  KALDI_ASSERT(speaker_frames <= cmn_window && global_frames <= speaker_frames
231  && modulus > 0);
232  }
233 
234  void Register(ParseOptions *po) {
235  po->Register("cmn-window", &cmn_window, "Number of frames of sliding "
236  "context for cepstral mean normalization.");
237  po->Register("global-frames", &global_frames, "Number of frames of "
238  "global-average cepstral mean normalization stats to use for "
239  "first utterance of a speaker");
240  po->Register("speaker-frames", &speaker_frames, "Number of frames of "
241  "previous utterance(s) from this speaker to use in cepstral "
242  "mean normalization");
243  // we name the config string "norm-vars" for compatibility with
244  // ../featbin/apply-cmvn.cc
245  po->Register("norm-vars", &normalize_variance, "If true, do "
246  "cepstral variance normalization in addition to cepstral mean "
247  "normalization ");
248  po->Register("norm-means", &normalize_mean, "If true, do mean normalization "
249  "(note: you cannot normalize the variance but not the mean)");
250  po->Register("skip-dims", &skip_dims, "Dimensions to skip normalization of "
251  "(colon-separated list of integers)");}
252 };
253 
254 
255 
267  // The following is the total CMVN stats for this speaker (up till now), in
268  // the same format.
270 
271  // The following is the global CMVN stats, in the usual
272  // format, of dimension 2 x (dim+1), as [ sum-stats count
273  // sum-squared-stats 0 ]
275 
276  // If nonempty, contains CMVN stats representing the "frozen" state
277  // of CMVN that reflects how we were normalizing the data when the
278  // user called the Freeze() function in class OnlineCmvn.
280 
282 
283  explicit OnlineCmvnState(const Matrix<double> &global_stats):
284  global_cmvn_stats(global_stats) { }
285 
286  // Copy constructor
287  OnlineCmvnState(const OnlineCmvnState &other);
288 
289  void Write(std::ostream &os, bool binary) const;
290  void Read(std::istream &is, bool binary);
291 
292  // Use the default assignment operator.
293 };
294 
322  public:
323 
324  //
325  // First, functions that are present in the interface:
326  //
327  virtual int32 Dim() const { return src_->Dim(); }
328 
329  virtual bool IsLastFrame(int32 frame) const {
330  return src_->IsLastFrame(frame);
331  }
332  virtual BaseFloat FrameShiftInSeconds() const {
333  return src_->FrameShiftInSeconds();
334  }
335 
336  // The online cmvn does not introduce any additional latency.
337  virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
338 
339  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
340 
341  //
342  // Next, functions that are not in the interface.
343  //
344 
353  OnlineCmvn(const OnlineCmvnOptions &opts,
354  const OnlineCmvnState &cmvn_state,
356 
359  OnlineCmvn(const OnlineCmvnOptions &opts,
361 
362  // Outputs any state information from this utterance to "cmvn_state".
363  // The value of "cmvn_state" before the call does not matter: the output
364  // depends on the value of OnlineCmvnState the class was initialized
365  // with, the input feature values up to cur_frame, and the effects
366  // of the user possibly having called Freeze().
367  // If cur_frame is -1, it will just output the unmodified original
368  // state that was supplied to this object.
369  void GetState(int32 cur_frame,
370  OnlineCmvnState *cmvn_state);
371 
372  // This function can be used to modify the state of the CMVN computation
373  // from outside, but must only be called before you have processed any data
374  // (otherwise it will crash). This "state" is really just the information
375  // that is propagated between utterances, not the state of the computation
376  // inside an utterance.
377  void SetState(const OnlineCmvnState &cmvn_state);
378 
379  // From this point it will freeze the CMN to what it would have been if
380  // measured at frame "cur_frame", and it will stop it from changing
381  // further. This also applies retroactively for this utterance, so if you
382  // call GetFrame() on previous frames, it will use the CMVN stats
383  // from cur_frame; and it applies in the future too if you then
384  // call OutputState() and use this state to initialize the next
385  // utterance's CMVN object.
386  void Freeze(int32 cur_frame);
387 
388  virtual ~OnlineCmvn();
389  private:
390 
395  static void SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
396  const MatrixBase<double> &global_stats,
397  const OnlineCmvnOptions &opts,
398  MatrixBase<double> *stats);
399 
402  void GetMostRecentCachedFrame(int32 frame,
403  int32 *cached_frame,
404  MatrixBase<double> *stats);
405 
407  void CacheFrame(int32 frame, const MatrixBase<double> &stats);
408 
410  inline void InitRingBufferIfNeeded();
411 
415  void ComputeStatsForFrame(int32 frame,
416  MatrixBase<double> *stats);
417 
418 
420  std::vector<int32> skip_dims_; // Skip CMVN for these dimensions. Derived from opts_.
421  OnlineCmvnState orig_state_; // reflects the state before we saw this
422  // utterance.
423  Matrix<double> frozen_state_; // If the user called Freeze(), this variable
424  // will reflect the CMVN state that we froze
425  // at.
426 
427  // The variable below reflects the raw (count, x, x^2) statistics of the
428  // input, computed every opts_.modulus frames. raw_stats_[n / opts_.modulus]
429  // contains the (count, x, x^2) statistics for the frames from
430  // std::max(0, n - opts_.cmn_window) through n.
431  std::vector<Matrix<double>*> cached_stats_modulo_;
432  // the variable below is a ring-buffer of cached stats. the int32 is the
433  // frame index.
434  std::vector<std::pair<int32, Matrix<double> > > cached_stats_ring_;
435 
436  // Some temporary variables used inside functions of this class, which
437  // put here to avoid reallocation.
441 
442  OnlineFeatureInterface *src_; // Not owned here
443 };
444 
445 
449  OnlineSpliceOptions(): left_context(4), right_context(4) { }
450  void Register(ParseOptions *po) {
451  po->Register("left-context", &left_context, "Left-context for frame "
452  "splicing prior to LDA");
453  po->Register("right-context", &right_context, "Right-context for frame "
454  "splicing prior to LDA");
455  }
456 };
457 
459  public:
460  //
461  // First, functions that are present in the interface:
462  //
463  virtual int32 Dim() const {
464  return src_->Dim() * (1 + left_context_ + right_context_);
465  }
466 
467  virtual bool IsLastFrame(int32 frame) const {
468  return src_->IsLastFrame(frame);
469  }
470  virtual BaseFloat FrameShiftInSeconds() const {
471  return src_->FrameShiftInSeconds();
472  }
473 
474  virtual int32 NumFramesReady() const;
475 
476  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
477 
478  //
479  // Next, functions that are not in the interface.
480  //
483  left_context_(opts.left_context), right_context_(opts.right_context),
484  src_(src) { }
485 
486  private:
489  OnlineFeatureInterface *src_; // Not owned here
490 };
491 
494  public:
495  //
496  // First, functions that are present in the interface:
497  //
498  virtual int32 Dim() const { return offset_.Dim(); }
499 
500  virtual bool IsLastFrame(int32 frame) const {
501  return src_->IsLastFrame(frame);
502  }
503  virtual BaseFloat FrameShiftInSeconds() const {
504  return src_->FrameShiftInSeconds();
505  }
506 
507  virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
508 
509  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
510 
511  virtual void GetFrames(const std::vector<int32> &frames,
512  MatrixBase<BaseFloat> *feats);
513 
514  //
515  // Next, functions that are not in the interface.
516  //
517 
520  OnlineTransform(const MatrixBase<BaseFloat> &transform,
522 
523 
524  private:
525  OnlineFeatureInterface *src_; // Not owned here
528 };
529 
531  public:
532  //
533  // First, functions that are present in the interface:
534  //
535  virtual int32 Dim() const;
536 
537  virtual bool IsLastFrame(int32 frame) const {
538  return src_->IsLastFrame(frame);
539  }
540  virtual BaseFloat FrameShiftInSeconds() const {
541  return src_->FrameShiftInSeconds();
542  }
543 
544  virtual int32 NumFramesReady() const;
545 
546  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
547 
548  //
549  // Next, functions that are not in the interface.
550  //
553 
554  private:
555  OnlineFeatureInterface *src_; // Not owned here
557  DeltaFeatures delta_features_; // This class contains just a few
558  // coefficients.
559 };
560 
561 
565  public:
566  virtual int32 Dim() const { return src_->Dim(); }
567 
568  virtual bool IsLastFrame(int32 frame) const {
569  return src_->IsLastFrame(frame);
570  }
571  virtual BaseFloat FrameShiftInSeconds() const {
572  return src_->FrameShiftInSeconds();
573  }
574 
575  virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
576 
577  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
578 
579  virtual void GetFrames(const std::vector<int32> &frames,
580  MatrixBase<BaseFloat> *feats);
581 
582  virtual ~OnlineCacheFeature() { ClearCache(); }
583 
584  // Things that are not in the shared interface:
585 
586  void ClearCache(); // this should be called if you change the underlying
587  // features in some way.
588 
589  explicit OnlineCacheFeature(OnlineFeatureInterface *src): src_(src) { }
590  private:
591 
592  OnlineFeatureInterface *src_; // Not owned here
593  std::vector<Vector<BaseFloat>* > cache_;
594 };
595 
596 
597 
598 
602  public:
603  virtual int32 Dim() const { return src1_->Dim() + src2_->Dim(); }
604 
605  virtual bool IsLastFrame(int32 frame) const {
606  return (src1_->IsLastFrame(frame) || src2_->IsLastFrame(frame));
607  }
608  // Hopefully sources have the same rate
609  virtual BaseFloat FrameShiftInSeconds() const {
610  return src1_->FrameShiftInSeconds();
611  }
612 
613  virtual int32 NumFramesReady() const {
614  return std::min(src1_->NumFramesReady(), src2_->NumFramesReady());
615  }
616 
617  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
618 
619  virtual ~OnlineAppendFeature() { }
620 
622  OnlineFeatureInterface *src2): src1_(src1), src2_(src2) { }
623  private:
624 
627 };
628 
630 } // namespace kaldi
631 
632 #endif // KALDI_FEAT_ONLINE_FEATURE_H_
This class takes a Matrix<BaseFloat> and wraps it as an OnlineFeatureInterface: this can be useful wh...
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
void Register(ParseOptions *po)
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
RecyclingVector(int items_to_hold=-1)
By default it does not remove any elements.
OnlineGenericBaseFeature< PlpComputer > OnlinePlp
virtual BaseFloat FrameShiftInSeconds() const
virtual BaseFloat FrameShiftInSeconds() const
virtual int32 NumFramesReady() const
returns the feature dimension.
Matrix< double > speaker_cmvn_stats
OnlineGenericBaseFeature< MfccComputer > OnlineMfcc
OnlineGenericBaseFeature< FbankComputer > OnlineFbank
virtual int32 Dim() const
virtual int32 Dim() const
OnlineCmvnState(const Matrix< double > &global_stats)
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
virtual int32 Dim() const
This class does an online version of the cepstral mean and [optionally] variance, but note that this ...
virtual int32 NumFramesReady() const
returns the feature dimension.
Vector< double > temp_feats_dbl_
DeltaFeaturesOptions opts_
OnlineSpliceFrames(const OnlineSpliceOptions &opts, OnlineFeatureInterface *src)
Matrix< double > frozen_state
kaldi::int32 int32
OnlineCacheFeature(OnlineFeatureInterface *src)
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
void PushBack(Vector< BaseFloat > *item)
The ownership of the item is passed to this collection - do not delete the item.
Matrix< double > frozen_state_
Vector< BaseFloat > waveform_remainder_
void Register(const std::string &name, bool *ptr, const std::string &doc)
int Size() const
This method returns the size as if no "recycling" had happened, i.e.
FeatureWindowFunction window_function_
OnlineFeatureInterface * src_
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
virtual int32 Dim() const
Matrix< double > temp_stats_
virtual BaseFloat FrameShiftInSeconds() const
Vector< BaseFloat > temp_feats_
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
This online-feature class implements combination of two feature streams (such as pitch, plp) into one stream.
OnlineFeatureInterface * src_
std::vector< std::pair< int32, Matrix< double > > > cached_stats_ring_
Matrix< BaseFloat > linear_term_
virtual BaseFloat FrameShiftInSeconds() const
virtual int32 NumFramesReady() const
returns the feature dimension.
Struct OnlineCmvnState stores the state of CMVN adaptation between utterances (but not the state of t...
virtual BaseFloat FrameShiftInSeconds() const
This online-feature class implements any affine or linear transform.
This class serves as a storage for feature vectors with an option to limit the memory usage by removi...
std::vector< int32 > skip_dims_
virtual int32 NumFramesReady() const
returns the feature dimension.
OnlineCmvnState orig_state_
OnlineCmvnOptions opts_
virtual BaseFloat FrameShiftInSeconds() const
virtual int32 NumFramesReady() const
returns the feature dimension.
virtual BaseFloat FrameShiftInSeconds() const
OnlineFeatureInterface * src2_
std::unique_ptr< LinearResample > resampler_
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
OnlineAppendFeature(OnlineFeatureInterface *src1, OnlineFeatureInterface *src2)
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
OnlineFeatureInterface * src_
void Register(ParseOptions *po)
Add a virtual class for "source" features such as MFCC or PLP or pitch features.
A class representing a vector.
Definition: kaldi-vector.h:406
std::deque< Vector< BaseFloat > * > items_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
virtual int32 Dim() const
OnlineMatrixFeature(const MatrixBase< BaseFloat > &mat)
Caution: this class maintains the const reference from the constructor, so don&#39;t let it go out of sco...
const MatrixBase< BaseFloat > & mat_
OnlineFeatureInterface * src1_
virtual void GetFrame(int32 frame, VectorBase< BaseFloat > *feat)
Gets the feature vector for this frame.
std::vector< Matrix< double > * > cached_stats_modulo_
Matrix< double > global_cmvn_stats
OnlineFeatureInterface is an interface for online feature processing (it is also usable in the offlin...
std::vector< Vector< BaseFloat > *> cache_
virtual int32 NumFramesReady() const
returns the feature dimension.
virtual int32 Dim() const
Vector< BaseFloat > offset_
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
This is a templated class for online feature extraction; it&#39;s templated on a class like MfccComputer ...
Vector< BaseFloat > * At(int index) const
The ownership is being retained by this collection - do not delete the item.
virtual int32 Dim() const
OnlineFeatureInterface * src_
This feature type can be used to cache its input, to avoid repetition of computation in a multi-pass ...
OnlineFeatureInterface * src_
virtual BaseFloat FrameShiftInSeconds() const