online-feat-input.h
Go to the documentation of this file.
1 // online/online-feat-input.h
2 
3 // Copyright 2012 Cisco Systems (author: Matthias Paulik)
4 // 2012-2013 Vassil Panayotov
5 // 2013 Johns Hopkins University (author: Daniel Povey)
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #ifndef KALDI_ONLINE_ONLINE_FEAT_INPUT_H_
23 #define KALDI_ONLINE_ONLINE_FEAT_INPUT_H_
24 
25 #if !defined(_MSC_VER)
26 #include <sys/types.h>
27 #include <sys/socket.h>
28 #include <netinet/in.h>
29 #include <arpa/inet.h>
30 #endif
31 
32 #include "online-audio-source.h"
33 #include "feat/feature-functions.h"
34 #include "feat/feature-window.h"
35 
36 namespace kaldi {
37 
38 // Interface specification
40  public:
41  // Produces feature vectors in some way.
42  // The features may be e.g. extracted from an audio samples, received and/or
43  // transformed from another OnlineFeatInput class etc.
44  //
45  // "output" - a matrix to store the extracted feature vectors in its rows.
46  // The number of rows (NumRows()) of "output" when the function is
47  // called, is treated as a hint of how many frames the user wants,
48  // but this function does not promise to produce exactly that many:
49  // it may be slightly more, less, or even zero, on a given call.
50  // Zero frames may be returned because we timed out or because
51  // we're at the beginning of the file and some buffering is going on.
52  // In that case you should try again. The function will return "false"
53  // when it knows the stream is finished, but if it returns nothing
54  // several times in a row you may want to terminate processing the
55  // stream.
56  //
57  // Note: similar to the OnlineAudioSourceItf::Read(), Compute() previously
58  // had a second argument - "timeout". Again we decided against including
59  // this parameter in the interface specification. Instead we are
60  // considering time out handling to be implementation detail, and if needed
61  // it should be configured, through the descendant class' constructor,
62  // or by other means.
63  // For consistency, we recommend 'timeout' values greater than zero
64  // to mean that Compute() should not block for more than that number
65  // of milliseconds, and to return whatever data it has, when the timeout
66  // period is exceeded.
67  //
68  // Returns "false" if we know the underlying data source has no more data, and
69  // true if there may be more data.
70  virtual bool Compute(Matrix<BaseFloat> *output) = 0;
71 
72  virtual int32 Dim() const = 0; // Return the output dimension of these features.
73 
74  virtual ~OnlineFeatInputItf() {}
75 };
76 
77 
78 // Acts as a proxy to an underlying OnlineFeatInput.
79 // Applies cepstral mean normalization
81  public:
82  // "input" - the underlying(unnormalized) feature source
83  // "cmn_window" - the count of the preceding vectors over which the average is
84  // calculated
85  // "min_window" - the minimum count of frames for which it will compute the
86  // mean, at the start of the file. Adds latency but only at the
87  // start
88  OnlineCmnInput(OnlineFeatInputItf *input, int32 cmn_window, int32 min_window)
89  : input_(input), cmn_window_(cmn_window), min_window_(min_window),
90  history_(cmn_window + 1, input->Dim()), t_in_(0), t_out_(0),
91  sum_(input->Dim()) { KALDI_ASSERT(cmn_window >= min_window && min_window > 0); }
92 
93  virtual bool Compute(Matrix<BaseFloat> *output);
94 
95  virtual int32 Dim() const { return input_->Dim(); }
96 
97  private:
98  bool ComputeInternal(Matrix<BaseFloat> *output);
99 
100 
102  const int32 cmn_window_; // > 0
103  const int32 min_window_; // > 0, < cmn_window_.
104  Matrix<BaseFloat> history_; // circular-buffer history, of dim (cmn_window_ +
105  // 1, feat-dim). The + 1 is to serve as a place
106  // for the frame we're about to normalize.
107 
108  void AcceptFrame(const VectorBase<BaseFloat> &input); // Accept the next frame
109  // of input (read into the
110  // history buffer).
111  void OutputFrame(VectorBase<BaseFloat> *output); // Output the next frame.
112 
113  int32 NumOutputFrames(int32 num_new_frames,
114  bool more_data) const; // Tells the caller, assuming
115  // we get given "num_new_frames" of input (and given knowledge of whether
116  // there is more data coming), how many frames would we be able to
117  // output?
118 
119 
120  int64 t_in_; // Time-counter for what we've obtained from the input.
121  int64 t_out_; // Time-counter for what we've written to the output.
122 
123  Vector<double> sum_; // Sum of the frames from t_out_ - HistoryLength(t_out_),
124  // to t_out_ - 1.
125 
127 };
128 
129 
131  public:
132  OnlineCacheInput(OnlineFeatInputItf *input): input_(input) { }
133 
134  // The Compute function just forwards to the previous member of the
135  // chain, except that we locally accumulate the result, and
136  // GetCachedData() will return the entire input up to the current time.
137  virtual bool Compute(Matrix<BaseFloat> *output);
138 
139  void GetCachedData(Matrix<BaseFloat> *output);
140 
141  int32 Dim() const { return input_->Dim(); }
142 
143  void Deallocate();
144 
145  virtual ~OnlineCacheInput() { Deallocate(); }
146 
147  private:
149  // data_ is a list of all the outputs we produced in successive
150  // calls to Compute(). The memory is owned here.
151  std::vector<Matrix<BaseFloat>* > data_;
152 };
153 
154 
155 #if !defined(_MSC_VER)
156 
157 // Accepts features over an UDP socket
158 // The current implementation doesn't support the "timeout" -
159 // the server is waiting for data indefinitely long time.
161  public:
162  OnlineUdpInput(int32 port, int32 feature_dim);
163 
164  virtual bool Compute(Matrix<BaseFloat> *output);
165 
166  virtual int32 Dim() const { return feature_dim_; }
167 
168  const sockaddr_in& client_addr() const { return client_addr_; }
169 
170  const int32 descriptor() const { return sock_desc_; }
171 
172  private:
174  // various BSD sockets-related data structures
175  int32 sock_desc_; // socket descriptor
176  sockaddr_in server_addr_;
177  sockaddr_in client_addr_;
178 };
179 
180 #endif
181 
182 
183 // Splices the input features and applies a transformation matrix.
184 // Note: the transformation matrix will usually be a linear transformation
185 // [output-dim x input-dim] but we accept an affine transformation too.
187  public:
189  const Matrix<BaseFloat> &transform,
190  int32 left_context,
191  int32 right_context);
192 
193  virtual bool Compute(Matrix<BaseFloat> *output);
194 
195  virtual int32 Dim() const { return linear_transform_.NumRows(); }
196 
197  private:
198  // The static function SpliceFrames splices together the features and
199  // puts them together in a matrix, so that each row of "output" contains
200  // a contiguous window of size "context_window" of input frames. The dimension
201  // of "output" will be feats.NumRows() - context_window + 1 by
202  // feats.NumCols() * context_window. The input features are
203  // treated as if the frames of input1, input2 and input3 have been appended
204  // together before applying the main operation.
205  static void SpliceFrames(const MatrixBase<BaseFloat> &input1,
206  const MatrixBase<BaseFloat> &input2,
207  const MatrixBase<BaseFloat> &input3,
208  int32 context_window,
209  Matrix<BaseFloat> *output);
210 
211  void TransformToOutput(const MatrixBase<BaseFloat> &spliced_feats,
212  Matrix<BaseFloat> *output);
213  void ComputeNextRemainder(const MatrixBase<BaseFloat> &input);
214 
215  OnlineFeatInputItf *input_; // underlying/inferior input object
216  const int32 input_dim_; // dimension of the feature vectors before xform
219  Matrix<BaseFloat> linear_transform_; // transform matrix (linear part only)
220  Vector<BaseFloat> offset_; // Offset, if present; else empty.
221  Matrix<BaseFloat> remainder_; // The last few frames of the input, that may
222  // be needed for context purposes.
223 
225 };
226 
227 
228 // Does the time-derivative computation (e.g., adding deltas and delta-deltas).
229 // This is standard in more "old-fashioned" feature extraction. Like an online
230 // version of the function ComputeDeltas in feat/feature-functions.h, where the
231 // class DeltaFeaturesOptions is also defined.
233  public:
234  OnlineDeltaInput(const DeltaFeaturesOptions &delta_opts,
235  OnlineFeatInputItf *input);
236 
237  virtual bool Compute(Matrix<BaseFloat> *output);
238 
239  virtual int32 Dim() const { return input_dim_ * (opts_.order + 1); }
240 
241  private:
242  // The static function AppendFrames appends together the three input matrices,
243  // some of which may be empty.
244  static void AppendFrames(const MatrixBase<BaseFloat> &input1,
245  const MatrixBase<BaseFloat> &input2,
246  const MatrixBase<BaseFloat> &input3,
247  Matrix<BaseFloat> *output);
248 
249  // Context() is the number of frames on each side of a given frame,
250  // that we need for context.
251  int32 Context() const { return opts_.order * opts_.window; }
252 
253  // Does the delta computation. Here, "output" will be resized to dimension
254  // (input.NumRows() - Context() * 2) by (input.NumCols() * opts_.order)
255  // "remainder" will be the last Context() rows of "input".
256  void DeltaComputation(const MatrixBase<BaseFloat> &input,
257  Matrix<BaseFloat> *output,
258  Matrix<BaseFloat> *remainder) const;
259 
260  OnlineFeatInputItf *input_; // underlying/inferior input object
263  Matrix<BaseFloat> remainder_; // The last few frames of the input, that may
264  // be needed for context purposes.
265 
267 };
268 
269 // Implementation, that is meant to be used to read samples from an
270 // OnlineAudioSourceItf and to extract MFCC/PLP features in the usual way
271 template <class E>
273  public:
274  // "au_src" - OnlineAudioSourceItf object
275  // "fe" - object implementing MFCC/PLP feature extraction
276  // "frame_size" - frame extraction window size in audio samples
277  // "frame_shift" - feature frame width in audio samples
278  OnlineFeInput(OnlineAudioSourceItf *au_src, E *fe,
279  const int32 frame_size, const int32 frame_shift,
280  const bool snip_edges = true);
281 
282  virtual int32 Dim() const { return extractor_->Dim(); }
283 
284  virtual bool Compute(Matrix<BaseFloat> *output);
285 
286  private:
287  OnlineAudioSourceItf *source_; // audio source
288  E *extractor_; // the actual feature extractor used
291  Vector<BaseFloat> wave_; // the samples to be passed for extraction
292  Vector<BaseFloat> wave_remainder_; // the samples remained from the previous
293  // feature batch
295 
297 };
298 
299 template<class E>
301  int32 frame_size, int32 frame_shift,
302  bool snip_edges)
303  : source_(au_src), extractor_(fe),
304  frame_size_(frame_size), frame_shift_(frame_shift) {
305  // we need a FrameExtractionOptions to call NumFrames()
306  // 1000 is just a fake sample rate which equates ms and samples
307  frame_opts_.samp_freq = 1000;
308  frame_opts_.frame_shift_ms = frame_shift;
309  frame_opts_.frame_length_ms = frame_size;
310  frame_opts_.snip_edges = snip_edges;
311 }
312 
313 template<class E> bool
315  MatrixIndexT nvec = output->NumRows(); // the number of output vectors
316  if (nvec <= 0) {
317  KALDI_WARN << "No feature vectors requested?!";
318  return true;
319  }
320 
321  // Prepare the input audio samples
322  int32 samples_req = frame_size_ + (nvec - 1) * frame_shift_;
323  Vector<BaseFloat> read_samples(samples_req);
324 
325  bool ans = source_->Read(&read_samples);
326 
327  Vector<BaseFloat> all_samples(wave_remainder_.Dim() + read_samples.Dim());
328  all_samples.Range(0, wave_remainder_.Dim()).CopyFromVec(wave_remainder_);
329  all_samples.Range(wave_remainder_.Dim(), read_samples.Dim()).
330  CopyFromVec(read_samples);
331 
332  // Extract the features
333  if (all_samples.Dim() >= frame_size_) {
334  // extract waveform remainder before calling Compute()
335  int32 num_frames = NumFrames(all_samples.Dim(), frame_opts_);
336  // offset is the amount at the start that has been extracted.
337  int32 offset = num_frames * frame_shift_;
338  int32 remaining_len = all_samples.Dim() - offset;
339  wave_remainder_.Resize(remaining_len);
340  KALDI_ASSERT(remaining_len >= 0);
341  if (remaining_len > 0)
342  wave_remainder_.CopyFromVec(SubVector<BaseFloat>(all_samples, offset, remaining_len));
343  extractor_->Compute(all_samples, 1.0, output);
344  } else {
345  output->Resize(0, 0);
346  wave_remainder_ = all_samples;
347  }
348 
349  return ans;
350 }
351 
353  int32 batch_size; // number of frames to request each time.
354  int32 num_tries; // number of tries of getting no output and timing out,
355  // before we give up.
356  OnlineFeatureMatrixOptions(): batch_size(27),
357  num_tries(5) { }
358  void Register(OptionsItf *opts) {
359  opts->Register("batch-size", &batch_size,
360  "Number of feature vectors processed w/o interruption");
361  opts->Register("num-tries", &num_tries,
362  "Number of successive repetitions of timeout before we "
363  "terminate stream");
364  }
365 };
366 
367 // The class OnlineFeatureMatrix wraps something of type
368 // OnlineFeatInputItf in a manner that is convenient for
369 // a Decodable type to consume.
371  public:
373  OnlineFeatInputItf *input):
374  opts_(opts), input_(input), feat_dim_(input->Dim()),
375  feat_offset_(0), finished_(false) { }
376 
377  bool IsValidFrame (int32 frame);
378 
379  int32 Dim() const { return feat_dim_; }
380 
381  // GetFrame() will die if it's not a valid frame; you have to
382  // call IsValidFrame() for this frame, to see whether it
383  // is valid.
384  SubVector<BaseFloat> GetFrame(int32 frame);
385 
386  private:
387  void GetNextFeatures(); // called when we need more features. Guarantees
388  // to get at least one more frame, or set finished_ = true.
389 
394  int32 feat_offset_; // the offset of the first frame in the current batch
395  bool finished_; // True if there are no more frames to be got from the input.
396 };
397 
398 
399 } // namespace kaldi
400 
401 #endif // KALDI_ONLINE_ONLINE_FEAT_INPUT_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
OnlineFeatInputItf * input_
Matrix< BaseFloat > feat_matrix_
OnlineFeatureMatrix(const OnlineFeatureMatrixOptions &opts, OnlineFeatInputItf *input)
virtual bool Read(Vector< BaseFloat > *data)=0
virtual int32 Dim() const =0
std::vector< Matrix< BaseFloat > *> data_
Matrix< BaseFloat > remainder_
OnlineFeatInputItf * input_
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
FrameExtractionOptions frame_opts_
kaldi::int32 int32
OnlineCacheInput(OnlineFeatInputItf *input)
const sockaddr_in & client_addr() const
#define KALDI_DISALLOW_COPY_AND_ASSIGN(type)
Definition: kaldi-utils.h:121
Vector< BaseFloat > wave_
Matrix< BaseFloat > history_
OnlineFeatInputItf * input_
virtual int32 Dim() const
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
Matrix< BaseFloat > remainder_
Vector< BaseFloat > wave_remainder_
int32 MatrixIndexT
Definition: matrix-common.h:98
OnlineFeInput(OnlineAudioSourceItf *au_src, E *fe, const int32 frame_size, const int32 frame_shift, const bool snip_edges=true)
virtual bool Compute(Matrix< BaseFloat > *output)
virtual int32 Dim() const
OnlineCmnInput(OnlineFeatInputItf *input, int32 cmn_window, int32 min_window)
virtual int32 Dim() const
int32 NumFrames(int64 num_samples, const FrameExtractionOptions &opts, bool flush)
This function returns the number of frames that we can extract from a wave file with the given number...
OnlineAudioSourceItf * source_
#define KALDI_WARN
Definition: kaldi-error.h:150
OnlineFeatInputItf * input_
OnlineFeatInputItf * input_
Vector< BaseFloat > offset_
const OnlineFeatureMatrixOptions opts_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void SpliceFrames(const MatrixBase< BaseFloat > &input_features, int32 left_context, int32 right_context, Matrix< BaseFloat > *output_features)
DeltaFeaturesOptions opts_
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
virtual bool Compute(Matrix< BaseFloat > *output)=0
Matrix< BaseFloat > linear_transform_
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
const int32 descriptor() const
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501
virtual int32 Dim() const
virtual int32 Dim() const
SubVector< Real > Range(const MatrixIndexT o, const MatrixIndexT l)
Returns a sub-vector of a vector (a range of elements).
Definition: kaldi-vector.h:94