decodable-simple-looped.h
Go to the documentation of this file.
1 // nnet3/decodable-simple-looped.h
2 
3 // Copyright 2016 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #ifndef KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_
21 #define KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_
22 
23 #include <vector>
24 #include "base/kaldi-common.h"
25 #include "gmm/am-diag-gmm.h"
26 #include "hmm/transition-model.h"
27 #include "itf/decodable-itf.h"
28 #include "nnet3/nnet-optimize.h"
29 #include "nnet3/nnet-compute.h"
30 #include "nnet3/am-nnet-simple.h"
31 
32 namespace kaldi {
33 namespace nnet3 {
34 
35 // See also nnet-am-decodable-simple.h, which is a decodable object that's based
36 // on breaking up the input into fixed chunks. The decodable object defined here is based on
37 // 'looped' computations, which naturally handle infinite left-context (but are
38 // only ideal for systems that have only recurrence in the forward direction,
39 // i.e. not BLSTMs... because there isn't a natural way to enforce extra right
40 // context for each chunk.)
41 
42 
43 // Note: the 'simple' in the name means it applies to networks for which
44 // IsSimpleNnet(nnet) would return true. 'looped' means we use looped
45 // computations, with a kGotoLabel statement at the end of it.
55  extra_left_context_initial(0),
56  frame_subsampling_factor(1),
57  frames_per_chunk(20),
58  acoustic_scale(0.1),
59  debug_computation(false) { }
60 
61  void Check() const {
62  KALDI_ASSERT(extra_left_context_initial >= 0 &&
63  frame_subsampling_factor > 0 && frames_per_chunk > 0 &&
64  acoustic_scale > 0.0);
65  }
66 
67  void Register(OptionsItf *opts) {
68  opts->Register("extra-left-context-initial", &extra_left_context_initial,
69  "Extra left context to use at the first frame of an utterance (note: "
70  "this will just consist of repeats of the first frame, and should not "
71  "usually be necessary.");
72  opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
73  "Required if the frame-rate of the output (e.g. in 'chain' "
74  "models) is less than the frame-rate of the original "
75  "alignment.");
76  opts->Register("acoustic-scale", &acoustic_scale,
77  "Scaling factor for acoustic log-likelihoods");
78  opts->Register("frames-per-chunk", &frames_per_chunk,
79  "Number of frames in each chunk that is separately evaluated "
80  "by the neural net. Measured before any subsampling, if the "
81  "--frame-subsampling-factor options is used (i.e. counts "
82  "input frames. This is only advisory (may be rounded up "
83  "if needed.");
84  opts->Register("debug-computation", &debug_computation, "If true, turn on "
85  "debug for the actual computation (very verbose!)");
86 
87  // register the optimization options with the prefix "optimization".
88  ParseOptions optimization_opts("optimization", opts);
89  optimize_config.Register(&optimization_opts);
90 
91  // register the compute options with the prefix "computation".
92  ParseOptions compute_opts("computation", opts);
93  compute_config.Register(&compute_opts);
94  }
95 };
96 
97 
103  public:
104  // The constructor takes a non-const pointer to 'nnet' because it may have to
105  // modify it to be able to take multiple iVectors.
107  Nnet *nnet);
108 
109  // This constructor takes the priors from class AmNnetSimple (so it can divide by
110  // them).
112  AmNnetSimple *nnet);
113 
114  // this constructor is for use in testing.
116  const Vector<BaseFloat> &priors,
117  Nnet *nnet);
118 
119  void Init(const NnetSimpleLoopedComputationOptions &opts,
120  Nnet *nnet);
121 
123 
124  const Nnet &nnet;
125 
126  // the log priors (or the empty vector if the priors are not set in the model)
128 
129 
130  // frames_left_context equals the model left context plus the value of the
131  // --extra-left-context-initial option.
133  // frames_right_context is the same as the right-context of the model.
135  // The frames_per_chunk_ equals the number of input frames we need for each
136  // chunk (except for the first chunk). This divided by
137  // opts_.frame_subsampling_factor gives the number of output frames.
139 
140  // The output dimension of the neural network.
142 
143  // True if the neural net accepts iVectors. If so, the neural net will have been modified
144  // to accept the iVectors
146 
147  // The 3 computation requests that are used to create the looped
148  // computation are stored in the class, as we need them to work out
149  // exactly shich iVectors are needed.
150  ComputationRequest request1, request2, request3;
151 
152  // The compiled, 'looped' computation.
154 };
155 
156 /*
157  This class handles the neural net computation; it's mostly accessed
158  via other wrapper classes.
159 
160  It can accept just input features, or input features plus iVectors. */
162  public:
185  const MatrixBase<BaseFloat> &feats,
186  const VectorBase<BaseFloat> *ivector = NULL,
187  const MatrixBase<BaseFloat> *online_ivectors = NULL,
188  int32 online_ivector_period = 1);
189 
190 
191  // returns the number of frames of likelihoods. The same as feats_.NumRows()
192  // in the normal case (but may be less if opts_.frame_subsampling_factor !=
193  // 1).
194  inline int32 NumFrames() const { return num_subsampled_frames_; }
195 
196  inline int32 OutputDim() const { return info_.output_dim; }
197 
198  // Gets the output for a particular frame, with 0 <= frame < NumFrames().
199  // 'output' must be correctly sized (with dimension OutputDim()). Note:
200  // you're expected to call this, and GetOutput(), in an order of increasing
201  // frames. If you deviate from this, one of these calls may crash.
202  void GetOutputForFrame(int32 subsampled_frame,
203  VectorBase<BaseFloat> *output);
204 
205  // Gets the output for a particular frame and pdf_id, with
206  // 0 <= subsampled_frame < NumFrames(),
207  // and 0 <= pdf_id < OutputDim().
208  inline BaseFloat GetOutput(int32 subsampled_frame, int32 pdf_id) {
209  KALDI_ASSERT(subsampled_frame >= current_log_post_subsampled_offset_ &&
210  "Frames must be accessed in order.");
211  while (subsampled_frame >= current_log_post_subsampled_offset_ +
212  current_log_post_.NumRows())
213  AdvanceChunk();
214  return current_log_post_(subsampled_frame -
215  current_log_post_subsampled_offset_,
216  pdf_id);
217  }
218  private:
220 
221  // This function does the computation for the next chunk.
222  void AdvanceChunk();
223 
224  void AdvanceChunkInternal(const MatrixBase<BaseFloat> &input_feats,
225  const VectorBase<BaseFloat> &ivector);
226 
227  // Gets the iVector for the specified frame., if we are
228  // using iVectors (else does nothing).
229  void GetCurrentIvector(int32 input_frame,
230  Vector<BaseFloat> *ivector);
231 
232  // returns dimension of the provided iVectors if supplied, or 0 otherwise.
233  int32 GetIvectorDim() const;
234 
236 
238 
240  // note: num_subsampled_frames_ will equal feats_.NumRows() in the normal case
241  // when opts_.frame_subsampling_factor == 1.
243 
244  // ivector_ is the iVector if we're using iVectors that are estimated in batch
245  // mode.
247 
248  // online_ivector_feats_ is the iVectors if we're using online-estimated ones.
250  // online_ivector_period_ helps us interpret online_ivector_feats_; it's the
251  // number of frames the rows of ivector_feats are separated by.
253 
254  // The current log-posteriors that we got from the last time we
255  // ran the computation.
257 
258  // The number of chunks we have computed so far.
260 
261  // The time-offset of the current log-posteriors, equals
262  // (num_chunks_computed_ - 1) *
263  // (info_.frames_per_chunk_ / info_.opts_.frame_subsampling_factor).
265 };
266 
268  public:
298  const TransitionModel &trans_model,
299  const MatrixBase<BaseFloat> &feats,
300  const VectorBase<BaseFloat> *ivector = NULL,
301  const MatrixBase<BaseFloat> *online_ivectors = NULL,
302  int32 online_ivector_period = 1);
303 
304 
305  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id);
306 
307  virtual inline int32 NumFramesReady() const {
308  return decodable_nnet_.NumFrames();
309  }
310 
311  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
312 
313  virtual bool IsLastFrame(int32 frame) const {
314  KALDI_ASSERT(frame < NumFramesReady());
315  return (frame == NumFramesReady() - 1);
316  }
317 
318  private:
322 };
323 
324 
325 
326 } // namespace nnet3
327 } // namespace kaldi
328 
329 #endif // KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void Register(OptionsItf *opts)
Definition: nnet-optimize.h:84
const VectorBase< BaseFloat > * ivector_
DecodableInterface provides a link between the (acoustic-modeling and feature-processing) code and th...
Definition: decodable-itf.h:82
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
virtual int32 NumFramesReady() const
The call NumFramesReady() will return the number of frames currently available for this decodable obj...
const MatrixBase< BaseFloat > * online_ivector_feats_
kaldi::int32 int32
#define KALDI_DISALLOW_COPY_AND_ASSIGN(type)
Definition: kaldi-utils.h:121
virtual int32 NumIndices() const
Returns the number of states in the acoustic model (they will be indexed one-based, i.e.
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
const NnetSimpleLoopedComputationOptions & opts
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void Register(OptionsItf *opts)
Definition: nnet-compute.h:42
A class representing a vector.
Definition: kaldi-vector.h:406
class NnetComputer is responsible for executing the computation described in the "computation" object...
Definition: nnet-compute.h:59
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
BaseFloat GetOutput(int32 subsampled_frame, int32 pdf_id)
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
When you instantiate class DecodableNnetSimpleLooped, you should give it a const reference to this cl...
const DecodableNnetSimpleLoopedInfo & info_