online-nnet2-decodable.h
Go to the documentation of this file.
1 // nnet2/online-nnet2-decodable.h
2 
3 // Copyright 2014 Johns Hopkins Universithy (author: Daniel Povey)
4 
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #ifndef KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
22 #define KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
23 
24 #include "itf/online-feature-itf.h"
25 #include "itf/decodable-itf.h"
26 #include "nnet2/am-nnet.h"
27 #include "nnet2/nnet-compute.h"
28 #include "hmm/transition-model.h"
29 
30 namespace kaldi {
31 namespace nnet2 {
32 
33 // Note: see also nnet-compute-online.h, which provides a different
34 // (lower-level) interface and more efficient for progressive evaluation of an
35 // nnet throughout an utterance, with re-use of already-computed activations.
36 
39  bool pad_input;
41 
43  acoustic_scale(0.1),
44  pad_input(true),
45  max_nnet_batch_size(256) { }
46 
47  void Register(OptionsItf *opts) {
48  opts->Register("acoustic-scale", &acoustic_scale,
49  "Scaling factor for acoustic likelihoods");
50  opts->Register("pad-input", &pad_input,
51  "If true, pad acoustic features with required acoustic context "
52  "past edges of file.");
53  opts->Register("max-nnet-batch-size", &max_nnet_batch_size,
54  "Maximum batch size we use in neural-network decodable object, "
55  "in cases where we are not constrained by currently available "
56  "frames (this will rarely make a difference)");
57 
58  }
59 };
60 
61 
69  public:
70  DecodableNnet2Online(const AmNnet &nnet,
71  const TransitionModel &trans_model,
72  const DecodableNnet2OnlineOptions &opts,
73  OnlineFeatureInterface *input_feats);
74 
75 
77  virtual BaseFloat LogLikelihood(int32 frame, int32 index);
78 
79  virtual bool IsLastFrame(int32 frame) const;
80 
81  virtual int32 NumFramesReady() const;
82 
84  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
85 
86  private:
87 
90  void ComputeForFrame(int32 frame);
91 
93  const AmNnet &nnet_;
96  CuVector<BaseFloat> log_priors_; // log-priors taken from the model.
97  int32 feat_dim_; // dimensionality of the input features.
98  int32 left_context_; // Left context of the network (cached here)
99  int32 right_context_; // Right context of the network (cached here)
100  int32 num_pdfs_; // Number of pdfs, equals output-dim of the network (cached
101  // here)
102 
103  int32 begin_frame_; // First frame for which scaled_loglikes_ is valid
104  // (i.e. the first frame of the batch of frames for
105  // which we've computed the output).
106 
107  // scaled_loglikes_ contains the neural network pseudo-likelihoods: the log of
108  // (prob divided by the prior), scaled by opts.acoustic_scale). We may
109  // compute this using the GPU, but we transfer it back to the system memory
110  // when we store it here. These scores are only kept for a subset of frames,
111  // starting at begin_frame_, whose length depends how many frames were ready
112  // at the time we called LogLikelihood(), and will never exceed
113  // opts_.max_nnet_batch_size.
115 
117 };
118 
119 } // namespace nnet2
120 } // namespace kaldi
121 
122 #endif // KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
DecodableInterface provides a link between the (acoustic-modeling and feature-processing) code and th...
Definition: decodable-itf.h:82
kaldi::int32 int32
#define KALDI_DISALLOW_COPY_AND_ASSIGN(type)
Definition: kaldi-utils.h:121
virtual int32 NumIndices() const
Indices are one-based! This is for compatibility with OpenFst.
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
This Decodable object for class nnet2::AmNnet takes feature input from class OnlineFeatureInterface, unlike, say, class DecodableAmNnet which takes feature input from a matrix.
OnlineFeatureInterface is an interface for online feature processing (it is also usable in the offlin...