decodable-am-nnet.h
Go to the documentation of this file.
1 // nnet2/decodable-am-nnet.h
2 
3 // Copyright 2012 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #ifndef KALDI_NNET2_DECODABLE_AM_NNET_H_
21 #define KALDI_NNET2_DECODABLE_AM_NNET_H_
22 
23 #include <vector>
24 #include "base/kaldi-common.h"
25 #include "gmm/am-diag-gmm.h"
26 #include "hmm/transition-model.h"
27 #include "itf/decodable-itf.h"
28 #include "nnet2/am-nnet.h"
29 #include "nnet2/nnet-compute.h"
30 
31 namespace kaldi {
32 namespace nnet2 {
33 
36 
38  public:
39  DecodableAmNnet(const TransitionModel &trans_model,
40  const AmNnet &am_nnet,
41  const CuMatrixBase<BaseFloat> &feats,
42  bool pad_input = true, // if !pad_input, the NumIndices()
43  // will be < feats.NumRows().
44  BaseFloat prob_scale = 1.0):
45  trans_model_(trans_model) {
46  // Note: we could make this more memory-efficient by doing the
47  // computation in smaller chunks than the whole utterance, and not
48  // storing the whole thing. We'll leave this for later.
49  int32 num_rows = feats.NumRows() -
50  (pad_input ? 0 : am_nnet.GetNnet().LeftContext() +
51  am_nnet.GetNnet().RightContext());
52  if (num_rows <= 0) {
53  KALDI_WARN << "Input with " << feats.NumRows() << " rows will produce "
54  << "empty output.";
55  return;
56  }
57  CuMatrix<BaseFloat> log_probs(num_rows, trans_model.NumPdfs());
58  // the following function is declared in nnet-compute.h
59  NnetComputation(am_nnet.GetNnet(), feats, pad_input, &log_probs);
60  log_probs.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
61  log_probs.ApplyLog();
62  CuVector<BaseFloat> priors(am_nnet.Priors());
63  KALDI_ASSERT(priors.Dim() == trans_model.NumPdfs() &&
64  "Priors in neural network not set up.");
65  priors.ApplyLog();
66  // subtract log-prior (divide by prior)
67  log_probs.AddVecToRows(-1.0, priors);
68  // apply probability scale.
69  log_probs.Scale(prob_scale);
70  // Transfer the log-probs to the CPU for faster access by the
71  // decoding process.
72  log_probs_.Swap(&log_probs);
73  }
74 
75  // Note, frames are numbered from zero. But transition_id is numbered
76  // from one (this routine is called by FSTs).
77  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
78  return log_probs_(frame,
79  trans_model_.TransitionIdToPdfFast(transition_id));
80  }
81 
82  virtual int32 NumFramesReady() const { return log_probs_.NumRows(); }
83 
84  // Indices are one-based! This is for compatibility with OpenFst.
85  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
86 
87  virtual bool IsLastFrame(int32 frame) const {
88  KALDI_ASSERT(frame < NumFramesReady());
89  return (frame == NumFramesReady() - 1);
90  }
91 
92  protected:
94  Matrix<BaseFloat> log_probs_; // actually not really probabilities, since we divide
95  // by the prior -> they won't sum to one.
96 
98 };
99 
104 
106  public:
108  const TransitionModel &trans_model,
109  const AmNnet &am_nnet,
110  const CuMatrix<BaseFloat> *feats,
111  bool pad_input = true,
112  BaseFloat prob_scale = 1.0):
113  trans_model_(trans_model), am_nnet_(am_nnet), feats_(feats),
114  pad_input_(pad_input), prob_scale_(prob_scale) {
115  KALDI_ASSERT(feats_ != NULL);
116  }
117 
118  void Compute() {
119  log_probs_.Resize(feats_->NumRows(), trans_model_.NumPdfs());
120  // the following function is declared in nnet-compute.h
121  NnetComputation(am_nnet_.GetNnet(), *feats_,
122  pad_input_, &log_probs_);
123  log_probs_.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
125  CuVector<BaseFloat> priors(am_nnet_.Priors());
126  KALDI_ASSERT(priors.Dim() == trans_model_.NumPdfs() &&
127  "Priors in neural network not set up.");
128  priors.ApplyLog();
129  // subtract log-prior (divide by prior)
130  log_probs_.AddVecToRows(-1.0, priors);
131  // apply probability scale.
132  log_probs_.Scale(prob_scale_);
133  delete feats_;
134  feats_ = NULL;
135  }
136 
137  // Note, frames are numbered from zero. But state_index is numbered
138  // from one (this routine is called by FSTs).
139  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
140  if (feats_) Compute(); // this function sets feats_ to NULL.
141  return log_probs_(frame,
142  trans_model_.TransitionIdToPdfFast(transition_id));
143  }
144 
146  if (feats_) {
147  if (pad_input_) return feats_->NumRows();
148  else {
149  int32 ans = feats_->NumRows() - am_nnet_.GetNnet().LeftContext() -
150  am_nnet_.GetNnet().RightContext();
151  if (ans < 0) ans = 0;
152  return ans;
153  }
154  } else {
155  return log_probs_.NumRows();
156  }
157  }
158 
159  // Indices are one-based! This is for compatibility with OpenFst.
160  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
161 
162  virtual bool IsLastFrame(int32 frame) const {
163  KALDI_ASSERT(frame < NumFramesReady());
164  return (frame == NumFramesReady() - 1);
165  }
167  delete feats_;
168  }
169  protected:
171  const AmNnet &am_nnet_;
172  CuMatrix<BaseFloat> log_probs_; // actually not really probabilities, since we divide
173  // by the prior -> they won't sum to one.
178 };
179 
180 
181 
182 
183 
184 } // namespace nnet2
185 } // namespace kaldi
186 
187 #endif // KALDI_NNET2_DECODABLE_AM_NNET_H_
This version of DecodableAmNnet is intended for a version of the decoder that processes different utt...
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
int32 LeftContext() const
Returns the left-context summed over all the Components...
Definition: nnet-nnet.cc:42
int32 NumFramesReady() const
The call NumFramesReady() will return the number of frames currently available for this decodable obj...
DecodableInterface provides a link between the (acoustic-modeling and feature-processing) code and th...
Definition: decodable-itf.h:82
const TransitionModel & trans_model_
int32 TransitionIdToPdfFast(int32 trans_id) const
DecodableAmNnet(const TransitionModel &trans_model, const AmNnet &am_nnet, const CuMatrixBase< BaseFloat > &feats, bool pad_input=true, BaseFloat prob_scale=1.0)
kaldi::int32 int32
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void NnetComputation(const Nnet &nnet, const CuMatrixBase< BaseFloat > &input, bool pad_input, CuMatrixBase< BaseFloat > *output)
Does the basic neural net computation, on a sequence of data (e.g.
Matrix< BaseFloat > log_probs_
void Swap(Matrix< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
const CuMatrix< BaseFloat > * feats_
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id)
Returns the log likelihood, which will be negated in the decoder.
DecodableAmNnetParallel(const TransitionModel &trans_model, const AmNnet &am_nnet, const CuMatrix< BaseFloat > *feats, bool pad_input=true, BaseFloat prob_scale=1.0)
void Scale(Real alpha)
Multiply each element with a scalar value.
int32 NumTransitionIds() const
Returns the total number of transition-ids (note, these are one-based).
virtual bool IsLastFrame(int32 frame) const
Returns true if this is the last frame.
const VectorBase< BaseFloat > & Priors() const
Definition: am-nnet.h:67
void AddVecToRows(const Real alpha, const VectorBase< OtherReal > &v)
[each row of *this] += alpha * v
int32 RightContext() const
Returns the right-context summed over all the Components...
Definition: nnet-nnet.cc:56
virtual int32 NumIndices() const
Returns the number of states in the acoustic model (they will be indexed one-based, i.e.
#define KALDI_WARN
Definition: kaldi-error.h:150
virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id)
Returns the log likelihood, which will be negated in the decoder.
virtual int32 NumFramesReady() const
The call NumFramesReady() will return the number of frames currently available for this decodable obj...
KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnet)
Matrix for CUDA computing.
Definition: matrix-common.h:69
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
DecodableAmNnet is a decodable object that decodes with a neural net acoustic model of type AmNnet...
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
void ApplyFloor(Real floor_val)
Definition: kaldi-matrix.h:354
virtual int32 NumIndices() const
Returns the number of states in the acoustic model (they will be indexed one-based, i.e.
const Nnet & GetNnet() const
Definition: am-nnet.h:61