compute-gop.cc
Go to the documentation of this file.
1 // bin/compute-gop.cc
2 
3 // Copyright 2019 Junbo Zhang
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
57 #include "base/kaldi-common.h"
58 #include "util/common-utils.h"
59 #include "hmm/transition-model.h"
60 #include "hmm/hmm-utils.h"
61 #include "hmm/tree-accu.h"
62 #include "hmm/posterior.h"
63 
64 namespace kaldi {
65 
76 void FrameLevelLpp(const SubVector<BaseFloat> &prob_row,
77  const std::vector<std::set<int32> > &pdf2phones,
78  const std::vector<int32> *phone_map,
79  Vector<BaseFloat> *out_frame_level_lpp) {
80  for (int32 i = 0; i < prob_row.Dim(); i++) {
81  std::set<int32> dest_idxs;
82  for (int32 ph : pdf2phones.at(i)) {
83  dest_idxs.insert((phone_map != NULL) ? (*phone_map)[ph] - 1 : ph - 1);
84  }
85 
86  for (int32 idx : dest_idxs) {
87  KALDI_ASSERT(idx < out_frame_level_lpp->Dim());
88  (*out_frame_level_lpp)(idx) += prob_row(i);
89  }
90  }
91  out_frame_level_lpp->ApplyLog();
92 }
93 
94 } // namespace kaldi
95 
96 int main(int argc, char *argv[]) {
97  using namespace kaldi;
98  typedef kaldi::int32 int32;
99  try {
100  const char *usage =
101  "Compute Goodness Of Pronunciation (GOP) from a matrix of "
102  "probabilities (e.g. from nnet3-compute).\n"
103  "Usage: compute-gop [options] <model> <alignments-rspecifier> "
104  "<prob-matrix-rspecifier> <gop-wspecifier> "
105  "[<phone-feature-wspecifier>]\n"
106  "e.g.:\n"
107  " nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-"
108  " ark:gop.1 ark:phone-feat.1\n";
109 
110  ParseOptions po(usage);
111 
112  bool log_applied = true;
113  std::string phone_map_rxfilename;
114 
115  po.Register("log-applied", &log_applied,
116  "If true, assume the input probabilities have been applied log.");
117  po.Register("phone-map", &phone_map_rxfilename,
118  "File name containing old->new phone mapping (each line is: "
119  "old-integer-id new-integer-id)");
120 
121  po.Read(argc, argv);
122 
123  if (po.NumArgs() != 4 && po.NumArgs() != 5) {
124  po.PrintUsage();
125  exit(1);
126  }
127 
128  std::string model_filename = po.GetArg(1),
129  alignments_rspecifier = po.GetArg(2),
130  prob_rspecifier = po.GetArg(3),
131  gop_wspecifier = po.GetArg(4),
132  feat_wspecifier = po.GetArg(5);
133 
134  TransitionModel trans_model;
135  {
136  bool binary;
137  Input ki(model_filename, &binary);
138  trans_model.Read(ki.Stream(), binary);
139  }
140  std::vector<std::set<int32> > pdf2phones;
141  GetPdfToPhonesMap(trans_model, &pdf2phones);
142  int32 phone_num = trans_model.NumPhones();
143 
144  std::vector<int32> phone_map;
145  if (phone_map_rxfilename != "") {
146  ReadPhoneMap(phone_map_rxfilename, &phone_map);
147  phone_num = phone_map[phone_map.size() - 1];
148  }
149 
150  RandomAccessInt32VectorReader alignment_reader(alignments_rspecifier);
151  SequentialBaseFloatMatrixReader prob_reader(prob_rspecifier);
152  PosteriorWriter gop_writer(gop_wspecifier);
153  BaseFloatMatrixWriter feat_writer(feat_wspecifier);
154 
155  int32 num_done = 0;
156  for (; !prob_reader.Done(); prob_reader.Next()) {
157  std::string key = prob_reader.Key();
158  if (!alignment_reader.HasKey(key)) {
159  KALDI_WARN << "No alignment for utterance " << key;
160  continue;
161  }
162  auto alignment = alignment_reader.Value(key);
163  Matrix<BaseFloat> &probs = prob_reader.Value();
164  if (log_applied) probs.ApplyExp();
165 
166  int32 frame_num = alignment.size();
167  if (alignment.size() != probs.NumRows()) {
168  KALDI_WARN << "The frame numbers of alignment and prob are not equal.";
169  if (frame_num > probs.NumRows()) frame_num = probs.NumRows();
170  }
171 
172  KALDI_ASSERT(frame_num > 0);
173  int32 cur_phone_id = alignment[0] - 1; // start by 0, skipping <eps>
174  int32 duration = 0;
175  Vector<BaseFloat> phone_level_feat(phone_num * 2); // LPPs and LPRs
176  SubVector<BaseFloat> lpp_part(phone_level_feat, 0, phone_num);
177  std::vector<Vector<BaseFloat> > phone_level_feat_stdvector;
178  Posterior posterior_gop;
179  for (int32 i = 0; i < frame_num; i++) {
180  // Calculate LPP and LPR for each pure-phone
181  Vector<BaseFloat> frame_level_lpp(phone_num);
182  FrameLevelLpp(probs.Row(i), pdf2phones,
183  (phone_map_rxfilename != "") ? &phone_map : NULL,
184  &frame_level_lpp);
185 
186  // LPP(p)=\frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t)
187  lpp_part.AddVec(1, frame_level_lpp);
188  duration++;
189 
190  int32 next_phone_id = (i < frame_num - 1) ? alignment[i + 1] - 1: -1;
191  if (next_phone_id != cur_phone_id) {
192  // The current phone's feature have been ready
193  lpp_part.Scale(1.0 / duration);
194 
195  // LPR(p_j|p_i)=\log p(p_j|\mathbf o; t_s, t_e)-\log p(p_i|\mathbf o; t_s, t_e)
196  for (int k = 0; k < phone_num; k++)
197  phone_level_feat(phone_num + k) = lpp_part(cur_phone_id) - lpp_part(k);
198  phone_level_feat_stdvector.push_back(phone_level_feat);
199 
200  // Compute GOP from LPP
201  // GOP(p)=\log \frac{LPP(p)}{\max_{q\in Q} LPP(q)}
202  BaseFloat gop = lpp_part(cur_phone_id) - lpp_part.Max();
203  std::vector<std::pair<int32, BaseFloat> > posterior_item;
204  posterior_item.push_back(std::make_pair(cur_phone_id + 1, gop));
205  posterior_gop.push_back(posterior_item);
206 
207  // Reset
208  phone_level_feat.Set(0);
209  duration = 0;
210  }
211  cur_phone_id = next_phone_id;
212  }
213 
214  // Write GOPs and the phone-level features
215  Matrix<BaseFloat> feats(phone_level_feat_stdvector.size(), phone_num * 2);
216  for (int32 i = 0; i < phone_level_feat_stdvector.size(); i++) {
217  SubVector<BaseFloat> row(feats, i);
218  row.AddVec(1.0, phone_level_feat_stdvector[i]);
219  }
220  feat_writer.Write(key, feats);
221  gop_writer.Write(key, posterior_gop);
222  num_done++;
223  }
224 
225  KALDI_LOG << "Processed " << num_done << " prob matrices.";
226  return (num_done != 0 ? 0 : 1);
227  } catch (const std::exception &e) {
228  std::cerr << e.what() << '\n';
229  return -1;
230  }
231 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void ApplyLog()
Apply natural log to all elements.
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
int main(int argc, char *argv[])
Definition: compute-gop.cc:96
void ReadPhoneMap(std::string phone_map_rxfilename, std::vector< int32 > *phone_map)
Definition: tree-accu.cc:106
std::istream & Stream()
Definition: kaldi-io.cc:826
float BaseFloat
Definition: kaldi-types.h:29
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void GetPdfToPhonesMap(const TransitionModel &trans_model, std::vector< std::set< int32 > > *pdf2phones)
Definition: hmm-utils.cc:1292
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
const T & Value(const std::string &key)
void Read(std::istream &is, bool binary)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
void Scale(Real alpha)
Multiplies all elements by this constant.
bool HasKey(const std::string &key)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void Set(Real f)
Set all members of a vector to a specified value.
void FrameLevelLpp(const SubVector< BaseFloat > &prob_row, const std::vector< std::set< int32 > > &pdf2phones, const std::vector< int32 > *phone_map, Vector< BaseFloat > *out_frame_level_lpp)
FrameLevelLpp compute a log posterior for pure-phones by sum the posterior of the states belonging to...
Definition: compute-gop.cc:76
#define KALDI_LOG
Definition: kaldi-error.h:153
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501