compute-gop.cc File Reference
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "hmm/transition-model.h"
#include "hmm/hmm-utils.h"
#include "hmm/tree-accu.h"
#include "hmm/posterior.h"
Include dependency graph for compute-gop.cc:

Go to the source code of this file.

Namespaces

 kaldi
 This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for mispronunciations detection tasks, the reference:
 

Functions

void FrameLevelLpp (const SubVector< BaseFloat > &prob_row, const std::vector< std::set< int32 > > &pdf2phones, const std::vector< int32 > *phone_map, Vector< BaseFloat > *out_frame_level_lpp)
 FrameLevelLpp compute a log posterior for pure-phones by sum the posterior of the states belonging to those triphones whose current phone is the canonical phone: More...
 
int main (int argc, char *argv[])
 

Function Documentation

◆ main()

int main ( int  argc,
char *  argv[] 
)

Definition at line 96 of file compute-gop.cc.

References VectorBase< Real >::AddVec(), MatrixBase< Real >::ApplyExp(), SequentialTableReader< Holder >::Done(), kaldi::FrameLevelLpp(), ParseOptions::GetArg(), kaldi::GetPdfToPhonesMap(), RandomAccessTableReader< Holder >::HasKey(), rnnlm::i, KALDI_ASSERT, KALDI_LOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), VectorBase< Real >::Max(), SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), TransitionModel::NumPhones(), MatrixBase< Real >::NumRows(), ParseOptions::PrintUsage(), ParseOptions::Read(), TransitionModel::Read(), kaldi::ReadPhoneMap(), ParseOptions::Register(), MatrixBase< Real >::Row(), VectorBase< Real >::Scale(), VectorBase< Real >::Set(), Input::Stream(), RandomAccessTableReader< Holder >::Value(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

96  {
97  using namespace kaldi;
98  typedef kaldi::int32 int32;
99  try {
100  const char *usage =
101  "Compute Goodness Of Pronunciation (GOP) from a matrix of "
102  "probabilities (e.g. from nnet3-compute).\n"
103  "Usage: compute-gop [options] <model> <alignments-rspecifier> "
104  "<prob-matrix-rspecifier> <gop-wspecifier> "
105  "[<phone-feature-wspecifier>]\n"
106  "e.g.:\n"
107  " nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-"
108  " ark:gop.1 ark:phone-feat.1\n";
109 
110  ParseOptions po(usage);
111 
112  bool log_applied = true;
113  std::string phone_map_rxfilename;
114 
115  po.Register("log-applied", &log_applied,
116  "If true, assume the input probabilities have been applied log.");
117  po.Register("phone-map", &phone_map_rxfilename,
118  "File name containing old->new phone mapping (each line is: "
119  "old-integer-id new-integer-id)");
120 
121  po.Read(argc, argv);
122 
123  if (po.NumArgs() != 4 && po.NumArgs() != 5) {
124  po.PrintUsage();
125  exit(1);
126  }
127 
128  std::string model_filename = po.GetArg(1),
129  alignments_rspecifier = po.GetArg(2),
130  prob_rspecifier = po.GetArg(3),
131  gop_wspecifier = po.GetArg(4),
132  feat_wspecifier = po.GetArg(5);
133 
134  TransitionModel trans_model;
135  {
136  bool binary;
137  Input ki(model_filename, &binary);
138  trans_model.Read(ki.Stream(), binary);
139  }
140  std::vector<std::set<int32> > pdf2phones;
141  GetPdfToPhonesMap(trans_model, &pdf2phones);
142  int32 phone_num = trans_model.NumPhones();
143 
144  std::vector<int32> phone_map;
145  if (phone_map_rxfilename != "") {
146  ReadPhoneMap(phone_map_rxfilename, &phone_map);
147  phone_num = phone_map[phone_map.size() - 1];
148  }
149 
150  RandomAccessInt32VectorReader alignment_reader(alignments_rspecifier);
151  SequentialBaseFloatMatrixReader prob_reader(prob_rspecifier);
152  PosteriorWriter gop_writer(gop_wspecifier);
153  BaseFloatMatrixWriter feat_writer(feat_wspecifier);
154 
155  int32 num_done = 0;
156  for (; !prob_reader.Done(); prob_reader.Next()) {
157  std::string key = prob_reader.Key();
158  if (!alignment_reader.HasKey(key)) {
159  KALDI_WARN << "No alignment for utterance " << key;
160  continue;
161  }
162  auto alignment = alignment_reader.Value(key);
163  Matrix<BaseFloat> &probs = prob_reader.Value();
164  if (log_applied) probs.ApplyExp();
165 
166  int32 frame_num = alignment.size();
167  if (alignment.size() != probs.NumRows()) {
168  KALDI_WARN << "The frame numbers of alignment and prob are not equal.";
169  if (frame_num > probs.NumRows()) frame_num = probs.NumRows();
170  }
171 
172  KALDI_ASSERT(frame_num > 0);
173  int32 cur_phone_id = alignment[0] - 1; // start by 0, skipping <eps>
174  int32 duration = 0;
175  Vector<BaseFloat> phone_level_feat(phone_num * 2); // LPPs and LPRs
176  SubVector<BaseFloat> lpp_part(phone_level_feat, 0, phone_num);
177  std::vector<Vector<BaseFloat> > phone_level_feat_stdvector;
178  Posterior posterior_gop;
179  for (int32 i = 0; i < frame_num; i++) {
180  // Calculate LPP and LPR for each pure-phone
181  Vector<BaseFloat> frame_level_lpp(phone_num);
182  FrameLevelLpp(probs.Row(i), pdf2phones,
183  (phone_map_rxfilename != "") ? &phone_map : NULL,
184  &frame_level_lpp);
185 
186  // LPP(p)=\frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t)
187  lpp_part.AddVec(1, frame_level_lpp);
188  duration++;
189 
190  int32 next_phone_id = (i < frame_num - 1) ? alignment[i + 1] - 1: -1;
191  if (next_phone_id != cur_phone_id) {
192  // The current phone's feature have been ready
193  lpp_part.Scale(1.0 / duration);
194 
195  // LPR(p_j|p_i)=\log p(p_j|\mathbf o; t_s, t_e)-\log p(p_i|\mathbf o; t_s, t_e)
196  for (int k = 0; k < phone_num; k++)
197  phone_level_feat(phone_num + k) = lpp_part(cur_phone_id) - lpp_part(k);
198  phone_level_feat_stdvector.push_back(phone_level_feat);
199 
200  // Compute GOP from LPP
201  // GOP(p)=\log \frac{LPP(p)}{\max_{q\in Q} LPP(q)}
202  BaseFloat gop = lpp_part(cur_phone_id) - lpp_part.Max();
203  std::vector<std::pair<int32, BaseFloat> > posterior_item;
204  posterior_item.push_back(std::make_pair(cur_phone_id + 1, gop));
205  posterior_gop.push_back(posterior_item);
206 
207  // Reset
208  phone_level_feat.Set(0);
209  duration = 0;
210  }
211  cur_phone_id = next_phone_id;
212  }
213 
214  // Write GOPs and the phone-level features
215  Matrix<BaseFloat> feats(phone_level_feat_stdvector.size(), phone_num * 2);
216  for (int32 i = 0; i < phone_level_feat_stdvector.size(); i++) {
217  SubVector<BaseFloat> row(feats, i);
218  row.AddVec(1.0, phone_level_feat_stdvector[i]);
219  }
220  feat_writer.Write(key, feats);
221  gop_writer.Write(key, posterior_gop);
222  num_done++;
223  }
224 
225  KALDI_LOG << "Processed " << num_done << " prob matrices.";
226  return (num_done != 0 ? 0 : 1);
227  } catch (const std::exception &e) {
228  std::cerr << e.what() << '\n';
229  return -1;
230  }
231 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
void ReadPhoneMap(std::string phone_map_rxfilename, std::vector< int32 > *phone_map)
Definition: tree-accu.cc:106
float BaseFloat
Definition: kaldi-types.h:29
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void GetPdfToPhonesMap(const TransitionModel &trans_model, std::vector< std::set< int32 > > *pdf2phones)
Definition: hmm-utils.cc:1292
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
void Read(std::istream &is, bool binary)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
#define KALDI_WARN
Definition: kaldi-error.h:150
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void FrameLevelLpp(const SubVector< BaseFloat > &prob_row, const std::vector< std::set< int32 > > &pdf2phones, const std::vector< int32 > *phone_map, Vector< BaseFloat > *out_frame_level_lpp)
FrameLevelLpp compute a log posterior for pure-phones by sum the posterior of the states belonging to...
Definition: compute-gop.cc:76
#define KALDI_LOG
Definition: kaldi-error.h:153
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501