gmm-global-est-lvtln-trans.cc
Go to the documentation of this file.
1 // gmmbin/gmm-global-est-lvtln-trans.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation; Saarland University
4 // 2014 Daniel Povey
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include <string>
22 using std::string;
23 #include <vector>
24 using std::vector;
25 
26 #include "base/kaldi-common.h"
27 #include "util/common-utils.h"
28 #include "gmm/am-diag-gmm.h"
29 #include "hmm/transition-model.h"
30 #include "transform/lvtln.h"
31 #include "hmm/posterior.h"
32 
33 namespace kaldi {
35  const Posterior &post,
36  const DiagGmm &gmm,
37  FmllrDiagGmmAccs *spk_stats) {
38  KALDI_ASSERT(static_cast<int32>(post.size()) == feats.NumRows());
39  for (size_t i = 0; i < post.size(); i++) {
40  std::vector<int32> gselect(post[i].size());
41  Vector<BaseFloat> this_post(post[i].size());
42  for (size_t j = 0; j < post[i].size(); j++) {
43  int32 g = post[i][j].first;
44  BaseFloat weight = post[i][j].second;
45  gselect[j] = g;
46  this_post(j) = weight;
47  }
48  spk_stats->AccumulateFromPosteriorsPreselect(gmm, gselect,
49  feats.Row(i),
50  this_post);
51  }
52 }
53 
54 
55 }
56 
57 int main(int argc, char *argv[]) {
58  try {
59  typedef kaldi::int32 int32;
60  using namespace kaldi;
61  const char *usage =
62  "Estimate linear-VTLN transforms, either per utterance or for "
63  "the supplied set of speakers (spk2utt option); this version\n"
64  "is for a global diagonal GMM (also known as a UBM). Reads posteriors\n"
65  "indicating Gaussian indexes in the UBM.\n"
66  "\n"
67  "Usage: gmm-global-est-lvtln-trans [options] <gmm-in> <lvtln-in> "
68  "<feature-rspecifier> <gpost-rspecifier> <lvtln-trans-wspecifier> [<warp-wspecifier>]\n"
69  "e.g.: gmm-global-est-lvtln-trans 0.ubm 0.lvtln '$feats' ark,s,cs:- ark:1.trans ark:1.warp\n"
70  "(where the <gpost-rspecifier> will likely come from gmm-global-get-post or\n"
71  "gmm-global-gselect-to-post\n";
72 
73  ParseOptions po(usage);
74  string spk2utt_rspecifier;
75  BaseFloat logdet_scale = 1.0;
76  std::string norm_type = "offset";
77  po.Register("norm-type", &norm_type, "type of fMLLR applied (\"offset\"|\"none\"|\"diag\")");
78  po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
79  "utterance-list map");
80  po.Register("logdet-scale", &logdet_scale, "Scale on log-determinant term in auxiliary function");
81 
82  po.Read(argc, argv);
83 
84  if (po.NumArgs() < 5 || po.NumArgs() > 6) {
85  po.PrintUsage();
86  exit(1);
87  }
88 
89  string
90  model_rxfilename = po.GetArg(1),
91  lvtln_rxfilename = po.GetArg(2),
92  feature_rspecifier = po.GetArg(3),
93  post_rspecifier = po.GetArg(4),
94  trans_wspecifier = po.GetArg(5),
95  warp_wspecifier = po.GetOptArg(6);
96 
97  DiagGmm gmm;
98  ReadKaldiObject(model_rxfilename, &gmm);
99  LinearVtln lvtln;
100  ReadKaldiObject(lvtln_rxfilename, &lvtln);
101 
102 
103  RandomAccessPosteriorReader post_reader(post_rspecifier);
104 
105  double tot_lvtln_impr = 0.0, tot_t = 0.0;
106 
107  BaseFloatMatrixWriter transform_writer(trans_wspecifier);
108 
109  BaseFloatWriter warp_writer(warp_wspecifier);
110 
111  std::vector<int32> class_counts(lvtln.NumClasses(), 0);
112  int32 num_done = 0, num_no_post = 0, num_other_error = 0;
113  if (spk2utt_rspecifier != "") { // per-speaker adaptation
114  SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
115  RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
116 
117  for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
118  FmllrDiagGmmAccs spk_stats(lvtln.Dim());
119  string spk = spk2utt_reader.Key();
120  const vector<string> &uttlist = spk2utt_reader.Value();
121  for (size_t i = 0; i < uttlist.size(); i++) {
122  std::string utt = uttlist[i];
123  if (!feature_reader.HasKey(utt)) {
124  KALDI_WARN << "Did not find features for utterance " << utt;
125  continue;
126  }
127  if (!post_reader.HasKey(utt)) {
128  KALDI_WARN << "Did not find posteriors for utterance " << utt;
129  num_no_post++;
130  continue;
131  }
132  const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
133  const Posterior &post = post_reader.Value(utt);
134  if (static_cast<int32>(post.size()) != feats.NumRows()) {
135  KALDI_WARN << "Posterior vector has wrong size " << post.size()
136  << " vs. " << feats.NumRows();
137  num_other_error++;
138  continue;
139  }
140 
141  AccumulateForUtterance(feats, post, gmm, &spk_stats);
142 
143  num_done++;
144  } // end looping over all utterances of the current speaker
145 
146  BaseFloat impr, spk_tot_t;
147  { // Compute the transform and write it out.
148  Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
149  int32 class_idx;
150  lvtln.ComputeTransform(spk_stats,
151  norm_type,
152  logdet_scale,
153  &transform,
154  &class_idx,
155  NULL,
156  &impr,
157  &spk_tot_t);
158  class_counts[class_idx]++;
159  transform_writer.Write(spk, transform);
160  if (warp_wspecifier != "")
161  warp_writer.Write(spk, lvtln.GetWarp(class_idx));
162  }
163  KALDI_LOG << "For speaker " << spk << ", auxf-impr from LVTLN is "
164  << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.";
165  tot_lvtln_impr += impr;
166  tot_t += spk_tot_t;
167  } // end looping over speakers
168  } else { // per-utterance adaptation
169  SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
170  for (; !feature_reader.Done(); feature_reader.Next()) {
171  string utt = feature_reader.Key();
172  if (!post_reader.HasKey(utt)) {
173  KALDI_WARN << "Did not find posterior for utterance "
174  << utt;
175  num_no_post++;
176  continue;
177  }
178  const Matrix<BaseFloat> &feats = feature_reader.Value();
179  const Posterior &post = post_reader.Value(utt);
180 
181  if (static_cast<int32>(post.size()) != feats.NumRows()) {
182  KALDI_WARN << "Posterior has wrong size " << post.size()
183  << " vs. " << feats.NumRows();
184  num_other_error++;
185  continue;
186  }
187  num_done++;
188 
189  FmllrDiagGmmAccs spk_stats(lvtln.Dim());
190 
191  AccumulateForUtterance(feats, post, gmm,
192  &spk_stats);
193  BaseFloat impr, utt_tot_t = spk_stats.beta_;
194  { // Compute the transform and write it out.
195  Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
196  int32 class_idx;
197  lvtln.ComputeTransform(spk_stats,
198  norm_type,
199  logdet_scale,
200  &transform,
201  &class_idx,
202  NULL,
203  &impr,
204  &utt_tot_t);
205  class_counts[class_idx]++;
206  transform_writer.Write(utt, transform);
207  if (warp_wspecifier != "")
208  warp_writer.Write(utt, lvtln.GetWarp(class_idx));
209  }
210 
211  KALDI_LOG << "For utterance " << utt << ", auxf-impr from LVTLN is "
212  << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
213  tot_lvtln_impr += impr;
214  tot_t += utt_tot_t;
215  }
216  }
217 
218  {
219  std::ostringstream s;
220  for (size_t i = 0; i < class_counts.size(); i++)
221  s << ' ' << class_counts[i];
222  KALDI_LOG << "Distribution of classes is: " << s.str();
223  }
224 
225  KALDI_LOG << "Done " << num_done << " files, " << num_no_post
226  << " with no posteriors, " << num_other_error << " with other errors.";
227  KALDI_LOG << "Overall LVTLN auxf impr per frame is "
228  << (tot_lvtln_impr / tot_t) << " over " << tot_t << " frames.";
229  return (num_done == 0 ? 1 : 0);
230  } catch(const std::exception &e) {
231  std::cerr << e.what();
232  return -1;
233  }
234 }
235 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
BaseFloat GetWarp(int32 i) const
Definition: lvtln.cc:180
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
This does not work with multiple feature transforms.
void AccumulateForUtterance(const Matrix< BaseFloat > &feats, const GaussPost &gpost, const TransitionModel &trans_model, const AmDiagGmm &am_gmm, FmllrDiagGmmAccs *spk_stats)
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
int32 NumClasses() const
Definition: lvtln.h:78
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
int32 Dim() const
Definition: lvtln.h:77
float BaseFloat
Definition: kaldi-types.h:29
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
const T & Value(const std::string &key)
void ComputeTransform(const FmllrDiagGmmAccs &accs, std::string norm_type, BaseFloat logdet_scale, MatrixBase< BaseFloat > *Ws, int32 *class_idx, BaseFloat *logdet_out, BaseFloat *objf_impr=NULL, BaseFloat *count=NULL)
Compute the transform for the speaker.
Definition: lvtln.cc:97
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
bool HasKey(const std::string &key)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
Definition for Gaussian Mixture Model with diagonal covariances.
Definition: diag-gmm.h:42
void AccumulateFromPosteriorsPreselect(const DiagGmm &gmm, const std::vector< int32 > &gselect, const VectorBase< BaseFloat > &data, const VectorBase< BaseFloat > &posteriors)
Accumulate stats for a GMM, given supplied posteriors.
#define KALDI_LOG
Definition: kaldi-error.h:153
int main(int argc, char *argv[])
std::string GetOptArg(int param) const