All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
gmm-global-est-lvtln-trans.cc File Reference
#include <string>
#include <vector>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "transform/lvtln.h"
#include "hmm/posterior.h"
Include dependency graph for gmm-global-est-lvtln-trans.cc:

Go to the source code of this file.

Namespaces

 kaldi
 Relabels neural network egs with the read pdf-id alignments.
 

Functions

void AccumulateForUtterance (const Matrix< BaseFloat > &feats, const Posterior &post, const DiagGmm &gmm, FmllrDiagGmmAccs *spk_stats)
 
int main (int argc, char *argv[])
 

Function Documentation

int main ( int  argc,
char *  argv[] 
)

Definition at line 57 of file gmm-global-est-lvtln-trans.cc.

References kaldi::AccumulateForUtterance(), LinearVtln::ComputeTransform(), LinearVtln::Dim(), SequentialTableReader< Holder >::Done(), ParseOptions::GetArg(), ParseOptions::GetOptArg(), LinearVtln::GetWarp(), RandomAccessTableReader< Holder >::HasKey(), rnnlm::i, KALDI_LOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), LinearVtln::NumClasses(), MatrixBase< Real >::NumRows(), ParseOptions::PrintUsage(), ParseOptions::Read(), kaldi::ReadKaldiObject(), ParseOptions::Register(), RandomAccessTableReader< Holder >::Value(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

57  {
58  try {
59  typedef kaldi::int32 int32;
60  using namespace kaldi;
61  const char *usage =
62  "Estimate linear-VTLN transforms, either per utterance or for "
63  "the supplied set of speakers (spk2utt option); this version\n"
64  "is for a global diagonal GMM (also known as a UBM). Reads posteriors\n"
65  "indicating Gaussian indexes in the UBM.\n"
66  "\n"
67  "Usage: gmm-global-est-lvtln-trans [options] <gmm-in> <lvtln-in> "
68  "<feature-rspecifier> <gpost-rspecifier> <lvtln-trans-wspecifier> [<warp-wspecifier>]\n"
69  "e.g.: gmm-global-est-lvtln-trans 0.ubm 0.lvtln '$feats' ark,s,cs:- ark:1.trans ark:1.warp\n"
70  "(where the <gpost-rspecifier> will likely come from gmm-global-get-post or\n"
71  "gmm-global-gselect-to-post\n";
72 
73  ParseOptions po(usage);
74  string spk2utt_rspecifier;
75  BaseFloat logdet_scale = 1.0;
76  std::string norm_type = "offset";
77  po.Register("norm-type", &norm_type, "type of fMLLR applied (\"offset\"|\"none\"|\"diag\")");
78  po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
79  "utterance-list map");
80  po.Register("logdet-scale", &logdet_scale, "Scale on log-determinant term in auxiliary function");
81 
82  po.Read(argc, argv);
83 
84  if (po.NumArgs() < 5 || po.NumArgs() > 6) {
85  po.PrintUsage();
86  exit(1);
87  }
88 
89  string
90  model_rxfilename = po.GetArg(1),
91  lvtln_rxfilename = po.GetArg(2),
92  feature_rspecifier = po.GetArg(3),
93  post_rspecifier = po.GetArg(4),
94  trans_wspecifier = po.GetArg(5),
95  warp_wspecifier = po.GetOptArg(6);
96 
97  DiagGmm gmm;
98  ReadKaldiObject(model_rxfilename, &gmm);
99  LinearVtln lvtln;
100  ReadKaldiObject(lvtln_rxfilename, &lvtln);
101 
102 
103  RandomAccessPosteriorReader post_reader(post_rspecifier);
104 
105  double tot_lvtln_impr = 0.0, tot_t = 0.0;
106 
107  BaseFloatMatrixWriter transform_writer(trans_wspecifier);
108 
109  BaseFloatWriter warp_writer(warp_wspecifier);
110 
111  std::vector<int32> class_counts(lvtln.NumClasses(), 0);
112  int32 num_done = 0, num_no_post = 0, num_other_error = 0;
113  if (spk2utt_rspecifier != "") { // per-speaker adaptation
114  SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
115  RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
116 
117  for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
118  FmllrDiagGmmAccs spk_stats(lvtln.Dim());
119  string spk = spk2utt_reader.Key();
120  const vector<string> &uttlist = spk2utt_reader.Value();
121  for (size_t i = 0; i < uttlist.size(); i++) {
122  std::string utt = uttlist[i];
123  if (!feature_reader.HasKey(utt)) {
124  KALDI_WARN << "Did not find features for utterance " << utt;
125  continue;
126  }
127  if (!post_reader.HasKey(utt)) {
128  KALDI_WARN << "Did not find posteriors for utterance " << utt;
129  num_no_post++;
130  continue;
131  }
132  const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
133  const Posterior &post = post_reader.Value(utt);
134  if (static_cast<int32>(post.size()) != feats.NumRows()) {
135  KALDI_WARN << "Posterior vector has wrong size " << post.size()
136  << " vs. " << feats.NumRows();
137  num_other_error++;
138  continue;
139  }
140 
141  AccumulateForUtterance(feats, post, gmm, &spk_stats);
142 
143  num_done++;
144  } // end looping over all utterances of the current speaker
145 
146  BaseFloat impr, spk_tot_t;
147  { // Compute the transform and write it out.
148  Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
149  int32 class_idx;
150  lvtln.ComputeTransform(spk_stats,
151  norm_type,
152  logdet_scale,
153  &transform,
154  &class_idx,
155  NULL,
156  &impr,
157  &spk_tot_t);
158  class_counts[class_idx]++;
159  transform_writer.Write(spk, transform);
160  if (warp_wspecifier != "")
161  warp_writer.Write(spk, lvtln.GetWarp(class_idx));
162  }
163  KALDI_LOG << "For speaker " << spk << ", auxf-impr from LVTLN is "
164  << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.";
165  tot_lvtln_impr += impr;
166  tot_t += spk_tot_t;
167  } // end looping over speakers
168  } else { // per-utterance adaptation
169  SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
170  for (; !feature_reader.Done(); feature_reader.Next()) {
171  string utt = feature_reader.Key();
172  if (!post_reader.HasKey(utt)) {
173  KALDI_WARN << "Did not find posterior for utterance "
174  << utt;
175  num_no_post++;
176  continue;
177  }
178  const Matrix<BaseFloat> &feats = feature_reader.Value();
179  const Posterior &post = post_reader.Value(utt);
180 
181  if (static_cast<int32>(post.size()) != feats.NumRows()) {
182  KALDI_WARN << "Posterior has wrong size " << post.size()
183  << " vs. " << feats.NumRows();
184  num_other_error++;
185  continue;
186  }
187  num_done++;
188 
189  FmllrDiagGmmAccs spk_stats(lvtln.Dim());
190 
191  AccumulateForUtterance(feats, post, gmm,
192  &spk_stats);
193  BaseFloat impr, utt_tot_t = spk_stats.beta_;
194  { // Compute the transform and write it out.
195  Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
196  int32 class_idx;
197  lvtln.ComputeTransform(spk_stats,
198  norm_type,
199  logdet_scale,
200  &transform,
201  &class_idx,
202  NULL,
203  &impr,
204  &utt_tot_t);
205  class_counts[class_idx]++;
206  transform_writer.Write(utt, transform);
207  if (warp_wspecifier != "")
208  warp_writer.Write(utt, lvtln.GetWarp(class_idx));
209  }
210 
211  KALDI_LOG << "For utterance " << utt << ", auxf-impr from LVTLN is "
212  << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
213  tot_lvtln_impr += impr;
214  tot_t += utt_tot_t;
215  }
216  }
217 
218  {
219  std::ostringstream s;
220  for (size_t i = 0; i < class_counts.size(); i++)
221  s << ' ' << class_counts[i];
222  KALDI_LOG << "Distribution of classes is: " << s.str();
223  }
224 
225  KALDI_LOG << "Done " << num_done << " files, " << num_no_post
226  << " with no posteriors, " << num_other_error << " with other errors.";
227  KALDI_LOG << "Overall LVTLN auxf impr per frame is "
228  << (tot_lvtln_impr / tot_t) << " over " << tot_t << " frames.";
229  return (num_done == 0 ? 1 : 0);
230  } catch(const std::exception &e) {
231  std::cerr << e.what();
232  return -1;
233  }
234 }
Relabels neural network egs with the read pdf-id alignments.
Definition: chain.dox:20
This does not work with multiple feature transforms.
void AccumulateForUtterance(const Matrix< BaseFloat > &feats, const GaussPost &gpost, const TransitionModel &trans_model, const AmDiagGmm &am_gmm, FmllrDiagGmmAccs *spk_stats)
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:366
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:818
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
int32 Dim() const
Definition: lvtln.h:77
float BaseFloat
Definition: kaldi-types.h:29
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:43
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void ComputeTransform(const FmllrDiagGmmAccs &accs, std::string norm_type, BaseFloat logdet_scale, MatrixBase< BaseFloat > *Ws, int32 *class_idx, BaseFloat *logdet_out, BaseFloat *objf_impr=NULL, BaseFloat *count=NULL)
Compute the transform for the speaker.
Definition: lvtln.cc:97
BaseFloat GetWarp(int32 i) const
Definition: lvtln.cc:180
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
#define KALDI_WARN
Definition: kaldi-error.h:130
MatrixIndexT NumRows() const
Returns number of rows (or zero for emtpy matrix).
Definition: kaldi-matrix.h:58
Definition for Gaussian Mixture Model with diagonal covariances.
Definition: diag-gmm.h:42
int32 NumClasses() const
Definition: lvtln.h:78
#define KALDI_LOG
Definition: kaldi-error.h:133