estimate-am-sgmm2.h
Go to the documentation of this file.
1 // sgmm2/estimate-am-sgmm2.h
2 
3 // Copyright 2009-2011 Microsoft Corporation; Lukas Burget;
4 // Saarland University (Author: Arnab Ghoshal);
5 // Ondrej Glembek; Yanmin Qian;
6 // Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey)
7 // Liang Lu; Arnab Ghoshal
8 
9 // See ../../COPYING for clarification regarding multiple authors
10 //
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 //
15 // http://www.apache.org/licenses/LICENSE-2.0
16 //
17 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
19 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
20 // MERCHANTABLITY OR NON-INFRINGEMENT.
21 // See the Apache 2 License for the specific language governing permissions and
22 // limitations under the License.
23 
24 #ifndef KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_
25 #define KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_ 1
26 
27 #include <string>
28 #include <vector>
29 
30 #include "sgmm2/am-sgmm2.h"
31 #include "gmm/model-common.h"
32 #include "itf/options-itf.h"
33 #include "util/kaldi-thread.h"
34 
35 namespace kaldi {
36 
51 
52  bool renormalize_V; // Renormalize the phonetic space.
53  bool renormalize_N; // Renormalize the speaker space.
54 
57 
60 
63  bool full_row_cov;
64  bool full_col_cov;
65 
67  cov_floor = 0.025;
68  tau_c = 2.0;
69  cov_diag_ratio = 2.0; // set this to very large to get diagonal-cov models.
70  max_cond = 1.0e+05;
71  epsilon = 1.0e-40;
72  renormalize_V = true;
73  renormalize_N = false; // default to false since will invalidate spk vectors
74  // on disk.
75  weight_projections_iters = 3;
76  max_impr_u = 0.25;
77 
78  map_M_prior_iters = 5;
79  tau_map_M = 0.0; // No MAP update by default (~500-1000 depending on prior)
80  full_row_cov = false;
81  full_col_cov = false;
82  }
83 
84  void Register(OptionsItf *opts) {
85  std::string module = "MleAmSgmm2Options: ";
86  opts->Register("tau-c", &tau_c, module+
87  "Count for smoothing weight update.");
88  opts->Register("cov-floor", &cov_floor, module+
89  "Covariance floor (fraction of average covariance).");
90  opts->Register("cov-diag-ratio", &cov_diag_ratio, module+
91  "Minimum occ/dim ratio below which use diagonal covariances.");
92  opts->Register("max-cond", &max_cond, module+"Maximum condition number used to "
93  "regularize the solution of certain quadratic auxiliary functions.");
94  opts->Register("weight-projections-iters", &weight_projections_iters, module+
95  "Number for iterations for weight projection estimation.");
96  opts->Register("renormalize-v", &renormalize_V, module+"If true, renormalize "
97  "the phonetic-subspace vectors to have meaningful sizes.");
98  opts->Register("renormalize-n", &renormalize_N, module+"If true, renormalize "
99  "the speaker subspace to have meaningful sizes.");
100  opts->Register("max-impr-u", &max_impr_u, module+"Maximum objective function "
101  "improvement per frame allowed in update of u (to "
102  "maintain stability.");
103 
104  opts->Register("tau-map-M", &tau_map_M, module+"Smoothing for MAP estimate "
105  "of M (0 means ML update).");
106  opts->Register("map-M-prior-iters", &map_M_prior_iters, module+
107  "Number of iterations to estimate prior covariances for M.");
108  opts->Register("full-row-cov", &full_row_cov, module+
109  "Estimate row covariance instead of using I.");
110  opts->Register("full-col-cov", &full_col_cov, module+
111  "Estimate column covariance instead of using I.");
112  }
113 };
114 
120  public:
121  explicit MleAmSgmm2Accs(BaseFloat rand_prune = 1.0e-05)
122  : total_frames_(0.0), total_like_(0.0), feature_dim_(0),
123  phn_space_dim_(0), spk_space_dim_(0), num_gaussians_(0),
124  num_pdfs_(0), num_groups_(0), rand_prune_(rand_prune) {}
125 
127  bool have_spk_vecs,
128  BaseFloat rand_prune = 1.0e-05)
129  : total_frames_(0.0), total_like_(0.0), rand_prune_(rand_prune) {
130  ResizeAccumulators(model, flags, have_spk_vecs);
131  }
132 
133  ~MleAmSgmm2Accs();
134 
135  void Read(std::istream &in_stream, bool binary, bool add);
136  void Write(std::ostream &out_stream, bool binary) const;
137 
142  void Check(const AmSgmm2 &model, bool show_properties = true) const;
143 
146  void ResizeAccumulators(const AmSgmm2 &model, SgmmUpdateFlagsType flags,
147  bool have_spk_vecs);
148 
150  BaseFloat Accumulate(const AmSgmm2 &model,
151  const Sgmm2PerFrameDerivedVars &frame_vars,
152  int32 pdf_index, // == j2.
153  BaseFloat weight,
154  Sgmm2PerSpkDerivedVars *spk_vars);
155 
158  BaseFloat AccumulateFromPosteriors(const AmSgmm2 &model,
159  const Sgmm2PerFrameDerivedVars &frame_vars,
160  const Matrix<BaseFloat> &posteriors,
161  int32 pdf_index, // == j2.
162  Sgmm2PerSpkDerivedVars *spk_vars);
163 
168  void CommitStatsForSpk(const AmSgmm2 &model,
169  const Sgmm2PerSpkDerivedVars &spk_vars);
170 
172  void GetStateOccupancies(Vector<BaseFloat> *occs) const;
173  int32 FeatureDim() const { return feature_dim_; }
174  int32 PhoneSpaceDim() const { return phn_space_dim_; }
175  int32 NumPdfs() const { return num_pdfs_; } // returns J2
176  int32 NumGroups() const { return num_groups_; } // returns J1
177  int32 NumGauss() const { return num_gaussians_; }
178 
179  private:
182  std::vector< Matrix<double> > Y_;
184  std::vector< Matrix<double> > Z_;
186  std::vector< SpMatrix<double> > R_;
188  std::vector< SpMatrix<double> > S_;
189 
192  std::vector< Matrix<double> > y_;
195  std::vector< Matrix<double> > gamma_;
196 
200  std::vector< Matrix<double> > a_;
201 
206 
214 
217  std::vector<SpMatrix<double> > U_;
218 
223  std::vector< Vector<double> > gamma_c_;
224 
229 
230  double total_frames_, total_like_;
231 
233  int32 feature_dim_, phn_space_dim_, spk_space_dim_;
234  int32 num_gaussians_, num_pdfs_, num_groups_;
235 
237 
239  friend class MleAmSgmm2Updater;
240  friend class EbwAmSgmm2Updater;
241 };
242 
247  public:
248  explicit MleAmSgmm2Updater(const MleAmSgmm2Options &options)
249  : options_(options) {}
250  void Reconfigure(const MleAmSgmm2Options &options) {
251  options_ = options;
252  }
253 
254  void Update(const MleAmSgmm2Accs &accs,
255  AmSgmm2 *model,
256  SgmmUpdateFlagsType flags);
257 
258  private:
259  friend class UpdateWClass;
261  friend class EbwEstimateAmSgmm2;
262 
264  static void ComputeQ(const MleAmSgmm2Accs &accs,
265  const AmSgmm2 &model,
266  std::vector< SpMatrix<double> > *Q);
267 
269  static void ComputeSMeans(const MleAmSgmm2Accs &accs,
270  const AmSgmm2 &model,
271  std::vector< SpMatrix<double> > *S_means);
272  friend class EbwAmSgmm2Updater;
273 
275 
276  // Called from UpdatePhoneVectors; updates a subset of states
277  // (relates to multi-threading).
278  void UpdatePhoneVectorsInternal(const MleAmSgmm2Accs &accs,
279  const std::vector<SpMatrix<double> > &H,
280  const std::vector<Matrix<double> > &log_a,
281  AmSgmm2 *model,
282  double *auxf_impr,
283  int32 num_threads,
284  int32 thread_id) const;
285 
286  double UpdatePhoneVectors(const MleAmSgmm2Accs &accs,
287  const std::vector<SpMatrix<double> > &H,
288  const std::vector<Matrix<double> > &log_a,
289  AmSgmm2 *model) const;
290 
291  double UpdateM(const MleAmSgmm2Accs &accs,
292  const std::vector< SpMatrix<double> > &Q,
293  const Vector<double> &gamma_i,
294  AmSgmm2 *model);
295 
296  void RenormalizeV(const MleAmSgmm2Accs &accs, AmSgmm2 *model,
297  const Vector<double> &gamma_i,
298  const std::vector<SpMatrix<double> > &H);
299 
300  double UpdateN(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
301  AmSgmm2 *model);
302  void RenormalizeN(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
303  AmSgmm2 *model);
304  double UpdateVars(const MleAmSgmm2Accs &accs,
305  const std::vector< SpMatrix<double> > &S_means,
306  const Vector<double> &gamma_i,
307  AmSgmm2 *model);
308  // Update for the phonetic-subspace weight projections w_i
309  double UpdateW(const MleAmSgmm2Accs &accs,
310  const std::vector<Matrix<double> > &log_a,
311  const Vector<double> &gamma_i,
312  AmSgmm2 *model);
313  // Update for the speaker-subspace weight projections u_i [SSGMM]
314  double UpdateU(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
315  AmSgmm2 *model);
316 
318  static
319  void UpdateWGetStats(const MleAmSgmm2Accs &accs,
320  const AmSgmm2 &model,
321  const Matrix<double> &w,
322  const std::vector<Matrix<double> > &log_a,
323  Matrix<double> *F_i,
324  Matrix<double> *g_i,
325  double *tot_like,
326  int32 num_threads,
327  int32 thread_id);
328 
329  double UpdateSubstateWeights(const MleAmSgmm2Accs &accs,
330  AmSgmm2 *model);
331 
332  static void ComputeLogA(const MleAmSgmm2Accs &accs,
333  std::vector<Matrix<double> > *log_a); // [SSGMM]
334 
335  void ComputeMPrior(AmSgmm2 *model); // TODO(arnab): Maybe make this static?
336  double MapUpdateM(const MleAmSgmm2Accs &accs,
337  const std::vector< SpMatrix<double> > &Q,
338  const Vector<double> &gamma_i, AmSgmm2 *model);
339 
341  MleAmSgmm2Updater() {} // Prevent unconfigured updater.
342 };
343 
344 
355  public:
357  MleSgmm2SpeakerAccs(const AmSgmm2 &model,
358  BaseFloat rand_prune_ = 1.0e-05);
359 
361  void Clear();
362 
364  BaseFloat Accumulate(const AmSgmm2 &model,
365  const Sgmm2PerFrameDerivedVars &frame_vars,
366  int32 pdf_index,
367  BaseFloat weight,
368  Sgmm2PerSpkDerivedVars *spk_vars);
369 
373  BaseFloat AccumulateFromPosteriors(const AmSgmm2 &model,
374  const Sgmm2PerFrameDerivedVars &frame_vars,
375  const Matrix<BaseFloat> &posteriors,
376  int32 pdf_index,
377  Sgmm2PerSpkDerivedVars *spk_vars);
378 
381  void Update(const AmSgmm2 &model,
382  BaseFloat min_count, // e.g. 100
383  Vector<BaseFloat> *v_s,
384  BaseFloat *objf_impr_out,
385  BaseFloat *count_out);
386 
387  private:
388  // Update without speaker-dependent weights (vectors u_i),
389  // i.e. not symmetric SGMM (SSGMM)
390  void UpdateNoU(Vector<BaseFloat> *v_s,
391  BaseFloat *objf_impr_out,
392  BaseFloat *count_out);
393  // Update for SSGMM
394  void UpdateWithU(const AmSgmm2 &model,
395  Vector<BaseFloat> *v_s,
396  BaseFloat *objf_impr_out,
397  BaseFloat *count_out);
398 
399 
407 
411  std::vector< SpMatrix<double> > H_spk_;
412 
414  std::vector< Matrix<double> > NtransSigmaInv_;
415 
418 };
419 
420 // This class, used in multi-core implementation of the updates of the "w_i"
421 // quantities, was previously in estimate-am-sgmm.cc, but is being moved to the
422 // header so it can be used in estimate-am-sgmm-ebw.cc. It is responsible for
423 // computing, in parallel, the F_i and g_i quantities used in the updates of
424 // w_i.
426  public:
428  const AmSgmm2 &model,
429  const Matrix<double> &w,
430  const std::vector<Matrix<double> > &log_a,
431  Matrix<double> *F_i,
432  Matrix<double> *g_i,
433  double *tot_like):
434  accs_(accs), model_(model), w_(w), log_a_(log_a),
435  F_i_ptr_(F_i), g_i_ptr_(g_i), tot_like_ptr_(tot_like) {
436  tot_like_ = 0.0;
437  F_i_.Resize(F_i->NumRows(), F_i->NumCols());
438  g_i_.Resize(g_i->NumRows(), g_i->NumCols());
439  }
440 
441  UpdateWClass(const UpdateWClass &other) :
442  MultiThreadable(other),
443  accs_(other.accs_), model_(other.model_), w_(other.w_),
444  log_a_(other.log_a_), F_i_ptr_(other.F_i_ptr_), g_i_ptr_(other.g_i_ptr_),
445  F_i_(other.F_i_), g_i_(other.g_i_), tot_like_ptr_(other.tot_like_ptr_),
446  tot_like_(0.0) { }
447 
449  F_i_ptr_->AddMat(1.0, F_i_, kNoTrans);
450  g_i_ptr_->AddMat(1.0, g_i_, kNoTrans);
451  *tot_like_ptr_ += tot_like_;
452  }
453 
454  inline void operator() () {
455  // Note: give them local copy of the sums we're computing,
456  // which will be propagated to the total sums in the destructor.
457  MleAmSgmm2Updater::UpdateWGetStats(accs_, model_, w_, log_a_,
458  &F_i_, &g_i_, &tot_like_,
459  num_threads_, thread_id_);
460  }
461  private:
463  const AmSgmm2 &model_;
465  const std::vector<Matrix<double> > &log_a_;
470  double *tot_like_ptr_;
471  double tot_like_;
472 };
473 
474 
475 } // namespace kaldi
476 
477 
478 #endif // KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Matrix< double > t_
[SSGMM] each row is one of the t_i quantities in the less-exact version of the SSGMM update for the s...
std::vector< Vector< double > > gamma_c_
Sub-state occupancies gamma_{jm}^{(c)} for each sub-state.
Matrix< double > g_i_
Class for definition of the subspace Gmm acoustic model.
Definition: am-sgmm2.h:231
BaseFloat cov_floor
Floor covariance matrices Sigma_i to this times average cov.
int map_M_prior_iters
num of iterations to update the prior of M
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
UpdateWClass(const UpdateWClass &other)
static void UpdateWGetStats(const MleAmSgmm2Accs &accs, const AmSgmm2 &model, const Matrix< double > &w, const std::vector< Matrix< double > > &log_a, Matrix< double > *F_i, Matrix< double > *g_i, double *tot_like, int32 num_threads, int32 thread_id)
Called, multithreaded, inside UpdateW.
BaseFloat tau_c
Smoothing constant for sub-state weights [count to add to each one].
Class for the accumulators required to update the speaker vectors v_s.
MleAmSgmm2Updater(const MleAmSgmm2Options &options)
Vector< double > a_s_
a_i^{(s)}. For SSGMM.
kaldi::int32 int32
BaseFloat epsilon
very small value used to prevent SVD crashing.
BaseFloat rand_prune_
small constant to randomly prune tiny posteriors
std::vector< SpMatrix< double > > S_
S_{i}^{-}, scatter of adapted feature vectors x_{i}(t). Dim is [I][D][D].
std::vector< Matrix< double > > gamma_
Gaussian occupancies gamma_{jmi} for each substate and Gaussian index, pooled over groups...
#define KALDI_DISALLOW_COPY_AND_ASSIGN(type)
Definition: kaldi-utils.h:121
bool full_row_cov
Estimate row covariance instead of using I.
const AmSgmm2 & model_
Matrix< double > * F_i_ptr_
MleAmSgmm2Options options_
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
BaseFloat cov_diag_ratio
ratio to dim below which we use diagonal. default 2, set to inf for diag.
std::vector< SpMatrix< double > > U_
the U_i quantities from the less-exact version of the SSGMM update for the speaker weight projections...
void Register(OptionsItf *opts)
void Reconfigure(const MleAmSgmm2Options &options)
int weight_projections_iters
Number of iters when re-estimating weight projections "w".
const std::vector< Matrix< double > > & log_a_
std::vector< Matrix< double > > Y_
The stats which are not tied to any state.
uint16 SgmmUpdateFlagsType
Bitwise OR of the above flags.
Definition: model-common.h:59
bool full_col_cov
Estimate col covariance instead of using I.
Matrix< double > F_i_
std::vector< Matrix< double > > y_
The SGMM state specific stats.
MleAmSgmm2Accs(BaseFloat rand_prune=1.0e-05)
BaseFloat max_impr_u
max improvement per frame allowed in update of u.
std::vector< Matrix< double > > NtransSigmaInv_
N_i^T {i}^{-1}. Needed for y^{(s)}.
Matrix< double > * g_i_ptr_
UpdateWClass(const MleAmSgmm2Accs &accs, const AmSgmm2 &model, const Matrix< double > &w, const std::vector< Matrix< double > > &log_a, Matrix< double > *F_i, Matrix< double > *g_i, double *tot_like)
std::vector< Matrix< double > > a_
[SSGMM] These a_{jmi} quantities are dimensionally the same as the gamma quantities.
Vector< double > y_s_
Statistics for speaker adaptation (vectors), stored per-speaker.
std::vector< SpMatrix< double > > H_spk_
The following variable does not change per speaker, it just relates to the speaker subspace...
Vector< double > a_s_
[SSGMM], this is a per-speaker variable storing the a_i^{(s)} quantities that we will use in order to...
BaseFloat max_cond
Max on condition of matrices in update beyond which we do not update.
A class representing a vector.
Definition: kaldi-vector.h:406
std::vector< SpMatrix< double > > R_
R_{i}, quadratic term for speaker subspace estimation. Dim is [I][T][T].
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
const Matrix< double > & w_
std::vector< Matrix< double > > Z_
Stats Z_{i} for speaker-subspace projections N. Dim is [I][D][T].
BaseFloat tau_map_M
For MAP update of the phonetic subspace M.
const MleAmSgmm2Accs & accs_
Configuration variables needed in the SGMM estimation process.
Class for the accumulators associated with the phonetic-subspace model parameters.
Vector< double > gamma_s_
gamma_{i}^{(s)}. Per-speaker counts for each Gaussian. Dimension is [I]
Holds the per-frame precomputed quantities x(t), x_{i}(t), z_{i}(t), and n_{i}(t) (cf...
Definition: am-sgmm2.h:142
Vector< double > gamma_s_
gamma_{i}^{(s)}.
MleAmSgmm2Accs(const AmSgmm2 &model, SgmmUpdateFlagsType flags, bool have_spk_vecs, BaseFloat rand_prune=1.0e-05)