gmm-train-lvtln-special.cc File Reference
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "transform/lvtln.h"
#include "hmm/posterior.h"
Include dependency graph for gmm-train-lvtln-special.cc:

Go to the source code of this file.

Functions

int main (int argc, char *argv[])
 

Function Documentation

◆ main()

int main ( int  argc,
char *  argv[] 
)

Definition at line 27 of file gmm-train-lvtln-special.cc.

References MatrixBase< Real >::AddMatMat(), VectorBase< Real >::AddSpVec(), VectorBase< Real >::AddVec(), SpMatrix< Real >::AddVec2(), MatrixBase< Real >::AddVecVec(), TpMatrix< Real >::Cholesky(), kaldi::ConvertStringToInteger(), LinearVtln::Dim(), SequentialTableReader< Holder >::Done(), ParseOptions::GetArg(), ParseOptions::GetOptArg(), RandomAccessTableReader< Holder >::HasKey(), rnnlm::i, TpMatrix< Real >::Invert(), SpMatrix< Real >::Invert(), rnnlm::j, KALDI_ASSERT, KALDI_ERR, KALDI_LOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), kaldi::kNoTrans, kaldi::kTrans, SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), ParseOptions::PrintUsage(), ParseOptions::Read(), kaldi::ReadKaldiObject(), ParseOptions::Register(), MatrixBase< Real >::Row(), PackedMatrix< Real >::Scale(), VectorBase< Real >::Scale(), LinearVtln::SetTransform(), LinearVtln::SetWarp(), Output::Stream(), RandomAccessTableReader< Holder >::Value(), SequentialTableReader< Holder >::Value(), kaldi::VecSpVec(), kaldi::VecVec(), and LinearVtln::Write().

27  {
28  try {
29  using namespace kaldi;
30  using kaldi::int32;
31 
32  const char *usage =
33  "Set one of the transforms in lvtln to the minimum-squared-error solution\n"
34  "to mapping feats-untransformed to feats-transformed; posteriors may\n"
35  "optionally be used to downweight/remove silence.\n"
36  "Usage: gmm-train-lvtln-special [options] class-index <lvtln-in> <lvtln-out> "
37  " <feats-untransformed-rspecifier> <feats-transformed-rspecifier> [<posteriors-rspecifier>]\n"
38  "e.g.: \n"
39  " gmm-train-lvtln-special 5 5.lvtln 6.lvtln scp:train.scp scp:train_warp095.scp ark:nosil.post\n";
40 
41  BaseFloat warp = -1.0;
42  bool binary = true;
43  bool normalize_var = false;
44  bool normalize_covar = false;
45  std::string weights_rspecifier;
46 
47  ParseOptions po(usage);
48  po.Register("binary", &binary, "Write output in binary mode");
49  po.Register("warp", &warp, "If supplied, can be used to set warp factor"
50  "for this transform");
51  po.Register("normalize-var", &normalize_var, "Normalize diagonal of variance "
52  "to be the same before and after transform.");
53  po.Register("normalize-covar", &normalize_covar, "Normalize (matrix-valued) "
54  "covariance to be the same before and after transform.");
55  po.Register("weights-in", &weights_rspecifier,
56  "Can be used to take posteriors as an scp or ark file of weights "
57  "instead of giving <posteriors-rspecfier>");
58 
59  po.Read(argc, argv);
60 
61  if (po.NumArgs() < 5 || po.NumArgs() > 6) {
62  po.PrintUsage();
63  exit(1);
64  }
65 
66  std::string class_idx_str = po.GetArg(1);
67  int32 class_idx;
68  if (!ConvertStringToInteger(class_idx_str, &class_idx))
69  KALDI_ERR << "Expected integer first argument: got " << class_idx_str;
70 
71  std::string lvtln_rxfilename = po.GetArg(2),
72  lvtln_wxfilename = po.GetArg(3),
73  feats_orig_rspecifier = po.GetArg(4),
74  feats_transformed_rspecifier = po.GetArg(5),
75  posteriors_rspecifier = po.GetOptArg(6);
76 
77  // Get lvtln object.
78  LinearVtln lvtln;
79  ReadKaldiObject(lvtln_rxfilename, &lvtln);
80  int32 dim = lvtln.Dim(); // feature dimension [we hope!].
81 
82 
83  if (!normalize_covar) {
84  // Below is the computation if we are not normalizing the full covariance.
85 
86  // Ignoring weighting (which is a straightforward extension), the problem is this:
87  // we have original features x(t) and transformed features y(t) [both x(t) and y(t)
88  // are vectors of size D]. We are training an affine transform to minimize the sum-squared
89  // error between A x(t) + b and y(t). Let x(t)^+ be x(t) with a 1 appended, and let
90  // w_i be the i'th row of the matrix [ A; b ], as in CMLLR.
91  // We are minimizeing
92  // \sum_{t = 1}^T \sum_{i = 1}^D (w_i^T x(t)^+ - y_i(t))^2,
93  // We can express this in terms of sufficient statistics as:
94  // \sum_{i = 1}^D w_i^T Q w_i - 2 w_i^T l_i + c_i,
95  // where
96  // Q_i = \sum_{t = 1}^T x(t)^+ x(t)^+^T
97  // l_i = \sum_{t = 1}^T x(t)^+ y_i(t)
98  // c_i = \sum_{t = 1}^T y_i(t)^2
99  // The solution for row i is: w_i = Q^{-1} l_i
100  // and the sum-square error for index i is:
101  // w_i^T Q w_i - 2 w_i^T l_i + c_i .
102  // Note that for lvtln purposes we throw away the "offset" element (i.e. the last
103  // element of each row w_i).
104 
105  // Declare statistics we use to estimate transform.
106  SpMatrix<double> Q(dim+1); // quadratic stats == outer product of x^+.
107  Matrix<double> l(dim, dim+1); // i'th row of l is l_i
108  Vector<double> c(dim);
109  double beta = 0.0;
110  Vector<double> sum_xplus(dim+1); // sum of x_i^+
111  Vector<double> sumsq_x(dim); // sumsq of x_i
112  Vector<double> sumsq_diff(dim); // sum of x_i-y_i
113 
114  SequentialBaseFloatMatrixReader x_reader(feats_orig_rspecifier);
115  RandomAccessBaseFloatMatrixReader y_reader(feats_transformed_rspecifier);
116 
117  RandomAccessPosteriorReader post_reader(posteriors_rspecifier);
118  RandomAccessBaseFloatVectorReader weights_reader(weights_rspecifier);
119 
120  for (; !x_reader.Done(); x_reader.Next()) {
121  std::string utt = x_reader.Key();
122  if (!y_reader.HasKey(utt)) {
123  KALDI_WARN << "No transformed features for key " << utt;
124  continue;
125  }
126  const Matrix<BaseFloat> &x_feats = x_reader.Value();
127  const Matrix<BaseFloat> &y_feats = y_reader.Value(utt);
128  if (x_feats.NumRows() != y_feats.NumRows() ||
129  x_feats.NumCols() != y_feats.NumCols() ||
130  x_feats.NumCols() != dim) {
131  KALDI_ERR << "Number of rows and/or columns differs in features, or features have different dim from lvtln object";
132  }
133 
134  Vector<BaseFloat> weights(x_feats.NumRows());
135  if (weights_rspecifier == "" && posteriors_rspecifier != "") {
136  if (!post_reader.HasKey(utt)) {
137  KALDI_WARN << "No posteriors for utterance " << utt;
138  continue;
139  }
140  const Posterior &post = post_reader.Value(utt);
141  if (static_cast<int32>(post.size()) != x_feats.NumRows())
142  KALDI_ERR << "Mismatch in size of posterior";
143  for (size_t i = 0; i < post.size(); i++)
144  for (size_t j = 0; j < post[i].size(); j++)
145  weights(i) += post[i][j].second;
146  } else if (weights_rspecifier != "") {
147  if (!weights_reader.HasKey(utt)) {
148  KALDI_WARN << "No weights for utterance " << utt;
149  continue;
150  }
151  weights.CopyFromVec(weights_reader.Value(utt));
152  } else {
153  weights.Add(1.0);
154  }
155 
156  // Now get stats.
157 
158  for (int32 i = 0; i < x_feats.NumRows(); i++) {
159  BaseFloat weight = weights(i);
160  SubVector<BaseFloat> x_row(x_feats, i);
161  SubVector<BaseFloat> y_row(y_feats, i);
162  Vector<double> xplus_row_dbl(dim+1);
163  for (int32 j = 0; j < dim; j++)
164  xplus_row_dbl(j) = x_row(j);
165  xplus_row_dbl(dim) = 1.0;
166  Vector<double> y_row_dbl(y_row);
167  Q.AddVec2(weight, xplus_row_dbl);
168  l.AddVecVec(weight, y_row_dbl, xplus_row_dbl);
169  beta += weight;
170  sum_xplus(dim) += weight;
171  for (int32 j = 0; j < dim; j++) {
172  sum_xplus(j) += weight * x_row(j);
173  sumsq_x(j) += weight * x_row(j)*x_row(j);
174  sumsq_diff(j) += weight * (x_row(j)-y_row(j)) * (x_row(j)-y_row(j));
175  c(j) += weight * y_row(j)*y_row(j);
176  }
177  }
178  }
179 
180  Matrix<BaseFloat> A(dim, dim); // will give this to LVTLN object
181  // as transform matrix.
182  SpMatrix<double> Qinv(Q);
183  Qinv.Invert();
184  for (int32 i = 0; i < dim; i++) {
185  Vector<double> w_i(dim+1);
186  SubVector<double> l_i(l, i);
187  w_i.AddSpVec(1.0, Qinv, l_i, 0.0); // w_i = Q^{-1} l_i
188  SubVector<double> a_i(w_i, 0, dim);
189  A.Row(i).CopyFromVec(a_i);
190 
191  BaseFloat error = (VecSpVec(w_i, Q, w_i) - 2.0*VecVec(w_i, l_i) + c(i)) / beta,
192  sqdiff = sumsq_diff(i) / beta,
193  scatter = sumsq_x(i) / beta;
194 
195  KALDI_LOG << "For dimension " << i << ", sum-squared error in linear approximation is "
196  << error << ", versus feature-difference " << sqdiff << ", orig-sumsq is "
197  << scatter;
198  if (normalize_var) { // add a scaling to normalize the variance.
199  double x_var = scatter - pow(sum_xplus(i) / beta, 2.0);
200  double y_var = VecSpVec(w_i, Q, w_i)/beta
201  - pow(VecVec(w_i, sum_xplus)/beta, 2.0);
202  double scale = sqrt(x_var / y_var);
203  KALDI_LOG << "For dimension " << i
204  << ", variance of original and transformed data is " << x_var
205  << " and " << y_var << " respectively; scaling matrix row by "
206  << scale << " to make them equal.";
207  A.Row(i).Scale(scale);
208  }
209  }
210  lvtln.SetTransform(class_idx, A);
211  } else {
212  // Here is the computation if we normalize the full covariance.
213  // see the document "Notes for affine-transform-based VTLN" for explanation,
214  // here: http://www.danielpovey.com/files/2010_vtln_notes.pdf
215 
216  double T = 0.0;
217  SpMatrix<double> XX(dim); // sum of x x^t
218  Vector<double> x(dim); // sum of x.
219  Vector<double> y(dim); // sum of y.
220  Matrix<double> XY(dim, dim); // sum of x y^t
221 
222  SequentialBaseFloatMatrixReader x_reader(feats_orig_rspecifier);
223  RandomAccessBaseFloatMatrixReader y_reader(feats_transformed_rspecifier);
224 
225  RandomAccessPosteriorReader post_reader(posteriors_rspecifier);
226 
227  for (; !x_reader.Done(); x_reader.Next()) {
228  std::string utt = x_reader.Key();
229  if (!y_reader.HasKey(utt)) {
230  KALDI_WARN << "No transformed features for key " << utt;
231  continue;
232  }
233  const Matrix<BaseFloat> &x_feats = x_reader.Value();
234  const Matrix<BaseFloat> &y_feats = y_reader.Value(utt);
235  if (x_feats.NumRows() != y_feats.NumRows() ||
236  x_feats.NumCols() != y_feats.NumCols() ||
237  x_feats.NumCols() != dim) {
238  KALDI_ERR << "Number of rows and/or columns differs in features, or features have different dim from lvtln object";
239  }
240 
241  Vector<BaseFloat> weights(x_feats.NumRows());
242  if (posteriors_rspecifier != "") {
243  if (!post_reader.HasKey(utt)) {
244  KALDI_WARN << "No posteriors for utterance " << utt;
245  continue;
246  }
247  const Posterior &post = post_reader.Value(utt);
248  if (static_cast<int32>(post.size()) != x_feats.NumRows())
249  KALDI_ERR << "Mismatch in size of posterior";
250  for (size_t i = 0; i < post.size(); i++)
251  for (size_t j = 0; j < post[i].size(); j++)
252  weights(i) += post[i][j].second;
253  } else weights.Add(1.0);
254  // Now get stats.
255  for (int32 i = 0; i < x_feats.NumRows(); i++) {
256  BaseFloat weight = weights(i);
257  SubVector<BaseFloat> x_row(x_feats, i);
258  SubVector<BaseFloat> y_row(y_feats, i);
259  Vector<double> x_dbl(x_row);
260  Vector<double> y_dbl(y_row);
261  T += weight;
262  XX.AddVec2(weight, x_dbl);
263  x.AddVec(weight, x_row);
264  y.AddVec(weight, y_row);
265  XY.AddVecVec(weight, x_dbl, y_dbl);
266  }
267  }
268  KALDI_ASSERT(T > 0.0);
269  Vector<double> xbar(x); xbar.Scale(1.0/T);
270 
271  SpMatrix<double> S(XX); S.Scale(1.0/T);
272  S.AddVec2(-1.0, xbar);
273  TpMatrix<double> C_tp(dim);
274  C_tp.Cholesky(S); // get cholesky factor.
275  TpMatrix<double> Cinv_tp(C_tp);
276  Cinv_tp.Invert();
277  Matrix<double> C(C_tp); // use regular matrix as more stuff is implemented for this case.
278  Matrix<double> Cinv(Cinv_tp);
279  Matrix<double> P0(XY);
280  P0.AddVecVec(-1.0, xbar, y);
281  Matrix<double> P(dim, dim), tmp(dim, dim);
282  tmp.AddMatMat(1.0, P0, kNoTrans, Cinv, kTrans, 0.0); // tmp := P0 * C^{-T}
283  P.AddMatMat(1.0, Cinv, kNoTrans, tmp, kNoTrans, 0.0); // P := C^{-1} * P0
284  Vector<double> l(dim);
285  Matrix<double> U(dim, dim), Vt(dim, dim);
286  P.Svd(&l, &U, &Vt);
287  l.Scale(1.0/T); // normalize for diagnostic purposes.
288  KALDI_LOG << "Singular values of P are: " << l;
289  Matrix<double> N(dim, dim);
290  N.AddMatMat(1.0, Vt, kTrans, U, kTrans, 0.0); // N := V * U^T.
291  Matrix<double> M(dim, dim);
292  tmp.AddMatMat(1.0, N, kNoTrans, Cinv, kNoTrans, 0.0); // tmp := N * C^{-1}
293  M.AddMatMat(1.0, C, kNoTrans, tmp, kNoTrans, 0.0); // M := C * tmp = C * N * C^{-1}
294  Matrix<BaseFloat> Mf(M);
295  lvtln.SetTransform(class_idx, Mf); // in this setup we don't
296  // need the offset, v.
297  }
298 
299  if (warp >= 0.0)
300  lvtln.SetWarp(class_idx, warp);
301 
302  { // Write lvtln object.
303  Output ko(lvtln_wxfilename, binary);
304  lvtln.Write(ko.Stream(), binary);
305  }
306  return 0;
307  } catch(const std::exception &e) {
308  std::cerr << e.what();
309  return -1;
310  }
311 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
Definition: text-utils.h:118
void SetTransform(int32 i, const MatrixBase< BaseFloat > &transform)
Definition: lvtln.cc:166
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
kaldi::int32 int32
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
int32 Dim() const
Definition: lvtln.h:77
float BaseFloat
Definition: kaldi-types.h:29
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
#define KALDI_ERR
Definition: kaldi-error.h:147
Real VecSpVec(const VectorBase< Real > &v1, const SpMatrix< Real > &M, const VectorBase< Real > &v2)
Computes v1^T * M * v2.
Definition: sp-matrix.cc:964
#define KALDI_WARN
Definition: kaldi-error.h:150
void SetWarp(int32 i, BaseFloat warp)
Definition: lvtln.cc:174
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void Write(std::ostream &os, bool binary) const
Definition: lvtln.cc:74
#define KALDI_LOG
Definition: kaldi-error.h:153
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501