24 #ifndef KALDI_SGMM2_AM_SGMM2_H_ 25 #define KALDI_SGMM2_AM_SGMM2_H_ 103 opts->
Register(
"split-substates", &split_substates,
"Increase number of " 104 "substates to this overall target.");
105 opts->
Register(
"max-cond-split", &max_cond,
"Max condition number of smoothing " 106 "matrix used in substate splitting.");
107 opts->
Register(
"perturb-factor", &perturb_factor,
"Perturbation factor for " 108 "state vectors while splitting substates.");
109 opts->
Register(
"power", &power,
"Exponent for substate occupancies used while " 110 "splitting substates.");
111 opts->
Register(
"min-count", &min_count,
"Minimum allowed count, used in allocating " 112 "sub-states to state in mixture splitting.");
130 opts->
Register(
"full-gmm-nbest", &full_gmm_nbest,
"Number of highest-scoring" 131 " full-covariance Gaussians selected per frame.");
132 opts->
Register(
"diag-gmm-nbest", &diag_gmm_nbest,
"Number of highest-scoring" 133 " diagonal-covariance Gaussians selected per frame.");
153 if (xt.
Dim() != feat_dim) xt.
Resize(feat_dim);
155 xti.
Resize(ngauss, feat_dim);
157 zti.
Resize(ngauss, phn_dim);
158 if (nti.
Dim() != ngauss)
175 bool Empty() {
return v_s.Dim() == 0; }
181 v_s.Resize(v_s_in.
Dim());
182 v_s.CopyFromVec(v_s_in);
203 substate_cache(num_groups), pdf_cache(num_pdfs), t(1) { }
234 void Read(std::istream &is,
bool binary);
235 void Write(std::ostream &os,
bool binary,
241 void Check(
bool show_properties =
true);
247 void InitializeFromFullGmm(
const FullGmm &gmm,
248 const std::vector<int32> &pdf2group,
249 int32 phn_subspace_dim,
250 int32 spk_subspace_dim,
251 bool speaker_dependent_weights,
259 void CopyGlobalsInitVecs(
const AmSgmm2 &other,
260 const std::vector<int32> &pdf2group,
264 void CopyFromSgmm2(
const AmSgmm2 &other,
265 bool copy_normalizers,
274 std::vector<int32> *gselect)
const;
279 const std::vector<int32> &gselect,
317 void IncreasePhoneSpaceDim(
int32 target_dim,
324 void IncreaseSpkSpaceDim(
int32 target_dim,
326 bool speaker_dependent_weights);
332 void ComputeDerivedVars();
336 void ComputeNormalizers();
340 void ComputeWeights();
358 KALDI_ASSERT(j1 < NumGroups());
return v_[j1].NumRows();
383 template<
typename Real>
386 template<
typename Real>
390 template<
typename Real>
391 void GetNtransSigmaInv(std::vector<
Matrix<Real> > *out)
const;
393 template<
typename Real>
398 template<
typename Real>
399 void GetVarScaledSubstateSpeakerMean(
int32 j1,
int32 substate,
425 std::vector< Matrix<BaseFloat> >
M_;
427 std::vector< Matrix<BaseFloat> >
N_;
436 std::vector< Matrix<BaseFloat> >
v_;
438 std::vector< Vector<BaseFloat> >
c_;
440 std::vector< Matrix<BaseFloat> >
n_;
466 void ComputeNormalizersInternal(
int32 num_threads,
int32 thread,
467 int32 *entropy_count,
double *entropy_sum);
480 void InitializeMw(
int32 phn_subspace_dim,
483 void InitializeNu(
int32 spk_subspace_dim,
485 bool speaker_dependent_weights);
486 void InitializeVecsAndSubstateWeights(
BaseFloat self_weight);
487 void InitializeCovars();
489 void ComputeHsmFromModel(
495 void ComputePdfMappings();
506 friend class AmSgmm2Functions;
507 friend class Sgmm2Feature;
510 template<
typename Real>
518 template<
typename Real>
522 KALDI_ASSERT(j1 < NumGroups() && m < NumSubstatesForGroup(j1)
526 mean_tmp.AddMatVec(1.0, M_[i],
kNoTrans, v_[j1].Row(m), 0.0);
531 template<
typename Real>
535 GetSubstateMean(j1, m, i, mean_out);
536 if (spk.
v_s.Dim() != 0)
540 template<
typename Real>
545 GetSubstateSpeakerMean(j1, m, i, spk, &tmp_mean);
546 tmp_mean2.AddSpVec(1.0, SigmaInv_[i], tmp_mean, 0.0);
574 void Write(std::ostream &os,
bool binary)
const;
575 void Read(std::istream &is,
bool binary);
586 #endif // KALDI_SGMM2_AM_SGMM2_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
uint16 SgmmWriteFlagsType
Bitwise OR of the above flags.
Matrix< BaseFloat > u_
[SSGMM] Speaker-subspace weight projection vectors. Dimension is [I][T]
Class for definition of the subspace Gmm acoustic model.
Packed symetric matrix class.
std::vector< int32 > pdf2group_
KaldiObjectHolder works for Kaldi objects that have the "standard" Read and Write functions...
Vector< BaseFloat > xt
x'(t), FMLLR-adapted, dim = [D], eq.(33)
void GetSubstateMean(int32 j1, int32 m, int32 i, VectorBase< Real > *mean_out) const
bool HasSpeakerSpace() const
std::vector< Vector< BaseFloat > > c_
c_{jm}, mixture weights. Dimension is [J2][#mix]
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Matrix< BaseFloat > w_
Phonetic-subspace weight projection vectors. Dimension is [I][S].
Vector< BaseFloat > v_s
Speaker adaptation vector v_^{(s)}. Dim is [T].
Definition for Gaussian Mixture Model with full covariances.
Class for the accumulators required to update the speaker vectors v_s.
const Vector< BaseFloat > & GetSpeakerVector()
void GetVarScaledSubstateSpeakerMean(int32 j1, int32 substate, int32 gauss, const Sgmm2PerSpkDerivedVars &spk, VectorBase< Real > *mean_out) const
KaldiObjectHolder< Sgmm2GauPost > Sgmm2GauPostHolder
void GetInvCovars(int32 gauss_index, SpMatrix< Real > *out) const
Templated accessors (used to accumulate in different precision)
A templated class for writing objects to an archive or script file; see The Table concept...
DiagGmm diag_ubm_
These contain the "background" model associated with the subspace GMM.
std::vector< Matrix< BaseFloat > > posteriors
std::vector< Matrix< BaseFloat > > n_
n_{jim}, per-Gaussian normalizer. Dimension is [J1][I][#mix]
std::vector< Matrix< BaseFloat > > N_
Speaker-subspace projections. Dimension is [I][D][T].
SpMatrix< BaseFloat > col_cov_inv_
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
#define KALDI_DISALLOW_COPY_AND_ASSIGN(type)
BaseFloat remaining_log_like
const FullGmm & full_ubm() const
Accessors.
int32 PhoneSpaceDim() const
std::vector< Matrix< BaseFloat > > v_
The parameters in a particular SGMM state.
void CopyFromSp(const SpMatrix< Real > &other)
Sgmm2LikelihoodCache(int32 num_groups, int32 num_pdfs)
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Matrix< BaseFloat > zti
z_{i}(t), dim = [I][S], eq.(35)
std::vector< Matrix< BaseFloat > > M_
Phonetic-subspace projections. Dimension is [I][D][S].
std::vector< int32 > tids
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
std::vector< Vector< BaseFloat > > log_d_jms
< [SSGMM] log of the above (more efficient to store both).
SequentialTableReader< Sgmm2GauPostHolder > SequentialSgmm2GauPostReader
int32 NumSubstatesForPdf(int32 j2) const
void GetSubstateSpeakerMean(int32 j1, int32 substate, int32 gauss, const Sgmm2PerSpkDerivedVars &spk, VectorBase< Real > *mean_out) const
Matrix< BaseFloat > xti
x_{i}(t) = x'(t) - o_i(s): dim = [I][D], eq.(34)
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
std::vector< SpMatrix< BaseFloat > > SigmaInv_
Globally shared parameters of the subspace GMM.
Vector< BaseFloat > likes
std::vector< Matrix< BaseFloat > > M_prior_
Vector< BaseFloat > nti
n_{i}(t), dim = [I], eq.
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
std::vector< PdfCacheElement > pdf_cache
This is the entry for a single time.
int32 NumPdfs() const
Various model dimensions.
int32 full_gmm_nbest
Number of highest-scoring full-covariance Gaussians per frame.
void Resize(int32 ngauss, int32 feat_dim, int32 phn_dim)
MatrixIndexT Dim() const
Returns the dimension of the vector.
std::vector< int32 > gselect
const DiagGmm & diag_ubm() const
SpMatrix< BaseFloat > row_cov_inv_
TableWriter< Sgmm2GauPostHolder > Sgmm2GauPostWriter
std::vector< int32 > gselect
int32 diag_gmm_nbest
Number of highest-scoring diagonal-covariance Gaussians per frame.
std::vector< std::vector< int32 > > group2pdf_
A class representing a vector.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
void RemoveSpeakerSpace()
Definition for Gaussian Mixture Model with diagonal covariances.
Sgmm2LikelihoodCache caches SGMM likelihoods at two levels: the final pdf likelihoods, and the sub-state level likelihoods, which means that with the SCTM system we can avoid redundant computation.
std::vector< SubstateCacheElement > substate_cache
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
void SetSpeakerVector(const Vector< BaseFloat > &v_s_in)
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type=kSetZero)
bool HasSpeakerDependentWeights() const
True if doing SSGMM.
Provides a vector abstraction class.
void Register(OptionsItf *opts)
Class for the accumulators associated with the phonetic-subspace model parameters.
int32 SpkSpaceDim() const
void ComputeFeatureNormalizingTransform(const FullGmm &gmm, Matrix< BaseFloat > *xform)
Computes the inverse of an LDA transform (without dimensionality reduction) The computed transform is...
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Vector< BaseFloat > log_b_is
< [SSGMM]: Eq. (22) in techreport, b_i^{(s)} = (^T ^{(s)})
Holds the per-frame precomputed quantities x(t), x_{i}(t), z_{i}(t), and n_{i}(t) (cf...
int32 NumSubstatesForGroup(int32 j1) const
void Register(OptionsItf *opts)
RandomAccessTableReader< Sgmm2GauPostHolder > RandomAccessSgmm2GauPostReader
std::vector< Matrix< BaseFloat > > w_jmi_
[SSGMM] w_{jmi}, dimension is [J1][#mix][I]. Computed from w_ and v_.
Sgmm2SplitSubstatesConfig()
Matrix< BaseFloat > o_s
Per-speaker offsets o_{i}. Dimension is [I][D].