31 using namespace kaldi;
34 "Accumulate stats for SGMM training.\n" 35 "Usage: sgmm2-acc-stats [options] <model-in> <feature-rspecifier> " 36 "<posteriors-rspecifier> <stats-out>\n" 37 "e.g.: sgmm2-acc-stats --gselect=ark:gselect.ark 1.mdl 1.ali scp:train.scp 'ark:ali-to-post 1.ali ark:-|' 1.acc\n" 38 "(note: gselect option is mandatory)\n";
42 std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
43 std::string update_flags_str =
"vMNwcSt";
46 po.Register(
"binary", &binary,
"Write output in binary mode");
47 po.Register(
"gselect", &gselect_rspecifier,
"Precomputed Gaussian indices (rspecifier)");
48 po.Register(
"spk-vecs", &spkvecs_rspecifier,
"Speaker vectors (rspecifier)");
49 po.Register(
"utt2spk", &utt2spk_rspecifier,
50 "rspecifier for utterance to speaker map");
51 po.Register(
"rand-prune", &rand_prune,
"Pruning threshold for posteriors");
52 po.Register(
"update-flags", &update_flags_str,
"Which SGMM parameters to accumulate " 53 "stats for: subset of vMNwcS.");
59 if (po.NumArgs() != 4) {
63 if (gselect_rspecifier ==
"")
64 KALDI_ERR <<
"--gselect option is mandatory.";
66 std::string model_filename = po.GetArg(1),
67 feature_rspecifier = po.GetArg(2),
68 posteriors_rspecifier = po.GetArg(3),
69 accs_wxfilename = po.GetArg(4);
71 using namespace kaldi;
74 int32 num_done = 0, num_err = 0;
96 Input ki(model_filename, &binary);
97 trans_model.
Read(ki.Stream(), binary);
98 am_sgmm.
Read(ki.Stream(), binary);
103 sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags, (spkvecs_rspecifier!=
""));
105 double tot_like = 0.0;
112 for (; !feature_reader.Done(); feature_reader.Next()) {
113 std::string utt = feature_reader.Key();
114 std::string spk = utt;
115 if (!utt2spk_rspecifier.empty()) {
116 if (!utt2spk_map.HasKey(utt)) {
117 KALDI_WARN <<
"utt2spk map does not have value for " << utt
118 <<
", ignoring this utterance.";
120 }
else { spk = utt2spk_map.Value(utt); }
123 if (spk != cur_spk && cur_spk !=
"")
124 sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
126 if (spk != cur_spk || spk_vars.
Empty()) {
128 if (spkvecs_reader.IsOpen()) {
129 if (spkvecs_reader.HasKey(utt)) {
133 KALDI_WARN <<
"Cannot find speaker vector for " << utt;
143 if (!posteriors_reader.HasKey(utt) ||
144 posteriors_reader.Value(utt).size() != features.
NumRows()) {
145 KALDI_WARN <<
"No posterior info available for utterance " 146 << utt <<
" (or wrong size)";
150 const Posterior &posterior = posteriors_reader.Value(utt);
152 if (!gselect_reader.HasKey(utt)
153 && gselect_reader.Value(utt).size() != features.
NumRows()) {
154 KALDI_WARN <<
"No Gaussian-selection info available for utterance " 155 << utt <<
" (or wrong size)";
158 const std::vector<std::vector<int32> > &gselect =
159 gselect_reader.Value(utt);
163 BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0;
167 for (
size_t i = 0;
i < posterior.size();
i++) {
171 for (
size_t j = 0;
j < pdf_posterior[
i].size();
j++) {
172 int32 pdf_id = pdf_posterior[
i][
j].first;
174 tot_like_this_file += sgmm_accs.Accumulate(am_sgmm, per_frame_vars,
175 pdf_id, weight, &spk_vars)
177 tot_weight += weight;
181 for (
size_t j = 0;
j < posterior[
i].size();
j++) {
182 int32 tid = posterior[
i][
j].first;
184 trans_model.
Accumulate(weight, tid, &transition_accs);
188 KALDI_VLOG(2) <<
"Average like for this file is " 189 << (tot_like_this_file/tot_weight) <<
" over " 190 << tot_weight <<
" frames.";
191 tot_like += tot_like_this_file;
193 if (num_done % 50 == 0) {
194 KALDI_LOG <<
"Processed " << num_done <<
" utterances; for utterance " 195 << utt <<
" avg. like is " 196 << (tot_like_this_file/tot_weight)
197 <<
" over " << tot_weight <<
" frames.";
200 sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
203 KALDI_LOG <<
"Overall like per frame (Gaussian only) = " 204 << (tot_like/tot_t) <<
" over " << tot_t <<
" frames.";
206 KALDI_LOG <<
"Done " << num_done <<
" files, " << num_err
211 Output ko(accs_wxfilename, binary);
212 transition_accs.
Write(ko.Stream(), binary);
213 sgmm_accs.Write(ko.Stream(), binary);
216 return (num_done != 0 ? 0 : 1);
217 }
catch(
const std::exception &e) {
218 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Class for definition of the subspace Gmm acoustic model.
This class is for when you are reading something in random access, but it may actually be stored per-...
void Write(std::ostream &Out, bool binary) const
Writes to C++ stream (option to write in binary).
void Read(std::istream &is, bool binary)
SgmmUpdateFlagsType StringToSgmmUpdateFlags(std::string str)
void ComputePerSpkDerivedVars(Sgmm2PerSpkDerivedVars *vars) const
Computes the per-speaker derived vars; assumes vars->v_s is already set up.
Allows random access to a collection of objects in an archive or script file; see The Table concept...
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
void InitStats(Vector< double > *stats) const
uint16 SgmmUpdateFlagsType
Bitwise OR of the above flags.
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
void Read(std::istream &is, bool binary)
void Accumulate(BaseFloat prob, int32 trans_id, Vector< double > *stats) const
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
void ComputePerFrameVars(const VectorBase< BaseFloat > &data, const std::vector< int32 > &gselect, const Sgmm2PerSpkDerivedVars &spk_vars, Sgmm2PerFrameDerivedVars *per_frame_vars) const
This needs to be called with each new frame of data, prior to accumulation or likelihood evaluation: ...
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
void SetSpeakerVector(const Vector< BaseFloat > &v_s_in)
void ConvertPosteriorToPdfs(const TransitionModel &tmodel, const Posterior &post_in, Posterior *post_out)
Converts a posterior over transition-ids to be a posterior over pdf-ids.
Class for the accumulators associated with the phonetic-subspace model parameters.
Holds the per-frame precomputed quantities x(t), x_{i}(t), z_{i}(t), and n_{i}(t) (cf...