30 int main(
int argc,
char *argv[]) {
31 using namespace kaldi;
34 "Accumulate stats for SGMM training.\n" 35 "Usage: sgmm2-acc-stats [options] <model-in> <feature-rspecifier> " 36 "<posteriors-rspecifier> <stats-out>\n" 37 "e.g.: sgmm2-acc-stats --gselect=ark:gselect.ark 1.mdl 1.ali scp:train.scp 'ark:ali-to-post 1.ali ark:-|' 1.acc\n" 38 "(note: gselect option is mandatory)\n";
42 std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
43 std::string update_flags_str =
"vMNwcSt";
46 po.
Register(
"binary", &binary,
"Write output in binary mode");
47 po.
Register(
"gselect", &gselect_rspecifier,
"Precomputed Gaussian indices (rspecifier)");
48 po.
Register(
"spk-vecs", &spkvecs_rspecifier,
"Speaker vectors (rspecifier)");
49 po.
Register(
"utt2spk", &utt2spk_rspecifier,
50 "rspecifier for utterance to speaker map");
51 po.
Register(
"rand-prune", &rand_prune,
"Pruning threshold for posteriors");
52 po.
Register(
"update-flags", &update_flags_str,
"Which SGMM parameters to accumulate " 53 "stats for: subset of vMNwcS.");
63 if (gselect_rspecifier ==
"")
64 KALDI_ERR <<
"--gselect option is mandatory.";
66 std::string model_filename = po.
GetArg(1),
67 feature_rspecifier = po.
GetArg(2),
68 posteriors_rspecifier = po.
GetArg(3),
69 accs_wxfilename = po.
GetArg(4);
71 using namespace kaldi;
74 int32 num_done = 0, num_err = 0;
96 Input ki(model_filename, &binary);
105 double tot_like = 0.0;
112 for (; !feature_reader.
Done(); feature_reader.
Next()) {
113 std::string utt = feature_reader.
Key();
114 std::string spk = utt;
115 if (!utt2spk_rspecifier.empty()) {
116 if (!utt2spk_map.
HasKey(utt)) {
117 KALDI_WARN <<
"utt2spk map does not have value for " << utt
118 <<
", ignoring this utterance.";
120 }
else { spk = utt2spk_map.
Value(utt); }
123 if (spk != cur_spk && cur_spk !=
"")
126 if (spk != cur_spk || spk_vars.
Empty()) {
128 if (spkvecs_reader.
IsOpen()) {
129 if (spkvecs_reader.
HasKey(utt)) {
133 KALDI_WARN <<
"Cannot find speaker vector for " << utt;
143 if (!posteriors_reader.
HasKey(utt) ||
144 posteriors_reader.
Value(utt).size() != features.
NumRows()) {
145 KALDI_WARN <<
"No posterior info available for utterance " 146 << utt <<
" (or wrong size)";
152 if (!gselect_reader.
HasKey(utt)
153 && gselect_reader.
Value(utt).size() != features.
NumRows()) {
154 KALDI_WARN <<
"No Gaussian-selection info available for utterance " 155 << utt <<
" (or wrong size)";
158 const std::vector<std::vector<int32> > &gselect =
159 gselect_reader.
Value(utt);
163 BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0;
167 for (
size_t i = 0;
i < posterior.size();
i++) {
171 for (
size_t j = 0;
j < pdf_posterior[
i].size();
j++) {
172 int32 pdf_id = pdf_posterior[
i][
j].first;
174 tot_like_this_file += sgmm_accs.
Accumulate(am_sgmm, per_frame_vars,
175 pdf_id, weight, &spk_vars)
177 tot_weight += weight;
181 for (
size_t j = 0;
j < posterior[
i].size();
j++) {
182 int32 tid = posterior[
i][
j].first;
184 trans_model.
Accumulate(weight, tid, &transition_accs);
188 KALDI_VLOG(2) <<
"Average like for this file is " 189 << (tot_like_this_file/tot_weight) <<
" over " 190 << tot_weight <<
" frames.";
191 tot_like += tot_like_this_file;
193 if (num_done % 50 == 0) {
194 KALDI_LOG <<
"Processed " << num_done <<
" utterances; for utterance " 195 << utt <<
" avg. like is " 196 << (tot_like_this_file/tot_weight)
197 <<
" over " << tot_weight <<
" frames.";
203 KALDI_LOG <<
"Overall like per frame (Gaussian only) = " 204 << (tot_like/tot_t) <<
" over " << tot_t <<
" frames.";
206 KALDI_LOG <<
"Done " << num_done <<
" files, " << num_err
211 Output ko(accs_wxfilename, binary);
216 return (num_done != 0 ? 0 : 1);
217 }
catch(
const std::exception &e) {
218 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Class for definition of the subspace Gmm acoustic model.
BaseFloat Accumulate(const AmSgmm2 &model, const Sgmm2PerFrameDerivedVars &frame_vars, int32 pdf_index, BaseFloat weight, Sgmm2PerSpkDerivedVars *spk_vars)
Returns likelihood.
int main(int argc, char *argv[])
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
This class is for when you are reading something in random access, but it may actually be stored per-...
void Write(std::ostream &Out, bool binary) const
Writes to C++ stream (option to write in binary).
void Read(std::istream &is, bool binary)
SgmmUpdateFlagsType StringToSgmmUpdateFlags(std::string str)
void ComputePerSpkDerivedVars(Sgmm2PerSpkDerivedVars *vars) const
Computes the per-speaker derived vars; assumes vars->v_s is already set up.
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
void CommitStatsForSpk(const AmSgmm2 &model, const Sgmm2PerSpkDerivedVars &spk_vars)
Accumulates global stats for the current speaker (if applicable).
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
void InitStats(Vector< double > *stats) const
uint16 SgmmUpdateFlagsType
Bitwise OR of the above flags.
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
const T & Value(const std::string &key)
void Read(std::istream &is, bool binary)
void Accumulate(BaseFloat prob, int32 trans_id, Vector< double > *stats) const
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
void ComputePerFrameVars(const VectorBase< BaseFloat > &data, const std::vector< int32 > &gselect, const Sgmm2PerSpkDerivedVars &spk_vars, Sgmm2PerFrameDerivedVars *per_frame_vars) const
This needs to be called with each new frame of data, prior to accumulation or likelihood evaluation: ...
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
bool HasKey(const std::string &key)
void Write(std::ostream &out_stream, bool binary) const
int NumArgs() const
Number of positional parameters (c.f. argc-1).
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
const T & Value(const std::string &key)
void SetSpeakerVector(const Vector< BaseFloat > &v_s_in)
void ConvertPosteriorToPdfs(const TransitionModel &tmodel, const Posterior &post_in, Posterior *post_out)
Converts a posterior over transition-ids to be a posterior over pdf-ids.
Class for the accumulators associated with the phonetic-subspace model parameters.
Holds the per-frame precomputed quantities x(t), x_{i}(t), z_{i}(t), and n_{i}(t) (cf...
void ResizeAccumulators(const AmSgmm2 &model, SgmmUpdateFlagsType flags, bool have_spk_vecs)
Resizes the accumulators to the correct sizes given the model.