35 using namespace kaldi;
37 using fst::SymbolTable;
42 "Align features given [SGMM-based] models.\n" 43 "Usage: sgmm2-align-compiled [options] <model-in> <graphs-rspecifier> " 44 "<feature-rspecifier> <alignments-wspecifier>\n" 45 "e.g.: sgmm2-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n";
54 std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
55 std::string per_frame_acwt_wspecifier;
58 po.Register(
"binary", &binary,
"Write output in binary mode");
59 po.Register(
"log-prune", &log_prune,
"Pruning beam used to reduce number " 60 "of exp() evaluations.");
61 po.Register(
"spk-vecs", &spkvecs_rspecifier,
"Speaker vectors (rspecifier)");
62 po.Register(
"utt2spk", &utt2spk_rspecifier,
63 "rspecifier for utterance to speaker map");
64 po.Register(
"acoustic-scale", &acoustic_scale,
"Scaling factor for acoustic " 66 po.Register(
"transition-scale", &transition_scale,
"Scaling factor for " 67 "some transition probabilities [see also self-loop-scale].");
68 po.Register(
"self-loop-scale", &self_loop_scale,
"Scaling factor for " 69 "self-loop versus non-self-loop probability mass [controls " 70 "most transition probabilities.]");
71 po.Register(
"write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
72 "Wspecifier for table of vectors containing the acoustic log-likelihoods " 73 "per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
74 po.Register(
"gselect", &gselect_rspecifier,
"Precomputed Gaussian indices " 79 if (po.NumArgs() != 4) {
84 if (gselect_rspecifier ==
"")
85 KALDI_ERR <<
"--gselect option is mandatory.";
87 std::string model_in_filename = po.GetArg(1),
88 fst_rspecifier = po.GetArg(2),
89 feature_rspecifier = po.GetArg(3),
90 alignment_wspecifier = po.GetArg(4);
96 Input ki(model_in_filename, &binary);
97 trans_model.
Read(ki.Stream(), binary);
98 am_sgmm.
Read(ki.Stream(), binary);
111 int num_done = 0, num_err = 0, num_retry = 0;
112 double tot_like = 0.0;
113 kaldi::int64 frame_count = 0;
115 for (; !fst_reader.Done(); fst_reader.Next()) {
116 std::string utt = fst_reader.Key();
117 if (!feature_reader.HasKey(utt)) {
118 KALDI_WARN <<
"No feature found for utterance " << utt;
122 VectorFst<StdArc> decode_fst(fst_reader.Value());
125 fst_reader.FreeCurrent();
129 KALDI_WARN <<
"Zero-length utterance: " << utt;
135 if (spkvecs_reader.IsOpen()) {
136 if (spkvecs_reader.HasKey(utt)) {
140 KALDI_WARN <<
"Cannot find speaker vector for " << utt;
146 if (!gselect_reader.HasKey(utt)
147 && gselect_reader.Value(utt).size() != features.
NumRows()) {
148 KALDI_WARN <<
"No Gaussian-selection info available for utterance " 149 << utt <<
" (or wrong size)";
152 const std::vector<std::vector<int32> > &gselect =
153 gselect_reader.Value(utt);
156 std::vector<int32> disambig_syms;
158 transition_scale, self_loop_scale,
163 log_prune, acoustic_scale, &spk_vars);
166 acoustic_scale, &decode_fst, &sgmm_decodable,
167 &alignment_writer, NULL,
168 &num_done, &num_err, &num_retry,
169 &tot_like, &frame_count, &per_frame_acwt_writer);
173 KALDI_LOG <<
"Overall log-likelihood per frame is " << (tot_like/frame_count)
174 <<
" over " << frame_count<<
" frames.";
175 KALDI_LOG <<
"Retried " << num_retry <<
" out of " 176 << (num_done + num_err) <<
" utterances.";
177 KALDI_LOG <<
"Done " << num_done <<
", errors on " << num_err;
178 return (num_done != 0 ? 0 : 1);
179 }
catch(
const std::exception &e) {
180 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Class for definition of the subspace Gmm acoustic model.
void Register(OptionsItf *opts)
This class is for when you are reading something in random access, but it may actually be stored per-...
void Read(std::istream &is, bool binary)
A templated class for writing objects to an archive or script file; see The Table concept...
void ComputePerSpkDerivedVars(Sgmm2PerSpkDerivedVars *vars) const
Computes the per-speaker derived vars; assumes vars->v_s is already set up.
Allows random access to a collection of objects in an archive or script file; see The Table concept...
void AddTransitionProbs(const TransitionModel &trans_model, const std::vector< int32 > &disambig_syms, BaseFloat transition_scale, BaseFloat self_loop_scale, fst::VectorFst< fst::StdArc > *fst)
Adds transition-probs, with the supplied scales (see Scaling of transition and acoustic probabilities...
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
void Read(std::istream &is, bool binary)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
void SetSpeakerVector(const Vector< BaseFloat > &v_s_in)
void AlignUtteranceWrapper(const AlignConfig &config, const std::string &utt, BaseFloat acoustic_scale, fst::VectorFst< fst::StdArc > *fst, DecodableInterface *decodable, Int32VectorWriter *alignment_writer, BaseFloatWriter *scores_writer, int32 *num_done, int32 *num_error, int32 *num_retried, double *tot_like, int64 *frame_count, BaseFloatVectorWriter *per_frame_acwt_writer)
AlignUtteranceWapper is a wrapper for alignment code used in training, that is called from many diffe...