30 using namespace kaldi;
36 "Caution: this program relates to older scripts and is deprecated,\n" 37 "for modern scripts see egs/wsj/s5/steps/{get_ctm,get_train_ctm}.sh\n" 38 "Given per-utterance pronunciation information as output by \n" 39 "words-to-prons, and per-utterance phone alignment information\n" 40 "as output by ali-to-phones --write-lengths, output word alignment\n" 41 "information that can be turned into the ctm format.\n" 42 "Outputs is pairs of (word, #frames), or if --per-frame is given,\n" 43 "just the word for each frame.\n" 44 "Note: zero word-id usually means optional silence.\n" 45 "Format is standard format for archives of vector<pair<int32, int32> >\n" 47 "utt-id 600 22 ; 1028 32 ; 0 41\n" 48 "where 600, 1028 and 0 are the word-ids, and 22, 32 and 41 are the\n" 51 "Usage: prons-to-wordali [options] <prons-rspecifier>" 52 " <phone-lengths-rspecifier> <wordali-wspecifier>\n" 54 " ali-to-phones 1.mdl ark:1.ali ark:- | \\\n" 55 " phones-to-prons L_align.fst 46 47 ark:- 'ark:sym2int.pl -f 2- words.txt text|' \\\n" 56 " ark:- | prons-to-wordali ark:- \\\n" 57 " \"ark:ali-to-phones --write-lengths 1.mdl ark:1.ali ark:-|\" ark:1.wali\n";
60 bool per_frame =
false;
61 po.Register(
"per-frame", &per_frame,
"If true, write out the frame-level word alignment (else word sequence)");
64 if (po.NumArgs() != 3) {
69 std::string prons_rspecifier = po.GetArg(1),
70 phone_lengths_rspecifier = po.GetArg(2),
71 wordali_wspecifier = po.GetArg(3);
81 int32 n_done = 0, n_err = 0;
83 for (; !prons_reader.Done(); prons_reader.Next()) {
84 std::string key = prons_reader.Key();
85 const std::vector<std::vector<int32> > &prons = prons_reader.Value();
86 if (!phones_reader.HasKey(key)) {
87 KALDI_WARN <<
"Not processing utterance " << key <<
" because no phone " 88 <<
"alignment found.";
94 const std::vector<std::pair<int32, int32> > &phones =
95 phones_reader.Value(key);
97 std::vector<std::pair<int32, int32> > word_alignment;
100 for (
size_t i = 0;
i < prons.size();
i++) {
101 if (!(prons[
i].size() >= 1)) {
102 KALDI_WARN <<
"Invalid, empty pronunciation.";
106 int32 word = prons[
i][0], word_len = 0;
107 for (
size_t j = 1;
j < prons[
i].size();
j++, p++) {
108 if (!(static_cast<size_t>(p) < phones.size() &&
109 prons[
i][
j] == phones[p].first) ) {
110 KALDI_WARN <<
"For key " << key <<
", mismatch between prons and phones.";
114 word_len += phones[p].second;
116 word_alignment.push_back(std::make_pair(word, word_len));
118 if (static_cast<size_t>(p) != phones.size()) {
119 KALDI_WARN <<
"For key " << key <<
", mismatch between prons and phones (wrong #phones)";
125 pair_writer.Write(key, word_alignment);
127 std::vector<int32> word_per_frame;
128 for (
size_t i = 0;
i < word_alignment.size();
i++) {
129 int32 word = word_alignment[
i].first,
130 len = word_alignment[
i].second;
131 for (int32
j = 0;
j < len;
j++)
132 word_per_frame.push_back(word);
134 frame_writer.Write(key, word_per_frame);
138 KALDI_LOG <<
"Done " << n_done <<
" utterances; " << n_err <<
" had errors.";
139 }
catch(
const std::exception &e) {
140 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
A templated class for writing objects to an archive or script file; see The Table concept...
Allows random access to a collection of objects in an archive or script file; see The Table concept...
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
A templated class for reading objects sequentially from an archive or script file; see The Table conc...