prons-to-wordali.cc
Go to the documentation of this file.
1 // bin/prons-to-wordali.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 #include "base/kaldi-common.h"
22 #include "hmm/transition-model.h"
23 #include "hmm/hmm-utils.h"
24 #include "util/common-utils.h"
25 #include "fst/fstlib.h"
26 #include "fstext/fstext-utils.h"
27 
28 
29 int main(int argc, char *argv[]) {
30  using namespace kaldi;
31  using fst::VectorFst;
32  using fst::StdArc;
33  typedef kaldi::int32 int32;
34  try {
35  const char *usage =
36  "Caution: this program relates to older scripts and is deprecated,\n"
37  "for modern scripts see egs/wsj/s5/steps/{get_ctm,get_train_ctm}.sh\n"
38  "Given per-utterance pronunciation information as output by \n"
39  "words-to-prons, and per-utterance phone alignment information\n"
40  "as output by ali-to-phones --write-lengths, output word alignment\n"
41  "information that can be turned into the ctm format.\n"
42  "Outputs is pairs of (word, #frames), or if --per-frame is given,\n"
43  "just the word for each frame.\n"
44  "Note: zero word-id usually means optional silence.\n"
45  "Format is standard format for archives of vector<pair<int32, int32> >\n"
46  "i.e. :\n"
47  "utt-id 600 22 ; 1028 32 ; 0 41\n"
48  "where 600, 1028 and 0 are the word-ids, and 22, 32 and 41 are the\n"
49  "lengths.\n"
50  "\n"
51  "Usage: prons-to-wordali [options] <prons-rspecifier>"
52  " <phone-lengths-rspecifier> <wordali-wspecifier>\n"
53  "e.g.: \n"
54  " ali-to-phones 1.mdl ark:1.ali ark:- | \\\n"
55  " phones-to-prons L_align.fst 46 47 ark:- 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
56  " ark:- | prons-to-wordali ark:- \\\n"
57  " \"ark:ali-to-phones --write-lengths 1.mdl ark:1.ali ark:-|\" ark:1.wali\n";
58 
59  ParseOptions po(usage);
60  bool per_frame = false;
61  po.Register("per-frame", &per_frame, "If true, write out the frame-level word alignment (else word sequence)");
62  po.Read(argc, argv);
63 
64  if (po.NumArgs() != 3) {
65  po.PrintUsage();
66  exit(1);
67  }
68 
69  std::string prons_rspecifier = po.GetArg(1),
70  phone_lengths_rspecifier = po.GetArg(2),
71  wordali_wspecifier = po.GetArg(3);
72 
73 
74  SequentialInt32VectorVectorReader prons_reader(prons_rspecifier);
75  RandomAccessInt32PairVectorReader phones_reader(phone_lengths_rspecifier);
76 
77  std::string empty;
78  Int32PairVectorWriter pair_writer(per_frame ? empty : wordali_wspecifier);
79  Int32VectorWriter frame_writer(per_frame ? wordali_wspecifier : empty);
80 
81  int32 n_done = 0, n_err = 0;
82 
83  for (; !prons_reader.Done(); prons_reader.Next()) {
84  std::string key = prons_reader.Key();
85  const std::vector<std::vector<int32> > &prons = prons_reader.Value();
86  if (!phones_reader.HasKey(key)) {
87  KALDI_WARN << "Not processing utterance " << key << " because no phone "
88  << "alignment found.";
89  n_err++;
90  continue;
91  }
92  // first member of each pair is phone; second is length in
93  // frames.
94  const std::vector<std::pair<int32, int32> > &phones =
95  phones_reader.Value(key);
96 
97  std::vector<std::pair<int32, int32> > word_alignment;
98 
99  size_t p = 0; // index into "phones".
100  for (size_t i = 0; i < prons.size(); i++) {
101  if (!(prons[i].size() >= 1)) {
102  KALDI_WARN << "Invalid, empty pronunciation.";
103  n_err++;
104  continue;
105  }
106  int32 word = prons[i][0], word_len = 0;
107  for (size_t j = 1; j < prons[i].size(); j++, p++) {
108  if (!(static_cast<size_t>(p) < phones.size() &&
109  prons[i][j] == phones[p].first) ) {
110  KALDI_WARN << "For key " << key << ", mismatch between prons and phones.";
111  n_err++;
112  continue;
113  }
114  word_len += phones[p].second;
115  }
116  word_alignment.push_back(std::make_pair(word, word_len));
117  }
118  if (static_cast<size_t>(p) != phones.size()) {
119  KALDI_WARN << "For key " << key << ", mismatch between prons and phones (wrong #phones)";
120  n_err++;
121  continue;
122  }
123 
124  if (!per_frame) {
125  pair_writer.Write(key, word_alignment);
126  } else {
127  std::vector<int32> word_per_frame;
128  for (size_t i = 0; i < word_alignment.size(); i++) {
129  int32 word = word_alignment[i].first,
130  len = word_alignment[i].second;
131  for (int32 j = 0; j < len; j++)
132  word_per_frame.push_back(word);
133  }
134  frame_writer.Write(key, word_per_frame);
135  }
136  n_done++;
137  }
138  KALDI_LOG << "Done " << n_done << " utterances; " << n_err << " had errors.";
139  } catch(const std::exception &e) {
140  std::cerr << e.what();
141  return -1;
142  }
143 }
144 
145 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
fst::StdArc StdArc
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const T & Value(const std::string &key)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
bool HasKey(const std::string &key)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
int main(int argc, char *argv[])
#define KALDI_LOG
Definition: kaldi-error.h:153