phones-to-prons.cc
Go to the documentation of this file.
1 // bin/phones-to-prons.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 // 2013 Johns Hopkins University (author: Daniel Povey)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 
22 #include "base/kaldi-common.h"
23 #include "hmm/transition-model.h"
24 #include "hmm/hmm-utils.h"
25 #include "util/common-utils.h"
26 #include "fst/fstlib.h"
27 #include "fstext/fstext-lib.h"
28 
29 // Create FST that accepts the phone sequence, with any number
30 // of word-start and word-end symbol in between each phone.
31 void CreatePhonesAltFst(const std::vector<int32> &phones,
32  int32 word_start_sym,
33  int32 word_end_sym,
34  fst::VectorFst<fst::StdArc> *ofst) {
35  using fst::StdArc;
38 
39  ofst->DeleteStates();
40  StateId cur_s = ofst->AddState();
41  ofst->SetStart(cur_s); // will be 0.
42  for (size_t i = 0; i < phones.size(); i++) {
43  StateId next_s = ofst->AddState();
44  // add arc to next state.
45  ofst->AddArc(cur_s, StdArc(phones[i], phones[i], Weight::One(),
46  next_s));
47  cur_s = next_s;
48  }
49  for (StateId s = 0; s <= cur_s; s++) {
50  ofst->AddArc(s, StdArc(word_end_sym, word_end_sym,
51  Weight::One(), s));
52  ofst->AddArc(s, StdArc(word_start_sym, word_start_sym,
53  Weight::One(), s));
54  }
55  ofst->SetFinal(cur_s, Weight::One());
56  {
57  fst::OLabelCompare<StdArc> olabel_comp;
58  ArcSort(ofst, olabel_comp);
59  }
60 }
61 
62 int main(int argc, char *argv[]) {
63  using namespace kaldi;
64  using fst::VectorFst;
65  using fst::StdArc;
66  typedef kaldi::int32 int32;
67  try {
68  const char *usage =
69  "Convert pairs of (phone-level, word-level) transcriptions to\n"
70  "output that indicates the phones assigned to each word.\n"
71  "Format is standard format for archives of vector<vector<int32> >\n"
72  "i.e. :\n"
73  "utt-id 600 4 7 19 ; 512 4 18 ; 0 1\n"
74  "where 600, 512 and 0 are the word-ids (0 for non-word phones, e.g.\n"
75  "optional-silence introduced by the lexicon), and the phone-ids\n"
76  "follow the word-ids.\n"
77  "Note: L_align.fst must have word-start and word-end symbols in it\n"
78  "\n"
79  "Usage: phones-to-prons [options] <L_align.fst> <word-start-sym> "
80  "<word-end-sym> <phones-rspecifier> <words-rspecifier> <prons-wspecifier>\n"
81  "e.g.: \n"
82  " ali-to-phones 1.mdl ark:1.ali ark:- | \\\n"
83  " phones-to-prons L_align.fst 46 47 ark:- "
84  "'ark:sym2int.pl -f 2- words.txt text|' ark:1.prons\n";
85 
86  ParseOptions po(usage);
87  po.Read(argc, argv);
88 
89  if (po.NumArgs() != 6) {
90  po.PrintUsage();
91  exit(1);
92  }
93 
94 
95  std::string lex_fst_filename = po.GetArg(1),
96  word_start_sym_str = po.GetArg(2),
97  word_end_sym_str = po.GetArg(3),
98  phones_rspecifier = po.GetArg(4),
99  words_rspecifier = po.GetArg(5),
100  prons_wspecifier = po.GetArg(6);
101 
102  int32 word_start_sym, word_end_sym;
103 
104  if (!ConvertStringToInteger(word_start_sym_str, &word_start_sym)
105  || word_start_sym <= 0)
106  KALDI_ERR << "Invalid word start symbol (expecting integer >= 0): "
107  << word_start_sym_str;
108  if (!ConvertStringToInteger(word_end_sym_str, &word_end_sym)
109  || word_end_sym <= 0 || word_end_sym == word_start_sym)
110  KALDI_ERR << "Invalid word end symbol (expecting integer >= 0"
111  << ", different from word start symbol): "
112  << word_end_sym_str;
113 
114  // L should be lexicon with word start and end symbols marked.
115  VectorFst<StdArc> *L = fst::ReadFstKaldi(lex_fst_filename);
116  {
117  // Make sure that L is sorted on output symbol (words).
118  fst::OLabelCompare<StdArc> olabel_comp;
119  ArcSort(L, olabel_comp);
120  }
121 
122  SequentialInt32VectorReader phones_reader(phones_rspecifier);
123  RandomAccessInt32VectorReader words_reader(words_rspecifier);
124 
125  int32 n_done = 0, n_err = 0;
126 
127  std::string empty;
128  Int32VectorVectorWriter prons_writer(prons_wspecifier);
129 
130  for (; !phones_reader.Done(); phones_reader.Next()) {
131  std::string key = phones_reader.Key();
132  const std::vector<int32> &phones = phones_reader.Value();
133  if (!words_reader.HasKey(key)) {
134  KALDI_WARN << "Not processing utterance " << key << " because no word "
135  << "transcription found.";
136  n_err++;
137  continue;
138  }
139  const std::vector<int32> &words = words_reader.Value(key);
140 
141  // convert word alignment to acceptor and compose it with lexicon.
142  // phn2word will have phones (and word start/end symbols) on its
143  // input, and words on its output. It will enode the alternative
144  // pronunciations of this word-sequence, with word start and end
145  // symbols at the appropriate places.
146  VectorFst<StdArc> phn2word;
147  {
148  VectorFst<StdArc> words_acceptor;
149  MakeLinearAcceptor(words, &words_acceptor);
150  Compose(*L, words_acceptor, &phn2word);
151  }
152  if (phn2word.Start() == fst::kNoStateId) {
153  KALDI_WARN << "Phone to word FST for utterance " << key
154  << "is empty (either decoding for this utterance did "
155  << "not reach end-state, or mismatched lexicon.)";
156  n_err++;
157  continue;
158  }
159 
160  VectorFst<StdArc> phones_alt_fst;
161  CreatePhonesAltFst(phones, word_start_sym, word_end_sym, &phones_alt_fst);
162 
163  // phnx2word will have phones and word-start and word-end symbols
164  // on the input side, and words on the output side.
165  VectorFst<StdArc> phnx2word;
166  Compose(phones_alt_fst, phn2word, &phnx2word);
167 
168  if (phnx2word.Start() == fst::kNoStateId) {
169  KALDI_WARN << "phnx2word FST for utterance " << key
170  << "is empty (either decoding for this utterance did "
171  << "not reach end-state, or mismatched lexicon.)";
172  if (g_kaldi_verbose_level >= 2) {
173  KALDI_LOG << "phn2word FST is below:";
174  fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true, "\t");
175  fstprinter.Print(&std::cerr, "standard error");
176  KALDI_LOG << "phone sequence is: ";
177  for (size_t i = 0; i < phones.size(); i++)
178  std::cerr << phones[i] << ' ';
179  std::cerr << '\n';
180  }
181  continue;
182  }
183 
184  // Now get the best path in phnx2word.
185  VectorFst<StdArc> phnx2word_best;
186  ShortestPath(phnx2word, &phnx2word_best);
187 
188  // Now get seqs of phones and words.
189  std::vector<int32> phnx, words2;
190  StdArc::Weight garbage;
191  if (!fst::GetLinearSymbolSequence(phnx2word_best,
192  &phnx, &words2, &garbage))
193  KALDI_ERR << "phnx2word is not a linear transducer (code error?)";
194  if (words2 != words)
195  KALDI_ERR << "words have changed! (code error?)";
196 
197  // Now, "phnx" should be the phone sequence with start and end
198  // symbols included. At this point we break it up into segments,
199  // and try to match it up with words.
200  std::vector<std::vector<int32> > prons;
201  if (!ConvertPhnxToProns(phnx, words,
202  word_start_sym, word_end_sym,
203  &prons)) {
204  KALDI_WARN << "Error converting phones and words to prons "
205  << " (mismatched or non-marked lexicon or partial "
206  << " alignment?)";
207  n_err++;
208  continue;
209  }
210  prons_writer.Write(key, prons);
211  n_done++;
212  }
213  KALDI_LOG << "Done " << n_done << " utterances; " << n_err << " had errors.";
214  } catch(const std::exception &e) {
215  std::cerr << e.what();
216  return -1;
217  }
218 }
int32 words[kMaxOrder]
fst::StdArc::StateId StateId
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
Definition: text-utils.h:118
bool ConvertPhnxToProns(const std::vector< int32 > &phnx, const std::vector< int32 > &words, int32 word_start_sym, int32 word_end_sym, std::vector< std::vector< int32 > > *prons)
Definition: hmm-utils.cc:1161
int main(int argc, char *argv[])
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
fst::StdArc StdArc
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
bool GetLinearSymbolSequence(const Fst< Arc > &fst, std::vector< I > *isymbols_out, std::vector< I > *osymbols_out, typename Arc::Weight *tot_weight_out)
GetLinearSymbolSequence gets the symbol sequence from a linear FST.
void Write(const std::string &key, const T &value) const
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
void MakeLinearAcceptor(const std::vector< I > &labels, MutableFst< Arc > *ofst)
Creates unweighted linear acceptor from symbol sequence.
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const T & Value(const std::string &key)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
void CreatePhonesAltFst(const std::vector< int32 > &phones, int32 word_start_sym, int32 word_end_sym, fst::VectorFst< fst::StdArc > *ofst)
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
bool HasKey(const std::string &key)
fst::StdArc::Weight Weight
int NumArgs() const
Number of positional parameters (c.f. argc-1).
int32 g_kaldi_verbose_level
This is set by util/parse-options.
Definition: kaldi-error.cc:46
void ReadFstKaldi(std::istream &is, bool binary, VectorFst< Arc > *fst)
#define KALDI_LOG
Definition: kaldi-error.h:153