arpa-file-parser.cc
Go to the documentation of this file.
1 // lm/arpa-file-parser.cc
2 
3 // Copyright 2014 Guoguo Chen
4 // Copyright 2016 Smart Action Company LLC (kkm)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include <fst/fstlib.h>
22 
23 #include <sstream>
24 
25 #include "base/kaldi-error.h"
26 #include "base/kaldi-math.h"
27 #include "lm/arpa-file-parser.h"
28 #include "util/text-utils.h"
29 
30 namespace kaldi {
31 
33  fst::SymbolTable* symbols)
34  : options_(options), symbols_(symbols),
35  line_number_(0), warning_count_(0) {
36 }
37 
39 }
40 
41 void TrimTrailingWhitespace(std::string *str) {
42  str->erase(str->find_last_not_of(" \n\r\t") + 1);
43 }
44 
45 void ArpaFileParser::Read(std::istream &is) {
46  // Argument sanity checks.
47  if (options_.bos_symbol <= 0 || options_.eos_symbol <= 0 ||
49  KALDI_ERR << "BOS and EOS symbols are required, must not be epsilons, and "
50  << "differ from each other. Given:"
51  << " BOS=" << options_.bos_symbol
52  << " EOS=" << options_.eos_symbol;
53  if (symbols_ != NULL &&
55  (options_.unk_symbol <= 0 ||
58  KALDI_ERR << "When symbol table is given and OOV mode is kReplaceWithUnk, "
59  << "UNK symbol is required, must not be epsilon, and "
60  << "differ from both BOS and EOS symbols. Given:"
61  << " UNK=" << options_.unk_symbol
62  << " BOS=" << options_.bos_symbol
63  << " EOS=" << options_.eos_symbol;
64  if (symbols_ != NULL && symbols_->Find(options_.bos_symbol).empty())
65  KALDI_ERR << "BOS symbol must exist in symbol table";
66  if (symbols_ != NULL && symbols_->Find(options_.eos_symbol).empty())
67  KALDI_ERR << "EOS symbol must exist in symbol table";
68  if (symbols_ != NULL && options_.unk_symbol > 0 &&
69  symbols_->Find(options_.unk_symbol).empty())
70  KALDI_ERR << "UNK symbol must exist in symbol table";
71 
72  ngram_counts_.clear();
73  line_number_ = 0;
74  warning_count_ = 0;
75  current_line_.clear();
76 
77 #define PARSE_ERR KALDI_ERR << LineReference() << ": "
78 
79  // Give derived class an opportunity to prepare its state.
80  ReadStarted();
81 
82  // Processes "\data\" section.
83  bool keyword_found = false;
84  while (++line_number_, getline(is, current_line_) && !is.eof()) {
85  if (current_line_.find_first_not_of(" \t\n\r") == std::string::npos) {
86  continue;
87  }
88 
90 
91  // Continue skipping lines until the \data\ marker alone on a line is found.
92  if (!keyword_found) {
93  if (current_line_ == "\\data\\") {
94  KALDI_LOG << "Reading \\data\\ section.";
95  keyword_found = true;
96  }
97  continue;
98  }
99 
100  if (current_line_[0] == '\\') break;
101 
102  // Enters "\data\" section, and looks for patterns like "ngram 1=1000",
103  // which means there are 1000 unigrams.
104  std::size_t equal_symbol_pos = current_line_.find("=");
105  if (equal_symbol_pos != std::string::npos)
106  // Guaranteed spaces around the "=".
107  current_line_.replace(equal_symbol_pos, 1, " = ");
108  std::vector<std::string> col;
109  SplitStringToVector(current_line_, " \t", true, &col);
110  if (col.size() == 4 && col[0] == "ngram" && col[2] == "=") {
111  int32 order, ngram_count = 0;
112  if (!ConvertStringToInteger(col[1], &order) ||
113  !ConvertStringToInteger(col[3], &ngram_count)) {
114  PARSE_ERR << "cannot parse ngram count";
115  }
116  if (ngram_counts_.size() <= order) {
117  ngram_counts_.resize(order);
118  }
119  ngram_counts_[order - 1] = ngram_count;
120  } else {
122  << ": uninterpretable line in \\data\\ section";
123  }
124  }
125 
126  if (ngram_counts_.size() == 0)
127  PARSE_ERR << "\\data\\ section missing or empty.";
128 
129  // Signal that grammar order and n-gram counts are known.
130  HeaderAvailable();
131 
132  NGram ngram;
133  ngram.words.reserve(ngram_counts_.size());
134 
135  // Processes "\N-grams:" section.
136  for (int32 cur_order = 1; cur_order <= ngram_counts_.size(); ++cur_order) {
137  // Skips n-grams with zero count.
138  if (ngram_counts_[cur_order - 1] == 0)
139  KALDI_WARN << "Zero ngram count in ngram order " << cur_order
140  << "(look for 'ngram " << cur_order << "=0' in the \\data\\ "
141  << " section). There is possibly a problem with the file.";
142 
143  // Must be looking at a \k-grams: directive at this point.
144  std::ostringstream keyword;
145  keyword << "\\" << cur_order << "-grams:";
146  if (current_line_ != keyword.str()) {
147  PARSE_ERR << "invalid directive, expecting '" << keyword.str() << "'";
148  }
149  KALDI_LOG << "Reading " << current_line_ << " section.";
150 
151  int32 ngram_count = 0;
152  while (++line_number_, getline(is, current_line_) && !is.eof()) {
153  if (current_line_.find_first_not_of(" \n\t\r") == std::string::npos) {
154  continue;
155  }
156  if (current_line_[0] == '\\') {
157  TrimTrailingWhitespace(&current_line_);
158  std::ostringstream next_keyword;
159  next_keyword << "\\" << cur_order + 1 << "-grams:";
160  if ((current_line_ != next_keyword.str()) &&
161  (current_line_ != "\\end\\")) {
162  if (ShouldWarn()) {
163  KALDI_WARN << "ignoring possible directive '" << current_line_
164  << "' expecting '" << next_keyword.str() << "'";
165 
166  if (warning_count_ > 0 &&
167  warning_count_ > static_cast<uint32>(options_.max_warnings)) {
168  KALDI_WARN << "Of " << warning_count_ << " parse warnings, "
169  << options_.max_warnings << " were reported. "
170  << "Run program with --max-arpa-warnings=-1 "
171  << "to see all warnings";
172  }
173  }
174  } else {
175  break;
176  }
177  }
178 
179  std::vector<std::string> col;
180  SplitStringToVector(current_line_, " \t", true, &col);
181 
182  if (col.size() < 1 + cur_order ||
183  col.size() > 2 + cur_order ||
184  (cur_order == ngram_counts_.size() && col.size() != 1 + cur_order)) {
185  PARSE_ERR << "Invalid n-gram data line";
186  }
187  ++ngram_count;
188 
189  // Parse out n-gram logprob and, if present, backoff weight.
190  if (!ConvertStringToReal(col[0], &ngram.logprob)) {
191  PARSE_ERR << "invalid n-gram logprob '" << col[0] << "'";
192  }
193  ngram.backoff = 0.0;
194  if (col.size() > cur_order + 1) {
195  if (!ConvertStringToReal(col[cur_order + 1], &ngram.backoff))
196  PARSE_ERR << "invalid backoff weight '" << col[cur_order + 1] << "'";
197  }
198  // Convert to natural log.
199  ngram.logprob *= M_LN10;
200  ngram.backoff *= M_LN10;
201 
202  ngram.words.resize(cur_order);
203  bool skip_ngram = false;
204  for (int32 index = 0; !skip_ngram && index < cur_order; ++index) {
205  int32 word;
206  if (symbols_) {
207  // Symbol table provided, so symbol labels are expected.
209  word = symbols_->AddSymbol(col[1 + index]);
210  } else {
211  word = symbols_->Find(col[1 + index]);
212  if (word == -1) { // fst::kNoSymbol
213  switch (options_.oov_handling) {
215  word = options_.unk_symbol;
216  break;
218  if (ShouldWarn())
219  KALDI_WARN << LineReference() << " skipped: word '"
220  << col[1 + index] << "' not in symbol table";
221  skip_ngram = true;
222  break;
223  default:
224  PARSE_ERR << "word '" << col[1 + index]
225  << "' not in symbol table";
226  }
227  }
228  }
229  } else {
230  // Symbols not provided, LM file should contain integers.
231  if (!ConvertStringToInteger(col[1 + index], &word) || word < 0) {
232  PARSE_ERR << "invalid symbol '" << col[1 + index] << "'";
233  }
234  }
235  // Whichever way we got it, an epsilon is invalid.
236  if (word == 0) {
237  PARSE_ERR << "epsilon symbol '" << col[1 + index]
238  << "' is illegal in ARPA LM";
239  }
240  ngram.words[index] = word;
241  }
242  if (!skip_ngram) {
243  ConsumeNGram(ngram);
244  }
245  }
246  if (ngram_count > ngram_counts_[cur_order - 1]) {
247  PARSE_ERR << "header said there would be " << ngram_counts_[cur_order - 1]
248  << " n-grams of order " << cur_order
249  << ", but we saw more already.";
250  }
251  }
252 
253  if (current_line_ != "\\end\\") {
254  PARSE_ERR << "invalid or unexpected directive line, expecting \\end\\";
255  }
256 
257  if (warning_count_ > 0 &&
258  warning_count_ > static_cast<uint32>(options_.max_warnings)) {
259  KALDI_WARN << "Of " << warning_count_ << " parse warnings, "
260  << options_.max_warnings << " were reported. Run program with "
261  << "--max_warnings=-1 to see all warnings";
262  }
263 
264  current_line_.clear();
265  ReadComplete();
266 
267 #undef PARSE_ERR
268 }
269 
270 std::string ArpaFileParser::LineReference() const {
271  std::ostringstream ss;
272  ss << "line " << line_number_ << " [" << current_line_ << "]";
273  return ss.str();
274 }
275 
277  return (warning_count_ != -1) &&
278  (++warning_count_ <= static_cast<uint32>(options_.max_warnings));
279 }
280 
281 } // namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
ArpaParseOptions options_
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
Definition: text-utils.h:118
ArpaFileParser(const ArpaParseOptions &options, fst::SymbolTable *symbols)
Constructs the parser with the given options and optional symbol table.
virtual void ReadStarted()
Override called before reading starts.
Options that control ArpaFileParser.
virtual void ReadComplete()
Override function called after the last n-gram has been consumed.
int32 unk_symbol
Symbol for <unk>, Required for kReplaceWithUnk.
float logprob
Log-prob of the n-gram.
kaldi::int32 int32
virtual void HeaderAvailable()
Override function called to signal that ARPA header with the expected number of n-grams has been read...
Add novel words to the symbol table.
int32 eos_symbol
Symbol for </s>, Required non-epsilon.
float backoff
log-backoff weight of the n-gram.
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
std::string LineReference() const
Inside ConsumeNGram(), returns a formatted reference to the line being compiled, to print out as part...
std::vector< int32 > ngram_counts_
std::vector< int32 > words
Symbols in left to right order.
#define KALDI_ERR
Definition: kaldi-error.h:147
bool ConvertStringToReal(const std::string &str, T *out)
ConvertStringToReal converts a string into either float or double and returns false if there was any ...
Definition: text-utils.cc:238
#define KALDI_WARN
Definition: kaldi-error.h:150
Skip n-gram with OOV word and continue.
void Read(std::istream &is)
Read ARPA LM file from a stream.
int32 bos_symbol
Symbol for <s>, Required non-epsilon.
#define PARSE_ERR
#define M_LN10
Definition: kaldi-math.h:68
bool ShouldWarn()
Increments warning count, and returns true if a warning should be printed or false if the count has e...
int32 max_warnings
Maximum warnings to report, <0 unlimited.
fst::SymbolTable * symbols_
OovHandling oov_handling
How to handle OOV words in the file.
void TrimTrailingWhitespace(std::string *str)
#define KALDI_LOG
Definition: kaldi-error.h:153
virtual void ConsumeNGram(const NGram &)=0
Pure override that must be implemented to process current n-gram.
Replace OOV words with <unk>.
A parsed n-gram from ARPA LM file.