21 #include <fst/fstlib.h> 33 fst::SymbolTable* symbols)
34 : options_(options), symbols_(symbols),
35 line_number_(0), warning_count_(0) {
42 str->erase(str->find_last_not_of(
" \n\r\t") + 1);
49 KALDI_ERR <<
"BOS and EOS symbols are required, must not be epsilons, and " 50 <<
"differ from each other. Given:" 58 KALDI_ERR <<
"When symbol table is given and OOV mode is kReplaceWithUnk, " 59 <<
"UNK symbol is required, must not be epsilon, and " 60 <<
"differ from both BOS and EOS symbols. Given:" 65 KALDI_ERR <<
"BOS symbol must exist in symbol table";
67 KALDI_ERR <<
"EOS symbol must exist in symbol table";
70 KALDI_ERR <<
"UNK symbol must exist in symbol table";
77 #define PARSE_ERR KALDI_ERR << LineReference() << ": " 83 bool keyword_found =
false;
85 if (
current_line_.find_first_not_of(
" \t\n\r") == std::string::npos) {
105 if (equal_symbol_pos != std::string::npos)
108 std::vector<std::string> col;
110 if (col.size() == 4 && col[0] ==
"ngram" && col[2] ==
"=") {
111 int32 order, ngram_count = 0;
122 <<
": uninterpretable line in \\data\\ section";
127 PARSE_ERR <<
"\\data\\ section missing or empty.";
139 KALDI_WARN <<
"Zero ngram count in ngram order " << cur_order
140 <<
"(look for 'ngram " << cur_order <<
"=0' in the \\data\\ " 141 <<
" section). There is possibly a problem with the file.";
144 std::ostringstream keyword;
145 keyword <<
"\\" << cur_order <<
"-grams:";
147 PARSE_ERR <<
"invalid directive, expecting '" << keyword.str() <<
"'";
151 int32 ngram_count = 0;
152 while (++
line_number_, getline(is, current_line_) && !is.eof()) {
153 if (current_line_.find_first_not_of(
" \n\t\r") == std::string::npos) {
156 if (current_line_[0] ==
'\\') {
158 std::ostringstream next_keyword;
159 next_keyword <<
"\\" << cur_order + 1 <<
"-grams:";
160 if ((current_line_ != next_keyword.str()) &&
161 (current_line_ !=
"\\end\\")) {
163 KALDI_WARN <<
"ignoring possible directive '" << current_line_
164 <<
"' expecting '" << next_keyword.str() <<
"'";
170 <<
"Run program with --max-arpa-warnings=-1 " 171 <<
"to see all warnings";
179 std::vector<std::string> col;
182 if (col.size() < 1 + cur_order ||
183 col.size() > 2 + cur_order ||
184 (cur_order ==
ngram_counts_.size() && col.size() != 1 + cur_order)) {
191 PARSE_ERR <<
"invalid n-gram logprob '" << col[0] <<
"'";
194 if (col.size() > cur_order + 1) {
196 PARSE_ERR <<
"invalid backoff weight '" << col[cur_order + 1] <<
"'";
202 ngram.
words.resize(cur_order);
203 bool skip_ngram =
false;
204 for (
int32 index = 0; !skip_ngram && index < cur_order; ++index) {
209 word =
symbols_->AddSymbol(col[1 + index]);
211 word =
symbols_->Find(col[1 + index]);
220 << col[1 + index] <<
"' not in symbol table";
225 <<
"' not in symbol table";
232 PARSE_ERR <<
"invalid symbol '" << col[1 + index] <<
"'";
237 PARSE_ERR <<
"epsilon symbol '" << col[1 + index]
238 <<
"' is illegal in ARPA LM";
240 ngram.
words[index] = word;
248 <<
" n-grams of order " << cur_order
249 <<
", but we saw more already.";
254 PARSE_ERR <<
"invalid or unexpected directive line, expecting \\end\\";
261 <<
"--max_warnings=-1 to see all warnings";
271 std::ostringstream ss;
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
ArpaParseOptions options_
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
ArpaFileParser(const ArpaParseOptions &options, fst::SymbolTable *symbols)
Constructs the parser with the given options and optional symbol table.
virtual ~ArpaFileParser()
virtual void ReadStarted()
Override called before reading starts.
Options that control ArpaFileParser.
virtual void ReadComplete()
Override function called after the last n-gram has been consumed.
int32 unk_symbol
Symbol for <unk>, Required for kReplaceWithUnk.
float logprob
Log-prob of the n-gram.
virtual void HeaderAvailable()
Override function called to signal that ARPA header with the expected number of n-grams has been read...
Add novel words to the symbol table.
int32 eos_symbol
Symbol for </s>, Required non-epsilon.
float backoff
log-backoff weight of the n-gram.
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
std::string LineReference() const
Inside ConsumeNGram(), returns a formatted reference to the line being compiled, to print out as part...
std::vector< int32 > ngram_counts_
std::vector< int32 > words
Symbols in left to right order.
bool ConvertStringToReal(const std::string &str, T *out)
ConvertStringToReal converts a string into either float or double and returns false if there was any ...
std::string current_line_
Skip n-gram with OOV word and continue.
void Read(std::istream &is)
Read ARPA LM file from a stream.
int32 bos_symbol
Symbol for <s>, Required non-epsilon.
bool ShouldWarn()
Increments warning count, and returns true if a warning should be printed or false if the count has e...
int32 max_warnings
Maximum warnings to report, <0 unlimited.
fst::SymbolTable * symbols_
OovHandling oov_handling
How to handle OOV words in the file.
void TrimTrailingWhitespace(std::string *str)
virtual void ConsumeNGram(const NGram &)=0
Pure override that must be implemented to process current n-gram.
Replace OOV words with <unk>.
A parsed n-gram from ARPA LM file.