21 #include <fst/fstlib.h>    33                                fst::SymbolTable* symbols)
    34     : options_(options), symbols_(symbols),
    35       line_number_(0), warning_count_(0) {
    42   str->erase(str->find_last_not_of(
" \n\r\t") + 1);
    49     KALDI_ERR << 
"BOS and EOS symbols are required, must not be epsilons, and "    50               << 
"differ from each other. Given:"    58     KALDI_ERR << 
"When symbol table is given and OOV mode is kReplaceWithUnk, "    59               << 
"UNK symbol is required, must not be epsilon, and "    60               << 
"differ from both BOS and EOS symbols. Given:"    65     KALDI_ERR << 
"BOS symbol must exist in symbol table";
    67     KALDI_ERR << 
"EOS symbol must exist in symbol table";
    70     KALDI_ERR << 
"UNK symbol must exist in symbol table";
    77 #define PARSE_ERR KALDI_ERR << LineReference() << ": "    83   bool keyword_found = 
false;
    85     if (
current_line_.find_first_not_of(
" \t\n\r") == std::string::npos) {
   105     if (equal_symbol_pos != std::string::npos)
   108     std::vector<std::string> col;
   110     if (col.size() == 4 && col[0] == 
"ngram" && col[2] == 
"=") {
   111       int32 order, ngram_count = 0;
   122                  << 
": uninterpretable line in \\data\\ section";
   127     PARSE_ERR << 
"\\data\\ section missing or empty.";
   139       KALDI_WARN << 
"Zero ngram count in ngram order " << cur_order
   140                  << 
"(look for 'ngram " << cur_order << 
"=0' in the \\data\\ "   141                  << 
" section). There is possibly a problem with the file.";
   144     std::ostringstream keyword;
   145     keyword << 
"\\" << cur_order << 
"-grams:";
   147       PARSE_ERR << 
"invalid directive, expecting '" << keyword.str() << 
"'";
   151     int32 ngram_count = 0;
   152     while (++
line_number_, getline(is, current_line_) && !is.eof()) {
   153       if (current_line_.find_first_not_of(
" \n\t\r") == std::string::npos) {
   156       if (current_line_[0] == 
'\\') {
   158         std::ostringstream next_keyword;
   159         next_keyword << 
"\\" << cur_order + 1 << 
"-grams:";
   160         if ((current_line_ != next_keyword.str()) &&
   161             (current_line_ != 
"\\end\\")) {
   163             KALDI_WARN << 
"ignoring possible directive '" << current_line_
   164                        << 
"' expecting '" << next_keyword.str() << 
"'";
   170                          << 
"Run program with --max-arpa-warnings=-1 "   171                          << 
"to see all warnings";
   179       std::vector<std::string> col;
   182       if (col.size() < 1 + cur_order ||
   183           col.size() > 2 + cur_order ||
   184           (cur_order == 
ngram_counts_.size() && col.size() != 1 + cur_order)) {
   191         PARSE_ERR << 
"invalid n-gram logprob '" << col[0] << 
"'";
   194       if (col.size() > cur_order + 1) {
   196           PARSE_ERR << 
"invalid backoff weight '" << col[cur_order + 1] << 
"'";
   202       ngram.
words.resize(cur_order);
   203       bool skip_ngram = 
false;
   204       for (
int32 index = 0; !skip_ngram && index < cur_order; ++index) {
   209             word = 
symbols_->AddSymbol(col[1 + index]);
   211             word = 
symbols_->Find(col[1 + index]);
   220                                << col[1 + index] << 
"' not in symbol table";
   225                             << 
"' not in symbol table";
   232             PARSE_ERR << 
"invalid symbol '" << col[1 + index] << 
"'";
   237           PARSE_ERR << 
"epsilon symbol '" << col[1 + index]
   238                     << 
"' is illegal in ARPA LM";
   240         ngram.
words[index] = word;
   248                 << 
" n-grams of order " << cur_order
   249                 << 
", but we saw more already.";
   254     PARSE_ERR << 
"invalid or unexpected directive line, expecting \\end\\";
   261                << 
"--max_warnings=-1 to see all warnings";
   271   std::ostringstream ss;
 This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
 
ArpaParseOptions options_
 
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
 
ArpaFileParser(const ArpaParseOptions &options, fst::SymbolTable *symbols)
Constructs the parser with the given options and optional symbol table. 
 
virtual ~ArpaFileParser()
 
virtual void ReadStarted()
Override called before reading starts. 
 
Options that control ArpaFileParser. 
 
virtual void ReadComplete()
Override function called after the last n-gram has been consumed. 
 
int32 unk_symbol
Symbol for <unk>, Required for kReplaceWithUnk. 
 
float logprob
Log-prob of the n-gram. 
 
virtual void HeaderAvailable()
Override function called to signal that ARPA header with the expected number of n-grams has been read...
 
Add novel words to the symbol table. 
 
int32 eos_symbol
Symbol for </s>, Required non-epsilon. 
 
float backoff
log-backoff weight of the n-gram. 
 
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters. 
 
std::string LineReference() const
Inside ConsumeNGram(), returns a formatted reference to the line being compiled, to print out as part...
 
std::vector< int32 > ngram_counts_
 
std::vector< int32 > words
Symbols in left to right order. 
 
bool ConvertStringToReal(const std::string &str, T *out)
ConvertStringToReal converts a string into either float or double and returns false if there was any ...
 
std::string current_line_
 
Skip n-gram with OOV word and continue. 
 
void Read(std::istream &is)
Read ARPA LM file from a stream. 
 
int32 bos_symbol
Symbol for <s>, Required non-epsilon. 
 
bool ShouldWarn()
Increments warning count, and returns true if a warning should be printed or false if the count has e...
 
int32 max_warnings
Maximum warnings to report, <0 unlimited. 
 
fst::SymbolTable * symbols_
 
OovHandling oov_handling
How to handle OOV words in the file. 
 
void TrimTrailingWhitespace(std::string *str)
 
virtual void ConsumeNGram(const NGram &)=0
Pure override that must be implemented to process current n-gram. 
 
Replace OOV words with <unk>. 
 
A parsed n-gram from ARPA LM file.