arpa-file-parser.h
Go to the documentation of this file.
1 // lm/arpa-file-parser.h
2 
3 // Copyright 2014 Guoguo Chen
4 // Copyright 2016 Smart Action Company LLC (kkm)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #ifndef KALDI_LM_ARPA_FILE_PARSER_H_
22 #define KALDI_LM_ARPA_FILE_PARSER_H_
23 
24 #include <fst/fst-decl.h>
25 
26 #include <string>
27 #include <vector>
28 
29 #include "base/kaldi-types.h"
30 #include "itf/options-itf.h"
31 
32 namespace kaldi {
33 
38  enum OovHandling {
43  };
44 
46  bos_symbol(-1), eos_symbol(-1), unk_symbol(-1),
48 
49  void Register(OptionsItf *opts) {
50  // Registering only the max_warnings count, since other options are
51  // treated differently by client programs: some want integer symbols,
52  // while other are passed words in their command line.
53  opts->Register("max-arpa-warnings", &max_warnings,
54  "Maximum warnings to report on ARPA parsing, "
55  "0 to disable, -1 to show all");
56  }
57 
63 };
64 
68 struct NGram {
69  NGram() : logprob(0.0), backoff(0.0) { }
70  std::vector<int32> words;
71  float logprob;
72  float backoff;
73 };
75 
82  public:
92  ArpaFileParser(const ArpaParseOptions& options, fst::SymbolTable* symbols);
93  virtual ~ArpaFileParser();
94 
96  void Read(std::istream &is);
97 
99  const ArpaParseOptions& Options() const { return options_; }
100 
101  protected:
104  virtual void ReadStarted() { }
105 
108  virtual void HeaderAvailable() { }
109 
113  virtual void ConsumeNGram(const NGram&) = 0;
114 
116  virtual void ReadComplete() { }
117 
119  const fst::SymbolTable* Symbols() const { return symbols_; }
120 
122  int32 LineNumber() const { return line_number_; }
123 
126  std::string LineReference() const;
127 
130  bool ShouldWarn();
131 
133  const std::vector<int32>& NgramCounts() const { return ngram_counts_; }
134 
135  private:
137  fst::SymbolTable* symbols_; // the pointer is not owned here.
140  std::string current_line_;
141  std::vector<int32> ngram_counts_;
142 };
143 
144 } // namespace kaldi
145 
146 #endif // KALDI_LM_ARPA_FILE_PARSER_H_
ArpaFileParser is an abstract base class for ARPA LM file conversion.
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
ArpaParseOptions options_
const fst::SymbolTable * Symbols() const
Read-only access to symbol table. Not owned, do not make public.
virtual void ReadStarted()
Override called before reading starts.
Options that control ArpaFileParser.
virtual void ReadComplete()
Override function called after the last n-gram has been consumed.
const ArpaParseOptions & Options() const
Parser options.
int32 unk_symbol
Symbol for <unk>, Required for kReplaceWithUnk.
float logprob
Log-prob of the n-gram.
void Register(OptionsItf *opts)
float logprob
kaldi::int32 int32
int32 LineNumber() const
Inside ConsumeNGram(), provides the current line number.
virtual void HeaderAvailable()
Override function called to signal that ARPA header with the expected number of n-grams has been read...
Add novel words to the symbol table.
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
int32 eos_symbol
Symbol for </s>, Required non-epsilon.
float backoff
log-backoff weight of the n-gram.
float backoff
std::vector< int32 > ngram_counts_
std::vector< int32 > words
Symbols in left to right order.
Skip n-gram with OOV word and continue.
int32 bos_symbol
Symbol for <s>, Required non-epsilon.
int32 max_warnings
Maximum warnings to report, <0 unlimited.
fst::SymbolTable * symbols_
const std::vector< int32 > & NgramCounts() const
N-gram counts. Valid from the point when HeaderAvailable() is called.
OovHandling oov_handling
How to handle OOV words in the file.
Replace OOV words with <unk>.
A parsed n-gram from ARPA LM file.