48 class GeneralHistKey {
51 template<
class InputIt>
52 GeneralHistKey(InputIt begin, InputIt end) :
vector_(begin, end) { }
54 GeneralHistKey() :
vector_() { }
58 GeneralHistKey Tails()
const {
62 friend bool operator==(
const GeneralHistKey& a,
const GeneralHistKey& b) {
63 return a.vector_ == b.vector_;
66 struct HashType :
public std::unary_function<GeneralHistKey, size_t> {
67 size_t operator()(
const GeneralHistKey& key)
const {
83 class OptimizedHistKey {
87 kMaxData = (1 << kShift) - 1
89 template<
class InputIt>
90 OptimizedHistKey(InputIt begin, InputIt end) :
data_(0) {
91 for (uint32 shift = 0; begin != end; ++begin, shift += kShift) {
92 data_ |=
static_cast<uint64
>(*begin) << shift;
95 OptimizedHistKey() :
data_(0) { }
96 OptimizedHistKey Tails()
const {
97 return OptimizedHistKey(
data_ >> kShift);
99 friend bool operator==(
const OptimizedHistKey& a,
const OptimizedHistKey& b) {
100 return a.data_ == b.data_;
102 struct HashType :
public std::unary_function<OptimizedHistKey, size_t> {
103 size_t operator()(
const OptimizedHistKey& key)
const {
return key.data_; }
107 explicit OptimizedHistKey(uint64 data) :
data_(data) { }
113 template <
class HistKey>
123 void CreateBackoff(HistKey key,
StateId state,
float weight);
132 typedef unordered_map<HistKey,
StateId,
137 template <
class HistKey>
140 : parent_(parent), fst_(fst), bos_symbol_(parent->Options().bos_symbol),
141 eos_symbol_(parent->Options().eos_symbol), sub_eps_(sub_eps) {
156 template <
class HistKey>
183 HistKey heads(ngram.
words.begin(), ngram.
words.end() - 1);
184 typename HistoryMap::iterator source_it =
history_.find(heads);
190 <<
" skipped: no parent (n-1)-gram exists";
194 StateId source = source_it->second;
196 Symbol sym = ngram.
words.back();
199 KALDI_ERR <<
" <eps> or disambiguation symbol " << sym <<
"found in the ARPA file. ";
208 fst_->SetFinal(source, weight);
217 HistKey(ngram.
words.begin() + (is_highest ? 1 : 0),
226 source =
fst_->AddState();
227 fst_->SetStart(source);
230 fst_->SetStart(dest);
244 template <
class HistKey>
247 typename HistoryMap::iterator dest_it =
history_.find(key);
251 return dest_it->second;
264 template <
class HistKey>
266 HistKey key,
StateId state,
float weight) {
267 typename HistoryMap::iterator dest_it =
history_.find(key);
288 int64 max_symbol = 0;
289 if (Symbols() != NULL)
290 max_symbol = Symbols()->AvailableKey() - 1;
294 max_symbol += NgramCounts()[0];
296 if (NgramCounts().size() <= 4 && max_symbol < OptimizedHistKey::kMaxData) {
300 KALDI_LOG <<
"Reverting to slower state tracking because model is large: " 301 << NgramCounts().size() <<
"-gram with symbols up to " 308 for (
int i = 0;
i < ngram.
words.size(); ++
i) {
309 if ((
i > 0 && ngram.
words[
i] == Options().bos_symbol) ||
310 (
i + 1 < ngram.
words.size()
311 && ngram.
words[
i] == Options().eos_symbol)) {
314 <<
" skipped: n-gram has invalid BOS/EOS placement";
319 bool is_highest = ngram.
words.size() == NgramCounts().size();
320 impl_->ConsumeNGram(ngram, is_highest);
325 if (backoff_symbol == 0) {
344 if (
fst_.NumArcs(state) == 1 &&
fst_.Final(state) == fst::TropicalWeight::Zero()) {
345 fst::MutableArcIterator<fst::StdVectorFst> iter(&
fst_, state);
347 if (arc.ilabel == backoff_symbol) {
359 KALDI_LOG <<
"Reduced num-states from " << num_states <<
" to " 364 if (
fst_.Start() == fst::kNoStateId) {
365 KALDI_ERR <<
"Arpa file did not contain the beginning-of-sentence symbol " 366 << Symbols()->Find(Options().bos_symbol) <<
".";
371 fst_.SetInputSymbols(Symbols());
372 fst_.SetOutputSymbols(Symbols());
373 RemoveRedundantStates();
fst::StdArc::StateId StateId
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
A hashing function-object for vectors.
virtual ~ArpaLmCompilerImplInterface()
void RemoveEpsLocal(MutableFst< Arc > *fst)
RemoveEpsLocal remove some (but not necessarily all) epsilons in an FST, using an algorithm that is g...
virtual void ConsumeNGram(const NGram &ngram, bool is_highest)
virtual void ConsumeNGram(const NGram &ngram)
Pure override that must be implemented to process current n-gram.
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
virtual void ConsumeNGram(const NGram &ngram, bool is_highest)=0
float logprob
Log-prob of the n-gram.
StateId AddStateWithBackoff(HistKey key, float backoff)
fst::StdVectorFst StdVectorFst
Add novel words to the symbol table.
virtual void HeaderAvailable()
Override function called to signal that ARPA header with the expected number of n-grams has been read...
void CreateBackoff(HistKey key, StateId state, float weight)
unordered_map< HistKey, StateId, typename HistKey::HashType > HistoryMap
float backoff
log-backoff weight of the n-gram.
std::string LineReference() const
Inside ConsumeNGram(), returns a formatted reference to the line being compiled, to print out as part...
std::vector< int32 > words
Symbols in left to right order.
ArpaLmCompilerImpl(ArpaLmCompiler *parent, fst::StdVectorFst *fst, Symbol sub_eps)
void RemoveRedundantStates()
#define KALDI_ASSERT(cond)
bool ShouldWarn()
Increments warning count, and returns true if a warning should be printed or false if the count has e...
bool operator==(const LatticeWeightTpl< FloatType > &wa, const LatticeWeightTpl< FloatType > &wb)
std::vector< Symbol > vector_
A parsed n-gram from ARPA LM file.
virtual void ReadComplete()
Override function called after the last n-gram has been consumed.