ConstArpaLmBuilder Class Reference
Inheritance diagram for ConstArpaLmBuilder:
Collaboration diagram for ConstArpaLmBuilder:

Classes

struct  WordsAndLmStatePairLessThan
 

Public Member Functions

 ConstArpaLmBuilder (ArpaParseOptions options)
 
 ~ConstArpaLmBuilder ()
 
void Write (std::ostream &os, bool binary) const
 
void SetMaxAddressOffset (const int32 max_address_offset)
 
- Public Member Functions inherited from ArpaFileParser
 ArpaFileParser (const ArpaParseOptions &options, fst::SymbolTable *symbols)
 Constructs the parser with the given options and optional symbol table. More...
 
virtual ~ArpaFileParser ()
 
void Read (std::istream &is)
 Read ARPA LM file from a stream. More...
 
const ArpaParseOptionsOptions () const
 Parser options. More...
 

Protected Member Functions

virtual void HeaderAvailable ()
 Override function called to signal that ARPA header with the expected number of n-grams has been read, and ngram_counts() is now valid. More...
 
virtual void ConsumeNGram (const NGram &ngram)
 Pure override that must be implemented to process current n-gram. More...
 
virtual void ReadComplete ()
 Override function called after the last n-gram has been consumed. More...
 
- Protected Member Functions inherited from ArpaFileParser
virtual void ReadStarted ()
 Override called before reading starts. More...
 
const fst::SymbolTable * Symbols () const
 Read-only access to symbol table. Not owned, do not make public. More...
 
int32 LineNumber () const
 Inside ConsumeNGram(), provides the current line number. More...
 
std::string LineReference () const
 Inside ConsumeNGram(), returns a formatted reference to the line being compiled, to print out as part of diagnostics. More...
 
bool ShouldWarn ()
 Increments warning count, and returns true if a warning should be printed or false if the count has exceeded the set maximum. More...
 
const std::vector< int32 > & NgramCounts () const
 N-gram counts. Valid from the point when HeaderAvailable() is called. More...
 

Private Attributes

bool is_built_
 
int32 max_address_offset_
 
int32 ngram_order_
 
int32 num_words_
 
int32 overflow_buffer_size_
 
int64 lm_states_size_
 
int32lm_states_
 
int32 ** unigram_states_
 
int32 ** overflow_buffer_
 
unordered_map< std::vector< int32 >, LmState *, VectorHasher< int32 > > seq_to_state_
 

Detailed Description

Definition at line 177 of file const-arpa-lm.cc.

Constructor & Destructor Documentation

◆ ConstArpaLmBuilder()

ConstArpaLmBuilder ( ArpaParseOptions  options)
inlineexplicit

Definition at line 179 of file const-arpa-lm.cc.

180  : ArpaFileParser(options, NULL) {
181  ngram_order_ = 0;
182  num_words_ = 0;
184  lm_states_size_ = 0;
185  max_address_offset_ = pow(2, 30) - 1;
186  is_built_ = false;
187  lm_states_ = NULL;
188  unigram_states_ = NULL;
189  overflow_buffer_ = NULL;
190  }
ArpaFileParser(const ArpaParseOptions &options, fst::SymbolTable *symbols)
Constructs the parser with the given options and optional symbol table.

◆ ~ConstArpaLmBuilder()

~ConstArpaLmBuilder ( )
inline

Definition at line 192 of file const-arpa-lm.cc.

192  {
193  unordered_map<std::vector<int32>,
194  LmState*, VectorHasher<int32> >::iterator iter;
195  for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) {
196  delete iter->second;
197  }
198  if (is_built_) {
199  delete[] lm_states_;
200  delete[] unigram_states_;
201  delete[] overflow_buffer_;
202  }
203  }
unordered_map< std::vector< int32 >, LmState *, VectorHasher< int32 > > seq_to_state_

Member Function Documentation

◆ ConsumeNGram()

void ConsumeNGram ( const NGram )
protectedvirtual

Pure override that must be implemented to process current n-gram.

The n-grams are sent in the file order, which guarantees that all (k-1)-grams are processed before the first k-gram is.

Implements ArpaFileParser.

Definition at line 271 of file const-arpa-lm.cc.

References NGram::backoff, rnnlm::i, KALDI_ASSERT, KALDI_ERR, NGram::logprob, and NGram::words.

271  {
272  int32 cur_order = ngram.words.size();
273  // If <ngram_order_> is larger than 1, then we do not create LmState for
274  // the final order entry. We only keep the log probability for it.
275  LmState *lm_state = NULL;
276  if (cur_order != ngram_order_ || ngram_order_ == 1) {
277  lm_state = new LmState(cur_order == 1,
278  cur_order == ngram_order_ - 1,
279  ngram.logprob, ngram.backoff);
280 
281  if (seq_to_state_.find(ngram.words) != seq_to_state_.end()) {
282  std::ostringstream os;
283  os << "[ ";
284  for (size_t i = 0; i < ngram.words.size(); i++) {
285  os << ngram.words[i] << " ";
286  }
287  os <<"]";
288 
289  KALDI_ERR << "N-gram " << os.str() << " appears twice in the arpa file";
290  }
291  seq_to_state_[ngram.words] = lm_state;
292  }
293 
294  // If n-gram order is larger than 1, we have to add possible child to
295  // existing LmStates. We have the following two assumptions:
296  // 1. N-grams are processed from small order to larger ones, i.e., from
297  // 1, 2, ... to the highest order.
298  // 2. If a n-gram exists in the Arpa format language model, then the
299  // "history" n-gram also exists. For example, if "A B C" is a valid
300  // n-gram, then "A B" is also a valid n-gram.
301  int32 last_word = ngram.words[cur_order - 1];
302  if (cur_order > 1) {
303  std::vector<int32> hist(ngram.words.begin(), ngram.words.end() - 1);
304  unordered_map<std::vector<int32>,
305  LmState*, VectorHasher<int32> >::iterator hist_iter;
306  hist_iter = seq_to_state_.find(hist);
307  if (hist_iter == seq_to_state_.end()) {
308  std::ostringstream ss;
309  for (int i = 0; i < cur_order; ++i)
310  ss << (i == 0 ? '[' : ' ') << ngram.words[i];
311  KALDI_ERR << "In line " << LineNumber() << ": "
312  << cur_order << "-gram " << ss.str() << "] does not have "
313  << "a parent model " << cur_order << "-gram.";
314  }
315  if (cur_order != ngram_order_ || ngram_order_ == 1) {
316  KALDI_ASSERT(lm_state != NULL);
317  KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
318  hist_iter->second->AddChild(last_word, lm_state);
319  } else {
320  KALDI_ASSERT(lm_state == NULL);
321  KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
322  hist_iter->second->AddChild(last_word, ngram.logprob);
323  }
324  } else {
325  // Figures out <max_word_id>.
326  num_words_ = std::max(num_words_, last_word + 1);
327  }
328 }
unordered_map< std::vector< int32 >, LmState *, VectorHasher< int32 > > seq_to_state_
kaldi::int32 int32
int32 LineNumber() const
Inside ConsumeNGram(), provides the current line number.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ HeaderAvailable()

void HeaderAvailable ( )
protectedvirtual

Override function called to signal that ARPA header with the expected number of n-grams has been read, and ngram_counts() is now valid.

Reimplemented from ArpaFileParser.

Definition at line 267 of file const-arpa-lm.cc.

267  {
268  ngram_order_ = NgramCounts().size();
269 }
const std::vector< int32 > & NgramCounts() const
N-gram counts. Valid from the point when HeaderAvailable() is called.

◆ ReadComplete()

void ReadComplete ( )
protectedvirtual

Override function called after the last n-gram has been consumed.

Reimplemented from ArpaFileParser.

Definition at line 356 of file const-arpa-lm.cc.

References Int32AndFloat::f, rnnlm::i, Int32AndFloat::i, rnnlm::j, KALDI_ASSERT, and KALDI_ERR.

356  {
357  // STEP 1: sorting LmStates lexicographically.
358  // Vector for holding the sorted LmStates.
359  std::vector<std::pair<std::vector<int32>*, LmState*> > sorted_vec;
360  unordered_map<std::vector<int32>,
361  LmState*, VectorHasher<int32> >::iterator iter;
362  for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) {
363  if (iter->second->MemSize() > 0) {
364  sorted_vec.push_back(
365  std::make_pair(const_cast<std::vector<int32>*>(&(iter->first)),
366  iter->second));
367  }
368  }
369 
370  std::sort(sorted_vec.begin(), sorted_vec.end(),
371  WordsAndLmStatePairLessThan());
372 
373  // STEP 2: updating <my_address> in LmState.
374  for (int32 i = 0; i < sorted_vec.size(); ++i) {
375  lm_states_size_ += sorted_vec[i].second->MemSize();
376  if (i == 0) {
377  sorted_vec[i].second->SetMyAddress(0);
378  } else {
379  sorted_vec[i].second->SetMyAddress(sorted_vec[i - 1].second->MyAddress()
380  + sorted_vec[i - 1].second->MemSize());
381  }
382  }
383 
384  // STEP 3: creating memory block to store LmStates.
385  // Reserves a memory block for LmStates.
386  int64 lm_states_index = 0;
387  try {
389  } catch(const std::exception &e) {
390  KALDI_ERR << e.what();
391  }
392 
393  // Puts data into memory block.
395  std::vector<int32*> overflow_buffer_vec;
396  for (int32 i = 0; i < num_words_; ++i) {
397  unigram_states_[i] = NULL;
398  }
399  for (int32 i = 0; i < sorted_vec.size(); ++i) {
400  // Current address.
401  int32* parent_address = lm_states_ + lm_states_index;
402 
403  // Adds logprob.
404  Int32AndFloat logprob_f(sorted_vec[i].second->Logprob());
405  lm_states_[lm_states_index++] = logprob_f.i;
406 
407  // Adds backoff_logprob.
408  Int32AndFloat backoff_logprob_f(sorted_vec[i].second->BackoffLogprob());
409  lm_states_[lm_states_index++] = backoff_logprob_f.i;
410 
411  // Adds num_children.
412  lm_states_[lm_states_index++] = sorted_vec[i].second->NumChildren();
413 
414  // Adds children, there are 3 cases:
415  // 1. Child is a leaf and not unigram
416  // 2. Child is not a leaf or is unigram
417  // 2.1 Relative address can be represented by 30 bits
418  // 2.2 Relative address cannot be represented by 30 bits
419  sorted_vec[i].second->SortChildren();
420  for (int32 j = 0; j < sorted_vec[i].second->NumChildren(); ++j) {
421  int32 child_info;
422  if (sorted_vec[i].second->IsChildFinalOrder() ||
423  sorted_vec[i].second->GetChild(j).second.state->MemSize() == 0) {
424  // Child is a leaf and not unigram. In this case we will not create an
425  // entry in <lm_states_>; instead, we put the logprob in the place where
426  // we normally store the poitner.
427  Int32AndFloat child_logprob_f;
428  if (sorted_vec[i].second->IsChildFinalOrder()) {
429  child_logprob_f.f = sorted_vec[i].second->GetChild(j).second.prob;
430  } else {
431  child_logprob_f.f =
432  sorted_vec[i].second->GetChild(j).second.state->Logprob();
433  }
434  child_info = child_logprob_f.i;
435  child_info &= ~1; // Sets the last bit to 0 so <child_info> is even.
436  } else {
437  // Child is not a leaf or is unigram.
438  int64 offset =
439  sorted_vec[i].second->GetChild(j).second.state->MyAddress()
440  - sorted_vec[i].second->MyAddress();
441  KALDI_ASSERT(offset > 0);
442  if (offset <= max_address_offset_) {
443  // Relative address can be represented by 30 bits.
444  child_info = offset * 2;
445  child_info |= 1;
446  } else {
447  // Relative address cannot be represented by 30 bits, we have to put
448  // the child address into <overflow_buffer_>.
449  int32* abs_address = parent_address + offset;
450  overflow_buffer_vec.push_back(abs_address);
451  int32 overflow_buffer_index = overflow_buffer_vec.size() - 1;
452  child_info = overflow_buffer_index * 2;
453  child_info |= 1;
454  child_info *= -1;
455  }
456  }
457  // Child word.
458  lm_states_[lm_states_index++] = sorted_vec[i].second->GetChild(j).first;
459  // Child info.
460  lm_states_[lm_states_index++] = child_info;
461  }
462 
463  // If the current state corresponds to an unigram, then create a separate
464  // loop up table to improve efficiency, since those will be looked up pretty
465  // frequently.
466  if (sorted_vec[i].second->IsUnigram()) {
467  KALDI_ASSERT(sorted_vec[i].first->size() == 1);
468  unigram_states_[(*sorted_vec[i].first)[0]] = parent_address;
469  }
470  }
471  KALDI_ASSERT(lm_states_size_ == lm_states_index);
472 
473  // Move <overflow_buffer_> from vector holder to array.
474  overflow_buffer_size_ = overflow_buffer_vec.size();
476  for (int32 i = 0; i < overflow_buffer_size_; ++i) {
477  overflow_buffer_[i] = overflow_buffer_vec[i];
478  }
479 
480  is_built_ = true;
481 }
unordered_map< std::vector< int32 >, LmState *, VectorHasher< int32 > > seq_to_state_
kaldi::int32 int32
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ SetMaxAddressOffset()

void SetMaxAddressOffset ( const int32  max_address_offset)
inline

Definition at line 208 of file const-arpa-lm.cc.

References KALDI_WARN.

208  {
209  KALDI_WARN << "You are changing <max_address_offset_>; the default should "
210  << "not be changed unless you are in testing mode.";
211  max_address_offset_ = max_address_offset;
212  }
#define KALDI_WARN
Definition: kaldi-error.h:150

◆ Write()

void Write ( std::ostream &  os,
bool  binary 
) const

Definition at line 483 of file const-arpa-lm.cc.

References KALDI_ASSERT, KALDI_ERR, and ConstArpaLm::Write().

483  {
484  if (!binary) {
485  KALDI_ERR << "text-mode writing is not implemented for ConstArpaLmBuilder.";
486  }
488 
489  // Creates ConstArpaLm.
490  ConstArpaLm const_arpa_lm(
491  Options().bos_symbol, Options().eos_symbol, Options().unk_symbol,
494  const_arpa_lm.Write(os, binary);
495 }
const ArpaParseOptions & Options() const
Parser options.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

Member Data Documentation

◆ is_built_

bool is_built_
private

Definition at line 231 of file const-arpa-lm.cc.

◆ lm_states_

int32* lm_states_
private

Definition at line 253 of file const-arpa-lm.cc.

◆ lm_states_size_

int64 lm_states_size_
private

Definition at line 250 of file const-arpa-lm.cc.

◆ max_address_offset_

int32 max_address_offset_
private

Definition at line 235 of file const-arpa-lm.cc.

◆ ngram_order_

int32 ngram_order_
private

Definition at line 239 of file const-arpa-lm.cc.

◆ num_words_

int32 num_words_
private

Definition at line 243 of file const-arpa-lm.cc.

◆ overflow_buffer_

int32** overflow_buffer_
private

Definition at line 260 of file const-arpa-lm.cc.

◆ overflow_buffer_size_

int32 overflow_buffer_size_
private

Definition at line 247 of file const-arpa-lm.cc.

◆ seq_to_state_

unordered_map<std::vector<int32>, LmState*, VectorHasher<int32> > seq_to_state_
private

Definition at line 264 of file const-arpa-lm.cc.

◆ unigram_states_

int32** unigram_states_
private

Definition at line 256 of file const-arpa-lm.cc.


The documentation for this class was generated from the following file: