Inheritance diagram for ConstArpaLmBuilder:

Collaboration diagram for ConstArpaLmBuilder:

Classes
struct	WordsAndLmStatePairLessThan

Public Member Functions
	ConstArpaLmBuilder (ArpaParseOptions options)

	~ConstArpaLmBuilder ()

void	Write (std::ostream &os, bool binary) const

void	SetMaxAddressOffset (const int32 max_address_offset)

Public Member Functions inherited from ArpaFileParser
	ArpaFileParser (const ArpaParseOptions &options, fst::SymbolTable *symbols)
	Constructs the parser with the given options and optional symbol table. More...

virtual	~ArpaFileParser ()

void	Read (std::istream &is)
	Read ARPA LM file from a stream. More...

const ArpaParseOptions &	Options () const
	Parser options. More...

Protected Member Functions
virtual void	HeaderAvailable ()
	Override function called to signal that ARPA header with the expected number of n-grams has been read, and ngram_counts() is now valid. More...

virtual void	ConsumeNGram (const NGram &ngram)
	Pure override that must be implemented to process current n-gram. More...

virtual void	ReadComplete ()
	Override function called after the last n-gram has been consumed. More...

Protected Member Functions inherited from ArpaFileParser
virtual void	ReadStarted ()
	Override called before reading starts. More...

const fst::SymbolTable *	Symbols () const
	Read-only access to symbol table. Not owned, do not make public. More...

int32	LineNumber () const
	Inside ConsumeNGram(), provides the current line number. More...

std::string	LineReference () const
	Inside ConsumeNGram(), returns a formatted reference to the line being compiled, to print out as part of diagnostics. More...

bool	ShouldWarn ()
	Increments warning count, and returns true if a warning should be printed or false if the count has exceeded the set maximum. More...

const std::vector< int32 > &	NgramCounts () const
	N-gram counts. Valid from the point when HeaderAvailable() is called. More...

Private Attributes
bool	is_built_

int32	max_address_offset_

int32	ngram_order_

int32	num_words_

int32	overflow_buffer_size_

int64	lm_states_size_

int32 *	lm_states_

int32 **	unigram_states_

int32 **	overflow_buffer_

unordered_map< std::vector< int32 >, LmState *, VectorHasher< int32 > >	seq_to_state_

Detailed Description

Definition at line 177 of file const-arpa-lm.cc.

Constructor & Destructor Documentation

◆ ConstArpaLmBuilder()

ConstArpaLmBuilder ( ArpaParseOptions options )

inlineexplicit

Definition at line 179 of file const-arpa-lm.cc.

       : ArpaFileParser(options, NULL) {
     ngram_order_ = 0;
     num_words_ = 0;
     overflow_buffer_size_ = 0;
     lm_states_size_ = 0;
     max_address_offset_ = pow(2, 30) - 1;
     is_built_ = false;
     lm_states_ = NULL;
     unigram_states_ = NULL;
     overflow_buffer_ = NULL;
   }

◆ ~ConstArpaLmBuilder()

~ConstArpaLmBuilder ( )

inline

Definition at line 192 of file const-arpa-lm.cc.

                         {
     unordered_map<std::vector<int32>,
                   LmState*, VectorHasher<int32> >::iterator iter;
     for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) {
       delete iter->second;
     }
     if (is_built_) {
       delete[] lm_states_;
       delete[] unigram_states_;
       delete[] overflow_buffer_;
     }
   }

Member Function Documentation

◆ ConsumeNGram()

void ConsumeNGram ( const NGram & )

protectedvirtual

Pure override that must be implemented to process current n-gram.

The n-grams are sent in the file order, which guarantees that all (k-1)-grams are processed before the first k-gram is.

Implements ArpaFileParser.

Definition at line 271 of file const-arpa-lm.cc.

References NGram::backoff, rnnlm::i, KALDI_ASSERT, KALDI_ERR, NGram::logprob, and NGram::words.

                                                         {
   int32 cur_order = ngram.words.size();
   // If <ngram_order_> is larger than 1, then we do not create LmState for
   // the final order entry. We only keep the log probability for it.
   LmState *lm_state = NULL;
   if (cur_order != ngram_order_ || ngram_order_ == 1) {
     lm_state = new LmState(cur_order == 1,
                            cur_order == ngram_order_ - 1,
                            ngram.logprob, ngram.backoff);
 
     if (seq_to_state_.find(ngram.words) != seq_to_state_.end()) {
       std::ostringstream os;
       os << "[ ";
       for (size_t i = 0; i < ngram.words.size(); i++) {
         os << ngram.words[i] << " ";
       }
       os <<"]";
 
       KALDI_ERR << "N-gram " << os.str() << " appears twice in the arpa file";
     }
     seq_to_state_[ngram.words] = lm_state;
   }
 
   // If n-gram order is larger than 1, we have to add possible child to
   // existing LmStates. We have the following two assumptions:
   // 1. N-grams are processed from small order to larger ones, i.e., from
   //    1, 2, ... to the highest order.
   // 2. If a n-gram exists in the Arpa format language model, then the
   //    "history" n-gram also exists. For example, if "A B C" is a valid
   //    n-gram, then "A B" is also a valid n-gram.
   int32 last_word = ngram.words[cur_order - 1];
   if (cur_order > 1) {
     std::vector<int32> hist(ngram.words.begin(), ngram.words.end() - 1);
     unordered_map<std::vector<int32>,
                   LmState*, VectorHasher<int32> >::iterator hist_iter;
     hist_iter = seq_to_state_.find(hist);
     if (hist_iter == seq_to_state_.end()) {
       std::ostringstream ss;
       for (int i = 0; i < cur_order; ++i)
         ss << (i == 0 ? '[' : ' ') << ngram.words[i];
       KALDI_ERR << "In line " << LineNumber() << ": "
                 << cur_order << "-gram " << ss.str() << "] does not have "
                 << "a parent model " << cur_order << "-gram.";
     }
     if (cur_order != ngram_order_ || ngram_order_ == 1) {
       KALDI_ASSERT(lm_state != NULL);
       KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
       hist_iter->second->AddChild(last_word, lm_state);
     } else {
       KALDI_ASSERT(lm_state == NULL);
       KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
       hist_iter->second->AddChild(last_word, ngram.logprob);
     }
   } else {
     // Figures out <max_word_id>.
     num_words_ = std::max(num_words_, last_word + 1);
   }
 }

◆ HeaderAvailable()

void HeaderAvailable ( )

protectedvirtual

Override function called to signal that ARPA header with the expected number of n-grams has been read, and ngram_counts() is now valid.

Reimplemented from ArpaFileParser.

Definition at line 267 of file const-arpa-lm.cc.

                                          {
   ngram_order_ = NgramCounts().size();
 }

◆ ReadComplete()

void ReadComplete ( )

protectedvirtual

Override function called after the last n-gram has been consumed.

Reimplemented from ArpaFileParser.

Definition at line 356 of file const-arpa-lm.cc.

References Int32AndFloat::f, rnnlm::i, Int32AndFloat::i, rnnlm::j, KALDI_ASSERT, and KALDI_ERR.

                                       {
   // STEP 1: sorting LmStates lexicographically.
   // Vector for holding the sorted LmStates.
   std::vector<std::pair<std::vector<int32>*, LmState*> > sorted_vec;
   unordered_map<std::vector<int32>,
                 LmState*, VectorHasher<int32> >::iterator iter;
   for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) {
     if (iter->second->MemSize() > 0) {
       sorted_vec.push_back(
           std::make_pair(const_cast<std::vector<int32>*>(&(iter->first)),
                          iter->second));
     }
   }
 
   std::sort(sorted_vec.begin(), sorted_vec.end(),
             WordsAndLmStatePairLessThan());
 
   // STEP 2: updating <my_address> in LmState.
   for (int32 i = 0; i < sorted_vec.size(); ++i) {
     lm_states_size_ += sorted_vec[i].second->MemSize();
     if (i == 0) {
       sorted_vec[i].second->SetMyAddress(0);
     } else {
       sorted_vec[i].second->SetMyAddress(sorted_vec[i - 1].second->MyAddress()
           + sorted_vec[i - 1].second->MemSize());
     }
   }
 
   // STEP 3: creating memory block to store LmStates.
   // Reserves a memory block for LmStates.
   int64 lm_states_index = 0;
   try {
     lm_states_ = new int32[lm_states_size_];
   } catch(const std::exception &e) {
     KALDI_ERR << e.what();
   }
 
   // Puts data into memory block.
   unigram_states_ = new int32*[num_words_];
   std::vector<int32*> overflow_buffer_vec;
   for (int32 i = 0; i < num_words_; ++i) {
     unigram_states_[i] = NULL;
   }
   for (int32 i = 0; i < sorted_vec.size(); ++i) {
     // Current address.
     int32* parent_address = lm_states_ + lm_states_index;
 
     // Adds logprob.
     Int32AndFloat logprob_f(sorted_vec[i].second->Logprob());
     lm_states_[lm_states_index++] = logprob_f.i;
 
     // Adds backoff_logprob.
     Int32AndFloat backoff_logprob_f(sorted_vec[i].second->BackoffLogprob());
     lm_states_[lm_states_index++] = backoff_logprob_f.i;
 
     // Adds num_children.
     lm_states_[lm_states_index++] = sorted_vec[i].second->NumChildren();
 
     // Adds children, there are 3 cases:
     // 1. Child is a leaf and not unigram
     // 2. Child is not a leaf or is unigram
     //    2.1 Relative address can be represented by 30 bits
     //    2.2 Relative address cannot be represented by 30 bits
     sorted_vec[i].second->SortChildren();
     for (int32 j = 0; j < sorted_vec[i].second->NumChildren(); ++j) {
       int32 child_info;
       if (sorted_vec[i].second->IsChildFinalOrder() ||
           sorted_vec[i].second->GetChild(j).second.state->MemSize() == 0) {
         // Child is a leaf and not unigram. In this case we will not create an
         // entry in <lm_states_>; instead, we put the logprob in the place where
         // we normally store the poitner.
         Int32AndFloat child_logprob_f;
         if (sorted_vec[i].second->IsChildFinalOrder()) {
           child_logprob_f.f = sorted_vec[i].second->GetChild(j).second.prob;
         } else {
           child_logprob_f.f =
               sorted_vec[i].second->GetChild(j).second.state->Logprob();
         }
         child_info = child_logprob_f.i;
         child_info &= ~1;   // Sets the last bit to 0 so <child_info> is even.
       } else {
         // Child is not a leaf or is unigram.
         int64 offset =
             sorted_vec[i].second->GetChild(j).second.state->MyAddress()
             - sorted_vec[i].second->MyAddress();
         KALDI_ASSERT(offset > 0);
         if (offset <= max_address_offset_) {
           // Relative address can be represented by 30 bits.
           child_info = offset * 2;
           child_info |= 1;
         } else {
           // Relative address cannot be represented by 30 bits, we have to put
           // the child address into <overflow_buffer_>.
           int32* abs_address = parent_address + offset;
           overflow_buffer_vec.push_back(abs_address);
           int32 overflow_buffer_index = overflow_buffer_vec.size() - 1;
           child_info = overflow_buffer_index * 2;
           child_info |= 1;
           child_info *= -1;
         }
       }
       // Child word.
       lm_states_[lm_states_index++] = sorted_vec[i].second->GetChild(j).first;
       // Child info.
       lm_states_[lm_states_index++] = child_info;
     }
 
     // If the current state corresponds to an unigram, then create a separate
     // loop up table to improve efficiency, since those will be looked up pretty
     // frequently.
     if (sorted_vec[i].second->IsUnigram()) {
       KALDI_ASSERT(sorted_vec[i].first->size() == 1);
       unigram_states_[(*sorted_vec[i].first)[0]] = parent_address;
     }
   }
   KALDI_ASSERT(lm_states_size_ == lm_states_index);
 
   // Move <overflow_buffer_> from vector holder to array.
   overflow_buffer_size_ = overflow_buffer_vec.size();
   overflow_buffer_ = new int32*[overflow_buffer_size_];
   for (int32 i = 0; i < overflow_buffer_size_; ++i) {
     overflow_buffer_[i] = overflow_buffer_vec[i];
   }
 
   is_built_ = true;
 }

◆ SetMaxAddressOffset()

void SetMaxAddressOffset ( const int32 max_address_offset )

inline

Definition at line 208 of file const-arpa-lm.cc.

References KALDI_WARN.

                                                            {
     KALDI_WARN << "You are changing <max_address_offset_>; the default should "
         << "not be changed unless you are in testing mode.";
     max_address_offset_ = max_address_offset;
   }

◆ Write()

void Write	(	std::ostream &	os,
		bool	binary
	)		const

Definition at line 483 of file const-arpa-lm.cc.

References KALDI_ASSERT, KALDI_ERR, and ConstArpaLm::Write().

                                                                 {
   if (!binary) {
     KALDI_ERR << "text-mode writing is not implemented for ConstArpaLmBuilder.";
   }
   KALDI_ASSERT(is_built_);
 
   // Creates ConstArpaLm.
   ConstArpaLm const_arpa_lm(
       Options().bos_symbol, Options().eos_symbol, Options().unk_symbol,
       ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_,
       unigram_states_, overflow_buffer_, lm_states_);
   const_arpa_lm.Write(os, binary);
 }

Member Data Documentation

◆ is_built_

bool is_built_

private

Definition at line 231 of file const-arpa-lm.cc.

◆ lm_states_

int32* lm_states_

private

Definition at line 253 of file const-arpa-lm.cc.

◆ lm_states_size_

int64 lm_states_size_

private

Definition at line 250 of file const-arpa-lm.cc.

◆ max_address_offset_

int32 max_address_offset_

private

Definition at line 235 of file const-arpa-lm.cc.

◆ ngram_order_

int32 ngram_order_

private

Definition at line 239 of file const-arpa-lm.cc.

◆ num_words_

int32 num_words_

private

Definition at line 243 of file const-arpa-lm.cc.

◆ overflow_buffer_

int32** overflow_buffer_

private

Definition at line 260 of file const-arpa-lm.cc.

◆ overflow_buffer_size_

int32 overflow_buffer_size_

private

Definition at line 247 of file const-arpa-lm.cc.

◆ seq_to_state_

unordered_map<std::vector<int32>, LmState*, VectorHasher<int32> > seq_to_state_

private

Definition at line 264 of file const-arpa-lm.cc.

◆ unigram_states_

int32** unigram_states_

private

Definition at line 256 of file const-arpa-lm.cc.

The documentation for this class was generated from the following file:

lm/const-arpa-lm.cc

Classes

Public Member Functions

Protected Member Functions

Private Attributes

Detailed Description

Constructor & Destructor Documentation

◆ ConstArpaLmBuilder()

◆ ~ConstArpaLmBuilder()

Member Function Documentation

◆ ConsumeNGram()

◆ HeaderAvailable()

◆ ReadComplete()

◆ SetMaxAddressOffset()

◆ Write()

Member Data Documentation

◆ is_built_

◆ lm_states_

◆ lm_states_size_

◆ max_address_offset_

◆ ngram_order_

◆ num_words_

◆ overflow_buffer_

◆ overflow_buffer_size_

◆ seq_to_state_

◆ unigram_states_