doc/const-arpa-lm_8cc_source.html

 // lm/const-arpa-lm.cc

 // Copyright 2014  Guoguo Chen

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.

 #include <algorithm>
 #include <limits>
 #include <sstream>
 #include <utility>

 #include "base/kaldi-math.h"
 #include "lm/arpa-file-parser.h"
 #include "lm/const-arpa-lm.h"
 #include "util/stl-utils.h"
 #include "util/text-utils.h"


 namespace kaldi {

 // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa
 // format.
 struct ArpaLine {
   std::vector<int32> words;  // Sequence of words to be printed.
   float logprob;             // Logprob corresponds to word sequence.
   float backoff_logprob;     // Backoff_logprob corresponds to word sequence.
   // Comparison function for sorting.
   bool operator < (const ArpaLine &other) const {
     if (words.size() < other.words.size()) {
       return true;
     } else if (words.size() > other.words.size()) {
       return false;
     } else {
       return words < other.words;
     }
   }
 };

 // Auxiliary class to build ConstArpaLm. We first use this class to figure out
 // the relative address of different LmStates, and then put everything into one
 // block in memory.
 class LmState {
  public:
   union ChildType {
     // If child is not the final order, we keep the pointer to its LmState.
     LmState* state;

     // If child is the final order, we only keep the log probability for it.
     float prob;
   };

   struct ChildrenVectorLessThan {
     bool operator()(
         const std::pair<int32, union ChildType>& lhs,
         const std::pair<int32, union ChildType>& rhs) const {
       return lhs.first < rhs.first;
     }
   };

   LmState(const bool is_unigram, const bool is_child_final_order,
           const float logprob, const float backoff_logprob) :
       is_unigram_(is_unigram), is_child_final_order_(is_child_final_order),
       logprob_(logprob), backoff_logprob_(backoff_logprob) {}

   void SetMyAddress(const int64 address) {
     my_address_ = address;
   }

   void AddChild(const int32 word, LmState* child_state) {
     KALDI_ASSERT(!is_child_final_order_);
     ChildType child;
     child.state = child_state;
     children_.push_back(std::make_pair(word, child));
   }

   void AddChild(const int32 word, const float child_prob) {
     KALDI_ASSERT(is_child_final_order_);
     ChildType child;
     child.prob = child_prob;
     children_.push_back(std::make_pair(word, child));
   }

   int64 MyAddress() const {
     return my_address_;
   }

   bool IsUnigram() const {
     return is_unigram_;
   }

   bool IsChildFinalOrder() const {
     return is_child_final_order_;
   }

   float Logprob() const {
     return logprob_;
   }

   float BackoffLogprob() const {
     return backoff_logprob_;
   }

   int32 NumChildren() const {
     return children_.size();
   }

   std::pair<int32, union ChildType> GetChild(const int32 index) {
     KALDI_ASSERT(index < children_.size());
     KALDI_ASSERT(index >= 0);
     return children_[index];
   }

   void SortChildren() {
     std::sort(children_.begin(), children_.end(), ChildrenVectorLessThan());
   }

   // Checks if the current LmState is a leaf.
   bool IsLeaf() const {
     return (backoff_logprob_ == 0.0 && children_.empty());
   }

   // Computes the size of the memory that the current LmState would take in
   // <lm_states> array. It's the number of 4-byte chunks.
   int32 MemSize() const {
     if (IsLeaf() && !is_unigram_) {
       // We don't create an entry in this case; the logprob will be stored in
       // the same int32 that we would normally store the pointer in.
       return 0;
     } else {
       // We store the following information:
       // logprob, backoff_logprob, children.size() and children data.
       return (3 + 2 * children_.size());
     }
   }

  private:
   // Unigram states will have LmStates even if they are leaves, therefore we
   // need to note when this is a unigram or not.
   bool is_unigram_;

   // If the current LmState has an order of (final_order - 1), then its child
   // must be the final order. We only keep the log probability for its child.
   bool is_child_final_order_;

   // When we compute the addresses of the LmStates as offsets into <lm_states_>
   // pointer, and put the offsets here. Note that this is just offset, not
   // actual pointer.
   int64 my_address_;

   // Language model log probability of the current sequence. For example, if
   // this state is "A B", then it would be the logprob of "A -> B".
   float logprob_;

   // Language model backoff log probability of the current sequence, e.g., state
   // "A B -> X" backing off to "B -> X".
   float backoff_logprob_;

   // List of children.
   std::vector<std::pair<int32, union ChildType> > children_;
 };

 // Class to build ConstArpaLm from Arpa format language model. It relies on the
 // auxiliary class LmState above.
 class ConstArpaLmBuilder : public ArpaFileParser {
  public:
   explicit ConstArpaLmBuilder(ArpaParseOptions options)
       : ArpaFileParser(options, NULL) {
     ngram_order_ = 0;
     num_words_ = 0;
     overflow_buffer_size_ = 0;
     lm_states_size_ = 0;
     max_address_offset_ = pow(2, 30) - 1;
     is_built_ = false;
     lm_states_ = NULL;
     unigram_states_ = NULL;
     overflow_buffer_ = NULL;
   }

   ~ConstArpaLmBuilder() {
     unordered_map<std::vector<int32>,
                   LmState*, VectorHasher<int32> >::iterator iter;
     for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) {
       delete iter->second;
     }
     if (is_built_) {
       delete[] lm_states_;
       delete[] unigram_states_;
       delete[] overflow_buffer_;
     }
   }

   // Writes ConstArpaLm.
   void Write(std::ostream &os, bool binary) const;

   void SetMaxAddressOffset(const int32 max_address_offset) {
     KALDI_WARN << "You are changing <max_address_offset_>; the default should "
         << "not be changed unless you are in testing mode.";
     max_address_offset_ = max_address_offset;
   }

  protected:
   // ArpaFileParser overrides.
   virtual void HeaderAvailable();
   virtual void ConsumeNGram(const NGram& ngram);
   virtual void ReadComplete();

  private:
   struct WordsAndLmStatePairLessThan {
     bool operator()(
         const std::pair<std::vector<int32>*, LmState*>& lhs,
         const std::pair<std::vector<int32>*, LmState*>& rhs) const {
       return *(lhs.first) < *(rhs.first);
     }
   };

  private:
   // Indicating if ConstArpaLm has been built or not.
   bool is_built_;

   // Maximum relative address for the child. We put it here just for testing.
   // The default value is 30-bits and should not be changed except for testing.
   int32 max_address_offset_;

   // N-gram order of language model. This can be figured out from "/data/"
   // section in Arpa format language model.
   int32 ngram_order_;

   // Index of largest word-id plus one. It defines the end of <unigram_states_>
   // array.
   int32 num_words_;

   // Number of entries in the overflow buffer for pointers that couldn't be
   // represented as a 30-bit relative index.
   int32 overflow_buffer_size_;

   // Size of the <lm_states_> array, which will be needed by I/O.
   int64 lm_states_size_;

   // Memory blcok for storing LmStates.
   int32* lm_states_;

   // Memory block for storing pointers of unigram LmStates.
   int32** unigram_states_;

   // Memory block for storing pointers of the LmStates that have large relative
   // address to their parents.
   int32** overflow_buffer_;

   // Hash table from word sequences to LmStates.
   unordered_map<std::vector<int32>,
                 LmState*, VectorHasher<int32> > seq_to_state_;
 };

 void ConstArpaLmBuilder::HeaderAvailable() {
   ngram_order_ = NgramCounts().size();
 }

 void ConstArpaLmBuilder::ConsumeNGram(const NGram &ngram) {
   int32 cur_order = ngram.words.size();
   // If <ngram_order_> is larger than 1, then we do not create LmState for
   // the final order entry. We only keep the log probability for it.
   LmState *lm_state = NULL;
   if (cur_order != ngram_order_ || ngram_order_ == 1) {
     lm_state = new LmState(cur_order == 1,
                            cur_order == ngram_order_ - 1,
                            ngram.logprob, ngram.backoff);

     if (seq_to_state_.find(ngram.words) != seq_to_state_.end()) {
       std::ostringstream os;
       os << "[ ";
       for (size_t i = 0; i < ngram.words.size(); i++) {
         os << ngram.words[i] << " ";
       }
       os <<"]";

       KALDI_ERR << "N-gram " << os.str() << " appears twice in the arpa file";
     }
     seq_to_state_[ngram.words] = lm_state;
   }

   // If n-gram order is larger than 1, we have to add possible child to
   // existing LmStates. We have the following two assumptions:
   // 1. N-grams are processed from small order to larger ones, i.e., from
   //    1, 2, ... to the highest order.
   // 2. If a n-gram exists in the Arpa format language model, then the
   //    "history" n-gram also exists. For example, if "A B C" is a valid
   //    n-gram, then "A B" is also a valid n-gram.
   int32 last_word = ngram.words[cur_order - 1];
   if (cur_order > 1) {
     std::vector<int32> hist(ngram.words.begin(), ngram.words.end() - 1);
     unordered_map<std::vector<int32>,
                   LmState*, VectorHasher<int32> >::iterator hist_iter;
     hist_iter = seq_to_state_.find(hist);
     if (hist_iter == seq_to_state_.end()) {
       std::ostringstream ss;
       for (int i = 0; i < cur_order; ++i)
         ss << (i == 0 ? '[' : ' ') << ngram.words[i];
       KALDI_ERR << "In line " << LineNumber() << ": "
                 << cur_order << "-gram " << ss.str() << "] does not have "
                 << "a parent model " << cur_order << "-gram.";
     }
     if (cur_order != ngram_order_ || ngram_order_ == 1) {
       KALDI_ASSERT(lm_state != NULL);
       KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
       hist_iter->second->AddChild(last_word, lm_state);
     } else {
       KALDI_ASSERT(lm_state == NULL);
       KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
       hist_iter->second->AddChild(last_word, ngram.logprob);
     }
   } else {
     // Figures out <max_word_id>.
     num_words_ = std::max(num_words_, last_word + 1);
   }
 }

 // ConstArpaLm can be built in the following steps, assuming we have already
 // created LmStates <seq_to_state_>:
 // 1. Sort LmStates lexicographically.
 //    This enables us to compute relative address. When we say lexicographic, we
 //    treat the word-ids as letters. After sorting, the LmStates are in the
 //    following order:
 //    ...
 //    A B
 //    A B A
 //    A B B
 //    A B C
 //    ...
 //    where each line represents a LmState.
 // 2. Update <my_address> in LmState, which is relative to the first element in
 //    <sorted_vec>.
 // 3. Put the following structure into the memory block
 //    struct LmState {
 //      float logprob;
 //      float backoff_logprob;
 //      int32 num_children;
 //      std::pair<int32, int32> [] children;
 //    }
 //
 //    At the same time, we will also create two special buffers:
 //    <unigram_states_>
 //    <overflow_buffer_>
 void ConstArpaLmBuilder::ReadComplete() {
   // STEP 1: sorting LmStates lexicographically.
   // Vector for holding the sorted LmStates.
   std::vector<std::pair<std::vector<int32>*, LmState*> > sorted_vec;
   unordered_map<std::vector<int32>,
                 LmState*, VectorHasher<int32> >::iterator iter;
   for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) {
     if (iter->second->MemSize() > 0) {
       sorted_vec.push_back(
           std::make_pair(const_cast<std::vector<int32>*>(&(iter->first)),
                          iter->second));
     }
   }

   std::sort(sorted_vec.begin(), sorted_vec.end(),
             WordsAndLmStatePairLessThan());

   // STEP 2: updating <my_address> in LmState.
   for (int32 i = 0; i < sorted_vec.size(); ++i) {
     lm_states_size_ += sorted_vec[i].second->MemSize();
     if (i == 0) {
       sorted_vec[i].second->SetMyAddress(0);
     } else {
       sorted_vec[i].second->SetMyAddress(sorted_vec[i - 1].second->MyAddress()
           + sorted_vec[i - 1].second->MemSize());
     }
   }

   // STEP 3: creating memory block to store LmStates.
   // Reserves a memory block for LmStates.
   int64 lm_states_index = 0;
   try {
     lm_states_ = new int32[lm_states_size_];
   } catch(const std::exception &e) {
     KALDI_ERR << e.what();
   }

   // Puts data into memory block.
   unigram_states_ = new int32*[num_words_];
   std::vector<int32*> overflow_buffer_vec;
   for (int32 i = 0; i < num_words_; ++i) {
     unigram_states_[i] = NULL;
   }
   for (int32 i = 0; i < sorted_vec.size(); ++i) {
     // Current address.
     int32* parent_address = lm_states_ + lm_states_index;

     // Adds logprob.
     Int32AndFloat logprob_f(sorted_vec[i].second->Logprob());
     lm_states_[lm_states_index++] = logprob_f.i;

     // Adds backoff_logprob.
     Int32AndFloat backoff_logprob_f(sorted_vec[i].second->BackoffLogprob());
     lm_states_[lm_states_index++] = backoff_logprob_f.i;

     // Adds num_children.
     lm_states_[lm_states_index++] = sorted_vec[i].second->NumChildren();

     // Adds children, there are 3 cases:
     // 1. Child is a leaf and not unigram
     // 2. Child is not a leaf or is unigram
     //    2.1 Relative address can be represented by 30 bits
     //    2.2 Relative address cannot be represented by 30 bits
     sorted_vec[i].second->SortChildren();
     for (int32 j = 0; j < sorted_vec[i].second->NumChildren(); ++j) {
       int32 child_info;
       if (sorted_vec[i].second->IsChildFinalOrder() ||
           sorted_vec[i].second->GetChild(j).second.state->MemSize() == 0) {
         // Child is a leaf and not unigram. In this case we will not create an
         // entry in <lm_states_>; instead, we put the logprob in the place where
         // we normally store the poitner.
         Int32AndFloat child_logprob_f;
         if (sorted_vec[i].second->IsChildFinalOrder()) {
           child_logprob_f.f = sorted_vec[i].second->GetChild(j).second.prob;
         } else {
           child_logprob_f.f =
               sorted_vec[i].second->GetChild(j).second.state->Logprob();
         }
         child_info = child_logprob_f.i;
         child_info &= ~1;   // Sets the last bit to 0 so <child_info> is even.
       } else {
         // Child is not a leaf or is unigram.
         int64 offset =
             sorted_vec[i].second->GetChild(j).second.state->MyAddress()
             - sorted_vec[i].second->MyAddress();
         KALDI_ASSERT(offset > 0);
         if (offset <= max_address_offset_) {
           // Relative address can be represented by 30 bits.
           child_info = offset * 2;
           child_info |= 1;
         } else {
           // Relative address cannot be represented by 30 bits, we have to put
           // the child address into <overflow_buffer_>.
           int32* abs_address = parent_address + offset;
           overflow_buffer_vec.push_back(abs_address);
           int32 overflow_buffer_index = overflow_buffer_vec.size() - 1;
           child_info = overflow_buffer_index * 2;
           child_info |= 1;
           child_info *= -1;
         }
       }
       // Child word.
       lm_states_[lm_states_index++] = sorted_vec[i].second->GetChild(j).first;
       // Child info.
       lm_states_[lm_states_index++] = child_info;
     }

     // If the current state corresponds to an unigram, then create a separate
     // loop up table to improve efficiency, since those will be looked up pretty
     // frequently.
     if (sorted_vec[i].second->IsUnigram()) {
       KALDI_ASSERT(sorted_vec[i].first->size() == 1);
       unigram_states_[(*sorted_vec[i].first)[0]] = parent_address;
     }
   }
   KALDI_ASSERT(lm_states_size_ == lm_states_index);

   // Move <overflow_buffer_> from vector holder to array.
   overflow_buffer_size_ = overflow_buffer_vec.size();
   overflow_buffer_ = new int32*[overflow_buffer_size_];
   for (int32 i = 0; i < overflow_buffer_size_; ++i) {
     overflow_buffer_[i] = overflow_buffer_vec[i];
   }

   is_built_ = true;
 }

 void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const {
   if (!binary) {
     KALDI_ERR << "text-mode writing is not implemented for ConstArpaLmBuilder.";
   }
   KALDI_ASSERT(is_built_);

   // Creates ConstArpaLm.
   ConstArpaLm const_arpa_lm(
       Options().bos_symbol, Options().eos_symbol, Options().unk_symbol,
       ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_,
       unigram_states_, overflow_buffer_, lm_states_);
   const_arpa_lm.Write(os, binary);
 }

 void ConstArpaLm::Write(std::ostream &os, bool binary) const {
   KALDI_ASSERT(initialized_);
   if (!binary) {
     KALDI_ERR << "text-mode writing is not implemented for ConstArpaLm.";
   }

   WriteToken(os, binary, "<ConstArpaLm>");

   // Misc info.
   WriteToken(os, binary, "<LmInfo>");
   WriteBasicType(os, binary, bos_symbol_);
   WriteBasicType(os, binary, eos_symbol_);
   WriteBasicType(os, binary, unk_symbol_);
   WriteBasicType(os, binary, ngram_order_);
   WriteToken(os, binary, "</LmInfo>");

   // LmStates section.
   WriteToken(os, binary, "<LmStates>");
   WriteBasicType(os, binary, lm_states_size_);
   os.write(reinterpret_cast<char *>(lm_states_),
            sizeof(int32) * lm_states_size_);
   if (!os.good()) {
     KALDI_ERR << "ConstArpaLm <LmStates> section writing failed.";
   }
   WriteToken(os, binary, "</LmStates>");

   // Unigram section. We write memory offset to disk instead of the absolute
   // pointers.
   WriteToken(os, binary, "<LmUnigram>");
   WriteBasicType(os, binary, num_words_);
   int64* tmp_unigram_address = new int64[num_words_];
   for (int32 i = 0; i < num_words_; ++i) {
     // The relative address here is a little bit tricky:
     // 1. If the original address is NULL, then we set the relative address to
     //    zero.
     // 2. If the original address is not NULL, we set it to the following:
     //      unigram_states_[i] - lm_states_ + 1
     //    we plus 1 to ensure that the above value is positive.
     tmp_unigram_address[i] = (unigram_states_[i] == NULL) ? 0 :
         unigram_states_[i] - lm_states_ + 1;
   }
   os.write(reinterpret_cast<char *>(tmp_unigram_address),
            sizeof(int64) * num_words_);
   if (!os.good()) {
     KALDI_ERR << "ConstArpaLm <LmUnigram> section writing failed.";
   }
   delete[] tmp_unigram_address;   // Releases the memory.
   tmp_unigram_address = NULL;
   WriteToken(os, binary, "</LmUnigram>");

   // Overflow section. We write memory offset to disk instead of the absolute
   // pointers.
   WriteToken(os, binary, "<LmOverflow>");
   WriteBasicType(os, binary, overflow_buffer_size_);
   int64* tmp_overflow_address = new int64[overflow_buffer_size_];
   for (int32 i = 0; i < overflow_buffer_size_; ++i) {
     // The relative address here is a little bit tricky:
     // 1. If the original address is NULL, then we set the relative address to
     //    zero.
     // 2. If the original address is not NULL, we set it to the following:
     //      overflow_buffer_[i] - lm_states_ + 1
     //    we plus 1 to ensure that the above value is positive.
     tmp_overflow_address[i] = (overflow_buffer_[i] == NULL) ? 0 :
         overflow_buffer_[i] - lm_states_ + 1;
   }
   os.write(reinterpret_cast<char *>(tmp_overflow_address),
            sizeof(int64) * overflow_buffer_size_);
   if (!os.good()) {
     KALDI_ERR << "ConstArpaLm <LmOverflow> section writing failed.";
   }
   delete[] tmp_overflow_address;
   tmp_overflow_address = NULL;
   WriteToken(os, binary, "</LmOverflow>");
   WriteToken(os, binary, "</ConstArpaLm>");
 }

 void ConstArpaLm::Read(std::istream &is, bool binary) {
   KALDI_ASSERT(!initialized_);
   if (!binary) {
     KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
   }

   int first_char = is.peek();
   if (first_char == 4) {  // Old on-disk format starts with length of int32.
     ReadInternalOldFormat(is, binary);
   } else {                // New on-disk format starts with token <ConstArpaLm>.
     ReadInternal(is, binary);
   }
 }

 void ConstArpaLm::ReadInternal(std::istream &is, bool binary) {
   KALDI_ASSERT(!initialized_);
   if (!binary) {
     KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
   }

   ExpectToken(is, binary, "<ConstArpaLm>");

   // Misc info.
   ExpectToken(is, binary, "<LmInfo>");
   ReadBasicType(is, binary, &bos_symbol_);
   ReadBasicType(is, binary, &eos_symbol_);
   ReadBasicType(is, binary, &unk_symbol_);
   ReadBasicType(is, binary, &ngram_order_);
   ExpectToken(is, binary, "</LmInfo>");

   // LmStates section.
   ExpectToken(is, binary, "<LmStates>");
   ReadBasicType(is, binary, &lm_states_size_);
   lm_states_ = new int32[lm_states_size_];
   is.read(reinterpret_cast<char *>(lm_states_),
           sizeof(int32) * lm_states_size_);
   if (!is.good()) {
     KALDI_ERR << "ConstArpaLm <LmStates> section reading failed.";
   }
   ExpectToken(is, binary, "</LmStates>");

   // Unigram section. We write memory offset to disk instead of the absolute
   // pointers.
   ExpectToken(is, binary, "<LmUnigram>");
   ReadBasicType(is, binary, &num_words_);
   unigram_states_ = new int32*[num_words_];
   int64* tmp_unigram_address = new int64[num_words_];
   is.read(reinterpret_cast<char *>(tmp_unigram_address),
           sizeof(int64) * num_words_);
   if (!is.good()) {
     KALDI_ERR << "ConstArpaLm <LmUnigram> section reading failed.";
   }
   for (int32 i = 0; i < num_words_; ++i) {
     // Check out how we compute the relative address in ConstArpaLm::Write().
     unigram_states_[i] = (tmp_unigram_address[i] == 0) ? NULL
         : lm_states_ + tmp_unigram_address[i] - 1;
   }
   delete[] tmp_unigram_address;
   tmp_unigram_address = NULL;
   ExpectToken(is, binary, "</LmUnigram>");

   // Overflow section. We write memory offset to disk instead of the absolute
   // pointers.
   ExpectToken(is, binary, "<LmOverflow>");
   ReadBasicType(is, binary, &overflow_buffer_size_);
   overflow_buffer_ = new int32*[overflow_buffer_size_];
   int64* tmp_overflow_address = new int64[overflow_buffer_size_];
   is.read(reinterpret_cast<char *>(tmp_overflow_address),
           sizeof(int64) * overflow_buffer_size_);
   if (!is.good()) {
     KALDI_ERR << "ConstArpaLm <LmOverflow> section reading failed.";
   }
   for (int32 i = 0; i < overflow_buffer_size_; ++i) {
     // Check out how we compute the relative address in ConstArpaLm::Write().
     overflow_buffer_[i] = (tmp_overflow_address[i] == 0) ? NULL
         : lm_states_ + tmp_overflow_address[i] - 1;
   }
   delete[] tmp_overflow_address;
   tmp_overflow_address = NULL;
   ExpectToken(is, binary, "</LmOverflow>");
   ExpectToken(is, binary, "</ConstArpaLm>");

   KALDI_ASSERT(ngram_order_ > 0);
   KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0);
   KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0);
   KALDI_ASSERT(unk_symbol_ < num_words_ &&
                (unk_symbol_ > 0 || unk_symbol_ == -1));
   lm_states_end_ = lm_states_ + lm_states_size_ - 1;
   memory_assigned_ = true;
   initialized_ = true;
 }

 void ConstArpaLm::ReadInternalOldFormat(std::istream &is, bool binary) {
   KALDI_ASSERT(!initialized_);
   if (!binary) {
     KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
   }

   // Misc info.
   ReadBasicType(is, binary, &bos_symbol_);
   ReadBasicType(is, binary, &eos_symbol_);
   ReadBasicType(is, binary, &unk_symbol_);
   ReadBasicType(is, binary, &ngram_order_);

   // LmStates section.
   // In the deprecated version, <lm_states_size_> used to be type of int32,
   // which was a bug. We therefore use int32 for read for back-compatibility.
   int32 lm_states_size_int32;
   ReadBasicType(is, binary, &lm_states_size_int32);
   lm_states_size_ = static_cast<int64>(lm_states_size_int32);
   lm_states_ = new int32[lm_states_size_];
   for (int64 i = 0; i < lm_states_size_; ++i) {
     ReadBasicType(is, binary, &lm_states_[i]);
   }

   // Unigram section. We write memory offset to disk instead of the absolute
   // pointers.
   ReadBasicType(is, binary, &num_words_);
   unigram_states_ = new int32*[num_words_];
   for (int32 i = 0; i < num_words_; ++i) {
     int64 tmp_address;
     ReadBasicType(is, binary, &tmp_address);
     // Check out how we compute the relative address in ConstArpaLm::Write().
     unigram_states_[i] =
         (tmp_address == 0) ? NULL : lm_states_ + tmp_address - 1;
   }

   // Overflow section. We write memory offset to disk instead of the absolute
   // pointers.
   ReadBasicType(is, binary, &overflow_buffer_size_);
   overflow_buffer_ = new int32*[overflow_buffer_size_];
   for (int32 i = 0; i < overflow_buffer_size_; ++i) {
     int64 tmp_address;
     ReadBasicType(is, binary, &tmp_address);
     // Check out how we compute the relative address in ConstArpaLm::Write().
     overflow_buffer_[i] =
         (tmp_address == 0) ? NULL : lm_states_ + tmp_address - 1;
   }
   KALDI_ASSERT(ngram_order_ > 0);
   KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0);
   KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0);
   KALDI_ASSERT(unk_symbol_ < num_words_ &&
                (unk_symbol_ > 0 || unk_symbol_ == -1));
   lm_states_end_ = lm_states_ + lm_states_size_ - 1;
   memory_assigned_ = true;
   initialized_ = true;
 }

 bool ConstArpaLm::HistoryStateExists(const std::vector<int32>& hist) const {
   // We do not create LmState for empty word sequence, but technically it is the
   // history state of all unigrams.
   if (hist.size() == 0) {
     return true;
   }

   // Tries to locate the LmState of the given word sequence.
   int32* lm_state = GetLmState(hist);
   if (lm_state == NULL) {
     // <lm_state> does not exist means <hist> has no child.
     return false;
   } else {
     // Note that we always create LmState for unigrams, so even if <lm_state> is
     // not NULL, we still have to check if it has child.
     KALDI_ASSERT(lm_state >= lm_states_);
     KALDI_ASSERT(lm_state + 2 <= lm_states_end_);
     // <lm_state + 2> points to <num_children>.
     if (*(lm_state + 2) > 0) {
       return true;
     } else {
       return false;
     }
   }
   return true;
 }

 float ConstArpaLm::GetNgramLogprob(const int32 word,
                                    const std::vector<int32>& hist) const {
   KALDI_ASSERT(initialized_);

   // If the history size plus one is larger than <ngram_order_>, remove the old
   // words.
   std::vector<int32> mapped_hist(hist);
   while (mapped_hist.size() >= ngram_order_) {
     mapped_hist.erase(mapped_hist.begin(), mapped_hist.begin() + 1);
   }
   KALDI_ASSERT(mapped_hist.size() + 1 <= ngram_order_);

   // TODO(guoguo): check with Dan if this is reasonable.
   // Maps possible out-of-vocabulary words to <unk>. If a word does not have a
   // corresponding LmState, we treat it as <unk>. We map it to <unk> if <unk> is
   // specified.
   int32 mapped_word = word;
   if (unk_symbol_ != -1) {
     KALDI_ASSERT(mapped_word >= 0);
     if (mapped_word >= num_words_ || unigram_states_[mapped_word] == NULL) {
       mapped_word = unk_symbol_;
     }
     for (int32 i = 0; i < mapped_hist.size(); ++i) {
       KALDI_ASSERT(mapped_hist[i] >= 0);
       if (mapped_hist[i] >= num_words_ ||
           unigram_states_[mapped_hist[i]] == NULL) {
         mapped_hist[i] = unk_symbol_;
       }
     }
   }

   // Loops up n-gram probability.
   return GetNgramLogprobRecurse(mapped_word, mapped_hist);
 }

 float ConstArpaLm::GetNgramLogprobRecurse(
     const int32 word, const std::vector<int32>& hist) const {
   KALDI_ASSERT(initialized_);
   KALDI_ASSERT(hist.size() + 1 <= ngram_order_);

   // Unigram case.
   if (hist.size() == 0) {
     if (word >= num_words_ || unigram_states_[word] == NULL) {
       // If <unk> is defined, then the word sequence should have already been
       // mapped to <unk> is necessary; this is for the case where <unk> is not
       // defined.
       return std::numeric_limits<float>::min();
     } else {
       Int32AndFloat logprob_i(*unigram_states_[word]);
       return logprob_i.f;
     }
   }

   // High n-gram orders.
   float logprob = 0.0;
   float backoff_logprob = 0.0;
   int32* state;
   if ((state = GetLmState(hist)) != NULL) {
     int32 child_info;
     int32* child_lm_state = NULL;
     if (GetChildInfo(word, state, &child_info)) {
       DecodeChildInfo(child_info, state, &child_lm_state, &logprob);
       return logprob;
     } else {
       Int32AndFloat backoff_logprob_i(*(state + 1));
       backoff_logprob = backoff_logprob_i.f;
     }
   }
   std::vector<int32> new_hist(hist);
   new_hist.erase(new_hist.begin(), new_hist.begin() + 1);
   return backoff_logprob + GetNgramLogprobRecurse(word, new_hist);
 }

 int32* ConstArpaLm::GetLmState(const std::vector<int32>& seq) const {
   KALDI_ASSERT(initialized_);

   // No LmState exists for empty word sequence.
   if (seq.size() == 0) return NULL;

   // If <unk> is defined, then the word sequence should have already been mapped
   // to <unk> is necessary; this is for the case where <unk> is not defined.
   if (seq[0] >= num_words_ || unigram_states_[seq[0]] == NULL) return NULL;
   int32* parent = unigram_states_[seq[0]];

   int32 child_info;
   int32* child_lm_state = NULL;
   float logprob;
   for (int32 i = 1; i < seq.size(); ++i) {
     if (!GetChildInfo(seq[i], parent, &child_info)) {
       return NULL;
     }
     DecodeChildInfo(child_info, parent, &child_lm_state, &logprob);
     if (child_lm_state == NULL) {
       return NULL;
     } else {
       parent = child_lm_state;
     }
   }
   return parent;
 }

 bool ConstArpaLm::GetChildInfo(const int32 word,
                                int32* parent, int32* child_info) const {
   KALDI_ASSERT(initialized_);

   KALDI_ASSERT(parent != NULL);
   KALDI_ASSERT(parent >= lm_states_);
   KALDI_ASSERT(child_info != NULL);

   KALDI_ASSERT(parent + 2 <= lm_states_end_);
   int32 num_children = *(parent + 2);
   KALDI_ASSERT(parent + 2 + 2 * num_children <= lm_states_end_);

   if (num_children == 0) return false;

   // A binary search into the children memory block.
   int32 start_index = 1;
   int32 end_index = num_children;
   while (start_index <= end_index) {
     int32 mid_index = round((start_index + end_index) / 2);
     int32 mid_word = *(parent + 1 + 2 * mid_index);
     if (mid_word == word) {
       *child_info = *(parent + 2 + 2 * mid_index);
       return true;
     } else if (mid_word < word) {
       start_index = mid_index + 1;
     } else {
       end_index = mid_index - 1;
     }
   }

   return false;
 }

 void ConstArpaLm::DecodeChildInfo(const int32 child_info,
                                   int32* parent,
                                   int32** child_lm_state,
                                   float* logprob) const {
   KALDI_ASSERT(initialized_);

   KALDI_ASSERT(logprob != NULL);
   if (child_info % 2 == 0) {
     // Child is a leaf, only returns the log probability.
     *child_lm_state = NULL;
     Int32AndFloat logprob_i(child_info);
     *logprob = logprob_i.f;
   } else {
     int32 child_offset = child_info / 2;
     if (child_offset > 0) {
       *child_lm_state = parent + child_offset;
       Int32AndFloat logprob_i(**child_lm_state);
       *logprob = logprob_i.f;
     } else {
       KALDI_ASSERT(-child_offset < overflow_buffer_size_);
       *child_lm_state = overflow_buffer_[-child_offset];
       Int32AndFloat logprob_i(**child_lm_state);
       *logprob = logprob_i.f;
     }
     KALDI_ASSERT(*child_lm_state >= lm_states_);
     KALDI_ASSERT(*child_lm_state <= lm_states_end_);
   }
 }

 void ConstArpaLm::WriteArpaRecurse(int32* lm_state,
                                    const std::vector<int32>& seq,
                                    std::vector<ArpaLine> *output) const {
   if (lm_state == NULL) return;

   KALDI_ASSERT(lm_state >= lm_states_);
   KALDI_ASSERT(lm_state + 2 <= lm_states_end_);

   // Inserts the current LmState to <output>.
   ArpaLine arpa_line;
   arpa_line.words = seq;
   Int32AndFloat logprob_i(*lm_state);
   arpa_line.logprob = logprob_i.f;
   Int32AndFloat backoff_logprob_i(*(lm_state + 1));
   arpa_line.backoff_logprob = backoff_logprob_i.f;
   output->push_back(arpa_line);

   // Scans for possible children, and recursively adds child to <output>.
   int32 num_children = *(lm_state + 2);
   KALDI_ASSERT(lm_state + 2 + 2 * num_children <= lm_states_end_);
   for (int32 i = 0; i < num_children; ++i) {
     std::vector<int32> new_seq(seq);
     new_seq.push_back(*(lm_state + 3 + 2 * i));
     int32 child_info = *(lm_state + 4 + 2 * i);
     float logprob;
     int32* child_lm_state = NULL;
     DecodeChildInfo(child_info, lm_state, &child_lm_state, &logprob);

     if (child_lm_state == NULL) {
       // Leaf case.
       ArpaLine child_arpa_line;
       child_arpa_line.words = new_seq;
       child_arpa_line.logprob = logprob;
       child_arpa_line.backoff_logprob = 0.0;
       output->push_back(child_arpa_line);
     } else {
       WriteArpaRecurse(child_lm_state, new_seq, output);
     }
   }
 }

 void ConstArpaLm::WriteArpa(std::ostream &os) const {
   KALDI_ASSERT(initialized_);

   std::vector<ArpaLine> tmp_output;
   for (int32 i = 0; i < num_words_; ++i) {
     if (unigram_states_[i] != NULL) {
       std::vector<int32> seq(1, i);
       WriteArpaRecurse(unigram_states_[i], seq, &tmp_output);
     }
   }

   // Sorts ArpaLines and collects head information.
   std::sort(tmp_output.begin(), tmp_output.end());
   std::vector<int32> ngram_count(1, 0);
   for (int32 i = 0; i < tmp_output.size(); ++i) {
     if (tmp_output[i].words.size() >= ngram_count.size()) {
       ngram_count.resize(tmp_output[i].words.size() + 1);
       ngram_count[tmp_output[i].words.size()] = 1;
     } else {
       ngram_count[tmp_output[i].words.size()] += 1;
     }
   }

   // Writes the header.
   os << std::endl;
   os << "\\data\\" << std::endl;
   for (int32 i = 1; i < ngram_count.size(); ++i) {
     os << "ngram " << i << "=" << ngram_count[i] << std::endl;
   }

   // Writes n-grams.
   int32 current_order = 0;
   for (int32 i = 0; i < tmp_output.size(); ++i) {
     // Beginning of a n-gram section.
     if (tmp_output[i].words.size() != current_order) {
       current_order = tmp_output[i].words.size();
       os << std::endl;
       os << "\\" << current_order << "-grams:" << std::endl;
     }

     // Writes logprob.
     os << tmp_output[i].logprob << '\t';

     // Writes word sequence.
     for (int32 j = 0; j < tmp_output[i].words.size(); ++j) {
       os << tmp_output[i].words[j];
       if (j != tmp_output[i].words.size() - 1) {
         os << " ";
       }
     }

     // Writes backoff_logprob if it is not zero.
     if (tmp_output[i].backoff_logprob != 0.0) {
       os << '\t' << tmp_output[i].backoff_logprob;
     }
     os << std::endl;
   }

   os << std::endl << "\\end\\" << std::endl;
 }

 ConstArpaLmDeterministicFst::ConstArpaLmDeterministicFst(
     const ConstArpaLm& lm) : lm_(lm) {
   // Creates a history state for <s>.
   std::vector<Label> bos_state(1, lm_.BosSymbol());
   state_to_wseq_.push_back(bos_state);
   wseq_to_state_[bos_state] = 0;
   start_state_ = 0;
 }

 fst::StdArc::Weight ConstArpaLmDeterministicFst::Final(StateId s) {
   // At this point, we should have created the state.
   KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
   const std::vector<Label>& wseq = state_to_wseq_[s];
   float logprob = lm_.GetNgramLogprob(lm_.EosSymbol(), wseq);
   return Weight(-logprob);
 }

 bool ConstArpaLmDeterministicFst::GetArc(StateId s,
                                          Label ilabel, fst::StdArc *oarc) {
   // At this point, we should have created the state.
   KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
   std::vector<Label> wseq = state_to_wseq_[s];

   float logprob = lm_.GetNgramLogprob(ilabel, wseq);
   if (logprob == std::numeric_limits<float>::min()) {
     return false;
   }

   // Locates the next state in ConstArpaLm. Note that OOV and backoff have been
   // taken care of in ConstArpaLm.
   wseq.push_back(ilabel);
   while (wseq.size() >= lm_.NgramOrder()) {
     // History state has at most lm_.NgramOrder() -1 words in the state.
     wseq.erase(wseq.begin(), wseq.begin() + 1);
   }
   while (!lm_.HistoryStateExists(wseq)) {
     KALDI_ASSERT(wseq.size() > 0);
     wseq.erase(wseq.begin(), wseq.begin() + 1);
   }

   std::pair<const std::vector<Label>, StateId> wseq_state_pair(
       wseq, static_cast<Label>(state_to_wseq_.size()));

   // Attemps to insert the current <wseq_state_pair>. If the pair already exists
   // then it returns false.
   typedef MapType::iterator IterType;
   std::pair<IterType, bool> result = wseq_to_state_.insert(wseq_state_pair);

   // If the pair was just inserted, then also add it to <state_to_wseq_>.
   if (result.second == true)
     state_to_wseq_.push_back(wseq);

   // Creates the arc.
   oarc->ilabel = ilabel;
   oarc->olabel = ilabel;
   oarc->nextstate = result.first->second;
   oarc->weight = Weight(-logprob);

   return true;
 }

 bool BuildConstArpaLm(const ArpaParseOptions& options,
                       const std::string& arpa_rxfilename,
                       const std::string& const_arpa_wxfilename) {
   ConstArpaLmBuilder lm_builder(options);
   KALDI_LOG << "Reading " << arpa_rxfilename;
   Input ki(arpa_rxfilename);
   lm_builder.Read(ki.Stream());
   WriteKaldiObject(lm_builder, const_arpa_wxfilename, true);
   return true;
 }

 }  // namespace kaldi
kaldi::ArpaFileParser
ArpaFileParser is an abstract base class for ARPA LM file conversion.
Definition: arpa-file-parser.h:81

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::ConstArpaLmBuilder::is_built_
bool is_built_
Definition: const-arpa-lm.cc:231

kaldi::ConstArpaLmDeterministicFst::Weight
fst::StdArc::Weight Weight
Definition: const-arpa-lm.h:394

kaldi::LmState::is_child_final_order_
bool is_child_final_order_
Definition: const-arpa-lm.cc:156

kaldi::ConstArpaLmDeterministicFst::state_to_wseq_
std::vector< std::vector< Label > > state_to_wseq_
Definition: const-arpa-lm.h:415

stl-utils.h

kaldi::ConstArpaLmBuilder::Write
void Write(std::ostream &os, bool binary) const
Definition: const-arpa-lm.cc:483

kaldi::VectorHasher
A hashing function-object for vectors.
Definition: stl-utils.h:216

rnnlm::j
int j
Definition: mikolov-rnnlm-lib.cc:66

kaldi::Input
Definition: kaldi-io.h:190

kaldi::ConstArpaLm::Read
void Read(std::istream &is, bool binary)
Definition: const-arpa-lm.cc:573

arpa-file-parser.h

kaldi::ConstArpaLmBuilder::num_words_
int32 num_words_
Definition: const-arpa-lm.cc:243

kaldi::ConstArpaLmBuilder::seq_to_state_
unordered_map< std::vector< int32 >, LmState *, VectorHasher< int32 > > seq_to_state_
Definition: const-arpa-lm.cc:264

kaldi::ConstArpaLmBuilder::WordsAndLmStatePairLessThan::operator()
bool operator()(const std::pair< std::vector< int32 > *, LmState *> &lhs, const std::pair< std::vector< int32 > *, LmState *> &rhs) const
Definition: const-arpa-lm.cc:222

kaldi::ReadBasicType
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55

kaldi::ConstArpaLmBuilder::ReadComplete
virtual void ReadComplete()
Override function called after the last n-gram has been consumed.
Definition: const-arpa-lm.cc:356

kaldi::LmState::ChildrenVectorLessThan
Definition: const-arpa-lm.cc:65

kaldi::Int32AndFloat::f
float f
Definition: const-arpa-lm.h:204

fst::StdArc
fst::StdArc StdArc
Definition: deterministic-fst-test.cc:56

kaldi::ArpaParseOptions
Options that control ArpaFileParser.
Definition: arpa-file-parser.h:37

kaldi::NGram::logprob
float logprob
Log-prob of the n-gram.
Definition: arpa-file-parser.h:71

kaldi::LmState::MyAddress
int64 MyAddress() const
Definition: const-arpa-lm.cc:96

kaldi::ConstArpaLm::ReadInternalOldFormat
void ReadInternalOldFormat(std::istream &is, bool binary)
Definition: const-arpa-lm.cc:665

logprob
float logprob
Definition: arpa-file-parser-test.cc:42

kaldi::LmState::ChildType::state
LmState * state
Definition: const-arpa-lm.cc:59

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

kaldi::ArpaLine
Definition: const-arpa-lm.cc:36

kaldi-math.h

kaldi::LmState::LmState
LmState(const bool is_unigram, const bool is_child_final_order, const float logprob, const float backoff_logprob)
Definition: const-arpa-lm.cc:73

kaldi::ConstArpaLmBuilder::SetMaxAddressOffset
void SetMaxAddressOffset(const int32 max_address_offset)
Definition: const-arpa-lm.cc:208

text-utils.h

kaldi::ConstArpaLm
Definition: const-arpa-lm.h:211

kaldi::ConstArpaLmDeterministicFst::GetArc
virtual bool GetArc(StateId s, Label ilabel, fst::StdArc *oarc)
Definition: const-arpa-lm.cc:1030

kaldi::ConstArpaLmDeterministicFst::ConstArpaLmDeterministicFst
ConstArpaLmDeterministicFst(const ConstArpaLm &lm)
Definition: const-arpa-lm.cc:1013

kaldi::LmState::AddChild
void AddChild(const int32 word, LmState *child_state)
Definition: const-arpa-lm.cc:82

kaldi::ConstArpaLm::Write
void Write(std::ostream &os, bool binary) const
Definition: const-arpa-lm.cc:497

kaldi::ConstArpaLmBuilder::lm_states_size_
int64 lm_states_size_
Definition: const-arpa-lm.cc:250

kaldi::Input::Stream
std::istream & Stream()
Definition: kaldi-io.cc:826

kaldi::ConstArpaLmBuilder::overflow_buffer_size_
int32 overflow_buffer_size_
Definition: const-arpa-lm.cc:247

kaldi::ConstArpaLmBuilder::HeaderAvailable
virtual void HeaderAvailable()
Override function called to signal that ARPA header with the expected number of n-grams has been read...
Definition: const-arpa-lm.cc:267

kaldi::ConstArpaLmDeterministicFst::wseq_to_state_
MapType wseq_to_state_
Definition: const-arpa-lm.h:414

kaldi::ConstArpaLmBuilder::unigram_states_
int32 ** unigram_states_
Definition: const-arpa-lm.cc:256

kaldi::LmState::logprob_
float logprob_
Definition: const-arpa-lm.cc:165

kaldi::NGram::backoff
float backoff
log-backoff weight of the n-gram.
Definition: arpa-file-parser.h:72

kaldi::ConstArpaLmBuilder::overflow_buffer_
int32 ** overflow_buffer_
Definition: const-arpa-lm.cc:260

kaldi::LmState
Definition: const-arpa-lm.cc:55

kaldi::ExpectToken
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191

kaldi::ConstArpaLm::WriteArpaRecurse
void WriteArpaRecurse(int32 *lm_state, const std::vector< int32 > &seq, std::vector< ArpaLine > *output) const
Definition: const-arpa-lm.cc:911

kaldi::ConstArpaLm::DecodeChildInfo
void DecodeChildInfo(const int32 child_info, int32 *parent, int32 **child_lm_state, float *logprob) const
Definition: const-arpa-lm.cc:882

kaldi::NGram::words
std::vector< int32 > words
Symbols in left to right order.
Definition: arpa-file-parser.h:70

kaldi::LmState::SetMyAddress
void SetMyAddress(const int64 address)
Definition: const-arpa-lm.cc:78

kaldi::ArpaLine::backoff_logprob
float backoff_logprob
Definition: const-arpa-lm.cc:39

kaldi::ConstArpaLmDeterministicFst::Label
fst::StdArc::Label Label
Definition: const-arpa-lm.h:396

KALDI_ERR
#define KALDI_ERR
Definition: kaldi-error.h:147

kaldi::ConstArpaLmDeterministicFst::start_state_
StateId start_state_
Definition: const-arpa-lm.h:413

kaldi::LmState::GetChild
std::pair< int32, union ChildType > GetChild(const int32 index)
Definition: const-arpa-lm.cc:120

const-arpa-lm.h

KALDI_WARN
#define KALDI_WARN
Definition: kaldi-error.h:150

kaldi::ArpaFileParser::Read
void Read(std::istream &is)
Read ARPA LM file from a stream.
Definition: arpa-file-parser.cc:45

kaldi::WriteToken
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134

kaldi::ConstArpaLmBuilder
Definition: const-arpa-lm.cc:177

kaldi::ConstArpaLm::GetLmState
int32 * GetLmState(const std::vector< int32 > &seq) const
Definition: const-arpa-lm.cc:821

kaldi::Int32AndFloat::i
int32 i
Definition: const-arpa-lm.h:203

kaldi::ConstArpaLm::NgramOrder
int32 NgramOrder() const
Definition: const-arpa-lm.h:279

kaldi::ConstArpaLmBuilder::WordsAndLmStatePairLessThan
Definition: const-arpa-lm.cc:221

fst::Weight
fst::StdArc::Weight Weight
Definition: deterministic-fst-test.cc:60

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::LmState::children_
std::vector< std::pair< int32, union ChildType > > children_
Definition: const-arpa-lm.cc:172

kaldi::ArpaLine::logprob
float logprob
Definition: const-arpa-lm.cc:38

kaldi::ConstArpaLmBuilder::~ConstArpaLmBuilder
~ConstArpaLmBuilder()
Definition: const-arpa-lm.cc:192

kaldi::LmState::AddChild
void AddChild(const int32 word, const float child_prob)
Definition: const-arpa-lm.cc:89

kaldi::ConstArpaLmDeterministicFst::StateId
fst::StdArc::StateId StateId
Definition: const-arpa-lm.h:395

kaldi::ConstArpaLm::BosSymbol
int32 BosSymbol() const
Definition: const-arpa-lm.h:276

kaldi::LmState::IsUnigram
bool IsUnigram() const
Definition: const-arpa-lm.cc:100

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

kaldi::ConstArpaLm::GetNgramLogprob
float GetNgramLogprob(const int32 word, const std::vector< int32 > &hist) const
Definition: const-arpa-lm.cc:748

kaldi::ArpaLine::operator<
bool operator<(const ArpaLine &other) const
Definition: const-arpa-lm.cc:41

kaldi::LmState::my_address_
int64 my_address_
Definition: const-arpa-lm.cc:161

kaldi::LmState::backoff_logprob_
float backoff_logprob_
Definition: const-arpa-lm.cc:169

kaldi::LmState::ChildrenVectorLessThan::operator()
bool operator()(const std::pair< int32, union ChildType > &lhs, const std::pair< int32, union ChildType > &rhs) const
Definition: const-arpa-lm.cc:66

kaldi::ConstArpaLmBuilder::ngram_order_
int32 ngram_order_
Definition: const-arpa-lm.cc:239

kaldi::Int32AndFloat
Definition: const-arpa-lm.h:202

kaldi::WriteKaldiObject
void WriteKaldiObject(const C &c, const std::string &filename, bool binary)
Definition: kaldi-io.h:257

kaldi::LmState::IsLeaf
bool IsLeaf() const
Definition: const-arpa-lm.cc:131

kaldi::WriteBasicType
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34

kaldi::ConstArpaLm::EosSymbol
int32 EosSymbol() const
Definition: const-arpa-lm.h:277

kaldi::LmState::MemSize
int32 MemSize() const
Definition: const-arpa-lm.cc:137

kaldi::ConstArpaLm::GetNgramLogprobRecurse
float GetNgramLogprobRecurse(const int32 word, const std::vector< int32 > &hist) const
Definition: const-arpa-lm.cc:783

kaldi::LmState::ChildType
Definition: const-arpa-lm.cc:57

kaldi::LmState::is_unigram_
bool is_unigram_
Definition: const-arpa-lm.cc:152

kaldi::BuildConstArpaLm
bool BuildConstArpaLm(const ArpaParseOptions &options, const std::string &arpa_rxfilename, const std::string &const_arpa_wxfilename)
Definition: const-arpa-lm.cc:1074

kaldi::ConstArpaLmBuilder::ConsumeNGram
virtual void ConsumeNGram(const NGram &ngram)
Pure override that must be implemented to process current n-gram.
Definition: const-arpa-lm.cc:271

kaldi::ConstArpaLmDeterministicFst::Final
virtual Weight Final(StateId s)
Definition: const-arpa-lm.cc:1022

kaldi::ArpaLine::words
std::vector< int32 > words
Definition: const-arpa-lm.cc:37

kaldi::ConstArpaLmBuilder::lm_states_
int32 * lm_states_
Definition: const-arpa-lm.cc:253

KALDI_LOG
#define KALDI_LOG
Definition: kaldi-error.h:153

kaldi::LmState::ChildType::prob
float prob
Definition: const-arpa-lm.cc:62

kaldi::ConstArpaLmDeterministicFst::lm_
const ConstArpaLm & lm_
Definition: const-arpa-lm.h:416

kaldi::LmState::SortChildren
void SortChildren()
Definition: const-arpa-lm.cc:126

kaldi::LmState::NumChildren
int32 NumChildren() const
Definition: const-arpa-lm.cc:116

kaldi::NGram
A parsed n-gram from ARPA LM file.
Definition: arpa-file-parser.h:68

kaldi::ConstArpaLmBuilder::max_address_offset_
int32 max_address_offset_
Definition: const-arpa-lm.cc:235

kaldi::ConstArpaLmBuilder::ConstArpaLmBuilder
ConstArpaLmBuilder(ArpaParseOptions options)
Definition: const-arpa-lm.cc:179

kaldi::LmState::IsChildFinalOrder
bool IsChildFinalOrder() const
Definition: const-arpa-lm.cc:104

kaldi::ConstArpaLm::ReadInternal
void ReadInternal(std::istream &is, bool binary)
Definition: const-arpa-lm.cc:587

kaldi::ConstArpaLm::HistoryStateExists
bool HistoryStateExists(const std::vector< int32 > &hist) const
Definition: const-arpa-lm.cc:721

kaldi::LmState::BackoffLogprob
float BackoffLogprob() const
Definition: const-arpa-lm.cc:112

kaldi::ConstArpaLm::WriteArpa
void WriteArpa(std::ostream &os) const
Definition: const-arpa-lm.cc:952

kaldi::ConstArpaLm::GetChildInfo
bool GetChildInfo(const int32 word, int32 *parent, int32 *child_info) const
Definition: const-arpa-lm.cc:849

kaldi::LmState::Logprob
float Logprob() const
Definition: const-arpa-lm.cc:108