ConstArpaLm Class Reference

#include <const-arpa-lm.h>

Collaboration diagram for ConstArpaLm:

Public Member Functions

 ConstArpaLm ()
 
 ConstArpaLm (const int32 bos_symbol, const int32 eos_symbol, const int32 unk_symbol, const int32 ngram_order, const int32 num_words, const int32 overflow_buffer_size, const int64 lm_states_size, int32 **unigram_states, int32 **overflow_buffer, int32 *lm_states)
 
 ~ConstArpaLm ()
 
void Read (std::istream &is, bool binary)
 
void Write (std::ostream &os, bool binary) const
 
void WriteArpa (std::ostream &os) const
 
float GetNgramLogprob (const int32 word, const std::vector< int32 > &hist) const
 
bool HistoryStateExists (const std::vector< int32 > &hist) const
 
int32 BosSymbol () const
 
int32 EosSymbol () const
 
int32 UnkSymbol () const
 
int32 NgramOrder () const
 

Private Member Functions

void ReadInternal (std::istream &is, bool binary)
 
void ReadInternalOldFormat (std::istream &is, bool binary)
 
float GetNgramLogprobRecurse (const int32 word, const std::vector< int32 > &hist) const
 
int32GetLmState (const std::vector< int32 > &seq) const
 
bool GetChildInfo (const int32 word, int32 *parent, int32 *child_info) const
 
void DecodeChildInfo (const int32 child_info, int32 *parent, int32 **child_lm_state, float *logprob) const
 
void WriteArpaRecurse (int32 *lm_state, const std::vector< int32 > &seq, std::vector< ArpaLine > *output) const
 

Private Attributes

bool memory_assigned_
 
bool initialized_
 
int32 bos_symbol_
 
int32 eos_symbol_
 
int32 unk_symbol_
 
int32 ngram_order_
 
int32 num_words_
 
int32 overflow_buffer_size_
 
int64 lm_states_size_
 
int32lm_states_end_
 
int32 ** unigram_states_
 
int32 ** overflow_buffer_
 
int32lm_states_
 

Detailed Description

Definition at line 211 of file const-arpa-lm.h.

Constructor & Destructor Documentation

◆ ConstArpaLm() [1/2]

ConstArpaLm ( )
inline

Definition at line 216 of file const-arpa-lm.h.

216  {
217  lm_states_ = NULL;
218  unigram_states_ = NULL;
219  overflow_buffer_ = NULL;
220  memory_assigned_ = false;
221  initialized_ = false;
222  }
int32 ** unigram_states_
int32 ** overflow_buffer_

◆ ConstArpaLm() [2/2]

ConstArpaLm ( const int32  bos_symbol,
const int32  eos_symbol,
const int32  unk_symbol,
const int32  ngram_order,
const int32  num_words,
const int32  overflow_buffer_size,
const int64  lm_states_size,
int32 **  unigram_states,
int32 **  overflow_buffer,
int32 lm_states 
)
inline

Definition at line 226 of file const-arpa-lm.h.

References KALDI_ASSERT.

230  :
231  bos_symbol_(bos_symbol), eos_symbol_(eos_symbol),
232  unk_symbol_(unk_symbol), ngram_order_(ngram_order),
233  num_words_(num_words), overflow_buffer_size_(overflow_buffer_size),
234  lm_states_size_(lm_states_size), unigram_states_(unigram_states),
235  overflow_buffer_(overflow_buffer), lm_states_(lm_states) {
236  KALDI_ASSERT(unigram_states_ != NULL);
238  KALDI_ASSERT(lm_states_ != NULL);
240  KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0);
241  KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0);
243  (unk_symbol_ > 0 || unk_symbol_ == -1));
245  memory_assigned_ = false;
246  initialized_ = true;
247  }
int32 ** unigram_states_
int32 ** overflow_buffer_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ ~ConstArpaLm()

~ConstArpaLm ( )
inline

Definition at line 249 of file const-arpa-lm.h.

249  {
250  if (memory_assigned_) {
251  delete[] lm_states_;
252  delete[] unigram_states_;
253  delete[] overflow_buffer_;
254  }
255  }
int32 ** unigram_states_
int32 ** overflow_buffer_

Member Function Documentation

◆ BosSymbol()

int32 BosSymbol ( ) const
inline

Definition at line 276 of file const-arpa-lm.h.

Referenced by ConstArpaLmDeterministicFst::ConstArpaLmDeterministicFst().

276 { return bos_symbol_; }

◆ DecodeChildInfo()

void DecodeChildInfo ( const int32  child_info,
int32 parent,
int32 **  child_lm_state,
float logprob 
) const
private

Definition at line 882 of file const-arpa-lm.cc.

References Int32AndFloat::f, and KALDI_ASSERT.

885  {
887 
888  KALDI_ASSERT(logprob != NULL);
889  if (child_info % 2 == 0) {
890  // Child is a leaf, only returns the log probability.
891  *child_lm_state = NULL;
892  Int32AndFloat logprob_i(child_info);
893  *logprob = logprob_i.f;
894  } else {
895  int32 child_offset = child_info / 2;
896  if (child_offset > 0) {
897  *child_lm_state = parent + child_offset;
898  Int32AndFloat logprob_i(**child_lm_state);
899  *logprob = logprob_i.f;
900  } else {
901  KALDI_ASSERT(-child_offset < overflow_buffer_size_);
902  *child_lm_state = overflow_buffer_[-child_offset];
903  Int32AndFloat logprob_i(**child_lm_state);
904  *logprob = logprob_i.f;
905  }
906  KALDI_ASSERT(*child_lm_state >= lm_states_);
907  KALDI_ASSERT(*child_lm_state <= lm_states_end_);
908  }
909 }
kaldi::int32 int32
int32 ** overflow_buffer_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ EosSymbol()

int32 EosSymbol ( ) const
inline

Definition at line 277 of file const-arpa-lm.h.

Referenced by ConstArpaLmDeterministicFst::Final().

277 { return eos_symbol_; }

◆ GetChildInfo()

bool GetChildInfo ( const int32  word,
int32 parent,
int32 child_info 
) const
private

Definition at line 849 of file const-arpa-lm.cc.

References KALDI_ASSERT.

850  {
852 
853  KALDI_ASSERT(parent != NULL);
854  KALDI_ASSERT(parent >= lm_states_);
855  KALDI_ASSERT(child_info != NULL);
856 
857  KALDI_ASSERT(parent + 2 <= lm_states_end_);
858  int32 num_children = *(parent + 2);
859  KALDI_ASSERT(parent + 2 + 2 * num_children <= lm_states_end_);
860 
861  if (num_children == 0) return false;
862 
863  // A binary search into the children memory block.
864  int32 start_index = 1;
865  int32 end_index = num_children;
866  while (start_index <= end_index) {
867  int32 mid_index = round((start_index + end_index) / 2);
868  int32 mid_word = *(parent + 1 + 2 * mid_index);
869  if (mid_word == word) {
870  *child_info = *(parent + 2 + 2 * mid_index);
871  return true;
872  } else if (mid_word < word) {
873  start_index = mid_index + 1;
874  } else {
875  end_index = mid_index - 1;
876  }
877  }
878 
879  return false;
880 }
kaldi::int32 int32
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ GetLmState()

int32 * GetLmState ( const std::vector< int32 > &  seq) const
private

Definition at line 821 of file const-arpa-lm.cc.

References rnnlm::i, KALDI_ASSERT, and ArpaLine::logprob.

821  {
823 
824  // No LmState exists for empty word sequence.
825  if (seq.size() == 0) return NULL;
826 
827  // If <unk> is defined, then the word sequence should have already been mapped
828  // to <unk> is necessary; this is for the case where <unk> is not defined.
829  if (seq[0] >= num_words_ || unigram_states_[seq[0]] == NULL) return NULL;
830  int32* parent = unigram_states_[seq[0]];
831 
832  int32 child_info;
833  int32* child_lm_state = NULL;
834  float logprob;
835  for (int32 i = 1; i < seq.size(); ++i) {
836  if (!GetChildInfo(seq[i], parent, &child_info)) {
837  return NULL;
838  }
839  DecodeChildInfo(child_info, parent, &child_lm_state, &logprob);
840  if (child_lm_state == NULL) {
841  return NULL;
842  } else {
843  parent = child_lm_state;
844  }
845  }
846  return parent;
847 }
float logprob
kaldi::int32 int32
int32 ** unigram_states_
void DecodeChildInfo(const int32 child_info, int32 *parent, int32 **child_lm_state, float *logprob) const
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
bool GetChildInfo(const int32 word, int32 *parent, int32 *child_info) const

◆ GetNgramLogprob()

float GetNgramLogprob ( const int32  word,
const std::vector< int32 > &  hist 
) const

Definition at line 748 of file const-arpa-lm.cc.

References rnnlm::i, and KALDI_ASSERT.

Referenced by ConstArpaLmDeterministicFst::Final(), and ConstArpaLmDeterministicFst::GetArc().

749  {
751 
752  // If the history size plus one is larger than <ngram_order_>, remove the old
753  // words.
754  std::vector<int32> mapped_hist(hist);
755  while (mapped_hist.size() >= ngram_order_) {
756  mapped_hist.erase(mapped_hist.begin(), mapped_hist.begin() + 1);
757  }
758  KALDI_ASSERT(mapped_hist.size() + 1 <= ngram_order_);
759 
760  // TODO(guoguo): check with Dan if this is reasonable.
761  // Maps possible out-of-vocabulary words to <unk>. If a word does not have a
762  // corresponding LmState, we treat it as <unk>. We map it to <unk> if <unk> is
763  // specified.
764  int32 mapped_word = word;
765  if (unk_symbol_ != -1) {
766  KALDI_ASSERT(mapped_word >= 0);
767  if (mapped_word >= num_words_ || unigram_states_[mapped_word] == NULL) {
768  mapped_word = unk_symbol_;
769  }
770  for (int32 i = 0; i < mapped_hist.size(); ++i) {
771  KALDI_ASSERT(mapped_hist[i] >= 0);
772  if (mapped_hist[i] >= num_words_ ||
773  unigram_states_[mapped_hist[i]] == NULL) {
774  mapped_hist[i] = unk_symbol_;
775  }
776  }
777  }
778 
779  // Loops up n-gram probability.
780  return GetNgramLogprobRecurse(mapped_word, mapped_hist);
781 }
kaldi::int32 int32
int32 ** unigram_states_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
float GetNgramLogprobRecurse(const int32 word, const std::vector< int32 > &hist) const

◆ GetNgramLogprobRecurse()

float GetNgramLogprobRecurse ( const int32  word,
const std::vector< int32 > &  hist 
) const
private

Definition at line 783 of file const-arpa-lm.cc.

References ArpaLine::backoff_logprob, Int32AndFloat::f, KALDI_ASSERT, and ArpaLine::logprob.

784  {
786  KALDI_ASSERT(hist.size() + 1 <= ngram_order_);
787 
788  // Unigram case.
789  if (hist.size() == 0) {
790  if (word >= num_words_ || unigram_states_[word] == NULL) {
791  // If <unk> is defined, then the word sequence should have already been
792  // mapped to <unk> is necessary; this is for the case where <unk> is not
793  // defined.
794  return std::numeric_limits<float>::min();
795  } else {
796  Int32AndFloat logprob_i(*unigram_states_[word]);
797  return logprob_i.f;
798  }
799  }
800 
801  // High n-gram orders.
802  float logprob = 0.0;
803  float backoff_logprob = 0.0;
804  int32* state;
805  if ((state = GetLmState(hist)) != NULL) {
806  int32 child_info;
807  int32* child_lm_state = NULL;
808  if (GetChildInfo(word, state, &child_info)) {
809  DecodeChildInfo(child_info, state, &child_lm_state, &logprob);
810  return logprob;
811  } else {
812  Int32AndFloat backoff_logprob_i(*(state + 1));
813  backoff_logprob = backoff_logprob_i.f;
814  }
815  }
816  std::vector<int32> new_hist(hist);
817  new_hist.erase(new_hist.begin(), new_hist.begin() + 1);
818  return backoff_logprob + GetNgramLogprobRecurse(word, new_hist);
819 }
float logprob
kaldi::int32 int32
int32 ** unigram_states_
void DecodeChildInfo(const int32 child_info, int32 *parent, int32 **child_lm_state, float *logprob) const
int32 * GetLmState(const std::vector< int32 > &seq) const
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
float GetNgramLogprobRecurse(const int32 word, const std::vector< int32 > &hist) const
bool GetChildInfo(const int32 word, int32 *parent, int32 *child_info) const

◆ HistoryStateExists()

bool HistoryStateExists ( const std::vector< int32 > &  hist) const

Definition at line 721 of file const-arpa-lm.cc.

References KALDI_ASSERT.

Referenced by ConstArpaLmDeterministicFst::GetArc().

721  {
722  // We do not create LmState for empty word sequence, but technically it is the
723  // history state of all unigrams.
724  if (hist.size() == 0) {
725  return true;
726  }
727 
728  // Tries to locate the LmState of the given word sequence.
729  int32* lm_state = GetLmState(hist);
730  if (lm_state == NULL) {
731  // <lm_state> does not exist means <hist> has no child.
732  return false;
733  } else {
734  // Note that we always create LmState for unigrams, so even if <lm_state> is
735  // not NULL, we still have to check if it has child.
736  KALDI_ASSERT(lm_state >= lm_states_);
737  KALDI_ASSERT(lm_state + 2 <= lm_states_end_);
738  // <lm_state + 2> points to <num_children>.
739  if (*(lm_state + 2) > 0) {
740  return true;
741  } else {
742  return false;
743  }
744  }
745  return true;
746 }
kaldi::int32 int32
int32 * GetLmState(const std::vector< int32 > &seq) const
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ NgramOrder()

int32 NgramOrder ( ) const
inline

Definition at line 279 of file const-arpa-lm.h.

References logprob.

Referenced by ConstArpaLmDeterministicFst::GetArc().

279 { return ngram_order_; }

◆ Read()

void Read ( std::istream &  is,
bool  binary 
)

Definition at line 573 of file const-arpa-lm.cc.

References KALDI_ASSERT, and KALDI_ERR.

573  {
575  if (!binary) {
576  KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
577  }
578 
579  int first_char = is.peek();
580  if (first_char == 4) { // Old on-disk format starts with length of int32.
581  ReadInternalOldFormat(is, binary);
582  } else { // New on-disk format starts with token <ConstArpaLm>.
583  ReadInternal(is, binary);
584  }
585 }
void ReadInternalOldFormat(std::istream &is, bool binary)
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void ReadInternal(std::istream &is, bool binary)

◆ ReadInternal()

void ReadInternal ( std::istream &  is,
bool  binary 
)
private

Definition at line 587 of file const-arpa-lm.cc.

References kaldi::ExpectToken(), rnnlm::i, KALDI_ASSERT, KALDI_ERR, and kaldi::ReadBasicType().

587  {
589  if (!binary) {
590  KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
591  }
592 
593  ExpectToken(is, binary, "<ConstArpaLm>");
594 
595  // Misc info.
596  ExpectToken(is, binary, "<LmInfo>");
597  ReadBasicType(is, binary, &bos_symbol_);
598  ReadBasicType(is, binary, &eos_symbol_);
599  ReadBasicType(is, binary, &unk_symbol_);
600  ReadBasicType(is, binary, &ngram_order_);
601  ExpectToken(is, binary, "</LmInfo>");
602 
603  // LmStates section.
604  ExpectToken(is, binary, "<LmStates>");
605  ReadBasicType(is, binary, &lm_states_size_);
607  is.read(reinterpret_cast<char *>(lm_states_),
608  sizeof(int32) * lm_states_size_);
609  if (!is.good()) {
610  KALDI_ERR << "ConstArpaLm <LmStates> section reading failed.";
611  }
612  ExpectToken(is, binary, "</LmStates>");
613 
614  // Unigram section. We write memory offset to disk instead of the absolute
615  // pointers.
616  ExpectToken(is, binary, "<LmUnigram>");
617  ReadBasicType(is, binary, &num_words_);
619  int64* tmp_unigram_address = new int64[num_words_];
620  is.read(reinterpret_cast<char *>(tmp_unigram_address),
621  sizeof(int64) * num_words_);
622  if (!is.good()) {
623  KALDI_ERR << "ConstArpaLm <LmUnigram> section reading failed.";
624  }
625  for (int32 i = 0; i < num_words_; ++i) {
626  // Check out how we compute the relative address in ConstArpaLm::Write().
627  unigram_states_[i] = (tmp_unigram_address[i] == 0) ? NULL
628  : lm_states_ + tmp_unigram_address[i] - 1;
629  }
630  delete[] tmp_unigram_address;
631  tmp_unigram_address = NULL;
632  ExpectToken(is, binary, "</LmUnigram>");
633 
634  // Overflow section. We write memory offset to disk instead of the absolute
635  // pointers.
636  ExpectToken(is, binary, "<LmOverflow>");
637  ReadBasicType(is, binary, &overflow_buffer_size_);
639  int64* tmp_overflow_address = new int64[overflow_buffer_size_];
640  is.read(reinterpret_cast<char *>(tmp_overflow_address),
641  sizeof(int64) * overflow_buffer_size_);
642  if (!is.good()) {
643  KALDI_ERR << "ConstArpaLm <LmOverflow> section reading failed.";
644  }
645  for (int32 i = 0; i < overflow_buffer_size_; ++i) {
646  // Check out how we compute the relative address in ConstArpaLm::Write().
647  overflow_buffer_[i] = (tmp_overflow_address[i] == 0) ? NULL
648  : lm_states_ + tmp_overflow_address[i] - 1;
649  }
650  delete[] tmp_overflow_address;
651  tmp_overflow_address = NULL;
652  ExpectToken(is, binary, "</LmOverflow>");
653  ExpectToken(is, binary, "</ConstArpaLm>");
654 
656  KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0);
657  KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0);
658  KALDI_ASSERT(unk_symbol_ < num_words_ &&
659  (unk_symbol_ > 0 || unk_symbol_ == -1));
660  lm_states_end_ = lm_states_ + lm_states_size_ - 1;
661  memory_assigned_ = true;
662  initialized_ = true;
663 }
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
kaldi::int32 int32
int32 ** unigram_states_
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191
#define KALDI_ERR
Definition: kaldi-error.h:147
int32 ** overflow_buffer_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ ReadInternalOldFormat()

void ReadInternalOldFormat ( std::istream &  is,
bool  binary 
)
private

Definition at line 665 of file const-arpa-lm.cc.

References rnnlm::i, KALDI_ASSERT, KALDI_ERR, and kaldi::ReadBasicType().

665  {
667  if (!binary) {
668  KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
669  }
670 
671  // Misc info.
672  ReadBasicType(is, binary, &bos_symbol_);
673  ReadBasicType(is, binary, &eos_symbol_);
674  ReadBasicType(is, binary, &unk_symbol_);
675  ReadBasicType(is, binary, &ngram_order_);
676 
677  // LmStates section.
678  // In the deprecated version, <lm_states_size_> used to be type of int32,
679  // which was a bug. We therefore use int32 for read for back-compatibility.
680  int32 lm_states_size_int32;
681  ReadBasicType(is, binary, &lm_states_size_int32);
682  lm_states_size_ = static_cast<int64>(lm_states_size_int32);
684  for (int64 i = 0; i < lm_states_size_; ++i) {
685  ReadBasicType(is, binary, &lm_states_[i]);
686  }
687 
688  // Unigram section. We write memory offset to disk instead of the absolute
689  // pointers.
690  ReadBasicType(is, binary, &num_words_);
692  for (int32 i = 0; i < num_words_; ++i) {
693  int64 tmp_address;
694  ReadBasicType(is, binary, &tmp_address);
695  // Check out how we compute the relative address in ConstArpaLm::Write().
696  unigram_states_[i] =
697  (tmp_address == 0) ? NULL : lm_states_ + tmp_address - 1;
698  }
699 
700  // Overflow section. We write memory offset to disk instead of the absolute
701  // pointers.
702  ReadBasicType(is, binary, &overflow_buffer_size_);
704  for (int32 i = 0; i < overflow_buffer_size_; ++i) {
705  int64 tmp_address;
706  ReadBasicType(is, binary, &tmp_address);
707  // Check out how we compute the relative address in ConstArpaLm::Write().
709  (tmp_address == 0) ? NULL : lm_states_ + tmp_address - 1;
710  }
712  KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0);
713  KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0);
714  KALDI_ASSERT(unk_symbol_ < num_words_ &&
715  (unk_symbol_ > 0 || unk_symbol_ == -1));
716  lm_states_end_ = lm_states_ + lm_states_size_ - 1;
717  memory_assigned_ = true;
718  initialized_ = true;
719 }
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
kaldi::int32 int32
int32 ** unigram_states_
#define KALDI_ERR
Definition: kaldi-error.h:147
int32 ** overflow_buffer_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ UnkSymbol()

int32 UnkSymbol ( ) const
inline

Definition at line 278 of file const-arpa-lm.h.

278 { return unk_symbol_; }

◆ Write()

void Write ( std::ostream &  os,
bool  binary 
) const

Definition at line 497 of file const-arpa-lm.cc.

References rnnlm::i, KALDI_ASSERT, KALDI_ERR, kaldi::WriteBasicType(), and kaldi::WriteToken().

Referenced by ConstArpaLmBuilder::Write().

497  {
499  if (!binary) {
500  KALDI_ERR << "text-mode writing is not implemented for ConstArpaLm.";
501  }
502 
503  WriteToken(os, binary, "<ConstArpaLm>");
504 
505  // Misc info.
506  WriteToken(os, binary, "<LmInfo>");
507  WriteBasicType(os, binary, bos_symbol_);
508  WriteBasicType(os, binary, eos_symbol_);
509  WriteBasicType(os, binary, unk_symbol_);
510  WriteBasicType(os, binary, ngram_order_);
511  WriteToken(os, binary, "</LmInfo>");
512 
513  // LmStates section.
514  WriteToken(os, binary, "<LmStates>");
515  WriteBasicType(os, binary, lm_states_size_);
516  os.write(reinterpret_cast<char *>(lm_states_),
517  sizeof(int32) * lm_states_size_);
518  if (!os.good()) {
519  KALDI_ERR << "ConstArpaLm <LmStates> section writing failed.";
520  }
521  WriteToken(os, binary, "</LmStates>");
522 
523  // Unigram section. We write memory offset to disk instead of the absolute
524  // pointers.
525  WriteToken(os, binary, "<LmUnigram>");
526  WriteBasicType(os, binary, num_words_);
527  int64* tmp_unigram_address = new int64[num_words_];
528  for (int32 i = 0; i < num_words_; ++i) {
529  // The relative address here is a little bit tricky:
530  // 1. If the original address is NULL, then we set the relative address to
531  // zero.
532  // 2. If the original address is not NULL, we set it to the following:
533  // unigram_states_[i] - lm_states_ + 1
534  // we plus 1 to ensure that the above value is positive.
535  tmp_unigram_address[i] = (unigram_states_[i] == NULL) ? 0 :
537  }
538  os.write(reinterpret_cast<char *>(tmp_unigram_address),
539  sizeof(int64) * num_words_);
540  if (!os.good()) {
541  KALDI_ERR << "ConstArpaLm <LmUnigram> section writing failed.";
542  }
543  delete[] tmp_unigram_address; // Releases the memory.
544  tmp_unigram_address = NULL;
545  WriteToken(os, binary, "</LmUnigram>");
546 
547  // Overflow section. We write memory offset to disk instead of the absolute
548  // pointers.
549  WriteToken(os, binary, "<LmOverflow>");
551  int64* tmp_overflow_address = new int64[overflow_buffer_size_];
552  for (int32 i = 0; i < overflow_buffer_size_; ++i) {
553  // The relative address here is a little bit tricky:
554  // 1. If the original address is NULL, then we set the relative address to
555  // zero.
556  // 2. If the original address is not NULL, we set it to the following:
557  // overflow_buffer_[i] - lm_states_ + 1
558  // we plus 1 to ensure that the above value is positive.
559  tmp_overflow_address[i] = (overflow_buffer_[i] == NULL) ? 0 :
561  }
562  os.write(reinterpret_cast<char *>(tmp_overflow_address),
563  sizeof(int64) * overflow_buffer_size_);
564  if (!os.good()) {
565  KALDI_ERR << "ConstArpaLm <LmOverflow> section writing failed.";
566  }
567  delete[] tmp_overflow_address;
568  tmp_overflow_address = NULL;
569  WriteToken(os, binary, "</LmOverflow>");
570  WriteToken(os, binary, "</ConstArpaLm>");
571 }
kaldi::int32 int32
int32 ** unigram_states_
#define KALDI_ERR
Definition: kaldi-error.h:147
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
int32 ** overflow_buffer_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34

◆ WriteArpa()

void WriteArpa ( std::ostream &  os) const

Definition at line 952 of file const-arpa-lm.cc.

References ArpaLine::backoff_logprob, rnnlm::i, rnnlm::j, KALDI_ASSERT, and ArpaLine::words.

952  {
954 
955  std::vector<ArpaLine> tmp_output;
956  for (int32 i = 0; i < num_words_; ++i) {
957  if (unigram_states_[i] != NULL) {
958  std::vector<int32> seq(1, i);
959  WriteArpaRecurse(unigram_states_[i], seq, &tmp_output);
960  }
961  }
962 
963  // Sorts ArpaLines and collects head information.
964  std::sort(tmp_output.begin(), tmp_output.end());
965  std::vector<int32> ngram_count(1, 0);
966  for (int32 i = 0; i < tmp_output.size(); ++i) {
967  if (tmp_output[i].words.size() >= ngram_count.size()) {
968  ngram_count.resize(tmp_output[i].words.size() + 1);
969  ngram_count[tmp_output[i].words.size()] = 1;
970  } else {
971  ngram_count[tmp_output[i].words.size()] += 1;
972  }
973  }
974 
975  // Writes the header.
976  os << std::endl;
977  os << "\\data\\" << std::endl;
978  for (int32 i = 1; i < ngram_count.size(); ++i) {
979  os << "ngram " << i << "=" << ngram_count[i] << std::endl;
980  }
981 
982  // Writes n-grams.
983  int32 current_order = 0;
984  for (int32 i = 0; i < tmp_output.size(); ++i) {
985  // Beginning of a n-gram section.
986  if (tmp_output[i].words.size() != current_order) {
987  current_order = tmp_output[i].words.size();
988  os << std::endl;
989  os << "\\" << current_order << "-grams:" << std::endl;
990  }
991 
992  // Writes logprob.
993  os << tmp_output[i].logprob << '\t';
994 
995  // Writes word sequence.
996  for (int32 j = 0; j < tmp_output[i].words.size(); ++j) {
997  os << tmp_output[i].words[j];
998  if (j != tmp_output[i].words.size() - 1) {
999  os << " ";
1000  }
1001  }
1002 
1003  // Writes backoff_logprob if it is not zero.
1004  if (tmp_output[i].backoff_logprob != 0.0) {
1005  os << '\t' << tmp_output[i].backoff_logprob;
1006  }
1007  os << std::endl;
1008  }
1009 
1010  os << std::endl << "\\end\\" << std::endl;
1011 }
int32 words[kMaxOrder]
kaldi::int32 int32
int32 ** unigram_states_
void WriteArpaRecurse(int32 *lm_state, const std::vector< int32 > &seq, std::vector< ArpaLine > *output) const
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ WriteArpaRecurse()

void WriteArpaRecurse ( int32 lm_state,
const std::vector< int32 > &  seq,
std::vector< ArpaLine > *  output 
) const
private

Definition at line 911 of file const-arpa-lm.cc.

References ArpaLine::backoff_logprob, Int32AndFloat::f, rnnlm::i, KALDI_ASSERT, ArpaLine::logprob, and ArpaLine::words.

913  {
914  if (lm_state == NULL) return;
915 
916  KALDI_ASSERT(lm_state >= lm_states_);
917  KALDI_ASSERT(lm_state + 2 <= lm_states_end_);
918 
919  // Inserts the current LmState to <output>.
920  ArpaLine arpa_line;
921  arpa_line.words = seq;
922  Int32AndFloat logprob_i(*lm_state);
923  arpa_line.logprob = logprob_i.f;
924  Int32AndFloat backoff_logprob_i(*(lm_state + 1));
925  arpa_line.backoff_logprob = backoff_logprob_i.f;
926  output->push_back(arpa_line);
927 
928  // Scans for possible children, and recursively adds child to <output>.
929  int32 num_children = *(lm_state + 2);
930  KALDI_ASSERT(lm_state + 2 + 2 * num_children <= lm_states_end_);
931  for (int32 i = 0; i < num_children; ++i) {
932  std::vector<int32> new_seq(seq);
933  new_seq.push_back(*(lm_state + 3 + 2 * i));
934  int32 child_info = *(lm_state + 4 + 2 * i);
935  float logprob;
936  int32* child_lm_state = NULL;
937  DecodeChildInfo(child_info, lm_state, &child_lm_state, &logprob);
938 
939  if (child_lm_state == NULL) {
940  // Leaf case.
941  ArpaLine child_arpa_line;
942  child_arpa_line.words = new_seq;
943  child_arpa_line.logprob = logprob;
944  child_arpa_line.backoff_logprob = 0.0;
945  output->push_back(child_arpa_line);
946  } else {
947  WriteArpaRecurse(child_lm_state, new_seq, output);
948  }
949  }
950 }
float logprob
kaldi::int32 int32
void WriteArpaRecurse(int32 *lm_state, const std::vector< int32 > &seq, std::vector< ArpaLine > *output) const
void DecodeChildInfo(const int32 child_info, int32 *parent, int32 **child_lm_state, float *logprob) const
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

Member Data Documentation

◆ bos_symbol_

int32 bos_symbol_
private

Definition at line 332 of file const-arpa-lm.h.

◆ eos_symbol_

int32 eos_symbol_
private

Definition at line 335 of file const-arpa-lm.h.

◆ initialized_

bool initialized_
private

Definition at line 329 of file const-arpa-lm.h.

◆ lm_states_

int32* lm_states_
private

Definition at line 384 of file const-arpa-lm.h.

◆ lm_states_end_

int32* lm_states_end_
private

Definition at line 357 of file const-arpa-lm.h.

◆ lm_states_size_

int64 lm_states_size_
private

Definition at line 353 of file const-arpa-lm.h.

◆ memory_assigned_

bool memory_assigned_
private

Definition at line 326 of file const-arpa-lm.h.

◆ ngram_order_

int32 ngram_order_
private

Definition at line 342 of file const-arpa-lm.h.

◆ num_words_

int32 num_words_
private

Definition at line 346 of file const-arpa-lm.h.

◆ overflow_buffer_

int32** overflow_buffer_
private

Definition at line 368 of file const-arpa-lm.h.

◆ overflow_buffer_size_

int32 overflow_buffer_size_
private

Definition at line 350 of file const-arpa-lm.h.

◆ unigram_states_

int32** unigram_states_
private

Definition at line 362 of file const-arpa-lm.h.

◆ unk_symbol_

int32 unk_symbol_
private

Definition at line 339 of file const-arpa-lm.h.


The documentation for this class was generated from the following files: