All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
LatticeDeterminizer< Weight, IntType > Class Template Reference

#include <determinize-lattice-inl.h>

Collaboration diagram for LatticeDeterminizer< Weight, IntType >:

Classes

struct  Element
 
class  PairComparator
 
class  SubsetEqual
 
class  SubsetEqualStates
 
class  SubsetKey
 
struct  TempArc
 

Public Types

typedef
CompactLatticeWeightTpl
< Weight, IntType > 
CompactWeight
 
typedef ArcTpl< CompactWeightCompactArc
 
typedef ArcTpl< WeightArc
 

Public Member Functions

void Output (MutableFst< CompactArc > *ofst, bool destroy=true)
 
void Output (MutableFst< Arc > *ofst, bool destroy=true)
 
 LatticeDeterminizer (const Fst< Arc > &ifst, DeterminizeLatticeOptions opts)
 
void FreeMostMemory ()
 
 ~LatticeDeterminizer ()
 
void RebuildRepository ()
 
bool CheckMemoryUsage ()
 
bool Determinize (bool *debug_ptr)
 

Private Types

enum  IsymbolOrFinal { OSF_UNKNOWN = 0, OSF_NO = 1, OSF_YES = 2 }
 
typedef Arc::Label Label
 
typedef Arc::StateId StateId
 
typedef Arc::StateId InputStateId
 
typedef Arc::StateId OutputStateId
 
typedef
LatticeStringRepository
< IntType > 
StringRepositoryType
 
typedef const
StringRepositoryType::Entry * 
StringId
 
typedef unordered_map< const
vector< Element >
*, OutputStateId, SubsetKey,
SubsetEqual
MinimalSubsetHash
 
typedef unordered_map< const
vector< Element > *, Element,
SubsetKey, SubsetEqual
InitialSubsetHash
 

Private Member Functions

void ConvertToMinimal (vector< Element > *subset)
 
OutputStateId MinimalToStateId (const vector< Element > &subset)
 
OutputStateId InitialToStateId (const vector< Element > &subset_in, Weight *remaining_weight, StringId *common_prefix)
 
int Compare (const Weight &a_w, StringId a_str, const Weight &b_w, StringId b_str) const
 
void EpsilonClosure (vector< Element > *subset)
 
void ProcessFinal (OutputStateId output_state)
 
void NormalizeSubset (vector< Element > *elems, Weight *tot_weight, StringId *common_str)
 
void MakeSubsetUnique (vector< Element > *subset)
 
void ProcessTransition (OutputStateId state, Label ilabel, vector< Element > *subset)
 
void ProcessTransitions (OutputStateId output_state)
 
void ProcessState (OutputStateId output_state)
 
void Debug ()
 
bool IsIsymbolOrFinal (InputStateId state)
 
void InitializeDeterminization ()
 
 KALDI_DISALLOW_COPY_AND_ASSIGN (LatticeDeterminizer)
 

Private Attributes

vector< vector< Element > * > output_states_
 
vector< vector< TempArc > > output_arcs_
 
int num_arcs_
 
int num_elems_
 
const Fst< Arc > * ifst_
 
DeterminizeLatticeOptions opts_
 
SubsetKey hasher_
 
SubsetEqual equal_
 
bool determinized_
 
MinimalSubsetHash minimal_hash_
 
InitialSubsetHash initial_hash_
 
vector< OutputStateIdqueue_
 
vector< pair< Label, Element > > all_elems_tmp_
 
vector< char > isymbol_or_final_
 
LatticeStringRepository< IntType > repository_
 

Detailed Description

template<class Weight, class IntType>
class fst::LatticeDeterminizer< Weight, IntType >

Definition at line 258 of file determinize-lattice-inl.h.

Member Typedef Documentation

typedef ArcTpl<Weight> Arc

Definition at line 266 of file determinize-lattice-inl.h.

typedef ArcTpl<CompactWeight> CompactArc

Definition at line 265 of file determinize-lattice-inl.h.

Definition at line 264 of file determinize-lattice-inl.h.

typedef unordered_map<const vector<Element>*, Element, SubsetKey, SubsetEqual> InitialSubsetHash
private

Definition at line 641 of file determinize-lattice-inl.h.

typedef Arc::StateId InputStateId
private

Definition at line 532 of file determinize-lattice-inl.h.

typedef Arc::Label Label
private

Definition at line 530 of file determinize-lattice-inl.h.

typedef unordered_map<const vector<Element>*, OutputStateId, SubsetKey, SubsetEqual> MinimalSubsetHash
private

Definition at line 633 of file determinize-lattice-inl.h.

typedef Arc::StateId OutputStateId
private

Definition at line 533 of file determinize-lattice-inl.h.

typedef Arc::StateId StateId
private

Definition at line 531 of file determinize-lattice-inl.h.

typedef const StringRepositoryType::Entry* StringId
private

Definition at line 538 of file determinize-lattice-inl.h.

Definition at line 537 of file determinize-lattice-inl.h.

Member Enumeration Documentation

Constructor & Destructor Documentation

LatticeDeterminizer ( const Fst< Arc > &  ifst,
DeterminizeLatticeOptions  opts 
)
inline

Definition at line 400 of file determinize-lattice-inl.h.

References KALDI_ASSERT.

401  :
402  num_arcs_(0), num_elems_(0), ifst_(ifst.Copy()), opts_(opts),
403  equal_(opts_.delta), determinized_(false),
405  KALDI_ASSERT(Weight::Properties() & kIdempotent); // this algorithm won't
406  // work correctly otherwise.
407  }
DeterminizeLatticeOptions opts_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
~LatticeDeterminizer ( )
inline

Definition at line 431 of file determinize-lattice-inl.h.

431  {
432  FreeMostMemory(); // rest is deleted by destructors.
433  }

Member Function Documentation

bool CheckMemoryUsage ( )
inline

Definition at line 468 of file determinize-lattice-inl.h.

References KALDI_VLOG, and KALDI_WARN.

468  {
469  int32 repo_size = repository_.MemSize(),
470  arcs_size = num_arcs_ * sizeof(TempArc),
471  elems_size = num_elems_ * sizeof(Element),
472  total_size = repo_size + arcs_size + elems_size;
473  if (opts_.max_mem > 0 && total_size > opts_.max_mem) { // We passed the memory threshold.
474  // This is usually due to the repository getting large, so we
475  // clean this out.
477  int32 new_repo_size = repository_.MemSize(),
478  new_total_size = new_repo_size + arcs_size + elems_size;
479 
480  KALDI_VLOG(2) << "Rebuilt repository in determinize-lattice: repository shrank from "
481  << repo_size << " to " << new_repo_size << " bytes (approximately)";
482 
483  if (new_total_size > static_cast<int32>(opts_.max_mem * 0.8)) {
484  // Rebuilding didn't help enough-- we need a margin to stop
485  // having to rebuild too often.
486  KALDI_WARN << "Failure in determinize-lattice: size exceeds maximum "
487  << opts_.max_mem << " bytes; (repo,arcs,elems) = ("
488  << repo_size << "," << arcs_size << "," << elems_size
489  << "), after rebuilding, repo size was " << new_repo_size;
490  return false;
491  }
492  }
493  return true;
494  }
LatticeStringRepository< IntType > repository_
#define KALDI_WARN
Definition: kaldi-error.h:130
DeterminizeLatticeOptions opts_
#define KALDI_VLOG(v)
Definition: kaldi-error.h:136
int Compare ( const Weight a_w,
StringId  a_str,
const Weight b_w,
StringId  b_str 
) const
inlineprivate

Definition at line 733 of file determinize-lattice-inl.h.

References fst::Compare(), and rnnlm::i.

734  {
735  int weight_comp = fst::Compare(a_w, b_w);
736  if (weight_comp != 0) return weight_comp;
737  // now comparing strings.
738  if (a_str == b_str) return 0;
739  vector<IntType> a_vec, b_vec;
740  repository_.ConvertToVector(a_str, &a_vec);
741  repository_.ConvertToVector(b_str, &b_vec);
742  // First compare their lengths.
743  int a_len = a_vec.size(), b_len = b_vec.size();
744  // use opposite order on the string lengths (c.f. Compare in
745  // lattice-weight.h)
746  if (a_len > b_len) return -1;
747  else if (a_len < b_len) return 1;
748  for(int i = 0; i < a_len; i++) {
749  if (a_vec[i] < b_vec[i]) return -1;
750  else if (a_vec[i] > b_vec[i]) return 1;
751  }
752  assert(0); // because we checked if a_str == b_str above, shouldn't reach here
753  return 0;
754  }
LatticeStringRepository< IntType > repository_
int Compare(const LatticeWeightTpl< FloatType > &w1, const LatticeWeightTpl< FloatType > &w2)
Compare returns -1 if w1 < w2, +1 if w1 > w2, and 0 if w1 == w2.
void ConvertToMinimal ( vector< Element > *  subset)
inlineprivate

Definition at line 647 of file determinize-lattice-inl.h.

647  {
648  assert(!subset->empty());
649  typename vector<Element>::iterator cur_in = subset->begin(),
650  cur_out = subset->begin(), end = subset->end();
651  while (cur_in != end) {
652  if(IsIsymbolOrFinal(cur_in->state)) { // keep it...
653  *cur_out = *cur_in;
654  cur_out++;
655  }
656  cur_in++;
657  }
658  subset->resize(cur_out - subset->begin());
659  }
bool IsIsymbolOrFinal(InputStateId state)
void Debug ( )
inlineprivate

Definition at line 1080 of file determinize-lattice-inl.h.

References rnnlm::i, rnnlm::j, KALDI_ASSERT, KALDI_ERR, and KALDI_WARN.

1080  { // this function called if you send a signal
1081  // SIGUSR1 to the process (and it's caught by the handler in
1082  // fstdeterminizestar). It prints out some traceback
1083  // info and exits.
1084 
1085  KALDI_WARN << "Debug function called (probably SIGUSR1 caught)";
1086  // free up memory from the hash as we need a little memory
1087  { MinimalSubsetHash hash_tmp; hash_tmp.swap(minimal_hash_); }
1088 
1089  if (output_arcs_.size() <= 2) {
1090  KALDI_ERR << "Nothing to trace back";
1091  }
1092  size_t max_state = output_arcs_.size() - 2; // Don't take the last
1093  // one as we might be halfway into constructing it.
1094 
1095  vector<OutputStateId> predecessor(max_state+1, kNoStateId);
1096  for (size_t i = 0; i < max_state; i++) {
1097  for (size_t j = 0; j < output_arcs_[i].size(); j++) {
1098  OutputStateId nextstate = output_arcs_[i][j].nextstate;
1099  // Always find an earlier-numbered predecessor; this
1100  // is always possible because of the way the algorithm
1101  // works.
1102  if (nextstate <= max_state && nextstate > i)
1103  predecessor[nextstate] = i;
1104  }
1105  }
1106  vector<pair<Label, StringId> > traceback;
1107  // 'traceback' is a pair of (ilabel, olabel-seq).
1108  OutputStateId cur_state = max_state; // A recently constructed state.
1109 
1110  while (cur_state != 0 && cur_state != kNoStateId) {
1111  OutputStateId last_state = predecessor[cur_state];
1112  pair<Label, StringId> p;
1113  size_t i;
1114  for (i = 0; i < output_arcs_[last_state].size(); i++) {
1115  if (output_arcs_[last_state][i].nextstate == cur_state) {
1116  p.first = output_arcs_[last_state][i].ilabel;
1117  p.second = output_arcs_[last_state][i].string;
1118  traceback.push_back(p);
1119  break;
1120  }
1121  }
1122  KALDI_ASSERT(i != output_arcs_[last_state].size()); // Or fell off loop.
1123  cur_state = last_state;
1124  }
1125  if (cur_state == kNoStateId)
1126  KALDI_WARN << "Traceback did not reach start state "
1127  << "(possibly debug-code error)";
1128 
1129  std::stringstream ss;
1130  ss << "Traceback follows in format "
1131  << "ilabel (olabel olabel) ilabel (olabel) ... :";
1132  for (ssize_t i = traceback.size() - 1; i >= 0; i--) {
1133  ss << ' ' << traceback[i].first << " ( ";
1134  vector<Label> seq;
1135  repository_.ConvertToVector(traceback[i].second, &seq);
1136  for (size_t j = 0; j < seq.size(); j++)
1137  ss << seq[j] << ' ';
1138  ss << ')';
1139  }
1140  KALDI_ERR << ss.str();
1141  }
LatticeStringRepository< IntType > repository_
#define KALDI_ERR
Definition: kaldi-error.h:127
#define KALDI_WARN
Definition: kaldi-error.h:130
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:169
unordered_map< const vector< Element > *, OutputStateId, SubsetKey, SubsetEqual > MinimalSubsetHash
vector< vector< TempArc > > output_arcs_
bool Determinize ( bool *  debug_ptr)
inline

Definition at line 498 of file determinize-lattice-inl.h.

References KALDI_WARN.

Referenced by fst::DeterminizeLattice().

498  {
499  assert(!determinized_);
500  // This determinizes the input fst but leaves it in the "special format"
501  // in "output_arcs_". Must be called after Initialize(). To get the
502  // output, call one of the Output routines.
503  try {
504  InitializeDeterminization(); // some start-up tasks.
505  while (!queue_.empty()) {
506  OutputStateId out_state = queue_.back();
507  queue_.pop_back();
508  ProcessState(out_state);
509  if (debug_ptr && *debug_ptr) Debug(); // will exit.
510  if (!CheckMemoryUsage()) return false;
511  }
512  return (determinized_ = true);
513  } catch (std::bad_alloc) {
514  int32 repo_size = repository_.MemSize(),
515  arcs_size = num_arcs_ * sizeof(TempArc),
516  elems_size = num_elems_ * sizeof(Element),
517  total_size = repo_size + arcs_size + elems_size;
518  KALDI_WARN << "Memory allocation error doing lattice determinization; using "
519  << total_size << " bytes (max = " << opts_.max_mem
520  << " (repo,arcs,elems) = ("
521  << repo_size << "," << arcs_size << "," << elems_size << ")";
522  return (determinized_ = false);
523  } catch (std::runtime_error) {
524  KALDI_WARN << "Caught exception doing lattice determinization";
525  return (determinized_ = false);
526  }
527  }
LatticeStringRepository< IntType > repository_
#define KALDI_WARN
Definition: kaldi-error.h:130
vector< OutputStateId > queue_
DeterminizeLatticeOptions opts_
void ProcessState(OutputStateId output_state)
void EpsilonClosure ( vector< Element > *  subset)
inlineprivate

Definition at line 762 of file determinize-lattice-inl.h.

References fst::Compare(), KALDI_ERR, LatticeDeterminizer< Weight, IntType >::Element::state, LatticeDeterminizer< Weight, IntType >::Element::string, fst::Times(), and LatticeDeterminizer< Weight, IntType >::Element::weight.

762  {
763  // at input, subset must have only one example of each StateId. [will still
764  // be so at output]. This function follows input-epsilons, and augments the
765  // subset accordingly.
766 
767  std::deque<Element> queue;
768  unordered_map<InputStateId, Element> cur_subset;
769  typedef typename unordered_map<InputStateId, Element>::iterator MapIter;
770  typedef typename vector<Element>::const_iterator VecIter;
771 
772  for (VecIter iter = subset->begin(); iter != subset->end(); ++iter) {
773  queue.push_back(*iter);
774  cur_subset[iter->state] = *iter;
775  }
776 
777  // find whether input fst is known to be sorted on input label.
778  bool sorted = ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0);
779  bool replaced_elems = false; // relates to an optimization, see below.
780  int counter = 0; // stops infinite loops here for non-lattice-determinizable input;
781  // useful in testing.
782  while (queue.size() != 0) {
783  Element elem = queue.front();
784  queue.pop_front();
785 
786  // The next if-statement is a kind of optimization. It's to prevent us
787  // unnecessarily repeating the processing of a state. "cur_subset" always
788  // contains only one Element with a particular state. The issue is that
789  // whenever we modify the Element corresponding to that state in "cur_subset",
790  // both the new (optimal) and old (less-optimal) Element will still be in
791  // "queue". The next if-statement stops us from wasting compute by
792  // processing the old Element.
793  if (replaced_elems && cur_subset[elem.state] != elem)
794  continue;
795  if (opts_.max_loop > 0 && counter++ > opts_.max_loop) {
796  KALDI_ERR << "Lattice determinization aborted since looped more than "
797  << opts_.max_loop << " times during epsilon closure";
798  }
799  for (ArcIterator<Fst<Arc> > aiter(*ifst_, elem.state); !aiter.Done(); aiter.Next()) {
800  const Arc &arc = aiter.Value();
801  if (sorted && arc.ilabel != 0) break; // Break from the loop: due to sorting there will be no
802  // more transitions with epsilons as input labels.
803  if (arc.ilabel == 0
804  && arc.weight != Weight::Zero()) { // Epsilon transition.
805  Element next_elem;
806  next_elem.state = arc.nextstate;
807  next_elem.weight = Times(elem.weight, arc.weight);
808  // now must append strings
809  if (arc.olabel == 0)
810  next_elem.string = elem.string;
811  else
812  next_elem.string = repository_.Successor(elem.string, arc.olabel);
813 
814  MapIter iter = cur_subset.find(next_elem.state);
815  if (iter == cur_subset.end()) {
816  // was no such StateId: insert and add to queue.
817  cur_subset[next_elem.state] = next_elem;
818  queue.push_back(next_elem);
819  } else {
820  // was not inserted because one already there. In normal determinization we'd
821  // add the weights. Here, we find which one has the better weight, and
822  // keep its corresponding string.
823  int comp = Compare(next_elem.weight, next_elem.string,
824  iter->second.weight, iter->second.string);
825  if(comp == 1) { // next_elem is better, so use its (weight, string)
826  iter->second.string = next_elem.string;
827  iter->second.weight = next_elem.weight;
828  queue.push_back(next_elem);
829  replaced_elems = true;
830  }
831  // else it is the same or worse, so use original one.
832  }
833  }
834  }
835  }
836 
837  { // copy cur_subset to subset.
838  subset->clear();
839  subset->reserve(cur_subset.size());
840  MapIter iter = cur_subset.begin(), end = cur_subset.end();
841  for (; iter != end; ++iter) subset->push_back(iter->second);
842  // sort by state ID, because the subset hash function is order-dependent(see SubsetKey)
843  std::sort(subset->begin(), subset->end());
844  }
845  }
LatticeStringRepository< IntType > repository_
LatticeWeightTpl< FloatType > Times(const LatticeWeightTpl< FloatType > &w1, const LatticeWeightTpl< FloatType > &w2)
#define KALDI_ERR
Definition: kaldi-error.h:127
DeterminizeLatticeOptions opts_
int Compare(const Weight &a_w, StringId a_str, const Weight &b_w, StringId b_str) const
void FreeMostMemory ( )
inline

Definition at line 411 of file determinize-lattice-inl.h.

References LatticeDeterminizer< Weight, IntType >::ifst_, and LatticeDeterminizer< Weight, IntType >::minimal_hash_.

Referenced by LatticeDeterminizer< Weight, IntType >::Output().

411  {
412  if (ifst_) {
413  delete ifst_;
414  ifst_ = NULL;
415  }
416  for (typename MinimalSubsetHash::iterator iter = minimal_hash_.begin();
417  iter != minimal_hash_.end(); ++iter)
418  delete iter->first;
419  { MinimalSubsetHash tmp; tmp.swap(minimal_hash_); }
420  for (typename InitialSubsetHash::iterator iter = initial_hash_.begin();
421  iter != initial_hash_.end(); ++iter)
422  delete iter->first;
423  { InitialSubsetHash tmp; tmp.swap(initial_hash_); }
424  { vector<vector<Element>* > output_states_tmp;
425  output_states_tmp.swap(output_states_); }
426  { vector<char> tmp; tmp.swap(isymbol_or_final_); }
427  { vector<OutputStateId> tmp; tmp.swap(queue_); }
428  { vector<pair<Label, Element> > tmp; tmp.swap(all_elems_tmp_); }
429  }
unordered_map< const vector< Element > *, Element, SubsetKey, SubsetEqual > InitialSubsetHash
vector< OutputStateId > queue_
unordered_map< const vector< Element > *, OutputStateId, SubsetKey, SubsetEqual > MinimalSubsetHash
vector< pair< Label, Element > > all_elems_tmp_
vector< vector< Element > * > output_states_
void InitializeDeterminization ( )
inlineprivate

Definition at line 1169 of file determinize-lattice-inl.h.

References LatticeDeterminizer< Weight, IntType >::Element::state, LatticeDeterminizer< Weight, IntType >::Element::string, and LatticeDeterminizer< Weight, IntType >::Element::weight.

1169  {
1170  if(ifst_->Properties(kExpanded, false) != 0) { // if we know the number of
1171  // states in ifst_, it might be a bit more efficient
1172  // to pre-size the hashes so we're not constantly rebuilding them.
1173 #if !(__GNUC__ == 4 && __GNUC_MINOR__ == 0)
1174  StateId num_states =
1175  down_cast<const ExpandedFst<Arc>*, const Fst<Arc> >(ifst_)->NumStates();
1176  minimal_hash_.rehash(num_states/2 + 3);
1177  initial_hash_.rehash(num_states/2 + 3);
1178 #endif
1179  }
1180  InputStateId start_id = ifst_->Start();
1181  if (start_id != kNoStateId) {
1182  /* Insert determinized-state corresponding to the start state into hash and
1183  queue. Unlike all the other states, we don't "normalize" the representation
1184  of this determinized-state before we put it into minimal_hash_. This is actually
1185  what we want, as otherwise we'd have problems dealing with any extra weight
1186  and string and might have to create a "super-initial" state which would make
1187  the output nondeterministic. Normalization is only needed to make the
1188  determinized output more minimal anyway, it's not needed for correctness.
1189  Note, we don't put anything in the initial_hash_. The initial_hash_ is only
1190  a lookaside buffer anyway, so this isn't a problem-- it will get populated
1191  later if it needs to be.
1192  */
1193  Element elem;
1194  elem.state = start_id;
1195  elem.weight = Weight::One();
1196  elem.string = repository_.EmptyString(); // Id of empty sequence.
1197  vector<Element> subset;
1198  subset.push_back(elem);
1199  EpsilonClosure(&subset); // follow through epsilon-inputs links
1200  ConvertToMinimal(&subset); // remove all but final states and
1201  // states with input-labels on arcs out of them.
1202  vector<Element> *subset_ptr = new vector<Element>(subset);
1203  assert(output_arcs_.empty() && output_states_.empty());
1204  // add the new state...
1205  output_states_.push_back(subset_ptr);
1206  output_arcs_.push_back(vector<TempArc>());
1207  OutputStateId initial_state = 0;
1208  minimal_hash_[subset_ptr] = initial_state;
1209  queue_.push_back(initial_state);
1210  }
1211  }
void EpsilonClosure(vector< Element > *subset)
LatticeStringRepository< IntType > repository_
vector< OutputStateId > queue_
void ConvertToMinimal(vector< Element > *subset)
vector< vector< TempArc > > output_arcs_
vector< vector< Element > * > output_states_
OutputStateId InitialToStateId ( const vector< Element > &  subset_in,
Weight remaining_weight,
StringId common_prefix 
)
inlineprivate

Definition at line 682 of file determinize-lattice-inl.h.

References KALDI_WARN, LatticeDeterminizer< Weight, IntType >::Element::state, LatticeDeterminizer< Weight, IntType >::Element::string, and LatticeDeterminizer< Weight, IntType >::Element::weight.

684  {
685  typename InitialSubsetHash::const_iterator iter
686  = initial_hash_.find(&subset_in);
687  if (iter != initial_hash_.end()) { // Found a matching subset.
688  const Element &elem = iter->second;
689  *remaining_weight = elem.weight;
690  *common_prefix = elem.string;
691  if (elem.weight == Weight::Zero())
692  KALDI_WARN << "Zero weight!"; // TEMP
693  return elem.state;
694  }
695  // else no matching subset-- have to work it out.
696  vector<Element> subset(subset_in);
697  // Follow through epsilons. Will add no duplicate states. note: after
698  // EpsilonClosure, it is the same as "canonical" subset, except not
699  // normalized (actually we never compute the normalized canonical subset,
700  // only the normalized minimal one).
701  EpsilonClosure(&subset); // follow epsilons.
702  ConvertToMinimal(&subset); // remove all but emitting and final states.
703 
704  Element elem; // will be used to store remaining weight and string, and
705  // OutputStateId, in initial_hash_;
706  NormalizeSubset(&subset, &elem.weight, &elem.string); // normalize subset; put
707  // common string and weight in "elem". The subset is now a minimal,
708  // normalized subset.
709 
710  OutputStateId ans = MinimalToStateId(subset);
711  *remaining_weight = elem.weight;
712  *common_prefix = elem.string;
713  if (elem.weight == Weight::Zero())
714  KALDI_WARN << "Zero weight!"; // TEMP
715 
716  // Before returning "ans", add the initial subset to the hash,
717  // so that we can bypass the epsilon-closure etc., next time
718  // we process the same initial subset.
719  vector<Element> *initial_subset_ptr = new vector<Element>(subset_in);
720  elem.state = ans;
721  initial_hash_[initial_subset_ptr] = elem;
722  num_elems_ += initial_subset_ptr->size(); // keep track of memory usage.
723  return ans;
724  }
void EpsilonClosure(vector< Element > *subset)
#define KALDI_WARN
Definition: kaldi-error.h:130
OutputStateId MinimalToStateId(const vector< Element > &subset)
void NormalizeSubset(vector< Element > *elems, Weight *tot_weight, StringId *common_str)
void ConvertToMinimal(vector< Element > *subset)
bool IsIsymbolOrFinal ( InputStateId  state)
inlineprivate

Definition at line 1143 of file determinize-lattice-inl.h.

1143  { // returns true if this state
1144  // of the input FST either is final or has an osymbol on an arc out of it.
1145  // Uses the vector isymbol_or_final_ as a cache for this info.
1146  assert(state >= 0);
1147  if (isymbol_or_final_.size() <= state)
1148  isymbol_or_final_.resize(state+1, static_cast<char>(OSF_UNKNOWN));
1149  if (isymbol_or_final_[state] == static_cast<char>(OSF_NO))
1150  return false;
1151  else if (isymbol_or_final_[state] == static_cast<char>(OSF_YES))
1152  return true;
1153  // else work it out...
1154  isymbol_or_final_[state] = static_cast<char>(OSF_NO);
1155  if (ifst_->Final(state) != Weight::Zero())
1156  isymbol_or_final_[state] = static_cast<char>(OSF_YES);
1157  for (ArcIterator<Fst<Arc> > aiter(*ifst_, state);
1158  !aiter.Done();
1159  aiter.Next()) {
1160  const Arc &arc = aiter.Value();
1161  if (arc.ilabel != 0 && arc.weight != Weight::Zero()) {
1162  isymbol_or_final_[state] = static_cast<char>(OSF_YES);
1163  return true;
1164  }
1165  }
1166  return IsIsymbolOrFinal(state); // will only recurse once.
1167  }
bool IsIsymbolOrFinal(InputStateId state)
KALDI_DISALLOW_COPY_AND_ASSIGN ( LatticeDeterminizer< Weight, IntType >  )
private
void MakeSubsetUnique ( vector< Element > *  subset)
inlineprivate

Definition at line 924 of file determinize-lattice-inl.h.

References fst::Compare().

924  {
925  typedef typename vector<Element>::iterator IterType;
926 
927  // This assert is designed to fail (usually) if the subset is not sorted on
928  // state.
929  assert(subset->size() < 2 || (*subset)[0].state <= (*subset)[1].state);
930 
931  IterType cur_in = subset->begin(), cur_out = cur_in, end = subset->end();
932  size_t num_out = 0;
933  // Merge elements with same state-id
934  while (cur_in != end) { // while we have more elements to process.
935  // At this point, cur_out points to location of next place we want to put an element,
936  // cur_in points to location of next element we want to process.
937  if (cur_in != cur_out) *cur_out = *cur_in;
938  cur_in++;
939  while (cur_in != end && cur_in->state == cur_out->state) {
940  if (Compare(cur_in->weight, cur_in->string,
941  cur_out->weight, cur_out->string) == 1) {
942  // if *cur_in > *cur_out in semiring, then take *cur_in.
943  cur_out->string = cur_in->string;
944  cur_out->weight = cur_in->weight;
945  }
946  cur_in++;
947  }
948  cur_out++;
949  num_out++;
950  }
951  subset->resize(num_out);
952  }
int Compare(const Weight &a_w, StringId a_str, const Weight &b_w, StringId b_str) const
OutputStateId MinimalToStateId ( const vector< Element > &  subset)
inlineprivate

Definition at line 664 of file determinize-lattice-inl.h.

664  {
665  typename MinimalSubsetHash::const_iterator iter
666  = minimal_hash_.find(&subset);
667  if (iter != minimal_hash_.end()) // Found a matching subset.
668  return iter->second;
669  OutputStateId ans = static_cast<OutputStateId>(output_arcs_.size());
670  vector<Element> *subset_ptr = new vector<Element>(subset);
671  output_states_.push_back(subset_ptr);
672  num_elems_ += subset_ptr->size();
673  output_arcs_.push_back(vector<TempArc>());
674  minimal_hash_[subset_ptr] = ans;
675  queue_.push_back(ans);
676  return ans;
677  }
vector< OutputStateId > queue_
vector< vector< TempArc > > output_arcs_
vector< vector< Element > * > output_states_
void NormalizeSubset ( vector< Element > *  elems,
Weight tot_weight,
StringId common_str 
)
inlineprivate

Definition at line 891 of file determinize-lattice-inl.h.

References fst::Divide(), rnnlm::i, KALDI_WARN, and fst::Plus().

893  {
894  if(elems->empty()) { // just set common_str, tot_weight
895  KALDI_WARN << "[empty subset]"; // TEMP
896  // to defaults and return...
897  *common_str = repository_.EmptyString();
898  *tot_weight = Weight::Zero();
899  return;
900  }
901  size_t size = elems->size();
902  vector<IntType> common_prefix;
903  repository_.ConvertToVector((*elems)[0].string, &common_prefix);
904  Weight weight = (*elems)[0].weight;
905  for (size_t i = 1; i < size; i++) {
906  weight = Plus(weight, (*elems)[i].weight);
907  repository_.ReduceToCommonPrefix((*elems)[i].string, &common_prefix);
908  }
909  assert(weight != Weight::Zero()); // we made sure to ignore arcs with zero
910  // weights on them, so we shouldn't have zero here.
911  size_t prefix_len = common_prefix.size();
912  for (size_t i = 0; i < size; i++) {
913  (*elems)[i].weight = Divide((*elems)[i].weight, weight, DIVIDE_LEFT);
914  (*elems)[i].string =
915  repository_.RemovePrefix((*elems)[i].string, prefix_len);
916  }
917  *common_str = repository_.ConvertFromVector(common_prefix);
918  *tot_weight = weight;
919  }
LatticeWeightTpl< FloatType > Divide(const LatticeWeightTpl< FloatType > &w1, const LatticeWeightTpl< FloatType > &w2, DivideType typ=DIVIDE_ANY)
LatticeWeightTpl< FloatType > Plus(const LatticeWeightTpl< FloatType > &w1, const LatticeWeightTpl< FloatType > &w2)
LatticeStringRepository< IntType > repository_
#define KALDI_WARN
Definition: kaldi-error.h:130
fst::StdArc::Weight Weight
void Output ( MutableFst< CompactArc > *  ofst,
bool  destroy = true 
)
inline

Definition at line 272 of file determinize-lattice-inl.h.

References LatticeDeterminizer< Weight, IntType >::determinized_, LatticeDeterminizer< Weight, IntType >::FreeMostMemory(), LatticeDeterminizer< Weight, IntType >::TempArc::ilabel, LatticeDeterminizer< Weight, IntType >::TempArc::nextstate, LatticeDeterminizer< Weight, IntType >::output_arcs_, LatticeDeterminizer< Weight, IntType >::repository_, LatticeDeterminizer< Weight, IntType >::TempArc::string, kaldi::swap(), and LatticeDeterminizer< Weight, IntType >::TempArc::weight.

Referenced by fst::DeterminizeLattice().

272  {
273  assert(determinized_);
274  typedef typename Arc::StateId StateId;
275  StateId nStates = static_cast<StateId>(output_arcs_.size());
276  if (destroy)
277  FreeMostMemory();
278  ofst->DeleteStates();
279  ofst->SetStart(kNoStateId);
280  if (nStates == 0) {
281  return;
282  }
283  for (StateId s = 0;s < nStates;s++) {
284  OutputStateId news = ofst->AddState();
285  assert(news == s);
286  }
287  ofst->SetStart(0);
288  // now process transitions.
289  for (StateId this_state = 0; this_state < nStates; this_state++) {
290  vector<TempArc> &this_vec(output_arcs_[this_state]);
291  typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
292 
293  for (;iter != end; ++iter) {
294  const TempArc &temp_arc(*iter);
295  CompactArc new_arc;
296  vector<Label> seq;
297  repository_.ConvertToVector(temp_arc.string, &seq);
298  CompactWeight weight(temp_arc.weight, seq);
299  if (temp_arc.nextstate == kNoStateId) { // is really final weight.
300  ofst->SetFinal(this_state, weight);
301  } else { // is really an arc.
302  new_arc.nextstate = temp_arc.nextstate;
303  new_arc.ilabel = temp_arc.ilabel;
304  new_arc.olabel = temp_arc.ilabel; // acceptor. input == output.
305  new_arc.weight = weight; // includes string and weight.
306  ofst->AddArc(this_state, new_arc);
307  }
308  }
309  // Free up memory. Do this inside the loop as ofst is also allocating memory
310  if (destroy) { vector<TempArc> temp; std::swap(temp, this_vec); }
311  }
312  if (destroy) { vector<vector<TempArc> > temp; std::swap(temp, output_arcs_); }
313  }
fst::StdArc::StateId StateId
LatticeStringRepository< IntType > repository_
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
ArcTpl< CompactWeight > CompactArc
CompactLatticeWeightTpl< Weight, IntType > CompactWeight
vector< vector< TempArc > > output_arcs_
void Output ( MutableFst< Arc > *  ofst,
bool  destroy = true 
)
inline

Definition at line 318 of file determinize-lattice-inl.h.

References LatticeDeterminizer< Weight, IntType >::FreeMostMemory(), rnnlm::i, LatticeDeterminizer< Weight, IntType >::TempArc::ilabel, LatticeDeterminizer< Weight, IntType >::TempArc::nextstate, LatticeDeterminizer< Weight, IntType >::output_arcs_, LatticeDeterminizer< Weight, IntType >::repository_, LatticeDeterminizer< Weight, IntType >::TempArc::string, and LatticeDeterminizer< Weight, IntType >::TempArc::weight.

318  {
319  // Outputs to standard fst.
320  OutputStateId nStates = static_cast<OutputStateId>(output_arcs_.size());
321  ofst->DeleteStates();
322  if (nStates == 0) {
323  ofst->SetStart(kNoStateId);
324  return;
325  }
326  if (destroy)
327  FreeMostMemory();
328  // Add basic states-- but we will add extra ones to account for strings on output.
329  for (OutputStateId s = 0;s < nStates;s++) {
330  OutputStateId news = ofst->AddState();
331  assert(news == s);
332  }
333  ofst->SetStart(0);
334  for (OutputStateId this_state = 0; this_state < nStates; this_state++) {
335  vector<TempArc> &this_vec(output_arcs_[this_state]);
336 
337  typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
338  for (; iter != end; ++iter) {
339  const TempArc &temp_arc(*iter);
340  vector<Label> seq;
341  repository_.ConvertToVector(temp_arc.string, &seq);
342 
343  if (temp_arc.nextstate == kNoStateId) { // Really a final weight.
344  // Make a sequence of states going to a final state, with the strings
345  // as labels. Put the weight on the first arc.
346  OutputStateId cur_state = this_state;
347  for (size_t i = 0; i < seq.size(); i++) {
348  OutputStateId next_state = ofst->AddState();
349  Arc arc;
350  arc.nextstate = next_state;
351  arc.weight = (i == 0 ? temp_arc.weight : Weight::One());
352  arc.ilabel = 0; // epsilon.
353  arc.olabel = seq[i];
354  ofst->AddArc(cur_state, arc);
355  cur_state = next_state;
356  }
357  ofst->SetFinal(cur_state, (seq.size() == 0 ? temp_arc.weight : Weight::One()));
358  } else { // Really an arc.
359  OutputStateId cur_state = this_state;
360  // Have to be careful with this integer comparison (i+1 < seq.size()) because unsigned.
361  // i < seq.size()-1 could fail for zero-length sequences.
362  for (size_t i = 0; i+1 < seq.size();i++) {
363  // for all but the last element of seq, create new state.
364  OutputStateId next_state = ofst->AddState();
365  Arc arc;
366  arc.nextstate = next_state;
367  arc.weight = (i == 0 ? temp_arc.weight : Weight::One());
368  arc.ilabel = (i == 0 ? temp_arc.ilabel : 0); // put ilabel on first element of seq.
369  arc.olabel = seq[i];
370  ofst->AddArc(cur_state, arc);
371  cur_state = next_state;
372  }
373  // Add the final arc in the sequence.
374  Arc arc;
375  arc.nextstate = temp_arc.nextstate;
376  arc.weight = (seq.size() <= 1 ? temp_arc.weight : Weight::One());
377  arc.ilabel = (seq.size() <= 1 ? temp_arc.ilabel : 0);
378  arc.olabel = (seq.size() > 0 ? seq.back() : 0);
379  ofst->AddArc(cur_state, arc);
380  }
381  }
382  // Free up memory. Do this inside the loop as ofst is also allocating memory
383  if (destroy) {
384  vector<TempArc> temp; temp.swap(this_vec);
385  }
386  }
387  if (destroy) {
388  vector<vector<TempArc> > temp;
389  temp.swap(output_arcs_);
390  repository_.Destroy();
391  }
392  }
LatticeStringRepository< IntType > repository_
vector< vector< TempArc > > output_arcs_
void ProcessFinal ( OutputStateId  output_state)
inlineprivate

Definition at line 852 of file determinize-lattice-inl.h.

References fst::Compare(), LatticeDeterminizer< Weight, IntType >::TempArc::ilabel, LatticeDeterminizer< Weight, IntType >::TempArc::nextstate, LatticeDeterminizer< Weight, IntType >::Element::state, LatticeDeterminizer< Weight, IntType >::Element::string, LatticeDeterminizer< Weight, IntType >::TempArc::string, fst::Times(), LatticeDeterminizer< Weight, IntType >::Element::weight, and LatticeDeterminizer< Weight, IntType >::TempArc::weight.

852  {
853  const vector<Element> &minimal_subset = *(output_states_[output_state]);
854  // processes final-weights for this subset.
855 
856  // minimal_subset may be empty if the graphs is not connected/trimmed, I think,
857  // do don't check that it's nonempty.
858  bool is_final = false;
859  StringId final_string = NULL; // = NULL to keep compiler happy.
860  Weight final_weight = Weight::Zero();
861  typename vector<Element>::const_iterator iter = minimal_subset.begin(), end = minimal_subset.end();
862  for (; iter != end; ++iter) {
863  const Element &elem = *iter;
864  Weight this_final_weight = Times(elem.weight, ifst_->Final(elem.state));
865  StringId this_final_string = elem.string;
866  if (this_final_weight != Weight::Zero() &&
867  (!is_final || Compare(this_final_weight, this_final_string,
868  final_weight, final_string) == 1)) { // the new
869  // (weight, string) pair is more in semiring than our current
870  // one.
871  is_final = true;
872  final_weight = this_final_weight;
873  final_string = this_final_string;
874  }
875  }
876  if (is_final) {
877  // store final weights in TempArc structure, just like a transition.
878  TempArc temp_arc;
879  temp_arc.ilabel = 0;
880  temp_arc.nextstate = kNoStateId; // special marker meaning "final weight".
881  temp_arc.string = final_string;
882  temp_arc.weight = final_weight;
883  output_arcs_[output_state].push_back(temp_arc);
884  num_arcs_++;
885  }
886  }
const StringRepositoryType::Entry * StringId
LatticeWeightTpl< FloatType > Times(const LatticeWeightTpl< FloatType > &w1, const LatticeWeightTpl< FloatType > &w2)
fst::StdArc::Weight Weight
int Compare(const Weight &a_w, StringId a_str, const Weight &b_w, StringId b_str) const
vector< vector< TempArc > > output_arcs_
vector< vector< Element > * > output_states_
void ProcessState ( OutputStateId  output_state)
inlineprivate

Definition at line 1074 of file determinize-lattice-inl.h.

1074  {
1075  ProcessFinal(output_state);
1076  ProcessTransitions(output_state);
1077  }
void ProcessTransitions(OutputStateId output_state)
void ProcessFinal(OutputStateId output_state)
void ProcessTransition ( OutputStateId  state,
Label  ilabel,
vector< Element > *  subset 
)
inlineprivate

Definition at line 962 of file determinize-lattice-inl.h.

References LatticeDeterminizer< Weight, IntType >::TempArc::ilabel, LatticeDeterminizer< Weight, IntType >::TempArc::nextstate, LatticeDeterminizer< Weight, IntType >::TempArc::string, fst::Times(), and LatticeDeterminizer< Weight, IntType >::TempArc::weight.

962  {
963  MakeSubsetUnique(subset); // remove duplicates with the same state.
964 
965  StringId common_str;
966  Weight tot_weight;
967  NormalizeSubset(subset, &tot_weight, &common_str);
968 
969  OutputStateId nextstate;
970  {
971  Weight next_tot_weight;
972  StringId next_common_str;
973  nextstate = InitialToStateId(*subset,
974  &next_tot_weight,
975  &next_common_str);
976  common_str = repository_.Concatenate(common_str, next_common_str);
977  tot_weight = Times(tot_weight, next_tot_weight);
978  }
979 
980  // Now add an arc to the next state (would have been created if necessary by
981  // InitialToStateId).
982  TempArc temp_arc;
983  temp_arc.ilabel = ilabel;
984  temp_arc.nextstate = nextstate;
985  temp_arc.string = common_str;
986  temp_arc.weight = tot_weight;
987  output_arcs_[state].push_back(temp_arc); // record the arc.
988  num_arcs_++;
989  }
const StringRepositoryType::Entry * StringId
OutputStateId InitialToStateId(const vector< Element > &subset_in, Weight *remaining_weight, StringId *common_prefix)
LatticeStringRepository< IntType > repository_
void MakeSubsetUnique(vector< Element > *subset)
LatticeWeightTpl< FloatType > Times(const LatticeWeightTpl< FloatType > &w1, const LatticeWeightTpl< FloatType > &w2)
fst::StdArc::Weight Weight
void NormalizeSubset(vector< Element > *elems, Weight *tot_weight, StringId *common_str)
vector< vector< TempArc > > output_arcs_
void ProcessTransitions ( OutputStateId  output_state)
inlineprivate

Definition at line 1018 of file determinize-lattice-inl.h.

References LatticeDeterminizer< Weight, IntType >::Element::state, LatticeDeterminizer< Weight, IntType >::Element::string, fst::Times(), and LatticeDeterminizer< Weight, IntType >::Element::weight.

1018  {
1019  const vector<Element> &minimal_subset = *(output_states_[output_state]);
1020  // it's possible that minimal_subset could be empty if there are
1021  // unreachable parts of the graph, so don't check that it's nonempty.
1022  vector<pair<Label, Element> > &all_elems(all_elems_tmp_); // use class member
1023  // to avoid memory allocation/deallocation.
1024  {
1025  // Push back into "all_elems", elements corresponding to all
1026  // non-epsilon-input transitions out of all states in "minimal_subset".
1027  typename vector<Element>::const_iterator iter = minimal_subset.begin(), end = minimal_subset.end();
1028  for (;iter != end; ++iter) {
1029  const Element &elem = *iter;
1030  for (ArcIterator<Fst<Arc> > aiter(*ifst_, elem.state); ! aiter.Done(); aiter.Next()) {
1031  const Arc &arc = aiter.Value();
1032  if (arc.ilabel != 0
1033  && arc.weight != Weight::Zero()) { // Non-epsilon transition -- ignore epsilons here.
1034  pair<Label, Element> this_pr;
1035  this_pr.first = arc.ilabel;
1036  Element &next_elem(this_pr.second);
1037  next_elem.state = arc.nextstate;
1038  next_elem.weight = Times(elem.weight, arc.weight);
1039  if (arc.olabel == 0) // output epsilon
1040  next_elem.string = elem.string;
1041  else
1042  next_elem.string = repository_.Successor(elem.string, arc.olabel);
1043  all_elems.push_back(this_pr);
1044  }
1045  }
1046  }
1047  }
1048  PairComparator pc;
1049  std::sort(all_elems.begin(), all_elems.end(), pc);
1050  // now sorted first on input label, then on state.
1051  typedef typename vector<pair<Label, Element> >::const_iterator PairIter;
1052  PairIter cur = all_elems.begin(), end = all_elems.end();
1053  vector<Element> this_subset;
1054  while (cur != end) {
1055  // Process ranges that share the same input symbol.
1056  Label ilabel = cur->first;
1057  this_subset.clear();
1058  while (cur != end && cur->first == ilabel) {
1059  this_subset.push_back(cur->second);
1060  cur++;
1061  }
1062  // We now have a subset for this ilabel.
1063  assert(!this_subset.empty()); // temp.
1064  ProcessTransition(output_state, ilabel, &this_subset);
1065  }
1066  all_elems.clear(); // as it's a class variable-- want it to stay
1067  // emtpy.
1068  }
LatticeStringRepository< IntType > repository_
LatticeWeightTpl< FloatType > Times(const LatticeWeightTpl< FloatType > &w1, const LatticeWeightTpl< FloatType > &w2)
void ProcessTransition(OutputStateId state, Label ilabel, vector< Element > *subset)
vector< pair< Label, Element > > all_elems_tmp_
vector< vector< Element > * > output_states_
void RebuildRepository ( )
inline

Definition at line 434 of file determinize-lattice-inl.h.

References rnnlm::i, rnnlm::j, and LatticeDeterminizer< Weight, IntType >::Element::string.

434  { // rebuild the string repository,
435  // freeing stuff we don't need.. we call this when memory usage
436  // passes a supplied threshold. We need to accumulate all the
437  // strings we need the repository to "remember", then tell it
438  // to clean the repository.
439  std::vector<StringId> needed_strings;
440  for (size_t i = 0; i < output_arcs_.size(); i++)
441  for (size_t j = 0; j < output_arcs_[i].size(); j++)
442  needed_strings.push_back(output_arcs_[i][j].string);
443 
444  // the following loop covers strings present in minimal_hash_
445  // which are also accessible via output_states_.
446  for (size_t i = 0; i < output_states_.size(); i++)
447  for (size_t j = 0; j < output_states_[i]->size(); j++)
448  needed_strings.push_back((*(output_states_[i]))[j].string);
449 
450  // the following loop covers strings present in initial_hash_.
451  for (typename InitialSubsetHash::const_iterator
452  iter = initial_hash_.begin();
453  iter != initial_hash_.end(); ++iter) {
454  const vector<Element> &vec = *(iter->first);
455  Element elem = iter->second;
456  for (size_t i = 0; i < vec.size(); i++)
457  needed_strings.push_back(vec[i].string);
458  needed_strings.push_back(elem.string);
459  }
460 
461  std::sort(needed_strings.begin(), needed_strings.end());
462  needed_strings.erase(std::unique(needed_strings.begin(),
463  needed_strings.end()),
464  needed_strings.end()); // uniq the strings.
465  repository_.Rebuild(needed_strings);
466  }
LatticeStringRepository< IntType > repository_
vector< vector< TempArc > > output_arcs_
vector< vector< Element > * > output_states_

Member Data Documentation

vector<pair<Label, Element> > all_elems_tmp_
private

Definition at line 1248 of file determinize-lattice-inl.h.

bool determinized_
private
SubsetEqual equal_
private

Definition at line 1228 of file determinize-lattice-inl.h.

SubsetKey hasher_
private

Definition at line 1227 of file determinize-lattice-inl.h.

const Fst<Arc>* ifst_
private
InitialSubsetHash initial_hash_
private

Definition at line 1235 of file determinize-lattice-inl.h.

vector<char> isymbol_or_final_
private

Definition at line 1252 of file determinize-lattice-inl.h.

MinimalSubsetHash minimal_hash_
private
int num_arcs_
private

Definition at line 1222 of file determinize-lattice-inl.h.

int num_elems_
private

Definition at line 1223 of file determinize-lattice-inl.h.

Definition at line 1226 of file determinize-lattice-inl.h.

vector<vector<TempArc> > output_arcs_
private
vector<vector<Element>* > output_states_
private

Definition at line 1216 of file determinize-lattice-inl.h.

vector<OutputStateId> queue_
private

Definition at line 1244 of file determinize-lattice-inl.h.

LatticeStringRepository<IntType> repository_
private

The documentation for this class was generated from the following file: