fstext-utils.h
Go to the documentation of this file.
1 // fstext/fstext-utils.h
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 // 2012-2013 Johns Hopkins University (Author: Daniel Povey)
5 // 2013 Guoguo Chen
6 // 2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder)
7 
8 // See ../../COPYING for clarification regarding multiple authors
9 //
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 //
14 // http://www.apache.org/licenses/LICENSE-2.0
15 //
16 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
18 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
19 // MERCHANTABLITY OR NON-INFRINGEMENT.
20 // See the Apache 2 License for the specific language governing permissions and
21 // limitations under the License.
22 
23 #ifndef KALDI_FSTEXT_FSTEXT_UTILS_H_
24 #define KALDI_FSTEXT_FSTEXT_UTILS_H_
25 #include <algorithm>
26 #include <map>
27 #include <set>
28 #include <vector>
29 #include <fst/fstlib.h>
30 #include <fst/fst-decl.h>
33 #include "base/kaldi-common.h" // for error reporting macros.
34 #include "util/text-utils.h" // for SplitStringToVector
35 #include "fst/script/print-impl.h"
36 
37 namespace fst {
38 
41 template<class Arc>
42 typename Arc::Label HighestNumberedOutputSymbol(const Fst<Arc> &fst);
43 
46 template<class Arc>
47 typename Arc::Label HighestNumberedInputSymbol(const Fst<Arc> &fst);
48 
50 template<class Arc>
51 typename Arc::StateId NumArcs(const ExpandedFst<Arc> &fst);
52 
56 template<class Arc, class I>
57 void GetInputSymbols(const Fst<Arc> &fst,
58  bool include_eps,
59  std::vector<I> *symbols);
60 
63 template<class Arc, class I>
64 void GetOutputSymbols(const Fst<Arc> &fst,
65  bool include_eps,
66  std::vector<I> *symbols);
67 
71 template<class Arc>
72 void ClearSymbols(bool clear_input,
73  bool clear_output,
74  MutableFst<Arc> *fst);
75 
76 template<class I>
77 void GetSymbols(const SymbolTable &symtab,
78  bool include_eps,
79  std::vector<I> *syms_out);
80 
81 
82 
83 inline
84 void DeterminizeStarInLog(VectorFst<StdArc> *fst, float delta = kDelta, bool *debug_ptr = NULL,
85  int max_states = -1);
86 
87 
88 // e.g. of using this function: PushInLog<REWEIGHT_TO_INITIAL>(fst, kPushWeights|kPushLabels);
89 
90 template<ReweightType rtype> // == REWEIGHT_TO_{INITIAL, FINAL}
91 void PushInLog(VectorFst<StdArc> *fst, uint32 ptype, float delta = kDelta) {
92 
93  // PushInLog pushes the FST
94  // and returns a new pushed FST (labels and weights pushed to the left).
95  VectorFst<LogArc> *fst_log = new VectorFst<LogArc>; // Want to determinize in log semiring.
96  Cast(*fst, fst_log);
97  VectorFst<StdArc> tmp;
98  *fst = tmp; // free up memory.
99  VectorFst<LogArc> *fst_pushed_log = new VectorFst<LogArc>;
100  Push<LogArc, rtype>(*fst_log, fst_pushed_log, ptype, delta);
101  Cast(*fst_pushed_log, fst);
102  delete fst_log;
103  delete fst_pushed_log;
104 }
105 
106 // Minimizes after encoding; applicable to all FSTs. It is like what you get
107 // from the Minimize() function, except it will not push the weights, or the
108 // symbols. This is better for our recipes, as we avoid ever pushing the
109 // weights. However, it will only minimize optimally if your graphs are such
110 // that the symbols are as far to the left as they can go, and the weights
111 // in combinable paths are the same... hard to formalize this, but it's something
112 // that is satisified by our normal FSTs.
113 template<class Arc>
114 void MinimizeEncoded(VectorFst<Arc> *fst, float delta = kDelta) {
115 
116  Map(fst, QuantizeMapper<Arc>(delta));
117  EncodeMapper<Arc> encoder(kEncodeLabels | kEncodeWeights, ENCODE);
118  Encode(fst, &encoder);
119  internal::AcceptorMinimize(fst);
120  Decode(fst, encoder);
121 }
122 
123 
132 template<class Arc, class I>
133 bool GetLinearSymbolSequence(const Fst<Arc> &fst,
134  std::vector<I> *isymbols_out,
135  std::vector<I> *osymbols_out,
136  typename Arc::Weight *tot_weight_out);
137 
138 
144 template<class Arc>
145 void ConvertNbestToVector(const Fst<Arc> &fst,
146  std::vector<VectorFst<Arc> > *fsts_out);
147 
148 
153 template<class Arc>
154 void NbestAsFsts(const Fst<Arc> &fst,
155  size_t n,
156  std::vector<VectorFst<Arc> > *fsts_out);
157 
158 
159 
160 
162 template<class Arc, class I>
163 void MakeLinearAcceptor(const std::vector<I> &labels, MutableFst<Arc> *ofst);
164 
165 
166 
170 template<class Arc, class I>
171 void MakeLinearAcceptorWithAlternatives(const std::vector<std::vector<I> > &labels,
172  MutableFst<Arc> *ofst);
173 
174 
181 
182 template<class Arc>
183 void SafeDeterminizeWrapper(MutableFst<Arc> *ifst, MutableFst<Arc> *ofst, float delta = kDelta);
184 
185 
188 template<class Arc>
189 void SafeDeterminizeMinimizeWrapper(MutableFst<Arc> *ifst, VectorFst<Arc> *ofst, float delta = kDelta);
190 
191 
194 void SafeDeterminizeMinimizeWrapperInLog(VectorFst<StdArc> *ifst, VectorFst<StdArc> *ofst, float delta = kDelta);
195 
196 
197 
200 template<class Arc, class I>
201 void RemoveSomeInputSymbols(const std::vector<I> &to_remove,
202  MutableFst<Arc> *fst);
203 
204 // MapInputSymbols will replace any input symbol i that is between 0 and
205 // symbol_map.size()-1, with symbol_map[i]. It removes the input symbol
206 // table of the FST.
207 template<class Arc, class I>
208 void MapInputSymbols(const std::vector<I> &symbol_map,
209  MutableFst<Arc> *fst);
210 
211 
212 template<class Arc>
213 void RemoveWeights(MutableFst<Arc> *fst);
214 
215 
216 
217 
222 template<class Arc>
223 bool PrecedingInputSymbolsAreSame(bool start_is_epsilon, const Fst<Arc> &fst);
224 
225 
234 template<class Arc, class F>
235 bool PrecedingInputSymbolsAreSameClass(bool start_is_epsilon, const Fst<Arc> &fst, const F &f);
236 
237 
242 template<class Arc>
243 bool FollowingInputSymbolsAreSame(bool end_is_epsilon, const Fst<Arc> &fst);
244 
245 
246 template<class Arc, class F>
247 bool FollowingInputSymbolsAreSameClass(bool end_is_epsilon, const Fst<Arc> &fst, const F &f);
248 
249 
257 template<class Arc>
258 void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst);
259 
260 
262 template<class Arc, class F>
263 void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f);
264 
265 
275 template<class Arc>
276 void MakeFollowingInputSymbolsSame(bool end_is_epsilon, MutableFst<Arc> *fst);
277 
279 template<class Arc, class F>
280 void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fst, const F &f);
281 
282 
283 
284 
294 
301 
305 
306 template<class Arc>
307 VectorFst<Arc>* MakeLoopFst(const std::vector<const ExpandedFst<Arc> *> &fsts);
308 
309 
314 template<class Arc>
315 void ApplyProbabilityScale(float scale, MutableFst<Arc> *fst);
316 
317 
318 
319 
320 
332 template<class Arc>
333 bool EqualAlign(const Fst<Arc> &ifst, typename Arc::StateId length,
334  int rand_seed, MutableFst<Arc> *ofst, int num_retries = 10);
335 
336 
337 
338 // RemoveUselessArcs removes arcs such that there is no input symbol
339 // sequence for which the best path through the FST would contain
340 // those arcs [for these purposes, epsilon is not treated as a real symbol].
341 // This is mainly geared towards decoding-graph FSTs which may contain
342 // transitions that have less likely words on them that would never be
343 // taken. We do not claim that this algorithm removes all such arcs;
344 // it just does the best job it can.
345 // Only works for tropical (not log) semiring as it uses
346 // NaturalLess.
347 template<class Arc>
348 void RemoveUselessArcs(MutableFst<Arc> *fst);
349 
350 
351 // PhiCompose is a version of composition where
352 // the right hand FST (fst2) is treated as a backoff
353 // LM, with the phi symbol (e.g. #0) treated as a
354 // "failure transition", only taken when we don't
355 // have a match for the requested symbol.
356 template<class Arc>
357 void PhiCompose(const Fst<Arc> &fst1,
358  const Fst<Arc> &fst2,
359  typename Arc::Label phi_label,
360  MutableFst<Arc> *fst);
361 
362 // PropagateFinal propagates final-probs through
363 // "phi" transitions (note that here, phi_label may
364 // be epsilon if you want). If you have a backoff LM
365 // with special symbols ("phi") on the backoff arcs
366 // instead of epsilon, you may use PhiCompose to compose
367 // with it, but this won't do the right thing w.r.t.
368 // final probabilities. You should first call PropagateFinal
369 // on the FST with phi's i it (fst2 in PhiCompose above),
370 // to fix this. If a state does not have a final-prob,
371 // but has a phi transition, it makes the state's final-prob
372 // (phi-prob * final-prob-of-dest-state), and does this
373 // recursively i.e. follows phi transitions on the dest state
374 // first. It behaves as if there were a super-final state
375 // with a special symbol leading to it, from each currently
376 // final state. Note that this may not behave as desired
377 // if there are epsilons in your FST; it might be better
378 // to remove those before calling this function.
379 
380 template<class Arc>
381 void PropagateFinal(typename Arc::Label phi_label,
382  MutableFst<Arc> *fst);
383 
384 // PhiCompose is a version of composition where
385 // the right hand FST (fst2) has speciall "rho transitions"
386 // which are taken whenever no normal transition matches; these
387 // transitions will be rewritten with whatever symbol was on
388 // the first FST.
389 template<class Arc>
390 void RhoCompose(const Fst<Arc> &fst1,
391  const Fst<Arc> &fst2,
392  typename Arc::Label rho_label,
393  MutableFst<Arc> *fst);
394 
395 
396 
408 template<class Arc>
409 bool IsStochasticFst(const Fst<Arc> &fst,
410  float delta = kDelta, // kDelta = 1.0/1024.0 by default.
411  typename Arc::Weight *min_sum = NULL,
412  typename Arc::Weight *max_sum = NULL);
413 
414 
415 
416 
417 // IsStochasticFstInLog makes sure it's stochastic after casting to log.
418 inline bool IsStochasticFstInLog(const Fst<StdArc> &fst,
419  float delta = kDelta, // kDelta = 1.0/1024.0 by default.
420  StdArc::Weight *min_sum = NULL,
421  StdArc::Weight *max_sum = NULL);
422 
423 
424 } // end namespace fst
425 
426 
427 #include "fstext/fstext-utils-inl.h"
428 
429 #endif
fst::StdArc::StateId StateId
void GetSymbols(const SymbolTable &symtab, bool include_eps, std::vector< I > *syms_out)
void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst< Arc > *fst)
MakePrecedingInputSymbolsSame ensures that all arcs entering any given fst state have the same input ...
void PropagateFinal(typename Arc::Label phi_label, MutableFst< Arc > *fst)
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
Definition: graph.dox:21
void PhiCompose(const Fst< Arc > &fst1, const Fst< Arc > &fst2, typename Arc::Label phi_label, MutableFst< Arc > *ofst)
void MinimizeEncoded(VectorFst< Arc > *fst, float delta=kDelta)
Definition: fstext-utils.h:114
void ClearSymbols(bool clear_input, bool clear_output, MutableFst< Arc > *fst)
ClearSymbols sets all the symbols on the input and/or output side of the FST to zero, as specified.
bool GetLinearSymbolSequence(const Fst< Arc > &fst, std::vector< I > *isymbols_out, std::vector< I > *osymbols_out, typename Arc::Weight *tot_weight_out)
GetLinearSymbolSequence gets the symbol sequence from a linear FST.
void GetInputSymbols(const Fst< Arc > &fst, bool include_eps, std::vector< I > *symbols)
GetInputSymbols gets the list of symbols on the input of fst (including epsilon, if include_eps == tr...
void RemoveUselessArcs(MutableFst< Arc > *fst)
void MakeLinearAcceptorWithAlternatives(const std::vector< std::vector< I > > &labels, MutableFst< Arc > *ofst)
Creates an unweighted acceptor with a linear structure, with alternatives at each position...
void DeterminizeStarInLog(VectorFst< StdArc > *fst, float delta, bool *debug_ptr, int max_states)
void MakeLinearAcceptor(const std::vector< I > &labels, MutableFst< Arc > *ofst)
Creates unweighted linear acceptor from symbol sequence.
bool EqualAlign(const Fst< Arc > &ifst, typename Arc::StateId length, int rand_seed, MutableFst< Arc > *ofst, int num_retries)
EqualAlign is similar to RandGen, but it generates a sequence with exactly "length" input symbols...
void PushInLog(VectorFst< StdArc > *fst, uint32 ptype, float delta=kDelta)
Definition: fstext-utils.h:91
void ConvertNbestToVector(const Fst< Arc > &fst, std::vector< VectorFst< Arc > > *fsts_out)
This function converts an FST with a special structure, which is output by the OpenFst functions Shor...
void NbestAsFsts(const Fst< Arc > &fst, size_t n, std::vector< VectorFst< Arc > > *fsts_out)
Takes the n-shortest-paths (using ShortestPath), but outputs the result as a vector of up to n fsts...
struct rnnlm::@11::@12 n
Arc::Label HighestNumberedOutputSymbol(const Fst< Arc > &fst)
Returns the highest numbered output symbol id of the FST (or zero for an empty FST.
bool IsStochasticFstInLog(const Fst< StdArc > &fst, float delta, StdArc::Weight *min_sum, StdArc::Weight *max_sum)
VectorFst< Arc > * MakeLoopFst(const std::vector< const ExpandedFst< Arc > *> &fsts)
MakeLoopFst creates an FST that has a state that is both initial and final (weight == Weight::One())...
bool PrecedingInputSymbolsAreSame(bool start_is_epsilon, const Fst< Arc > &fst)
Returns true if and only if the FST is such that the input symbols on arcs entering any given state a...
bool IsStochasticFst(const Fst< LogArc > &fst, float delta, LogArc::Weight *min_sum, LogArc::Weight *max_sum)
fst::StdArc::Label Label
void ApplyProbabilityScale(float scale, MutableFst< Arc > *fst)
ApplyProbabilityScale is applicable to FSTs in the log or tropical semiring.
bool PrecedingInputSymbolsAreSameClass(bool start_is_epsilon, const Fst< Arc > &fst, const F &f)
This is as PrecedingInputSymbolsAreSame, but with a functor f that maps labels to classes...
void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst< Arc > *fst, const F &f)
As MakePrecedingInputSymbolsSame, but takes a functor object that maps labels to classes.
fst::StdArc::Weight Weight
bool FollowingInputSymbolsAreSame(bool end_is_epsilon, const Fst< Arc > &fst)
Returns true if and only if the FST is such that the input symbols on arcs exiting any given state al...
Arc::Label HighestNumberedInputSymbol(const Fst< Arc > &fst)
Returns the highest numbered input symbol id of the FST (or zero for an empty FST.
void MapInputSymbols(const std::vector< I > &symbol_mapping, MutableFst< Arc > *fst)
void RemoveWeights(MutableFst< Arc > *ifst)
void SafeDeterminizeWrapper(MutableFst< Arc > *ifst, MutableFst< Arc > *ofst, float delta)
Does PreDeterminize and DeterminizeStar and then removes the disambiguation symbols.
void SafeDeterminizeMinimizeWrapperInLog(VectorFst< StdArc > *ifst, VectorFst< StdArc > *ofst, float delta)
SafeDeterminizeMinimizeWapperInLog is as SafeDeterminizeMinimizeWrapper except it first casts tothe l...
void SafeDeterminizeMinimizeWrapper(MutableFst< Arc > *ifst, VectorFst< Arc > *ofst, float delta)
SafeDeterminizeMinimizeWapper is as SafeDeterminizeWrapper except that it also minimizes (encoded min...
void RhoCompose(const Fst< Arc > &fst1, const Fst< Arc > &fst2, typename Arc::Label rho_label, MutableFst< Arc > *ofst)
bool FollowingInputSymbolsAreSameClass(bool end_is_epsilon, const Fst< Arc > &fst, const F &f)
Arc::StateId NumArcs(const ExpandedFst< Arc > &fst)
Returns the total number of arcs in an FST.
void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst< Arc > *fst, const F &f)
As MakeFollowingInputSymbolsSame, but takes a functor object that maps labels to classes.
void MakeFollowingInputSymbolsSame(bool end_is_epsilon, MutableFst< Arc > *fst)
MakeFollowingInputSymbolsSame ensures that all arcs exiting any given fst state have the same input s...
void GetOutputSymbols(const Fst< Arc > &fst, bool include_eps, std::vector< I > *symbols)
GetOutputSymbols gets the list of symbols on the output of fst (including epsilon, if include_eps == true)
void RemoveSomeInputSymbols(const std::vector< I > &to_remove, MutableFst< Arc > *fst)
RemoveSomeInputSymbols removes any symbol that appears in "to_remove", from the input side of the FST...