lattice-to-kws-index.cc
Go to the documentation of this file.
1 // kwsbin/lattice-to-kws-index.cc
2 
3 // Copyright 2012 Johns Hopkins University (Author: Guoguo Chen)
4 // Lucas Ondel
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 
22 #include "base/kaldi-common.h"
23 #include "util/common-utils.h"
24 #include "fstext/fstext-utils.h"
25 #include "lat/kaldi-lattice.h"
26 #include "lat/lattice-functions.h"
27 #include "kws/kaldi-kws.h"
28 #include "kws/kws-functions.h"
30 
31 int main(int argc, char *argv[]) {
32  try {
33  using namespace kaldi;
34  using fst::VectorFst;
35  typedef kaldi::int32 int32;
36  typedef kaldi::uint64 uint64;
37 
38  const char *usage =
39  "Create an inverted index of the given lattices. The output index is \n"
40  "in the T*T*T semiring. For details for the semiring, please refer to\n"
41  "Dogan Can and Murat Saraclar's paper named "
42  "\"Lattice Indexing for Spoken Term Detection\"\n"
43  "\n"
44  "Usage: lattice-to-kws-index [options] "
45  " <utter-symtab-rspecifier> <lattice-rspecifier> <index-wspecifier>\n"
46  "e.g.: \n"
47  " lattice-to-kws-index ark:utter.symtab ark:1.lats ark:global.idx\n";
48 
49  ParseOptions po(usage);
50 
51  int32 frame_subsampling_factor = 1;
52  int32 max_silence_frames = 50;
53  bool strict = true;
54  bool allow_partial = true;
55  BaseFloat max_states_scale = 4;
56  po.Register("frame-subsampling-factor", &frame_subsampling_factor,
57  "Frame subsampling factor. (Default value 1)");
58  po.Register("max-silence-frames", &max_silence_frames,
59  "If --frame-subsampling-factor is used, --max-silence-frames "
60  "is relative to the the input, not the output frame rate "
61  "(we divide by frame-subsampling-factor and round to "
62  "the closest integer, to get the number of symbols in the "
63  "lattice).");
64  po.Register("strict", &strict, "Setting --strict=false will cause "
65  "successful termination even if we processed no lattices.");
66  po.Register("max-states-scale", &max_states_scale, "Number of states in the"
67  " original lattice times this scale is the number of states "
68  "allowed when optimizing the index. Negative number means no "
69  "limit on the number of states.");
70  po.Register("allow-partial", &allow_partial, "Allow partial output if fails"
71  " to determinize, otherwise skip determinization if it fails.");
72 
73  po.Read(argc, argv);
74 
75  if (po.NumArgs() != 3) {
76  po.PrintUsage();
77  exit(1);
78  }
79 
80  max_silence_frames = 0.5 +
81  max_silence_frames / static_cast<float>(frame_subsampling_factor);
82  std::string usymtab_rspecifier = po.GetOptArg(1),
83  lats_rspecifier = po.GetArg(2),
84  index_wspecifier = po.GetArg(3);
85 
86  // We use RandomAccessInt32Reader to read the utterance symtab table.
87  RandomAccessInt32Reader usymtab_reader(usymtab_rspecifier);
88 
89  // We read the lattice in as CompactLattice; We need the CompactLattice
90  // structure for the rest of the work
91  SequentialCompactLatticeReader clat_reader(lats_rspecifier);
92 
94  index_writer(index_wspecifier);
95 
96  int32 n_done = 0;
97  int32 n_fail = 0;
98 
99  int32 max_states = -1;
100 
101  for (; !clat_reader.Done(); clat_reader.Next()) {
102  std::string key = clat_reader.Key();
103  CompactLattice clat = clat_reader.Value();
104  clat_reader.FreeCurrent();
105  KALDI_LOG << "Processing lattice " << key;
106 
107  if (max_states_scale > 0) {
108  max_states = static_cast<int32>(
109  max_states_scale * static_cast<BaseFloat>(clat.NumStates()));
110  }
111 
112  // Check if we have the corresponding utterance id.
113  if (!usymtab_reader.HasKey(key)) {
114  KALDI_WARN << "Cannot find utterance id for " << key;
115  n_fail++;
116  continue;
117  }
118 
119  // Topologically sort the lattice, if not already sorted.
120  uint64 props = clat.Properties(fst::kFstProperties, false);
121  if (!(props & fst::kTopSorted)) {
122  if (fst::TopSort(&clat) == false) {
123  KALDI_WARN << "Cycles detected in lattice " << key;
124  n_fail++;
125  continue;
126  }
127  }
128 
129  // Get the alignments
130  std::vector<int32> state_times;
131  CompactLatticeStateTimes(clat, &state_times);
132 
133  // Cluster the arcs in the CompactLattice, write the cluster_id on the
134  // output label side.
135  // ClusterLattice() corresponds to the second part of the preprocessing in
136  // Dogan and Murat's paper -- clustering. Note that we do the first part
137  // of preprocessing (the weight pushing step) later when generating the
138  // factor transducer.
139  KALDI_VLOG(1) << "Arc clustering...";
140  bool success = false;
141  success = kaldi::ClusterLattice(&clat, state_times);
142  if (!success) {
143  KALDI_WARN << "State id's and alignments do not match for lattice "
144  << key;
145  n_fail++;
146  continue;
147  }
148 
149  // The next part is something new, not in the Dogan and Can paper. It is
150  // necessary because we have epsilon arcs, due to silences, in our
151  // lattices. We modify the factor transducer, while maintaining
152  // equivalence, to ensure that states don't have both epsilon *and*
153  // non-epsilon arcs entering them. (and the same, with "entering"
154  // replaced with "leaving"). Later we will find out which states have
155  // non-epsilon arcs leaving/entering them and use it to be more selective
156  // in adding arcs to connect them with the initial/final states. The goal
157  // here is to disallow silences at the beginning or ending of a keyword
158  // occurrence.
159  if (true) {
160  EnsureEpsilonProperty(&clat);
161  fst::TopSort(&clat);
162  // We have to recompute the state times because they will have changed.
163  CompactLatticeStateTimes(clat, &state_times);
164  }
165 
166  // Generate factor transducer
167  // CreateFactorTransducer() corresponds to the "Factor Generation" part of
168  // Dogan and Murat's paper. But we also move the weight pushing step to
169  // this function as we have to compute the alphas and betas anyway.
170  KALDI_VLOG(1) << "Generating factor transducer...";
171  KwsProductFst factor_transducer;
172  int32 utterance_id = usymtab_reader.Value(key);
173  success = kaldi::CreateFactorTransducer(clat,
174  state_times,
175  utterance_id,
176  &factor_transducer);
177  if (!success) {
178  KALDI_WARN << "Cannot generate factor transducer for lattice " << key;
179  n_fail++;
180  }
181 
182  MaybeDoSanityCheck(factor_transducer);
183 
184  // Remove long silence arc
185  // We add the filtering step in our implementation. This is because gap
186  // between two successive words in a query term should be less than 0.5s
187  KALDI_VLOG(1) << "Removing long silence...";
188  RemoveLongSilences(max_silence_frames, state_times, &factor_transducer);
189 
190  MaybeDoSanityCheck(factor_transducer);
191 
192  // Do factor merging, and return a transducer in T*T*T semiring. This step
193  // corresponds to the "Factor Merging" part in Dogan and Murat's paper.
194  KALDI_VLOG(1) << "Merging factors...";
195  KwsLexicographicFst index_transducer;
196  DoFactorMerging(&factor_transducer, &index_transducer);
197 
198  MaybeDoSanityCheck(index_transducer);
199 
200  // Do factor disambiguation. It corresponds to the "Factor Disambiguation"
201  // step in Dogan and Murat's paper.
202  KALDI_VLOG(1) << "Doing factor disambiguation...";
203  DoFactorDisambiguation(&index_transducer);
204 
205  MaybeDoSanityCheck(index_transducer);
206 
207  // Optimize the above factor transducer. It corresponds to the
208  // "Optimization" step in the paper.
209  KALDI_VLOG(1) << "Optimizing factor transducer...";
210  OptimizeFactorTransducer(&index_transducer, max_states, allow_partial);
211 
212  MaybeDoSanityCheck(index_transducer);
213 
214  // Write result
215  index_writer.Write(key, index_transducer);
216 
217  n_done++;
218  }
219 
220  KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail;
221  if (strict == true)
222  return (n_done != 0 ? 0 : 1);
223  else
224  return 0;
225  } catch(const std::exception &e) {
226  std::cerr << e.what();
227  return -1;
228  }
229 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
fst::VectorFst< KwsProductArc > KwsProductFst
Definition: kaldi-kws.h:49
void EnsureEpsilonProperty(VectorFst< Arc > *fst)
This function modifies the fst (while maintaining equivalence) in such a way that, after the modification, all states of the FST which have epsilon-arcs entering them, have no non-epsilon arcs entering them, and all states which have epsilon-arcs leaving them, have no non-epsilon arcs leaving them.
bool ClusterLattice(CompactLattice *clat, const std::vector< int32 > &state_times)
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
fst::VectorFst< KwsLexicographicArc > KwsLexicographicFst
Definition: kaldi-kws.h:46
float BaseFloat
Definition: kaldi-types.h:29
void DoFactorMerging(KwsProductFst *factor_transducer, KwsLexicographicFst *index_transducer)
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const T & Value(const std::string &key)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
bool CreateFactorTransducer(const CompactLattice &clat, const std::vector< int32 > &state_times, int32 utterance_id, KwsProductFst *factor_transducer)
int32 CompactLatticeStateTimes(const CompactLattice &lat, vector< int32 > *times)
As LatticeStateTimes, but in the CompactLattice format.
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
bool HasKey(const std::string &key)
fst::VectorFst< CompactLatticeArc > CompactLattice
Definition: kaldi-lattice.h:46
void MaybeDoSanityCheck(const KwsLexicographicFst &index_transducer)
void RemoveLongSilences(int32 max_silence_frames, const std::vector< int32 > &state_times, KwsProductFst *factor_transducer)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
int main(int argc, char *argv[])
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
void DoFactorDisambiguation(KwsLexicographicFst *index_transducer)
#define KALDI_LOG
Definition: kaldi-error.h:153
void OptimizeFactorTransducer(KwsLexicographicFst *index_transducer, int32 max_states, bool allow_partial)
std::string GetOptArg(int param) const