lattice-copy.cc
Go to the documentation of this file.
1 // latbin/lattice-copy.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 // 2013 Johns Hopkins University (author: Daniel Povey)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 
22 #include "base/kaldi-common.h"
23 #include "util/common-utils.h"
24 #include "fstext/fstext-lib.h"
25 #include "lat/kaldi-lattice.h"
26 
27 namespace kaldi {
28  int32 CopySubsetLattices(std::string filename,
29  SequentialLatticeReader *lattice_reader,
30  LatticeWriter *lattice_writer,
31  bool include = true, bool ignore_missing = false,
32  bool sorted = false) {
33  unordered_set<std::string, StringHasher> subset;
34  std::set<std::string> subset_list;
35 
36  bool binary;
37  Input ki(filename, &binary);
38  KALDI_ASSERT(!binary);
39  std::string line;
40  while (std::getline(ki.Stream(), line)) {
41  std::vector<std::string> split_line;
42  SplitStringToVector(line, " \t\r", true, &split_line);
43  if(split_line.empty()) {
44  KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename;
45  }
46  subset.insert(split_line[0]);
47  subset_list.insert(split_line[0]);
48  }
49 
50  int32 num_total = 0;
51  size_t num_success = 0;
52  for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) {
53  if (include && sorted && subset_list.size() > 0
54  && lattice_reader->Key() > *(subset_list.rbegin())) {
55  KALDI_LOG << "The utterance " << lattice_reader->Key()
56  << " is larger than "
57  << "the last key in the include list. Not reading further.";
58  KALDI_LOG << "Wrote " << num_success << " utterances";
59  return 0;
60  }
61 
62  if (include && subset.count(lattice_reader->Key()) > 0) {
63  lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
64  num_success++;
65  } else if (!include && subset.count(lattice_reader->Key()) == 0) {
66  lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
67  num_success++;
68  }
69  }
70 
71  KALDI_LOG << "Wrote " << num_success << " out of " << num_total
72  << " utterances.";
73 
74  if (ignore_missing) return 0;
75 
76  return (num_success != 0 ? 0 : 1);
77  }
78 
79  int32 CopySubsetLattices(std::string filename,
80  SequentialCompactLatticeReader *lattice_reader,
81  CompactLatticeWriter *lattice_writer,
82  bool include = true, bool ignore_missing = false,
83  bool sorted = false) {
84  unordered_set<std::string, StringHasher> subset;
85  std::set<std::string> subset_list;
86 
87  bool binary;
88  Input ki(filename, &binary);
89  KALDI_ASSERT(!binary);
90  std::string line;
91  while (std::getline(ki.Stream(), line)) {
92  std::vector<std::string> split_line;
93  SplitStringToVector(line, " \t\r", true, &split_line);
94  if(split_line.empty()) {
95  KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename;
96  }
97  subset.insert(split_line[0]);
98  subset_list.insert(split_line[0]);
99  }
100 
101  int32 num_total = 0;
102  size_t num_success = 0;
103  for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) {
104  if (include && sorted && subset_list.size() > 0
105  && lattice_reader->Key() > *(subset_list.rbegin())) {
106  KALDI_LOG << "The utterance " << lattice_reader->Key()
107  << " is larger than "
108  << "the last key in the include list. Not reading further.";
109  KALDI_LOG << "Wrote " << num_success << " utterances";
110  return 0;
111  }
112 
113  if (include && subset.count(lattice_reader->Key()) > 0) {
114  lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
115  num_success++;
116  } else if (!include && subset.count(lattice_reader->Key()) == 0) {
117  lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
118  num_success++;
119  }
120  }
121 
122  KALDI_LOG << " Wrote " << num_success << " out of " << num_total
123  << " utterances.";
124 
125  if (ignore_missing) return 0;
126 
127  return (num_success != 0 ? 0 : 1);
128  }
129 }
130 
131 int main(int argc, char *argv[]) {
132  try {
133  using namespace kaldi;
134  typedef kaldi::int32 int32;
135  typedef kaldi::int64 int64;
136  using fst::SymbolTable;
137  using fst::VectorFst;
138  using fst::StdArc;
139 
140  const char *usage =
141  "Copy lattices (e.g. useful for changing to text mode or changing\n"
142  "format to standard from compact lattice.)\n"
143  "The --include and --exclude options can be used to copy only a subset "
144  "of lattices, where are the --include option specifies the "
145  "whitelisted utterances that would be copied and --exclude option "
146  "specifies the blacklisted utterances that would not be copied.\n"
147  "Only one of --include and --exclude can be supplied.\n"
148  "Usage: lattice-copy [options] lattice-rspecifier lattice-wspecifier\n"
149  " e.g.: lattice-copy --write-compact=false ark:1.lats ark,t:text.lats\n"
150  "See also: lattice-scale, lattice-to-fst, and\n"
151  " the script egs/wsj/s5/utils/convert_slf.pl\n";
152 
153  ParseOptions po(usage);
154  bool write_compact = true, ignore_missing = false;
155  std::string include_rxfilename;
156  std::string exclude_rxfilename;
157 
158  po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
159  po.Register("include", &include_rxfilename,
160  "Text file, the first field of each "
161  "line being interpreted as the "
162  "utterance-id whose lattices will be included");
163  po.Register("exclude", &exclude_rxfilename,
164  "Text file, the first field of each "
165  "line being interpreted as an utterance-id "
166  "whose lattices will be excluded");
167  po.Register("ignore-missing", &ignore_missing,
168  "Exit with status 0 even if no lattices are copied");
169 
170  po.Read(argc, argv);
171 
172  if (po.NumArgs() != 2) {
173  po.PrintUsage();
174  exit(1);
175  }
176 
177  std::string lats_rspecifier = po.GetArg(1),
178  lats_wspecifier = po.GetArg(2);
179 
180  RspecifierOptions opts;
181  ClassifyRspecifier(lats_rspecifier, NULL, &opts);
182  bool sorted = opts.sorted;
183 
184  int32 n_done = 0;
185 
186  if (write_compact) {
187  SequentialCompactLatticeReader lattice_reader(lats_rspecifier);
188  CompactLatticeWriter lattice_writer(lats_wspecifier);
189 
190  if (include_rxfilename != "") {
191  if (exclude_rxfilename != "") {
192  KALDI_ERR << "should not have both --exclude and --include option!";
193  }
194  return CopySubsetLattices(include_rxfilename,
195  &lattice_reader, &lattice_writer,
196  true, ignore_missing, sorted);
197  } else if (exclude_rxfilename != "") {
198  return CopySubsetLattices(exclude_rxfilename,
199  &lattice_reader, &lattice_writer,
200  false, ignore_missing);
201  }
202 
203  for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
204  lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
205  } else {
206  SequentialLatticeReader lattice_reader(lats_rspecifier);
207  LatticeWriter lattice_writer(lats_wspecifier);
208 
209  if (include_rxfilename != "") {
210  if (exclude_rxfilename != "") {
211  KALDI_ERR << "should not have both --exclude and --include option!";
212  }
213  return CopySubsetLattices(include_rxfilename,
214  &lattice_reader, &lattice_writer,
215  true, ignore_missing, sorted);
216  } else if (exclude_rxfilename != "") {
217  return CopySubsetLattices(exclude_rxfilename,
218  &lattice_reader, &lattice_writer,
219  true, ignore_missing);
220  }
221 
222  for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
223  lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
224  }
225  KALDI_LOG << "Done copying " << n_done << " lattices.";
226 
227  if (ignore_missing) return 0;
228 
229  return (n_done != 0 ? 0 : 1);
230  } catch(const std::exception &e) {
231  std::cerr << e.what();
232  return -1;
233  }
234 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
int32 CopySubsetLattices(std::string filename, SequentialLatticeReader *lattice_reader, LatticeWriter *lattice_writer, bool include=true, bool ignore_missing=false, bool sorted=false)
Definition: lattice-copy.cc:28
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
fst::StdArc StdArc
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
int main(int argc, char *argv[])
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
RspecifierType ClassifyRspecifier(const std::string &rspecifier, std::string *rxfilename, RspecifierOptions *opts)
Definition: kaldi-table.cc:225
std::istream & Stream()
Definition: kaldi-io.cc:826
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_LOG
Definition: kaldi-error.h:153