kaldi-table.h
Go to the documentation of this file.
1 // util/kaldi-table.h
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 // 2013 Johns Hopkins University (author: Daniel Povey)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #ifndef KALDI_UTIL_KALDI_TABLE_H_
22 #define KALDI_UTIL_KALDI_TABLE_H_
23 
24 #include <string>
25 #include <vector>
26 #include <utility>
27 
28 #include "base/kaldi-common.h"
29 #include "util/kaldi-holder.h"
30 
31 namespace kaldi {
32 
33 // Forward declarations
34 template<class Holder> class RandomAccessTableReaderImplBase;
35 template<class Holder> class SequentialTableReaderImplBase;
36 template<class Holder> class TableWriterImplBase;
37 
40 
41 // This header defines the Table classes (RandomAccessTableReader,
42 // SequentialTableReader and TableWriter) and explains what the Holder classes,
43 // which the Table class requires as a template argument, are like. It also
44 // explains the "rspecifier" and "wspecifier" concepts (these are strings that
45 // explain how to read/write objects via archives or scp files. A table is
46 // conceptually a collection of objects of a particular type T indexed by keys
47 // of type std::string (these Keys additionally have an order within
48 // each table).
49 // The Table classes are templated on a type (call it Holder) such that
50 // Holder::T is a typedef equal to T.
51 
52 // see kaldi-holder.h for detail on the Holder classes.
53 
54 typedef std::vector<std::string> KeyList;
55 
56 // Documentation for "wspecifier"
57 // "wspecifier" describes how we write a set of objects indexed by keys.
58 // The basic, unadorned wspecifiers are as follows:
59 //
60 // ark:wxfilename
61 // scp:rxfilename
62 // ark,scp:filename,wxfilename
63 // ark,scp:filename,wxfilename
64 //
65 //
66 // We also allow the following modifiers:
67 // t means text mode.
68 // b means binary mode.
69 // f means flush the stream after writing each entry.
70 // (nf means don't flush, and the default is not to flush).
71 // p means permissive mode, when writing to an "scp" file only: will ignore
72 // missing scp entries, i.e. won't write anything for those files but will
73 // return success status).
74 //
75 // So the following are valid wspecifiers:
76 // ark,b,f:foo
77 // "ark,b,b:| gzip -c > foo"
78 // "ark,scp,t,nf:foo.ark,|gzip -c > foo.scp.gz"
79 // ark,b:-
80 //
81 // The meanings of rxfilename and wxfilename are as described in
82 // kaldi-io.h (they are filenames but include pipes, stdin/stdout
83 // and so on; filename is a regular filename.
84 //
85 
86 // The ark:wxfilename type of wspecifier instructs the class to
87 // write directly to an archive. For small objects (e.g. lists of ints),
88 // the text archive format will generally be human readable with one line
89 // per entry in the archive.
90 //
91 // The type "scp:xfilename" refers to an scp file which should
92 // already exist on disk, and tells us where to write the data for
93 // each key (usually an actual file); each line of the scp file
94 // would be:
95 // key xfilename
96 //
97 // The type ark,scp:filename,wxfilename means
98 // we write both an archive and an scp file that specifies offsets into the
99 // archive, with lines like:
100 // key filename:12407
101 // where the number is the byte offset into the file.
102 // In this case we restrict the archive-filename to be an actual filename,
103 // as we can't see a situation where an extended filename would make sense
104 // for this (we can't fseek() in pipes).
105 
111 };
112 
114  bool binary;
115  bool flush;
116  bool permissive; // will ignore absent scp entries.
117  WspecifierOptions(): binary(true), flush(false), permissive(false) { }
118 };
119 
120 // ClassifyWspecifier returns the type of the wspecifier string,
121 // and (if pointers are non-NULL) outputs the extra information
122 // about the options, and the script and archive
123 // filenames.
124 WspecifierType ClassifyWspecifier(const std::string &wspecifier,
125  std::string *archive_wxfilename,
126  std::string *script_wxfilename,
127  WspecifierOptions *opts);
128 
129 // ReadScriptFile reads an .scp file in its entirety, and appends it
130 // (in order as it was in the scp file) in script_out_, which contains
131 // pairs of (key, xfilename). The .scp
132 // file format is: on each line, key xfilename
133 // where xfilename means rxfilename or wxfilename, and may contain internal
134 // spaces (we trim away any leading or trailing space). The key is space-free.
135 // ReadScriptFile returns true if the format was valid (empty files
136 // are valid).
137 // If 'print_warnings', it will print out warning messages that explain what
138 // kind of error there was.
139 bool ReadScriptFile(const std::string &rxfilename,
140  bool print_warnings,
141  std::vector<std::pair<std::string, std::string> >
142  *script_out);
143 
144 // This version of ReadScriptFile works from an istream.
145 bool ReadScriptFile(std::istream &is,
146  bool print_warnings,
147  std::vector<std::pair<std::string, std::string> >
148  *script_out);
149 
150 // Writes, for each entry in script, the first element, then ' ', then the
151 // second element then '\n'. Checks that the keys (first elements of pairs) are
152 // valid tokens (nonempty, no whitespace), and the values (second elements of
153 // pairs) are newline-free and contain no leading or trailing space. Returns
154 // true on success.
155 bool WriteScriptFile(const std::string &wxfilename,
156  const std::vector<std::pair<std::string, std::string> >
157  &script);
158 
159 // This version writes to an ostream.
160 bool WriteScriptFile(std::ostream &os,
161  const std::vector<std::pair<std::string, std::string> >
162  &script);
163 
164 // Documentation for "rspecifier"
165 // "rspecifier" describes how we read a set of objects indexed by keys.
166 // The possibilities are:
167 //
168 // ark:rxfilename
169 // scp:rxfilename
170 //
171 // We also allow various modifiers:
172 // o means the program will only ask for each key once, which enables
173 // the reader to discard already-asked-for values.
174 // s means the keys are sorted on input (means we don't have to read till
175 // eof if someone asked for a key that wasn't there).
176 // cs means that it is called in sorted order (we are generally asserting
177 // this based on knowledge of how the program works).
178 // p means "permissive", and causes it to skip over keys whose corresponding
179 // scp-file entries cannot be read. [and to ignore errors in archives and
180 // script files, and just consider the "good" entries].
181 // We allow the negation of the options above, as in no, ns, np,
182 // but these aren't currently very useful (just equivalent to omitting the
183 // corresponding option).
184 // [any of the above options can be prefixed by n to negate them, e.g. no,
185 // ns, ncs, np; but these aren't currently useful as you could just omit
186 // the option].
187 // bg means "background". It currently has no effect for random-access readers,
188 // but for sequential readers it will cause it to "read ahead" to the next
189 // value, in a background thread. Recommended when reading larger objects
190 // such as neural-net training examples, especially when you want to
191 // maximize GPU usage.
192 //
193 // b is ignored [for scripting convenience]
194 // t is ignored [for scripting convenience]
195 //
196 //
197 // So for instance the following would be a valid rspecifier:
198 //
199 // "o, s, p, ark:gunzip -c foo.gz|"
200 
202  // These options only make a difference for the RandomAccessTableReader class.
203  bool once; // we assert that the program will only ask for each key once.
204  bool sorted; // we assert that the keys are sorted.
205  bool called_sorted; // we assert that the (HasKey(), Value() functions will
206  // also be called in sorted order. [this implies "once" but not vice versa].
207  bool permissive; // If "permissive", when reading from scp files it treats
208  // scp files that can't be read as if the corresponding key were not there.
209  // For archive files it will suppress errors getting thrown if the archive
210  // is corrupted and can't be read to the end.
211  bool background; // For sequential readers, if the background option ("bg")
212  // is provided, it will read ahead to the next object in a
213  // background thread.
214  RspecifierOptions(): once(false), sorted(false),
215  called_sorted(false), permissive(false),
216  background(false) { }
217 };
218 
223 };
224 
225 RspecifierType ClassifyRspecifier(const std::string &rspecifier,
226  std::string *rxfilename,
227  RspecifierOptions *opts);
228 
229 
232 template<class Holder>
234  public:
235  typedef typename Holder::T T;
236 
237  RandomAccessTableReader(): impl_(NULL) { }
238 
239  // This constructor is equivalent to default constructor + "open", but
240  // throws on error.
241  explicit RandomAccessTableReader(const std::string &rspecifier);
242 
243  // Opens the table.
244  bool Open(const std::string &rspecifier);
245 
246  // Returns true if table is open.
247  bool IsOpen() const { return (impl_ != NULL); }
248 
249  // Close() will close the table [throws if it was not open],
250  // and returns true on success (false if we were reading an
251  // archive and we discovered an error in the archive).
252  bool Close();
253 
254  // Says if it has this key.
255  // If you are using the "permissive" (p) read option,
256  // it will return false for keys whose corresponding entry
257  // in the scp file cannot be read.
258 
259  bool HasKey(const std::string &key);
260 
261  // Value() may throw if you are reading an scp file, you
262  // do not have the "permissive" (p) option, and an entry
263  // in the scp file cannot be read. Typically you won't
264  // want to catch this error.
265  const T &Value(const std::string &key);
266 
268 
269  // Allow copy-constructor only for non-opened readers (needed for inclusion in
270  // stl vector)
272  &other):
273  impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); }
274  private:
275  // Disallow assignment.
277  void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
278  // message and dies (with KALDI_ERR) if NULL.
280 };
281 
282 
283 
286 template<class Holder>
288  public:
289  typedef typename Holder::T T;
290 
291  SequentialTableReader(): impl_(NULL) { }
292 
293  // This constructor equivalent to default constructor + "open", but
294  // throws on error.
295  explicit SequentialTableReader(const std::string &rspecifier);
296 
297  // Opens the table. Returns exit status; but does throw if previously open
298  // stream was in error state. You can call Close to prevent this; anyway,
299  // calling Open more than once is not usually needed.
300  bool Open(const std::string &rspecifier);
301 
302  // Returns true if we're done. It will also return true if there's some kind
303  // of error and we can't read any more; in this case, you can detect the
304  // error by calling Close and checking the return status; otherwise
305  // the destructor will throw.
306  inline bool Done();
307 
308  // Only valid to call Key() if Done() returned false.
309  inline std::string Key();
310 
311  // FreeCurrent() is provided as an optimization to save memory, for large
312  // objects. It instructs the class to deallocate the current value. The
313  // reference Value() will be invalidated by this.
314  void FreeCurrent();
315 
316  // Return reference to the current value. It's only valid to call this if
317  // Done() returned false. The reference is valid till next call to this
318  // object. It will throw if you are reading an scp file, did not specify the
319  // "permissive" (p) option and the file cannot be read. [The permissive
320  // option makes it behave as if that key does not even exist, if the
321  // corresponding file cannot be read.] You probably wouldn't want to catch
322  // this exception; the user can just specify the p option in the rspecifier.
323  // We make this non-const to enable things like shallow swap on the held
324  // object in situations where this would avoid making a redundant copy.
325  T &Value();
326 
327  // Next goes to the next key. It will not throw; any error will
328  // result in Done() returning true, and then the destructor will
329  // throw unless you call Close().
330  void Next();
331 
332  // Returns true if table is open for reading (does not imply
333  // stream is in good state).
334  bool IsOpen() const;
335 
336  // Close() will return false (failure) if Done() became true
337  // because of an error/ condition rather than because we are
338  // really done [e.g. because of an error or early termination
339  // in the archive].
340  // If there is an error and you don't call Close(), the destructor
341  // will fail.
342  // Close()
343  bool Close();
344 
345  // The destructor may throw. This is the desired behaviour, as it's the way
346  // we signal the error to the user (to detect it, call Close(). The issue is
347  // that otherwise the user has no way to tell whether Done() returned true
348  // because we reached the end of the archive or script, or because there was
349  // an error that prevented further reading.
351 
352  // Allow copy-constructor only for non-opened readers (needed for inclusion in
353  // stl vector)
355  impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); }
356  private:
357  // Disallow assignment.
359  void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
360  // message and dies (with KALDI_ERR) if NULL.
362 };
363 
364 
367 template<class Holder>
368 class TableWriter {
369  public:
370  typedef typename Holder::T T;
371 
372  TableWriter(): impl_(NULL) { }
373 
374  // This constructor equivalent to default constructor
375  // + "open", but throws on error. See docs for
376  // wspecifier above.
377  explicit TableWriter(const std::string &wspecifier);
378 
379  // Opens the table. See docs for wspecifier above.
380  // If it returns true, it is open.
381  bool Open(const std::string &wspecifier);
382 
383  // Returns true if open for writing.
384  bool IsOpen() const;
385 
386  // Write the object. Throws KaldiFatalError on error via the KALDI_ERR macro.
387  inline void Write(const std::string &key, const T &value) const;
388 
389 
390  // Flush will flush any archive; it does not return error status
391  // or throw, any errors will be reported on the next Write or Close.
392  // Useful if we may be writing to a command in a pipe and want
393  // to ensure good CPU utilization.
394  void Flush();
395 
396  // Close() is not necessary to call, as the destructor
397  // closes it; it's mainly useful if you want to handle
398  // error states because the destructor will throw on
399  // error if you do not call Close().
400  bool Close();
401 
402  ~TableWriter();
403 
404  // Allow copy-constructor only for non-opened writers (needed for inclusion in
405  // stl vector)
406  TableWriter(const TableWriter &other): impl_(NULL) {
407  KALDI_ASSERT(other.impl_ == NULL);
408  }
409  private:
410  TableWriter &operator = (const TableWriter&); // Disallow assignment.
411 
412  void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
413  // message and dies (with KALDI_ERR) if NULL.
415 };
416 
417 
430 
431 template<class Holder>
433  public:
434  typedef typename Holder::T T;
438  RandomAccessTableReaderMapped(const std::string &table_rxfilename,
439  const std::string &utt2spk_rxfilename);
440 
442 
444  bool Open(const std::string &table_rxfilename,
445  const std::string &utt2spk_rxfilename);
446 
447  bool HasKey(const std::string &key);
448  const T &Value(const std::string &key);
449  inline bool IsOpen() const { return reader_.IsOpen(); }
450  inline bool Close() { return reader_.Close(); }
451 
452 
453 
454  // The default copy-constructor will do what we want: it will crash for
455  // already-opened readers, by calling the member-variable copy-constructors.
456  private:
457  // Disallow assignment.
462  std::string utt2spk_rxfilename_; // Used only in diagnostic messages.
463 };
464 
465 
467 } // end namespace kaldi
468 
469 #include "util/kaldi-table-inl.h"
470 
471 #endif // KALDI_UTIL_KALDI_TABLE_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
std::vector< std::string > KeyList
Definition: kaldi-table.h:54
RandomAccessTableReader< TokenHolder > token_reader_
Definition: kaldi-table.h:461
This class is for when you are reading something in random access, but it may actually be stored per-...
Definition: kaldi-table.h:432
SequentialTableReader(const SequentialTableReader< Holder > &other)
Definition: kaldi-table.h:354
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
SequentialTableReaderImplBase< Holder > * impl_
Definition: kaldi-table.h:361
RspecifierType ClassifyRspecifier(const std::string &rspecifier, std::string *rxfilename, RspecifierOptions *opts)
Definition: kaldi-table.cc:225
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
RandomAccessTableReader(const RandomAccessTableReader< Holder > &other)
Definition: kaldi-table.h:271
bool WriteScriptFile(std::ostream &os, const std::vector< std::pair< std::string, std::string > > &script)
Definition: kaldi-table.cc:83
TableWriter(const TableWriter &other)
Definition: kaldi-table.h:406
RspecifierType
Definition: kaldi-table.h:219
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
WspecifierType
Definition: kaldi-table.h:106
RandomAccessTableReader< Holder > reader_
Definition: kaldi-table.h:460
WspecifierType ClassifyWspecifier(const std::string &wspecifier, std::string *archive_wxfilename, std::string *script_wxfilename, WspecifierOptions *opts)
Definition: kaldi-table.cc:135
bool ReadScriptFile(const std::string &rxfilename, bool warn, std::vector< std::pair< std::string, std::string > > *script_out)
Definition: kaldi-table.cc:26
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
TableWriterImplBase< Holder > * impl_
Definition: kaldi-table.h:414
RandomAccessTableReaderImplBase< Holder > * impl_
Definition: kaldi-table.h:279