kaldi-table.cc
Go to the documentation of this file.
1 // util/kaldi-table.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 
11 // http://www.apache.org/licenses/LICENSE-2.0
12 
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include "util/kaldi-table.h"
21 #include "util/text-utils.h"
22 
23 namespace kaldi {
24 
25 
26 bool ReadScriptFile(const std::string &rxfilename,
27  bool warn,
28  std::vector<std::pair<std::string, std::string> >
29  *script_out) {
30  bool is_binary;
31  Input input;
32 
33  if (!input.Open(rxfilename, &is_binary)) {
34  if (warn) KALDI_WARN << "Error opening script file: " <<
35  PrintableRxfilename(rxfilename);
36  return false;
37  }
38  if (is_binary) {
39  if (warn) KALDI_WARN << "Error: script file appears to be binary: " <<
40  PrintableRxfilename(rxfilename);
41  return false;
42  }
43 
44  bool ans = ReadScriptFile(input.Stream(), warn, script_out);
45  if (warn && !ans)
46  KALDI_WARN << "[script file was: " << PrintableRxfilename(rxfilename) <<
47  "]";
48  return ans;
49 }
50 
51 bool ReadScriptFile(std::istream &is,
52  bool warn,
53  std::vector<std::pair<std::string, std::string> >
54  *script_out) {
55  KALDI_ASSERT(script_out != NULL);
56  std::string line;
57  int line_number = 0;
58  while (getline(is, line)) {
59  line_number++;
60  const char *c = line.c_str();
61  if (*c == '\0') {
62  if (warn)
63  KALDI_WARN << "Empty " << line_number << "'th line in script file";
64  return false; // Empty line so invalid scp file format..
65  }
66 
67  std::string key, rest;
68  SplitStringOnFirstSpace(line, &key, &rest);
69 
70  if (key.empty() || rest.empty()) {
71  if (warn)
72  KALDI_WARN << "Invalid " << line_number << "'th line in script file"
73  <<":\"" << line << '"';
74  return false;
75  }
76  script_out->resize(script_out->size()+1);
77  script_out->back().first = key;
78  script_out->back().second = rest;
79  }
80  return true;
81 }
82 
83 bool WriteScriptFile(std::ostream &os,
84  const std::vector<std::pair<std::string, std::string> >
85  &script) {
86  if (!os.good()) {
87  KALDI_WARN << "WriteScriptFile: attempting to write to invalid stream.";
88  return false;
89  }
90  std::vector<std::pair<std::string, std::string> >::const_iterator iter;
91  for (iter = script.begin(); iter != script.end(); ++iter) {
92  if (!IsToken(iter->first)) {
93  KALDI_WARN << "WriteScriptFile: using invalid token \"" << iter->first <<
94  '"';
95  return false;
96  }
97  if (iter->second.find('\n') != std::string::npos ||
98  (iter->second.length() != 0 &&
99  (isspace(iter->second[0]) ||
100  isspace(iter->second[iter->second.length()-1])))) {
101  // second part contains newline or leading or trailing space.
102  KALDI_WARN << "WriteScriptFile: attempting to write invalid line \"" <<
103  iter->second << '"';
104  return false;
105  }
106  os << iter->first << ' ' << iter->second << '\n';
107  }
108  if (!os.good()) {
109  KALDI_WARN << "WriteScriptFile: stream in error state.";
110  return false;
111  }
112  return true;
113 }
114 
115 bool WriteScriptFile(const std::string &wxfilename,
116  const std::vector<std::pair<std::string, std::string> >
117  &script) {
118  Output output;
119  if (!output.Open(wxfilename, false, false)) { // false, false means not
120  // binary, no binary-mode header.
121  KALDI_ERR << "Error opening output stream for script file: "
122  << PrintableWxfilename(wxfilename);
123  return false;
124  }
125  if (!WriteScriptFile(output.Stream(), script)) {
126  KALDI_ERR << "Error writing script file to stream "
127  << PrintableWxfilename(wxfilename);
128  return false;
129  }
130  return true;
131 }
132 
133 
134 
135 WspecifierType ClassifyWspecifier(const std::string &wspecifier,
136  std::string *archive_wxfilename,
137  std::string *script_wxfilename,
138  WspecifierOptions *opts) {
139  // Examples:
140  // ark,t:wxfilename -> kArchiveWspecifier
141  // ark,b:wxfilename -> kArchiveWspecifier
142  // scp,t:rxfilename -> kScriptWspecifier
143  // scp,t:rxfilename -> kScriptWspecifier
144  // ark,scp,t:filename, wxfilename -> kBothWspecifier
145  // ark,scp:filename, wxfilename -> kBothWspecifier
146  // Note we can include the flush option (f) or no-flush (nf)
147  // anywhere: e.g.
148  // ark,scp,f:filename, wxfilename -> kBothWspecifier
149  // or:
150  // scp,t,nf:rxfilename -> kScriptWspecifier
151 
152  if (archive_wxfilename) archive_wxfilename->clear();
153  if (script_wxfilename) script_wxfilename->clear();
154 
155  size_t pos = wspecifier.find(':');
156  if (pos == std::string::npos) return kNoWspecifier;
157  if (isspace(*(wspecifier.rbegin()))) return kNoWspecifier; // Trailing space
158  // disallowed.
159 
160  std::string before_colon(wspecifier, 0, pos), after_colon(wspecifier, pos+1);
161 
162  std::vector<std::string> split_first_part; // Split part before ':' on ', '.
163  SplitStringToVector(before_colon, ", ", false, &split_first_part); // false==
164  // don't omit empty strings between commas.
165 
167 
168  if (opts != NULL)
169  *opts = WspecifierOptions(); // Make sure all the defaults are as in the
170  // default constructor of the options class.
171 
172  for (size_t i = 0; i < split_first_part.size(); i++) {
173  const std::string &str = split_first_part[i]; // e.g. "b", "t", "f", "ark",
174  // "scp".
175  const char *c = str.c_str();
176  if (!strcmp(c, "b")) {
177  if (opts) opts->binary = true;
178  } else if (!strcmp(c, "f")) {
179  if (opts) opts->flush = true;
180  } else if (!strcmp(c, "nf")) {
181  if (opts) opts->flush = false;
182  } else if (!strcmp(c, "t")) {
183  if (opts) opts->binary = false;
184  } else if (!strcmp(c, "p")) {
185  if (opts) opts->permissive = true;
186  } else if (!strcmp(c, "ark")) {
187  if (ws == kNoWspecifier) ws = kArchiveWspecifier;
188  else
189  return kNoWspecifier; // We do not allow "scp, ark", only "ark,
190  // scp".
191  } else if (!strcmp(c, "scp")) {
192  if (ws == kNoWspecifier) ws = kScriptWspecifier;
193  else if (ws == kArchiveWspecifier) ws = kBothWspecifier;
194  else
195  return kNoWspecifier; // repeated "scp" option: invalid.
196  } else {
197  return kNoWspecifier; // Could not interpret this option.
198  }
199  }
200 
201  switch (ws) {
202  case kArchiveWspecifier:
203  if (archive_wxfilename)
204  *archive_wxfilename = after_colon;
205  break;
206  case kScriptWspecifier:
207  if (script_wxfilename)
208  *script_wxfilename = after_colon;
209  break;
210  case kBothWspecifier:
211  pos = after_colon.find(','); // first comma.
212  if (pos == std::string::npos) return kNoWspecifier;
213  if (archive_wxfilename)
214  *archive_wxfilename = std::string(after_colon, 0, pos);
215  if (script_wxfilename)
216  *script_wxfilename = std::string(after_colon, pos+1);
217  break;
218  case kNoWspecifier: default: break;
219  }
220  return ws;
221 }
222 
223 
224 
225 RspecifierType ClassifyRspecifier(const std::string &rspecifier,
226  std::string *rxfilename,
227  RspecifierOptions *opts) {
228  // Examples
229  // ark:rxfilename -> kArchiveRspecifier
230  // scp:rxfilename -> kScriptRspecifier
231  //
232  // We also allow the meaningless prefixes b, and t,
233  // plus the options o (once), no (not-once),
234  // s (sorted) and ns (not-sorted), p (permissive)
235  // and np (not-permissive).
236  // so the following would be valid:
237  //
238  // f, o, b, np, ark:rxfilename -> kArchiveRspecifier
239  //
240  // Examples:
241  //
242  // b, ark:rxfilename -> kArchiveRspecifier
243  // t, ark:rxfilename -> kArchiveRspecifier
244  // b, scp:rxfilename -> kScriptRspecifier
245  // t, no, s, scp:rxfilename -> kScriptRspecifier
246  // t, ns, scp:rxfilename -> kScriptRspecifier
247 
248  // Improperly formed Rspecifiers will be classified as kNoRspecifier.
249 
250  if (rxfilename) rxfilename->clear();
251 
252  if (opts != NULL)
253  *opts = RspecifierOptions(); // Make sure all the defaults are as in the
254  // default constructor of the options class.
255 
256  size_t pos = rspecifier.find(':');
257  if (pos == std::string::npos) return kNoRspecifier;
258 
259  if (isspace(*(rspecifier.rbegin()))) return kNoRspecifier; // Trailing space
260  // disallowed.
261 
262  std::string before_colon(rspecifier, 0, pos),
263  after_colon(rspecifier, pos+1);
264 
265  std::vector<std::string> split_first_part; // Split part before ':' on ', '.
266  SplitStringToVector(before_colon, ", ", false, &split_first_part); // false==
267  // don't omit empty strings between commas.
268 
270 
271  for (size_t i = 0; i < split_first_part.size(); i++) {
272  const std::string &str = split_first_part[i]; // e.g. "b", "t", "f", "ark",
273  // "scp".
274  const char *c = str.c_str();
275  if (!strcmp(c, "b")); // Ignore this option. It's so we can use the same
276  // specifiers for rspecifiers and wspecifiers.
277  else if (!strcmp(c, "t")); // Ignore this option too.
278  else if (!strcmp(c, "o")) {
279  if (opts) opts->once = true;
280  } else if (!strcmp(c, "no")) {
281  if (opts) opts->once = false;
282  } else if (!strcmp(c, "p")) {
283  if (opts) opts->permissive = true;
284  } else if (!strcmp(c, "np")) {
285  if (opts) opts->permissive = false;
286  } else if (!strcmp(c, "s")) {
287  if (opts) opts->sorted = true;
288  } else if (!strcmp(c, "ns")) {
289  if (opts) opts->sorted = false;
290  } else if (!strcmp(c, "cs")) {
291  if (opts) opts->called_sorted = true;
292  } else if (!strcmp(c, "ncs")) {
293  if (opts) opts->called_sorted = false;
294  } else if (!strcmp(c, "bg")) {
295  if (opts) opts->background = true;
296  } else if (!strcmp(c, "ark")) {
297  if (rs == kNoRspecifier) rs = kArchiveRspecifier;
298  else
299  return kNoRspecifier; // Repeated or combined ark and scp options
300  // invalid.
301  } else if (!strcmp(c, "scp")) {
302  if (rs == kNoRspecifier) rs = kScriptRspecifier;
303  else
304  return kNoRspecifier; // Repeated or combined ark and scp options
305  // invalid.
306  } else {
307  return kNoRspecifier; // Could not interpret this option.
308  }
309  }
310  if ((rs == kArchiveRspecifier || rs == kScriptRspecifier)
311  && rxfilename != NULL)
312  *rxfilename = after_colon;
313  return rs;
314 }
315 
316 
317 
318 
319 
320 
321 } // end namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool Open(const std::string &rxfilename, bool *contents_binary=NULL)
Definition: kaldi-io-inl.h:26
RspecifierType ClassifyRspecifier(const std::string &rspecifier, std::string *rxfilename, RspecifierOptions *opts)
Definition: kaldi-table.cc:225
bool IsToken(const std::string &token)
Returns true if "token" is nonempty, and all characters are printable and whitespace-free.
Definition: text-utils.cc:105
std::istream & Stream()
Definition: kaldi-io.cc:826
std::ostream & Stream()
Definition: kaldi-io.cc:701
bool WriteScriptFile(std::ostream &os, const std::vector< std::pair< std::string, std::string > > &script)
Definition: kaldi-table.cc:83
void SplitStringOnFirstSpace(const std::string &str, std::string *first, std::string *rest)
Removes leading and trailing white space from the string, then splits on the first section of whitesp...
Definition: text-utils.cc:120
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
RspecifierType
Definition: kaldi-table.h:219
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
WspecifierType
Definition: kaldi-table.h:106
WspecifierType ClassifyWspecifier(const std::string &wspecifier, std::string *archive_wxfilename, std::string *script_wxfilename, WspecifierOptions *opts)
Definition: kaldi-table.cc:135
bool Open(const std::string &wxfilename, bool binary, bool write_header)
This opens the stream, with the given mode (binary or text).
Definition: kaldi-io.cc:707
bool ReadScriptFile(const std::string &rxfilename, bool warn, std::vector< std::pair< std::string, std::string > > *script_out)
Definition: kaldi-table.cc:26
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
std::string PrintableRxfilename(const std::string &rxfilename)
PrintableRxfilename turns the rxfilename into a more human-readable form for error reporting...
Definition: kaldi-io.cc:61
std::string PrintableWxfilename(const std::string &wxfilename)
PrintableWxfilename turns the wxfilename into a more human-readable form for error reporting...
Definition: kaldi-io.cc:73
int32 line_number