text-utils.cc
Go to the documentation of this file.
1 // util/text-utils.cc
2 
3 // Copyright 2009-2011 Saarland University; Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 
11 // http://www.apache.org/licenses/LICENSE-2.0
12 
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include "util/text-utils.h"
21 #include <limits>
22 #include <map>
23 #include <algorithm>
24 #include "base/kaldi-common.h"
25 
26 namespace kaldi {
27 
28 
29 template<class F>
30 bool SplitStringToFloats(const std::string &full,
31  const char *delim,
32  bool omit_empty_strings, // typically false
33  std::vector<F> *out) {
34  KALDI_ASSERT(out != NULL);
35  if (*(full.c_str()) == '\0') {
36  out->clear();
37  return true;
38  }
39  std::vector<std::string> split;
40  SplitStringToVector(full, delim, omit_empty_strings, &split);
41  out->resize(split.size());
42  for (size_t i = 0; i < split.size(); i++) {
43  F f = 0;
44  if (!ConvertStringToReal(split[i], &f))
45  return false;
46  (*out)[i] = f;
47  }
48  return true;
49 }
50 
51 // Instantiate the template above for float and double.
52 template
53 bool SplitStringToFloats(const std::string &full,
54  const char *delim,
55  bool omit_empty_strings,
56  std::vector<float> *out);
57 template
58 bool SplitStringToFloats(const std::string &full,
59  const char *delim,
60  bool omit_empty_strings,
61  std::vector<double> *out);
62 
63 void SplitStringToVector(const std::string &full, const char *delim,
64  bool omit_empty_strings,
65  std::vector<std::string> *out) {
66  size_t start = 0, found = 0, end = full.size();
67  out->clear();
68  while (found != std::string::npos) {
69  found = full.find_first_of(delim, start);
70  // start != end condition is for when the delimiter is at the end
71  if (!omit_empty_strings || (found != start && start != end))
72  out->push_back(full.substr(start, found - start));
73  start = found + 1;
74  }
75 }
76 
77 void JoinVectorToString(const std::vector<std::string> &vec_in,
78  const char *delim, bool omit_empty_strings,
79  std::string *str_out) {
80  std::string tmp_str;
81  for (size_t i = 0; i < vec_in.size(); i++) {
82  if (!omit_empty_strings || !vec_in[i].empty()) {
83  tmp_str.append(vec_in[i]);
84  if (i < vec_in.size() - 1)
85  if (!omit_empty_strings || !vec_in[i+1].empty())
86  tmp_str.append(delim);
87  }
88  }
89  str_out->swap(tmp_str);
90 }
91 
92 void Trim(std::string *str) {
93  const char *white_chars = " \t\n\r\f\v";
94 
95  std::string::size_type pos = str->find_last_not_of(white_chars);
96  if (pos != std::string::npos) {
97  str->erase(pos + 1);
98  pos = str->find_first_not_of(white_chars);
99  if (pos != std::string::npos) str->erase(0, pos);
100  } else {
101  str->erase(str->begin(), str->end());
102  }
103 }
104 
105 bool IsToken(const std::string &token) {
106  size_t l = token.length();
107  if (l == 0) return false;
108  for (size_t i = 0; i < l; i++) {
109  unsigned char c = token[i];
110  if ((!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255))
111  return false;
112  // The "&& (isascii(c) || c == 255)" was added so that we won't reject
113  // non-ASCII characters such as French characters with accents [except for
114  // 255 which is "nbsp", a form of space].
115  }
116  return true;
117 }
118 
119 
120 void SplitStringOnFirstSpace(const std::string &str,
121  std::string *first,
122  std::string *rest) {
123  const char *white_chars = " \t\n\r\f\v";
124  typedef std::string::size_type I;
125  const I npos = std::string::npos;
126  I first_nonwhite = str.find_first_not_of(white_chars);
127  if (first_nonwhite == npos) {
128  first->clear();
129  rest->clear();
130  return;
131  }
132  // next_white is first whitespace after first nonwhitespace.
133  I next_white = str.find_first_of(white_chars, first_nonwhite);
134 
135  if (next_white == npos) { // no more whitespace...
136  *first = std::string(str, first_nonwhite);
137  rest->clear();
138  return;
139  }
140  I next_nonwhite = str.find_first_not_of(white_chars, next_white);
141  if (next_nonwhite == npos) {
142  *first = std::string(str, first_nonwhite, next_white-first_nonwhite);
143  rest->clear();
144  return;
145  }
146 
147  I last_nonwhite = str.find_last_not_of(white_chars);
148  KALDI_ASSERT(last_nonwhite != npos); // or coding error.
149 
150  *first = std::string(str, first_nonwhite, next_white-first_nonwhite);
151  *rest = std::string(str, next_nonwhite, last_nonwhite+1-next_nonwhite);
152 }
153 
154 bool IsLine(const std::string &line) {
155  if (line.find('\n') != std::string::npos) return false;
156  if (line.empty()) return true;
157  if (isspace(*(line.begin()))) return false;
158  if (isspace(*(line.rbegin()))) return false;
159  std::string::const_iterator iter = line.begin(), end = line.end();
160  for (; iter != end; iter++)
161  if (!isprint(*iter)) return false;
162  return true;
163 }
164 
165 template <class T>
167  public:
168  explicit NumberIstream(std::istream &i) : in_(i) {}
169 
171  if (!in_.good()) return *this;
172  in_ >> x;
173  if (!in_.fail() && RemainderIsOnlySpaces()) return *this;
174  return ParseOnFail(&x);
175  }
176 
177  private:
178  std::istream &in_;
179 
181  if (in_.tellg() != std::istream::pos_type(-1)) {
182  std::string rem;
183  in_ >> rem;
184 
185  if (rem.find_first_not_of(' ') != std::string::npos) {
186  // there is not only spaces
187  return false;
188  }
189  }
190 
191  in_.clear();
192  return true;
193  }
194 
196  std::string str;
197  in_.clear();
198  in_.seekg(0);
199  // If the stream is broken even before trying
200  // to read from it or if there are many tokens,
201  // it's pointless to try.
202  if (!(in_ >> str) || !RemainderIsOnlySpaces()) {
203  in_.setstate(std::ios_base::failbit);
204  return *this;
205  }
206 
207  std::map<std::string, T> inf_nan_map;
208  // we'll keep just uppercase values.
209  inf_nan_map["INF"] = std::numeric_limits<T>::infinity();
210  inf_nan_map["+INF"] = std::numeric_limits<T>::infinity();
211  inf_nan_map["-INF"] = - std::numeric_limits<T>::infinity();
212  inf_nan_map["INFINITY"] = std::numeric_limits<T>::infinity();
213  inf_nan_map["+INFINITY"] = std::numeric_limits<T>::infinity();
214  inf_nan_map["-INFINITY"] = - std::numeric_limits<T>::infinity();
215  inf_nan_map["NAN"] = std::numeric_limits<T>::quiet_NaN();
216  inf_nan_map["+NAN"] = std::numeric_limits<T>::quiet_NaN();
217  inf_nan_map["-NAN"] = - std::numeric_limits<T>::quiet_NaN();
218  // MSVC
219  inf_nan_map["1.#INF"] = std::numeric_limits<T>::infinity();
220  inf_nan_map["-1.#INF"] = - std::numeric_limits<T>::infinity();
221  inf_nan_map["1.#QNAN"] = std::numeric_limits<T>::quiet_NaN();
222  inf_nan_map["-1.#QNAN"] = - std::numeric_limits<T>::quiet_NaN();
223 
224  std::transform(str.begin(), str.end(), str.begin(), ::toupper);
225 
226  if (inf_nan_map.find(str) != inf_nan_map.end()) {
227  *x = inf_nan_map[str];
228  } else {
229  in_.setstate(std::ios_base::failbit);
230  }
231 
232  return *this;
233  }
234 };
235 
236 
237 template <typename T>
238 bool ConvertStringToReal(const std::string &str,
239  T *out) {
240  std::istringstream iss(str);
241 
242  NumberIstream<T> i(iss);
243 
244  i >> *out;
245 
246  if (iss.fail()) {
247  // Number conversion failed.
248  return false;
249  }
250 
251  return true;
252 }
253 
254 template
255 bool ConvertStringToReal(const std::string &str,
256  float *out);
257 template
258 bool ConvertStringToReal(const std::string &str,
259  double *out);
260 
261 
262 
263 /*
264  This function is a helper function of StringsApproxEqual. It should be
265  thought of as a recursive function-- it was designed that way-- but rather
266  than actually recursing (which would cause problems with stack overflow), we
267  just set the args and return to the start.
268 
269  The 'decimal_places_tolerance' argument is just passed in from outside,
270  see the documentation for StringsApproxEqual in text-utils.h to see an
271  explanation. The argument 'places_into_number' provides some information
272  about the strings 'a' and 'b' that precedes the current pointers.
273  For purposes of this comment, let's define the 'decimal' of a number
274  as the part that comes after the decimal point, e.g. in '99.123',
275  '123' would be the decimal. If 'places_into_number' is -1, it means
276  we're not currently inside some place like that (i.e. it's not the
277  case that we're pointing to the '1' or the '2' or the '3').
278  If it's 0, then we'd be pointing to the first place after the decimal,
279  '1' in this case. Note if one of the numbers is shorter than the
280  other, like '99.123' versus '99.1234' and 'a' points to the first '3'
281  while 'b' points to the second '4', 'places_into_number' referes to the
282  shorter of the two, i.e. it would be 2 in this example.
283 
284 
285  */
286 bool StringsApproxEqualInternal(const char *a, const char *b,
287  int32 decimal_places_tolerance,
288  int32 places_into_number) {
289 start:
290  char ca = *a, cb = *b;
291  if (ca == cb) {
292  if (ca == '\0') {
293  return true;
294  } else {
295  if (places_into_number >= 0) {
296  if (isdigit(ca)) {
297  places_into_number++;
298  } else {
299  places_into_number = -1;
300  }
301  } else {
302  if (ca == '.') {
303  places_into_number = 0;
304  }
305  }
306  a++;
307  b++;
308  goto start;
309  }
310  } else {
311  if (places_into_number >= decimal_places_tolerance &&
312  (isdigit(ca) || isdigit(cb))) {
313  // we're potentially willing to accept this difference between the
314  // strings.
315  if (isdigit(ca)) a++;
316  if (isdigit(cb)) b++;
317  // we'll have advanced at least one of the two strings.
318  goto start;
319  } else if (places_into_number >= 0 &&
320  ((ca == '0' && !isdigit(cb)) || (cb == '0' && !isdigit(ca)))) {
321  // this clause is designed to ensure that, for example,
322  // "0.1" would count the same as "0.100001".
323  if (ca == '0') a++;
324  else b++;
325  places_into_number++;
326  goto start;
327  } else {
328  return false;
329  }
330  }
331 
332 }
333 
334 
335 bool StringsApproxEqual(const std::string &a,
336  const std::string &b,
337  int32 decimal_places_tolerance) {
338  return StringsApproxEqualInternal(a.c_str(), b.c_str(),
339  decimal_places_tolerance, -1);
340 }
341 
342 
343 bool ConfigLine::ParseLine(const std::string &line) {
344  data_.clear();
345  whole_line_ = line;
346  if (line.size() == 0) return false; // Empty line
347  size_t pos = 0, size = line.size();
348  while (isspace(line[pos]) && pos < size) pos++;
349  if (pos == size)
350  return false; // whitespace-only line
351  size_t first_token_start_pos = pos;
352  // first get first_token_.
353  while (!isspace(line[pos]) && pos < size) {
354  if (line[pos] == '=') {
355  // If the first block of non-whitespace looks like "foo-bar=...",
356  // then we ignore it: there is no initial token, and FirstToken()
357  // is empty.
358  pos = first_token_start_pos;
359  break;
360  }
361  pos++;
362  }
363  first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos);
364  // first_token_ is expected to be either empty or something like
365  // "component-node", which actually is a slightly more restrictive set of
366  // strings than IsValidName() checks for this is a convenient way to check it.
367  if (!first_token_.empty() && !IsValidName(first_token_))
368  return false;
369 
370  while (pos < size) {
371  if (isspace(line[pos])) {
372  pos++;
373  continue;
374  }
375 
376  // OK, at this point we know that we are pointing at nonspace.
377  size_t next_equals_sign = line.find_first_of("=", pos);
378  if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
379  // we're looking for something like 'key=value'. If there is no equals sign,
380  // or it's not preceded by something, it's a parsing failure.
381  return false;
382  }
383  std::string key(line, pos, next_equals_sign - pos);
384  if (!IsValidName(key)) return false;
385 
386  // handle any quotes. we support key='blah blah' or key="foo bar".
387  // no escaping is supported.
388  if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
389  char my_quote = line[next_equals_sign+1];
390  size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
391  if (next_quote == std::string::npos) { // no matching quote was found.
392  KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
393  << line << "'";
394  return false;
395  } else {
396  std::string value(line, next_equals_sign + 2,
397  next_quote - next_equals_sign - 2);
398  data_.insert(std::make_pair(key, std::make_pair(value, false)));
399  pos = next_quote + 1;
400  continue;
401  }
402  } else {
403  // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
404  // in general, config values with spaces in them, even without quoting.
405 
406  size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
407  terminating_space = size;
408 
409  if (next_next_equals_sign != std::string::npos) { // found a later equals sign.
410  size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
411  if (preceding_space != std::string::npos &&
412  preceding_space > next_equals_sign)
413  terminating_space = preceding_space;
414  }
415  while (isspace(line[terminating_space - 1]) && terminating_space > 0)
416  terminating_space--;
417 
418  std::string value(line, next_equals_sign + 1,
419  terminating_space - (next_equals_sign + 1));
420  data_.insert(std::make_pair(key, std::make_pair(value, false)));
421  pos = terminating_space;
422  }
423  }
424  return true;
425 }
426 
427 bool ConfigLine::GetValue(const std::string &key, std::string *value) {
428  KALDI_ASSERT(value != NULL);
429  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
430  for (; it != data_.end(); ++it) {
431  if (it->first == key) {
432  *value = (it->second).first;
433  (it->second).second = true;
434  return true;
435  }
436  }
437  return false;
438 }
439 
440 bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) {
441  KALDI_ASSERT(value != NULL);
442  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
443  for (; it != data_.end(); ++it) {
444  if (it->first == key) {
445  if (!ConvertStringToReal((it->second).first, value))
446  return false;
447  (it->second).second = true;
448  return true;
449  }
450  }
451  return false;
452 }
453 
454 bool ConfigLine::GetValue(const std::string &key, int32 *value) {
455  KALDI_ASSERT(value != NULL);
456  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
457  for (; it != data_.end(); ++it) {
458  if (it->first == key) {
459  if (!ConvertStringToInteger((it->second).first, value))
460  return false;
461  (it->second).second = true;
462  return true;
463  }
464  }
465  return false;
466 }
467 
468 bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
469  KALDI_ASSERT(value != NULL);
470  value->clear();
471  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
472  for (; it != data_.end(); ++it) {
473  if (it->first == key) {
474  if (!SplitStringToIntegers((it->second).first, ":,", true, value)) {
475  // KALDI_WARN << "Bad option " << (it->second).first;
476  return false;
477  }
478  (it->second).second = true;
479  return true;
480  }
481  }
482  return false;
483 }
484 
485 bool ConfigLine::GetValue(const std::string &key, bool *value) {
486  KALDI_ASSERT(value != NULL);
487  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
488  for (; it != data_.end(); ++it) {
489  if (it->first == key) {
490  if ((it->second).first.size() == 0) return false;
491  switch (((it->second).first)[0]) {
492  case 'F':
493  case 'f':
494  *value = false;
495  break;
496  case 'T':
497  case 't':
498  *value = true;
499  break;
500  default:
501  return false;
502  }
503  (it->second).second = true;
504  return true;
505  }
506  }
507  return false;
508 }
509 
511  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
512  for (; it != data_.end(); ++it) {
513  if (!(it->second).second) return true;
514  }
515  return false;
516 }
517 
518 std::string ConfigLine::UnusedValues() const {
519  std::string unused_str;
520  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
521  for (; it != data_.end(); ++it) {
522  if (!(it->second).second) {
523  if (unused_str == "")
524  unused_str = it->first + "=" + (it->second).first;
525  else
526  unused_str += " " + it->first + "=" + (it->second).first;
527  }
528  }
529  return unused_str;
530 }
531 
532 // This is like ExpectToken but for two tokens, and it
533 // will either accept token1 and then token2, or just token2.
534 // This is useful in Read functions where the first token
535 // may already have been consumed.
536 void ExpectOneOrTwoTokens(std::istream &is, bool binary,
537  const std::string &token1,
538  const std::string &token2) {
539  KALDI_ASSERT(token1 != token2);
540  std::string temp;
541  ReadToken(is, binary, &temp);
542  if (temp == token1) {
543  ExpectToken(is, binary, token2);
544  } else {
545  if (temp != token2) {
546  KALDI_ERR << "Expecting token " << token1 << " or " << token2
547  << " but got " << temp;
548  }
549  }
550 }
551 
552 
553 bool IsValidName(const std::string &name) {
554  if (name.size() == 0) return false;
555  for (size_t i = 0; i < name.size(); i++) {
556  if (i == 0 && !isalpha(name[i]) && name[i] != '_')
557  return false;
558  if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
559  return false;
560  }
561  return true;
562 }
563 
564 void ReadConfigLines(std::istream &is,
565  std::vector<std::string> *lines) {
566  KALDI_ASSERT(lines != NULL);
567  std::string line;
568  while (std::getline(is, line)) {
569  if (line.size() == 0) continue;
570  size_t start = line.find_first_not_of(" \t");
571  size_t end = line.find_first_of('#');
572  if (start == std::string::npos || start == end) continue;
573  end = line.find_last_not_of(" \t", end - 1);
574  KALDI_ASSERT(end >= start);
575  lines->push_back(line.substr(start, end - start + 1));
576  }
577 }
578 
579 void ParseConfigLines(const std::vector<std::string> &lines,
580  std::vector<ConfigLine> *config_lines) {
581  config_lines->resize(lines.size());
582  for (size_t i = 0; i < lines.size(); i++) {
583  bool ret = (*config_lines)[i].ParseLine(lines[i]);
584  if (!ret) {
585  KALDI_ERR << "Error parsing config line: " << lines[i];
586  }
587  }
588 }
589 
590 
591 } // end namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
Definition: text-utils.h:118
bool ParseLine(const std::string &line)
Definition: text-utils.cc:343
bool SplitStringToFloats(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< F > *out)
Definition: text-utils.cc:30
bool IsLine(const std::string &line)
Returns true if "line" is free of characters and unprintable characters, and does not contain leadi...
Definition: text-utils.cc:154
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
Definition: text-utils.h:68
void JoinVectorToString(const std::vector< std::string > &vec_in, const char *delim, bool omit_empty_strings, std::string *str_out)
Joins the elements of a vector of strings into a single string using "delim" as the delimiter...
Definition: text-utils.cc:77
std::string whole_line_
Definition: text-utils.h:233
kaldi::int32 int32
void ReadToken(std::istream &is, bool binary, std::string *str)
ReadToken gets the next token and puts it in str (exception on failure).
Definition: io-funcs.cc:154
bool IsValidName(const std::string &name)
Returns true if &#39;name&#39; would be a valid name for a component or node in a nnet3Nnet.
Definition: text-utils.cc:553
bool IsToken(const std::string &token)
Returns true if "token" is nonempty, and all characters are printable and whitespace-free.
Definition: text-utils.cc:105
void ExpectOneOrTwoTokens(std::istream &is, bool binary, const std::string &token1, const std::string &token2)
This function is like ExpectToken but for two tokens, and it will either accept token1 and then token...
Definition: text-utils.cc:536
std::string UnusedValues() const
returns e.g.
Definition: text-utils.cc:518
bool StringsApproxEqual(const std::string &a, const std::string &b, int32 decimal_places_tolerance)
This function returns true when two text strings are approximately equal, and false when they are not...
Definition: text-utils.cc:335
void SplitStringOnFirstSpace(const std::string &str, std::string *first, std::string *rest)
Removes leading and trailing white space from the string, then splits on the first section of whitesp...
Definition: text-utils.cc:120
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
bool RemainderIsOnlySpaces()
Definition: text-utils.cc:180
NumberIstream(std::istream &i)
Definition: text-utils.cc:168
#define KALDI_ERR
Definition: kaldi-error.h:147
bool ConvertStringToReal(const std::string &str, T *out)
ConvertStringToReal converts a string into either float or double and returns false if there was any ...
Definition: text-utils.cc:238
#define KALDI_WARN
Definition: kaldi-error.h:150
void ReadConfigLines(std::istream &is, std::vector< std::string > *lines)
This function reads in a config file and *appends* its contents to a vector of lines; it is responsib...
Definition: text-utils.cc:564
void Trim(std::string *str)
Removes the beginning and trailing whitespaces from a string.
Definition: text-utils.cc:92
std::istream & operator>>(std::istream &is, Matrix< Real > &M)
bool StringsApproxEqualInternal(const char *a, const char *b, int32 decimal_places_tolerance, int32 places_into_number)
Definition: text-utils.cc:286
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
std::istream & in_
Definition: text-utils.cc:178
bool HasUnusedValues() const
Definition: text-utils.cc:510
void ParseConfigLines(const std::vector< std::string > &lines, std::vector< ConfigLine > *config_lines)
This function converts config-lines from a simple sequence of strings as output by ReadConfigLines()...
Definition: text-utils.cc:579
bool GetValue(const std::string &key, std::string *value)
Definition: text-utils.cc:427
std::string first_token_
Definition: text-utils.h:237
NumberIstream & ParseOnFail(T *x)
Definition: text-utils.cc:195
std::map< std::string, std::pair< std::string, bool > > data_
Definition: text-utils.h:240