text-utils.h
Go to the documentation of this file.
1 // util/text-utils.h
2 
3 // Copyright 2009-2011 Saarland University; Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #ifndef KALDI_UTIL_TEXT_UTILS_H_
21 #define KALDI_UTIL_TEXT_UTILS_H_
22 
23 #include <errno.h>
24 #include <string>
25 #include <algorithm>
26 #include <map>
27 #include <set>
28 #include <vector>
29 #include <limits>
30 #include "base/kaldi-common.h"
31 
32 
33 namespace kaldi {
34 
42 void SplitStringToVector(const std::string &full, const char *delim,
43  bool omit_empty_strings,
44  std::vector<std::string> *out);
45 
50 void JoinVectorToString(const std::vector<std::string> &vec_in,
51  const char *delim, bool omit_empty_strings,
52  std::string *str_out);
53 
67 template<class I>
68 bool SplitStringToIntegers(const std::string &full,
69  const char *delim,
70  bool omit_empty_strings, // typically false [but
71  // should probably be true
72  // if "delim" is spaces].
73  std::vector<I> *out) {
74  KALDI_ASSERT(out != NULL);
76  if (*(full.c_str()) == '\0') {
77  out->clear();
78  return true;
79  }
80  std::vector<std::string> split;
81  SplitStringToVector(full, delim, omit_empty_strings, &split);
82  out->resize(split.size());
83  for (size_t i = 0; i < split.size(); i++) {
84  const char *this_str = split[i].c_str();
85  char *end = NULL;
86  int64 j = 0;
87  j = KALDI_STRTOLL(this_str, &end);
88  if (end == this_str || *end != '\0') {
89  out->clear();
90  return false;
91  } else {
92  I jI = static_cast<I>(j);
93  if (static_cast<int64>(jI) != j) {
94  // output type cannot fit this integer.
95  out->clear();
96  return false;
97  }
98  (*out)[i] = jI;
99  }
100  }
101  return true;
102 }
103 
104 // This is defined for F = float and double.
105 template<class F>
106 bool SplitStringToFloats(const std::string &full,
107  const char *delim,
108  bool omit_empty_strings, // typically false
109  std::vector<F> *out);
110 
111 
117 template<class Int>
118 bool ConvertStringToInteger(const std::string &str,
119  Int *out) {
121  const char *this_str = str.c_str();
122  char *end = NULL;
123  errno = 0;
124  int64 i = KALDI_STRTOLL(this_str, &end);
125  if (end != this_str)
126  while (isspace(*end)) end++;
127  if (end == this_str || *end != '\0' || errno != 0)
128  return false;
129  Int iInt = static_cast<Int>(i);
130  if (static_cast<int64>(iInt) != i ||
131  (i < 0 && !std::numeric_limits<Int>::is_signed)) {
132  return false;
133  }
134  *out = iInt;
135  return true;
136 }
137 
138 
143 template <typename T>
144 bool ConvertStringToReal(const std::string &str,
145  T *out);
146 
148 void Trim(std::string *str);
149 
150 
156 void SplitStringOnFirstSpace(const std::string &line,
157  std::string *first,
158  std::string *rest);
159 
160 
163 bool IsToken(const std::string &token);
164 
165 
168 bool IsLine(const std::string &line);
169 
170 
171 
182 bool StringsApproxEqual(const std::string &a,
183  const std::string &b,
184  int32 decimal_places_check = 2);
185 
205 class ConfigLine {
206  public:
207  // Tries to parse the line as a config-file line. Returns false
208  // if it could not for some reason, e.g. parsing failure. In most cases
209  // prints no warnings; the user should do this. Does not expect comments.
210  bool ParseLine(const std::string &line);
211 
212  // the GetValue functions are overloaded for various types. They return true
213  // if the key exists with value that can be converted to that type, and false
214  // otherwise. They also mark the key-value pair as having been read. It is
215  // not an error to read values twice.
216  bool GetValue(const std::string &key, std::string *value);
217  bool GetValue(const std::string &key, BaseFloat *value);
218  bool GetValue(const std::string &key, int32 *value);
219  // Values may be separated by ":" or by ",".
220  bool GetValue(const std::string &key, std::vector<int32> *value);
221  bool GetValue(const std::string &key, bool *value);
222 
223  bool HasUnusedValues() const;
226  std::string UnusedValues() const;
227 
228  const std::string &FirstToken() const { return first_token_; }
229 
230  const std::string WholeLine() { return whole_line_; }
231  // use default assignment operator and copy constructor.
232  private:
233  std::string whole_line_;
234  // the first token of the line, e.g. if line is
235  // foo-bar baz=bing
236  // then first_token_ would be "foo-bar".
237  std::string first_token_;
238 
239  // data_ maps from key to (value, is-this-value-consumed?).
240  std::map<std::string, std::pair<std::string, bool> > data_;
241 
242 };
243 
247 void ExpectOneOrTwoTokens(std::istream &is, bool binary,
248  const std::string &token1,
249  const std::string &token2);
250 
251 
257 void ReadConfigLines(std::istream &is,
258  std::vector<std::string> *lines);
259 
260 
270 void ParseConfigLines(const std::vector<std::string> &lines,
271  std::vector<ConfigLine> *config_lines);
272 
273 
277 bool IsValidName(const std::string &name);
278 
279 } // namespace kaldi
280 
281 #endif // KALDI_UTIL_TEXT_UTILS_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
Definition: text-utils.h:118
const std::string & FirstToken() const
Definition: text-utils.h:228
const std::string WholeLine()
Definition: text-utils.h:230
bool ParseLine(const std::string &line)
Definition: text-utils.cc:343
bool SplitStringToFloats(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< F > *out)
Definition: text-utils.cc:30
#define KALDI_ASSERT_IS_INTEGER_TYPE(I)
Definition: kaldi-utils.h:133
bool IsLine(const std::string &line)
Returns true if "line" is free of characters and unprintable characters, and does not contain leadi...
Definition: text-utils.cc:154
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
Definition: text-utils.h:68
void JoinVectorToString(const std::vector< std::string > &vec_in, const char *delim, bool omit_empty_strings, std::string *str_out)
Joins the elements of a vector of strings into a single string using "delim" as the delimiter...
Definition: text-utils.cc:77
std::string whole_line_
Definition: text-utils.h:233
kaldi::int32 int32
bool IsValidName(const std::string &name)
Returns true if &#39;name&#39; would be a valid name for a component or node in a nnet3Nnet.
Definition: text-utils.cc:553
bool IsToken(const std::string &token)
Returns true if "token" is nonempty, and all characters are printable and whitespace-free.
Definition: text-utils.cc:105
void ExpectOneOrTwoTokens(std::istream &is, bool binary, const std::string &token1, const std::string &token2)
This function is like ExpectToken but for two tokens, and it will either accept token1 and then token...
Definition: text-utils.cc:536
std::string UnusedValues() const
returns e.g.
Definition: text-utils.cc:518
bool StringsApproxEqual(const std::string &a, const std::string &b, int32 decimal_places_tolerance)
This function returns true when two text strings are approximately equal, and false when they are not...
Definition: text-utils.cc:335
void SplitStringOnFirstSpace(const std::string &str, std::string *first, std::string *rest)
Removes leading and trailing white space from the string, then splits on the first section of whitesp...
Definition: text-utils.cc:120
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
bool ConvertStringToReal(const std::string &str, T *out)
ConvertStringToReal converts a string into either float or double and returns false if there was any ...
Definition: text-utils.cc:238
void ReadConfigLines(std::istream &is, std::vector< std::string > *lines)
This function reads in a config file and *appends* its contents to a vector of lines; it is responsib...
Definition: text-utils.cc:564
void Trim(std::string *str)
Removes the beginning and trailing whitespaces from a string.
Definition: text-utils.cc:92
This class is responsible for parsing input like hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing=&#39;a b c&#39; baz="a b c d=&#39;a b&#39; e" and giving you access to the fields, in this case.
Definition: text-utils.h:205
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
bool HasUnusedValues() const
Definition: text-utils.cc:510
void ParseConfigLines(const std::vector< std::string > &lines, std::vector< ConfigLine > *config_lines)
This function converts config-lines from a simple sequence of strings as output by ReadConfigLines()...
Definition: text-utils.cc:579
bool GetValue(const std::string &key, std::string *value)
Definition: text-utils.cc:427
#define KALDI_STRTOLL(cur_cstr, end_cstr)
Definition: kaldi-utils.h:152
std::string first_token_
Definition: text-utils.h:237
std::map< std::string, std::pair< std::string, bool > > data_
Definition: text-utils.h:240