subset-feats.cc
Go to the documentation of this file.
1 // featbin/subset-feats.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 // 2014 Hainan Xu
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include "base/kaldi-common.h"
22 #include "util/common-utils.h"
23 #include "matrix/kaldi-matrix.h"
24 
25 using namespace kaldi;
26 
27 int32 CopyIncludedFeats(std::string filename,
28  SequentialBaseFloatMatrixReader *kaldi_reader,
29  BaseFloatMatrixWriter *kaldi_writer) {
30  unordered_set<std::string, StringHasher> include_set;
31  bool binary;
32  Input ki(filename, &binary);
33  KALDI_ASSERT(!binary);
34  std::string line;
35  while (std::getline(ki.Stream(), line)) {
36  std::vector<std::string> split_line;
37  SplitStringToVector(line, " \t\r", true, &split_line);
38  KALDI_ASSERT(!split_line.empty() &&
39  "Empty line encountered in input from --include option");
40  include_set.insert(split_line[0]);
41  }
42 
43  int32 num_total = 0;
44  size_t num_success = 0;
45  for (; !kaldi_reader->Done(); kaldi_reader->Next(), num_total++) {
46  if (include_set.count(kaldi_reader->Key()) > 0) {
47  kaldi_writer->Write(kaldi_reader->Key(), kaldi_reader->Value());
48  num_success++;
49  }
50  }
51 
52  KALDI_LOG << " Wrote " << num_success << " out of " << num_total
53  << " utterances.";
54  return (num_success != 0 ? 0 : 1);
55 }
56 
57 int32 CopyExcludedFeats(std::string filename,
58  SequentialBaseFloatMatrixReader *kaldi_reader,
59  BaseFloatMatrixWriter *kaldi_writer) {
60  unordered_set<std::string, StringHasher> exclude_set;
61  bool binary;
62  Input ki(filename, &binary);
63  KALDI_ASSERT(!binary);
64  std::string line;
65  while (std::getline(ki.Stream(), line)) {
66  std::vector<std::string> split_line;
67  SplitStringToVector(line, " \t\r", true, &split_line);
68  KALDI_ASSERT(!split_line.empty() &&
69  "Empty line encountered in input from --include option");
70  exclude_set.insert(split_line[0]);
71  }
72 
73  int32 num_total = 0;
74  size_t num_success = 0;
75  for (; !kaldi_reader->Done(); kaldi_reader->Next(), num_total++) {
76  if (exclude_set.count(kaldi_reader->Key()) == 0) {
77  kaldi_writer->Write(kaldi_reader->Key(), kaldi_reader->Value());
78  num_success++;
79  }
80  }
81 
82  KALDI_LOG << " Wrote " << num_success << " out of " << num_total
83  << " utterances.";
84  return (num_success != 0 ? 0 : 1);
85 }
86 
87 int main(int argc, char *argv[]) {
88  try {
89  using namespace kaldi;
90 
91  const char *usage =
92  "Copy a subset of features (by default, the first n feature files)\n"
93  "Usually used where only a small amount of data is needed\n"
94  "Note: if you want a specific subset, it's usually best to\n"
95  "filter the original .scp file with utils/filter_scp.pl\n"
96  "(possibly with the --exclude option). The --include and --exclude\n"
97  "options of this program are intended for specialized uses.\n"
98  "The --include and --exclude options are mutually exclusive, \n"
99  "and both cause the --n option to be ignored.\n"
100  "Usage: subset-feats [options] <in-rspecifier> <out-wspecifier>\n"
101  "e.g.: subset-feats --n=10 ark:- ark:-\n"
102  "or: subset-feats --include=include_uttlist ark:- ark:-\n"
103  "or: subset-feats --exclude=exclude_uttlist ark:- ark:-\n"
104  "See also extract-feature-segments, select-feats, subsample-feats\n";
105 
106  ParseOptions po(usage);
107 
108  int32 n = 10;
109  std::string include_rxfilename;
110  std::string exclude_rxfilename;
111  po.Register("n", &n, "If nonnegative, copy the first n feature files.");
112  po.Register("include", &include_rxfilename,
113  "Text file, the first field of each"
114  " line being interpreted as an "
115  "utterance-id whose features will be included");
116  po.Register("exclude", &exclude_rxfilename,
117  "Text file, the first field of each "
118  "line being interpreted as an utterance-id"
119  " whose features will be excluded");
120 
121  po.Read(argc, argv);
122 
123  if (po.NumArgs() != 2) {
124  po.PrintUsage();
125  exit(1);
126  }
127 
128  std::string rspecifier = po.GetArg(1);
129  std::string wspecifier = po.GetArg(2);
130 
131  KALDI_ASSERT(n >= 0);
132 
133  BaseFloatMatrixWriter kaldi_writer(wspecifier);
134  SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
135 
136  if (include_rxfilename != "") {
137  if (n != 10) {
138  KALDI_ERR << "Should not have both --include and --n option!";
139  }
140  if (exclude_rxfilename != "") {
141  KALDI_ERR << "should not have both --exclude and --include option!";
142  }
143  return CopyIncludedFeats(include_rxfilename,
144  &kaldi_reader, &kaldi_writer);
145  }
146  else if (exclude_rxfilename != "") {
147  if (n != 10) {
148  KALDI_ERR << "Should not have both --exclude and --n option!";
149  }
150  return CopyExcludedFeats(exclude_rxfilename,
151  &kaldi_reader, &kaldi_writer);
152  }
153 
154  if (n == 0) {
155  KALDI_ERR << "Invalid option --n=0. Should be at least 1";
156  }
157 
158  int32 k = 0;
159  for (; !kaldi_reader.Done() && k < n; kaldi_reader.Next(), k++)
160  kaldi_writer.Write(kaldi_reader.Key(), kaldi_reader.Value());
161 
162  return 0;
163  } catch(const std::exception &e) {
164  std::cerr << e.what();
165  return -1;
166  }
167 }
168 
169 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
std::istream & Stream()
Definition: kaldi-io.cc:826
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
int main(int argc, char *argv[])
Definition: subset-feats.cc:87
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
struct rnnlm::@11::@12 n
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
int32 CopyExcludedFeats(std::string filename, SequentialBaseFloatMatrixReader *kaldi_reader, BaseFloatMatrixWriter *kaldi_writer)
Definition: subset-feats.cc:57
int32 CopyIncludedFeats(std::string filename, SequentialBaseFloatMatrixReader *kaldi_reader, BaseFloatMatrixWriter *kaldi_writer)
Definition: subset-feats.cc:27
#define KALDI_LOG
Definition: kaldi-error.h:153