subset-feats.cc File Reference
Include dependency graph for subset-feats.cc:

Go to the source code of this file.

Functions

int32 CopyIncludedFeats (std::string filename, SequentialBaseFloatMatrixReader *kaldi_reader, BaseFloatMatrixWriter *kaldi_writer)
 
int32 CopyExcludedFeats (std::string filename, SequentialBaseFloatMatrixReader *kaldi_reader, BaseFloatMatrixWriter *kaldi_writer)
 
int main (int argc, char *argv[])
 

Function Documentation

◆ CopyExcludedFeats()

int32 CopyExcludedFeats ( std::string  filename,
SequentialBaseFloatMatrixReader kaldi_reader,
BaseFloatMatrixWriter kaldi_writer 
)

Definition at line 57 of file subset-feats.cc.

References SequentialTableReader< Holder >::Done(), KALDI_ASSERT, KALDI_LOG, SequentialTableReader< Holder >::Key(), SequentialTableReader< Holder >::Next(), kaldi::SplitStringToVector(), Input::Stream(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

Referenced by main().

59  {
60  unordered_set<std::string, StringHasher> exclude_set;
61  bool binary;
62  Input ki(filename, &binary);
63  KALDI_ASSERT(!binary);
64  std::string line;
65  while (std::getline(ki.Stream(), line)) {
66  std::vector<std::string> split_line;
67  SplitStringToVector(line, " \t\r", true, &split_line);
68  KALDI_ASSERT(!split_line.empty() &&
69  "Empty line encountered in input from --include option");
70  exclude_set.insert(split_line[0]);
71  }
72 
73  int32 num_total = 0;
74  size_t num_success = 0;
75  for (; !kaldi_reader->Done(); kaldi_reader->Next(), num_total++) {
76  if (exclude_set.count(kaldi_reader->Key()) == 0) {
77  kaldi_writer->Write(kaldi_reader->Key(), kaldi_reader->Value());
78  num_success++;
79  }
80  }
81 
82  KALDI_LOG << " Wrote " << num_success << " out of " << num_total
83  << " utterances.";
84  return (num_success != 0 ? 0 : 1);
85 }
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_LOG
Definition: kaldi-error.h:153

◆ CopyIncludedFeats()

int32 CopyIncludedFeats ( std::string  filename,
SequentialBaseFloatMatrixReader kaldi_reader,
BaseFloatMatrixWriter kaldi_writer 
)

Definition at line 27 of file subset-feats.cc.

References SequentialTableReader< Holder >::Done(), KALDI_ASSERT, KALDI_LOG, SequentialTableReader< Holder >::Key(), SequentialTableReader< Holder >::Next(), kaldi::SplitStringToVector(), Input::Stream(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

Referenced by main().

29  {
30  unordered_set<std::string, StringHasher> include_set;
31  bool binary;
32  Input ki(filename, &binary);
33  KALDI_ASSERT(!binary);
34  std::string line;
35  while (std::getline(ki.Stream(), line)) {
36  std::vector<std::string> split_line;
37  SplitStringToVector(line, " \t\r", true, &split_line);
38  KALDI_ASSERT(!split_line.empty() &&
39  "Empty line encountered in input from --include option");
40  include_set.insert(split_line[0]);
41  }
42 
43  int32 num_total = 0;
44  size_t num_success = 0;
45  for (; !kaldi_reader->Done(); kaldi_reader->Next(), num_total++) {
46  if (include_set.count(kaldi_reader->Key()) > 0) {
47  kaldi_writer->Write(kaldi_reader->Key(), kaldi_reader->Value());
48  num_success++;
49  }
50  }
51 
52  KALDI_LOG << " Wrote " << num_success << " out of " << num_total
53  << " utterances.";
54  return (num_success != 0 ? 0 : 1);
55 }
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_LOG
Definition: kaldi-error.h:153

◆ main()

int main ( int  argc,
char *  argv[] 
)

Definition at line 87 of file subset-feats.cc.

References CopyExcludedFeats(), CopyIncludedFeats(), SequentialTableReader< Holder >::Done(), ParseOptions::GetArg(), KALDI_ASSERT, KALDI_ERR, SequentialTableReader< Holder >::Key(), rnnlm::n, SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), ParseOptions::PrintUsage(), ParseOptions::Read(), ParseOptions::Register(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

87  {
88  try {
89  using namespace kaldi;
90 
91  const char *usage =
92  "Copy a subset of features (by default, the first n feature files)\n"
93  "Usually used where only a small amount of data is needed\n"
94  "Note: if you want a specific subset, it's usually best to\n"
95  "filter the original .scp file with utils/filter_scp.pl\n"
96  "(possibly with the --exclude option). The --include and --exclude\n"
97  "options of this program are intended for specialized uses.\n"
98  "The --include and --exclude options are mutually exclusive, \n"
99  "and both cause the --n option to be ignored.\n"
100  "Usage: subset-feats [options] <in-rspecifier> <out-wspecifier>\n"
101  "e.g.: subset-feats --n=10 ark:- ark:-\n"
102  "or: subset-feats --include=include_uttlist ark:- ark:-\n"
103  "or: subset-feats --exclude=exclude_uttlist ark:- ark:-\n"
104  "See also extract-feature-segments, select-feats, subsample-feats\n";
105 
106  ParseOptions po(usage);
107 
108  int32 n = 10;
109  std::string include_rxfilename;
110  std::string exclude_rxfilename;
111  po.Register("n", &n, "If nonnegative, copy the first n feature files.");
112  po.Register("include", &include_rxfilename,
113  "Text file, the first field of each"
114  " line being interpreted as an "
115  "utterance-id whose features will be included");
116  po.Register("exclude", &exclude_rxfilename,
117  "Text file, the first field of each "
118  "line being interpreted as an utterance-id"
119  " whose features will be excluded");
120 
121  po.Read(argc, argv);
122 
123  if (po.NumArgs() != 2) {
124  po.PrintUsage();
125  exit(1);
126  }
127 
128  std::string rspecifier = po.GetArg(1);
129  std::string wspecifier = po.GetArg(2);
130 
131  KALDI_ASSERT(n >= 0);
132 
133  BaseFloatMatrixWriter kaldi_writer(wspecifier);
134  SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
135 
136  if (include_rxfilename != "") {
137  if (n != 10) {
138  KALDI_ERR << "Should not have both --include and --n option!";
139  }
140  if (exclude_rxfilename != "") {
141  KALDI_ERR << "should not have both --exclude and --include option!";
142  }
143  return CopyIncludedFeats(include_rxfilename,
144  &kaldi_reader, &kaldi_writer);
145  }
146  else if (exclude_rxfilename != "") {
147  if (n != 10) {
148  KALDI_ERR << "Should not have both --exclude and --n option!";
149  }
150  return CopyExcludedFeats(exclude_rxfilename,
151  &kaldi_reader, &kaldi_writer);
152  }
153 
154  if (n == 0) {
155  KALDI_ERR << "Invalid option --n=0. Should be at least 1";
156  }
157 
158  int32 k = 0;
159  for (; !kaldi_reader.Done() && k < n; kaldi_reader.Next(), k++)
160  kaldi_writer.Write(kaldi_reader.Key(), kaldi_reader.Value());
161 
162  return 0;
163  } catch(const std::exception &e) {
164  std::cerr << e.what();
165  return -1;
166  }
167 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
struct rnnlm::@11::@12 n
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
int32 CopyExcludedFeats(std::string filename, SequentialBaseFloatMatrixReader *kaldi_reader, BaseFloatMatrixWriter *kaldi_writer)
Definition: subset-feats.cc:57
int32 CopyIncludedFeats(std::string filename, SequentialBaseFloatMatrixReader *kaldi_reader, BaseFloatMatrixWriter *kaldi_writer)
Definition: subset-feats.cc:27