25 using namespace kaldi;
30 unordered_set<std::string, StringHasher> include_set;
32 Input ki(filename, &binary);
35 while (std::getline(ki.
Stream(), line)) {
36 std::vector<std::string> split_line;
39 "Empty line encountered in input from --include option");
40 include_set.insert(split_line[0]);
44 size_t num_success = 0;
45 for (; !kaldi_reader->
Done(); kaldi_reader->
Next(), num_total++) {
46 if (include_set.count(kaldi_reader->
Key()) > 0) {
47 kaldi_writer->
Write(kaldi_reader->
Key(), kaldi_reader->
Value());
52 KALDI_LOG <<
" Wrote " << num_success <<
" out of " << num_total
54 return (num_success != 0 ? 0 : 1);
60 unordered_set<std::string, StringHasher> exclude_set;
62 Input ki(filename, &binary);
65 while (std::getline(ki.
Stream(), line)) {
66 std::vector<std::string> split_line;
69 "Empty line encountered in input from --include option");
70 exclude_set.insert(split_line[0]);
74 size_t num_success = 0;
75 for (; !kaldi_reader->
Done(); kaldi_reader->
Next(), num_total++) {
76 if (exclude_set.count(kaldi_reader->
Key()) == 0) {
77 kaldi_writer->
Write(kaldi_reader->
Key(), kaldi_reader->
Value());
82 KALDI_LOG <<
" Wrote " << num_success <<
" out of " << num_total
84 return (num_success != 0 ? 0 : 1);
87 int main(
int argc,
char *argv[]) {
89 using namespace kaldi;
92 "Copy a subset of features (by default, the first n feature files)\n" 93 "Usually used where only a small amount of data is needed\n" 94 "Note: if you want a specific subset, it's usually best to\n" 95 "filter the original .scp file with utils/filter_scp.pl\n" 96 "(possibly with the --exclude option). The --include and --exclude\n" 97 "options of this program are intended for specialized uses.\n" 98 "The --include and --exclude options are mutually exclusive, \n" 99 "and both cause the --n option to be ignored.\n" 100 "Usage: subset-feats [options] <in-rspecifier> <out-wspecifier>\n" 101 "e.g.: subset-feats --n=10 ark:- ark:-\n" 102 "or: subset-feats --include=include_uttlist ark:- ark:-\n" 103 "or: subset-feats --exclude=exclude_uttlist ark:- ark:-\n" 104 "See also extract-feature-segments, select-feats, subsample-feats\n";
109 std::string include_rxfilename;
110 std::string exclude_rxfilename;
111 po.
Register(
"n", &n,
"If nonnegative, copy the first n feature files.");
112 po.
Register(
"include", &include_rxfilename,
113 "Text file, the first field of each" 114 " line being interpreted as an " 115 "utterance-id whose features will be included");
116 po.
Register(
"exclude", &exclude_rxfilename,
117 "Text file, the first field of each " 118 "line being interpreted as an utterance-id" 119 " whose features will be excluded");
128 std::string rspecifier = po.
GetArg(1);
129 std::string wspecifier = po.
GetArg(2);
136 if (include_rxfilename !=
"") {
138 KALDI_ERR <<
"Should not have both --include and --n option!";
140 if (exclude_rxfilename !=
"") {
141 KALDI_ERR <<
"should not have both --exclude and --include option!";
144 &kaldi_reader, &kaldi_writer);
146 else if (exclude_rxfilename !=
"") {
148 KALDI_ERR <<
"Should not have both --exclude and --n option!";
151 &kaldi_reader, &kaldi_writer);
155 KALDI_ERR <<
"Invalid option --n=0. Should be at least 1";
159 for (; !kaldi_reader.
Done() && k <
n; kaldi_reader.
Next(), k++)
160 kaldi_writer.
Write(kaldi_reader.
Key(), kaldi_reader.
Value());
163 }
catch(
const std::exception &e) {
164 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
int main(int argc, char *argv[])
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
#define KALDI_ASSERT(cond)
int32 CopyExcludedFeats(std::string filename, SequentialBaseFloatMatrixReader *kaldi_reader, BaseFloatMatrixWriter *kaldi_writer)
int32 CopyIncludedFeats(std::string filename, SequentialBaseFloatMatrixReader *kaldi_reader, BaseFloatMatrixWriter *kaldi_writer)