merge-vads.cc
Go to the documentation of this file.
1 // ivectorbin/merge-vads.cc
2 
3 // Copyright 2015 David Snyder
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include "base/kaldi-common.h"
21 #include "util/common-utils.h"
22 #include "matrix/kaldi-matrix.h"
23 #include "util/stl-utils.h"
24 
25 namespace kaldi {
26 
45 void PrepareMap(const std::string map_rxfilename,
46  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *map) {
47  Input map_input(map_rxfilename);
48 
49  // If a map file isn't specified, provide an obvious mapping. The
50  // following mapping assumes "0" corresponds to nonspeech and "1"
51  // corresponds to speech. The combination of two VAD decisions only
52  // results in a decision of speech if both input frames are
53  // classified as speech.
54  if (map_rxfilename.empty()) {
55  (*map)[std::pair<int32, int32>(0, 0)] = 0;
56  (*map)[std::pair<int32, int32>(0, 1)] = 0;
57  (*map)[std::pair<int32, int32>(1, 0)] = 0;
58  (*map)[std::pair<int32, int32>(1, 1)] = 1;
59  } else {
60  std::string line;
61  while (std::getline(map_input.Stream(), line)) {
62  if (line.size() == 0) continue;
63  int32 start = line.find_first_not_of(" \t");
64  int32 end = line.find_first_of('#');
65  if (start == std::string::npos || start == end) continue;
66  end = line.find_last_not_of(" \t", end - 1);
67  KALDI_ASSERT(end >= start);
68  std::vector<std::string> fields;
69  SplitStringToVector(line.substr(start, end - start + 1),
70  " \t\n\r", true, &fields);
71  if (fields.size() != 3) {
72  KALDI_ERR << "Bad line. Expected three fields, got: "
73  << line;
74  }
75  int32 label1 = std::atoi(fields[0].c_str()),
76  label2 = std::atoi(fields[1].c_str()),
77  result_label = std::atoi(fields[2].c_str());
78  (*map)[std::pair<int32, int32>(label1, label2)] = result_label;
79  }
80  }
81 }
82 
83 }
84 
85 int main(int argc, char *argv[]) {
86  using namespace kaldi;
87  typedef kaldi::int32 int32;
88  try {
89  const char *usage =
90  "This program merges two archives of per-frame weights representing\n"
91  "voice activity decisions. By default, the program assumes that the\n"
92  "input vectors consist of floats that are 0.0 if a frame is judged\n"
93  "as nonspeech and 1.0 if it is considered speech. The default\n"
94  "behavior produces a frame-level decision of 1.0 if both input frames\n"
95  "are 1.0, and 0.0 otherwise. Additional classes (e.g., 2.0 for music)\n"
96  "can be handled using the \"map\" option.\n"
97  "\n"
98  "Usage: merge-vads [options] <vad-rspecifier-1> <vad-rspecifier-2>\n"
99  " <vad-wspecifier>\n"
100  "e.g.: merge-vads [options] scp:vad_energy.scp scp:vad_gmm.scp\n"
101  " ark:vad.ark\n"
102  "See also: compute-vad-from-frame-likes, compute-vad, ali-to-post,\n"
103  "post-to-weights\n";
104 
105  ParseOptions po(usage);
106  std::string map_rxfilename;
107  po.Register("map", &map_rxfilename, "This table specifies a mapping "
108  "between the labels of the frame-level decisions in the first and "
109  "second input archives to the integer output label.");
110 
111  po.Read(argc, argv);
112  if (po.NumArgs() != 3) {
113  po.PrintUsage();
114  exit(1);
115  }
116 
117  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > map;
118  PrepareMap(map_rxfilename, &map);
119  SequentialBaseFloatVectorReader first_vad_reader(po.GetArg(1));
120  RandomAccessBaseFloatVectorReader second_vad_reader(po.GetArg(2));
121  BaseFloatVectorWriter vad_writer(po.GetArg(3));
122 
123  int32 num_done = 0, num_err = 0;
124  for (;!first_vad_reader.Done(); first_vad_reader.Next()) {
125  std::string utt = first_vad_reader.Key();
126  Vector<BaseFloat> vad1(first_vad_reader.Value());
127  if (!second_vad_reader.HasKey(utt)) {
128  KALDI_WARN << "No vector for utterance " << utt;
129  num_err++;
130  continue;
131  }
132  Vector<BaseFloat> vad2(second_vad_reader.Value(utt));
133  if (vad1.Dim() != vad2.Dim()) {
134  KALDI_WARN << "VAD length mismatch for utterance " << utt;
135  num_err++;
136  continue;
137  }
138  Vector<BaseFloat> vad_result(vad1.Dim());
139  for (int32 i = 0; i < vad1.Dim(); i++) {
140  std::pair<int32, int32> key(static_cast<int32>(vad1(i)),
141  static_cast<int32>(vad2(i)));
142  unordered_map<std::pair<int32, int32>, int32,
143  PairHasher<int32> >::const_iterator iter = map.find(key);
144  if (iter == map.end()) {
145  KALDI_ERR << "Map is missing combination "
146  << vad1(i) << " and " << vad2(i);
147  } else {
148  vad_result(i) = iter->second;
149  }
150  }
151 
152  vad_writer.Write(utt, vad_result);
153  num_done++;
154  }
155  KALDI_LOG << "Merged voice activity detection decisions; "
156  << "processed " << num_done << " utterances successfully; "
157  << num_err << " had errors.";
158  return (num_done != 0 ? 0 : 1);
159  } catch(const std::exception &e) {
160  std::cerr << e.what();
161  return -1;
162  }
163 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
int main(int argc, char *argv[])
Definition: merge-vads.cc:85
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
std::istream & Stream()
Definition: kaldi-io.cc:826
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
void PrepareMap(const std::string &map_rxfilename, int32 num_classes, unordered_map< int32, int32 > *map)
PrepareMap creates a map that specifies the mapping between the input and output class labels...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_LOG
Definition: kaldi-error.h:153
A hashing function-object for pairs of ints.
Definition: stl-utils.h:235