compute-vad-from-frame-likes.cc
Go to the documentation of this file.
1 // ivectorbin/compute-vad-from-frame-likes.cc
2 
3 // Copyright 2015 David Snyder
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include "base/kaldi-common.h"
21 #include "util/common-utils.h"
22 #include "matrix/kaldi-matrix.h"
23 #include "util/parse-options.h"
24 #include "util/stl-utils.h"
25 
26 namespace kaldi {
27 
42 void PrepareMap(const std::string &map_rxfilename, int32 num_classes,
43  unordered_map<int32, int32> *map) {
44  Input map_input(map_rxfilename);
45  for (int32 i = 0; i < num_classes; i++)
46  (*map)[i] = i;
47 
48  if (!map_rxfilename.empty()) {
49  std::string line;
50  while (std::getline(map_input.Stream(), line)) {
51  if (line.size() == 0) continue;
52  int32 start = line.find_first_not_of(" \t");
53  int32 end = line.find_first_of('#'); // Ignore trailing comments
54  if (start == std::string::npos || start == end) continue;
55  end = line.find_last_not_of(" \t", end - 1);
56  KALDI_ASSERT(end >= start);
57  std::vector<std::string> fields;
58  SplitStringToVector(line.substr(start, end - start + 1),
59  " \t\n\r", true, &fields);
60  if (fields.size() != 2) {
61  KALDI_ERR << "Bad line. Expected two fields, got: "
62  << line;
63  }
64  (*map)[std::atoi(fields[0].c_str())] = std::atoi(fields[1].c_str());
65  }
66  }
67 
68  if (map->size() > num_classes)
69  KALDI_ERR << "Map table has " << map->size() << " classes. "
70  << "Expected " << num_classes << " or fewer";
71 }
72 
80 void PreparePriors(const std::string &priors_str, int32 num_classes,
81  std::vector<BaseFloat> *priors) {
82  if (priors_str.empty()) {
83  for (int32 i = 0; i < num_classes; i++)
84  priors->push_back(log(1.0/num_classes)); // Uniform priors
85  } else {
86  SplitStringToFloats(priors_str, ",", false, priors);
87  for (int32 i = 0; i < priors->size(); i++)
88  (*priors)[i] = log((*priors)[i]);
89  }
90 
91  if (priors->size() != num_classes)
92  KALDI_ERR << priors->size() << " priors specified. Expected "
93  << num_classes;
94 }
95 
96 }
97 
98 int main(int argc, char *argv[]) {
99  using namespace kaldi;
100  typedef kaldi::int32 int32;
101  try {
102  const char *usage =
103  "This program computes frame-level voice activity decisions from a\n"
104  "set of input frame-level log-likelihoods. Usually, these\n"
105  "log-likelihoods are the output of fgmm-global-get-frame-likes.\n"
106  "Frames are assigned labels according to the class for which the\n"
107  "log-likelihood (optionally weighted by a prior) is maximal. The\n"
108  "class labels are determined by the order of inputs on the command\n"
109  "line. See options for more details.\n"
110  "\n"
111  "Usage: compute-vad-from-frame-likes [options] <likes-rspecifier-1>\n"
112  " ... <likes-rspecifier-n> <vad-wspecifier>\n"
113  "e.g.: compute-vad-from-frame-likes --map=label_map.txt\n"
114  " scp:likes1.scp scp:likes2.scp ark:vad.ark\n"
115  "See also: fgmm-global-get-frame-likes, compute-vad, merge-vads\n";
116 
117  ParseOptions po(usage);
118  std::string map_rxfilename;
119  std::string priors_str;
120 
121  po.Register("map", &map_rxfilename, "Table that defines the frame-level "
122  "labels. For each row, the first field is the zero-based index of the "
123  "input likelihood archive and the second field is the associated "
124  "integer label.");
125 
126  po.Register("priors", &priors_str, "Comma-separated list that specifies "
127  "the priors for each class. The order of the floats corresponds to "
128  "the index of the input archives. E.g., --priors=0.5,0.2,0.3");
129 
130  po.Read(argc, argv);
131  if (po.NumArgs() < 3) {
132  po.PrintUsage();
133  exit(1);
134  }
135 
136  unordered_map<int32, int32> map;
137  std::vector<BaseFloat> priors;
138  int32 num_classes = po.NumArgs() - 1;
139  PrepareMap(map_rxfilename, num_classes, &map);
140  PreparePriors(priors_str, num_classes, &priors);
141 
142  SequentialBaseFloatVectorReader first_reader(po.GetArg(1));
143  std::vector<RandomAccessBaseFloatVectorReader *> readers;
144  std::string vad_wspecifier = po.GetArg(po.NumArgs());
145  BaseFloatVectorWriter vad_writer(vad_wspecifier);
146 
147  for (int32 i = 2; i < po.NumArgs(); i++) {
150  readers.push_back(reader);
151  }
152 
153  int32 num_done = 0, num_err = 0;
154  for (;!first_reader.Done(); first_reader.Next()) {
155  std::string utt = first_reader.Key();
156  Vector<BaseFloat> like(first_reader.Value());
157  int32 like_dim = like.Dim();
158  std::vector<Vector<BaseFloat> > likes;
159  likes.push_back(like);
160  if (like_dim == 0) {
161  KALDI_WARN << "Empty vector for utterance " << utt;
162  num_err++;
163  continue;
164  }
165  for (int32 i = 0; i < num_classes - 1; i++) {
166  if (!readers[i]->HasKey(utt)) {
167  KALDI_WARN << "No vector for utterance " << utt;
168  num_err++;
169  continue;
170  }
171  Vector<BaseFloat> other_like(readers[i]->Value(utt));
172  if (like_dim != other_like.Dim()) {
173  KALDI_WARN << "Dimension mismatch in input vectors in " << utt
174  << ": " << like_dim << " vs. " << other_like.Dim();
175  num_err++;
176  continue;
177  }
178  likes.push_back(other_like);
179  }
180 
181  Vector<BaseFloat> vad_result(like_dim);
182  for (int32 i = 0; i < like.Dim(); i++) {
183  int32 max_indx = 0;
184  BaseFloat max_post = likes[0](i) + priors[0];
185  for (int32 j = 0; j < num_classes; j++) {
186  BaseFloat other_post = likes[j](i) + priors[j];
187  if (other_post > max_post) {
188  max_indx = j;
189  max_post = other_post;
190  }
191  }
192  unordered_map<int32, int32>::const_iterator iter = map.find(max_indx);
193  if (iter == map.end()) {
194  KALDI_ERR << "Missing label " << max_indx << " in map";
195  } else {
196  vad_result(i) = iter->second;
197  }
198  }
199  vad_writer.Write(utt, vad_result);
200  num_done++;
201  }
202 
203  for (int32 i = 0; i < num_classes - 1; i++)
204  delete readers[i];
205 
206  KALDI_LOG << "Applied frame-level likelihood-based voice activity "
207  << "detection; processed " << num_done
208  << " utterances successfully; " << num_err
209  << " had empty features.";
210  return (num_done != 0 ? 0 : 1);
211  } catch(const std::exception &e) {
212  std::cerr << e.what();
213  return -1;
214  }
215 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool SplitStringToFloats(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< F > *out)
Definition: text-utils.cc:30
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
int main(int argc, char *argv[])
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void PreparePriors(const std::string &priors_str, int32 num_classes, std::vector< BaseFloat > *priors)
PreparePriors creates a table specifying the priors for each class.
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
std::istream & Stream()
Definition: kaldi-io.cc:826
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
void PrepareMap(const std::string &map_rxfilename, int32 num_classes, unordered_map< int32, int32 > *map)
PrepareMap creates a map that specifies the mapping between the input and output class labels...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
int NumArgs() const
Number of positional parameters (c.f. argc-1).
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_LOG
Definition: kaldi-error.h:153
RandomAccessTableReader< KaldiObjectHolder< Vector< BaseFloat > > > RandomAccessBaseFloatVectorReader
Definition: table-types.h:62