compute-vad.cc File Reference
Include dependency graph for compute-vad.cc:

Go to the source code of this file.

Functions

int main (int argc, char *argv[])
 

Function Documentation

◆ main()

int main ( int  argc,
char *  argv[] 
)

Definition at line 27 of file compute-vad.cc.

References kaldi::ComputeVadEnergy(), SequentialTableReader< Holder >::Done(), ParseOptions::GetArg(), KALDI_LOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), ParseOptions::PrintUsage(), ParseOptions::Read(), VadEnergyOptions::Register(), ParseOptions::Register(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

27  {
28  try {
29  using namespace kaldi;
30  using kaldi::int32;
31 
32  const char *usage =
33  "This program reads input features and writes out, for each utterance,\n"
34  "a vector of floats that are 1.0 if we judge the frame voiced and 0.0\n"
35  "otherwise. The algorithm is very simple and is based on thresholding\n"
36  "the log mel energy (and taking the consensus of threshold decisions\n"
37  "within a window centered on the current frame). See the options for\n"
38  "more details, and egs/sid/s1/run.sh for examples; this program is\n"
39  "intended for use in speaker-ID.\n"
40  "\n"
41  "Usage: compute-vad [options] <feats-rspecifier> <vad-wspecifier>\n"
42  "e.g.: compute-vad scp:feats.scp ark:vad.ark\n";
43 
44  ParseOptions po(usage);
45  bool omit_unvoiced_utts = false;
46  po.Register("omit-unvoiced-utts", &omit_unvoiced_utts,
47  "If true, do not write out voicing information for "
48  "utterances that were judged 100% unvoiced.");
49  VadEnergyOptions opts;
50  opts.Register(&po);
51  po.Read(argc, argv);
52 
53  if (po.NumArgs() != 2) {
54  po.PrintUsage();
55  exit(1);
56  }
57 
58  std::string feat_rspecifier = po.GetArg(1);
59  std::string vad_wspecifier = po.GetArg(2);
60 
61  SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
62  BaseFloatVectorWriter vad_writer(vad_wspecifier);
63 
64  int32 num_done = 0, num_err = 0;
65  int32 num_unvoiced = 0;
66  double tot_length = 0.0, tot_decision = 0.0;
67 
68  for (;!feat_reader.Done(); feat_reader.Next()) {
69  std::string utt = feat_reader.Key();
70  Matrix<BaseFloat> feat(feat_reader.Value());
71  if (feat.NumRows() == 0) {
72  KALDI_WARN << "Empty feature matrix for utterance " << utt;
73  num_err++;
74  continue;
75  }
76  Vector<BaseFloat> vad_result(feat.NumRows());
77 
78  ComputeVadEnergy(opts, feat, &vad_result);
79 
80  double sum = vad_result.Sum();
81  if (sum == 0.0) {
82  KALDI_WARN << "No frames were judged voiced for utterance " << utt;
83  num_unvoiced++;
84  } else {
85  num_done++;
86  }
87  tot_decision += vad_result.Sum();
88  tot_length += vad_result.Dim();
89 
90  if (!(omit_unvoiced_utts && sum == 0)) {
91  vad_writer.Write(utt, vad_result);
92  }
93  }
94 
95  KALDI_LOG << "Applied energy based voice activity detection; "
96  << "processed " << num_done << " utterances successfully; "
97  << num_err << " had empty features, and " << num_unvoiced
98  << " were completely unvoiced.";
99  KALDI_LOG << "Proportion of voiced frames was "
100  << (tot_decision / tot_length) << " over "
101  << tot_length << " frames.";
102  return (num_done != 0 ? 0 : 1);
103  } catch(const std::exception &e) {
104  std::cerr << e.what();
105  return -1;
106  }
107 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
void Register(OptionsItf *opts)
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
#define KALDI_WARN
Definition: kaldi-error.h:150
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_LOG
Definition: kaldi-error.h:153
void ComputeVadEnergy(const VadEnergyOptions &opts, const MatrixBase< BaseFloat > &feats, Vector< BaseFloat > *output_voiced)
Compute voice-activity vector for a file: 1 if we judge the frame as voiced, 0 otherwise.