voice-activity-detection.h
Go to the documentation of this file.
1 // ivector/voice-activity-detection.h
2 
3 // Copyright 2013 Daniel Povey
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 #ifndef KALDI_IVECTOR_VOICE_ACTIVITY_DETECTION_H_
22 #define KALDI_IVECTOR_VOICE_ACTIVITY_DETECTION_H_
23 
24 #include <cassert>
25 #include <cstdlib>
26 #include <string>
27 #include <vector>
28 
29 #include "matrix/matrix-lib.h"
30 #include "util/common-utils.h"
31 #include "base/kaldi-error.h"
32 
33 namespace kaldi {
34 
35 /*
36  Note: we may move the location of this file in the future, e.g. to feat/
37  This code is geared toward speaker-id applications and is not suitable
38  for automatic speech recognition (ASR) because it makes independent
39  decisions for each frame without imposing any notion of continuity.
40 */
41 
47 
48  VadEnergyOptions(): vad_energy_threshold(5.0),
49  vad_energy_mean_scale(0.5),
50  vad_frames_context(0),
51  vad_proportion_threshold(0.6) { }
52  void Register(OptionsItf *opts) {
53  opts->Register("vad-energy-threshold", &vad_energy_threshold,
54  "Constant term in energy threshold for MFCC0 for VAD (also see "
55  "--vad-energy-mean-scale)");
56  opts->Register("vad-energy-mean-scale", &vad_energy_mean_scale,
57  "If this is set to s, to get the actual threshold we "
58  "let m be the mean log-energy of the file, and use "
59  "s*m + vad-energy-threshold");
60  opts->Register("vad-frames-context", &vad_frames_context,
61  "Number of frames of context on each side of central frame, "
62  "in window for which energy is monitored");
63  opts->Register("vad-proportion-threshold", &vad_proportion_threshold,
64  "Parameter controlling the proportion of frames within "
65  "the window that need to have more energy than the "
66  "threshold");
67  }
68 };
69 
70 
80 void ComputeVadEnergy(const VadEnergyOptions &opts,
81  const MatrixBase<BaseFloat> &input_features,
82  Vector<BaseFloat> *output_voiced);
83 
84 
85 } // namespace kaldi
86 
87 
88 
89 #endif // KALDI_IVECTOR_VOICE_ACTIVITY_DETECTION_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
kaldi::int32 int32
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
void Register(OptionsItf *opts)
A class representing a vector.
Definition: kaldi-vector.h:406
void ComputeVadEnergy(const VadEnergyOptions &opts, const MatrixBase< BaseFloat > &feats, Vector< BaseFloat > *output_voiced)
Compute voice-activity vector for a file: 1 if we judge the frame as voiced, 0 otherwise.