35 interpolator_factor(1.0e-05),
36 max_voicing_prob(0.9),
37 max_pitch_change_per_frame(10.0) { }
39 opts->
Register(
"pitch-interval", &pitch_interval,
"Frequency interval in Hz, used " 40 "for the pitch interpolation and smoothing algorithm.");
41 opts->
Register(
"interpolator-factor", &interpolator_factor,
"Factor affecting the " 42 "interpolation algorithm; setting it closer to zero will cause " 43 "it to follow the measured pitch more faithfully but less " 45 opts->
Register(
"max-voicing-prob", &max_voicing_prob,
"Probability of voicing the " 46 "algorithm uses as the observed p(voicing) approaches 1; having " 47 "value <1 allows it to interpolate even if p(voicing) = 1");
48 opts->
Register(
"max-pitch-change-per-frame", &max_pitch_change_per_frame,
49 "This value should be set large enough to no longer affect the " 50 "results, but the larger it is the slower the algorithm will be.");
53 KALDI_ASSERT(pitch_interval > 0.0 && pitch_interval < 20.0 &&
54 interpolator_factor > 0.0 && interpolator_factor < 1.0 &&
55 max_voicing_prob <= 1.0 && max_voicing_prob >= 0.5 &&
56 max_pitch_change_per_frame > 2.0 * pitch_interval);
67 num_frames_changed(0) { }
69 BaseFloat zero_percent = num_frames_zero * 100.0 / num_frames_tot,
70 changed_percent = num_frames_changed * 100.0 / num_frames_tot;
71 KALDI_LOG <<
"Over " << num_frames_tot <<
" frames, " 72 << zero_percent <<
"% were zero at input, and " 73 << changed_percent <<
"% were not zero but were changed.";
86 Backtrace(mat, stats);
93 BaseFloat min_pitch = 1.0e+10, max_pitch = 0.0;
94 pitch_.resize(num_frames_);
95 p_voicing_.resize(num_frames_);
96 for (
int32 f = 0; f < num_frames_; f++) {
97 BaseFloat p_voicing = mat(f, 0), pitch = mat(f, 1);
98 p_voicing *= opts_.max_voicing_prob;
102 if (pitch < min_pitch) min_pitch = pitch;
103 if (pitch > max_pitch) max_pitch = pitch;
105 p_voicing_[f] = p_voicing;
107 if (max_pitch == 0.0) {
111 if (max_pitch <= min_pitch + (2.0 * pitch_interval)) {
116 num_pitches_ = floor((max_pitch - min_pitch) / pitch_interval + 0.5) + 2;
118 min_pitch_.resize(num_frames_);
119 for (
int32 f = 0; f < num_frames_; f++) {
120 min_pitch_[f] = min_pitch - pitch_interval *
RandUniform();
128 int32 int_pitch = floor((pitch - min_pitch_[f]) / pitch_interval + 0.5);
129 KALDI_ASSERT(int_pitch >= 0 && int_pitch < num_pitches_);
130 pitch_[f] = int_pitch;
139 BaseFloat constant_prob = (1.0 - p_voicing_[t]) * opts_.interpolator_factor,
140 specified_prob = p_voicing_[t] + constant_prob;
143 log_ratio =
Log(specified_prob / constant_prob);
144 log_alpha_.Add(log_constant_prob);
146 log_alpha_(pitch_[t]) += log_ratio;
152 log_alpha_.AddVec(0.01, temp_rand);
163 back_pointers_[t].resize(num_pitches_);
169 int32 K = floor(opts_.max_pitch_change_per_frame / pitch_interval + 0.5);
171 for (
int32 p = 0; p < num_pitches_; p++) {
172 int32 min_prev_p = p - K, max_prev_p = p + K;
173 if (min_prev_p < 0) min_prev_p = 0;
174 if (max_prev_p >= num_pitches_) max_prev_p = num_pitches_ - 1;
176 int32 best_prev_p = -1;
177 for (
int32 prev_p = min_prev_p; prev_p <= max_prev_p; prev_p++) {
179 (min_pitch_[t] + p * pitch_interval);
180 BaseFloat this_logprob = prev_log_alpha_(prev_p)
181 - 0.5 * delta_pitch * delta_pitch;
182 if (this_logprob > best_logprob) {
183 best_logprob = this_logprob;
184 best_prev_p = prev_p;
187 back_pointers_[t][p] = best_prev_p;
188 log_alpha_(p) = best_logprob;
200 back_pointers_.resize(num_frames_);
202 log_alpha_.Resize(num_pitches_);
203 prev_log_alpha_.Resize(num_pitches_);
206 for (
int32 t = 1; t < num_frames_; t++) {
207 log_alpha_.Swap(&prev_log_alpha_);
208 ComputeTransitionProb(t);
214 BaseFloat *p_begin = log_alpha_.Data(), *p_end = p_begin + num_pitches_,
215 *p_best = std::max_element(p_begin, p_end);
217 std::vector<int32> best_pitch(num_frames_);
218 int32 best_p = p_best - p_begin;
219 for (
int32 t = num_frames_ - 1; t >= 0; t--) {
225 BaseFloat pitch = min_pitch_[t] + pitch_interval * best_p;
226 (*mat)(t, 1) = pitch;
229 best_p = back_pointers_[t][best_p];
256 while (i < num_frames) {
257 if(features(i, 1) == 0.0) {
260 while( (features(end, 1)) == 0.0 && (end < num_frames))
262 BaseFloat end_value = -1, start_value = -1;
263 if (end < num_frames) end_value = features(end, 1);
264 if (start > 0) start_value = features(start, 1);
266 if (start_value < 0 && end_value < 0) {
277 if (start_value < 0.0) start_value = 0.9 * end_value;
278 if (end_value < 0.0) end_value = 0.9 * start_value;
280 for(
int k = start + 1; k < end; k++)
281 features(k, 1) = start_value +
282 (end_value - start_value) / (end - start) * (k - start);
292 int main(
int argc,
char *argv[]) {
294 using namespace kaldi;
296 "This is a rather special-purpose program which processes 2-dimensional\n" 297 "features consisting of (prob-of-voicing, pitch). By default we do model-based\n" 298 "pitch smoothing and interpolation (see code), or if --linear-interpolation=true,\n" 299 "just linear interpolation across gaps where pitch == 0 (not predicted).\n" 300 "Usage: interpolate-pitch [options...] <feats-rspecifier> <feats-wspecifier>\n";
306 bool linear_interpolation =
false;
309 opts.
Register(
"linear-interpolation",
310 &linear_interpolation,
"If true, just do simple linear " 311 "interpolation across gaps (else, model-based)");
315 opts.
Read(argc, argv);
322 std::string input_rspecifier = opts.
GetArg(1);
323 std::string output_wspecifier = opts.
GetArg(2);
328 if (!kaldi_writer.
Open(output_wspecifier))
329 KALDI_ERR <<
"Could not initialize output with wspecifier " 330 << output_wspecifier;
332 int32 num_done = 0, num_err = 0;
335 for (; !reader.
Done(); reader.
Next()) {
336 std::string utt = reader.
Key();
338 int num_frames = features.
NumRows();
340 if (num_frames == 0 && features.
NumCols() != 2) {
352 kaldi_writer.
Write(utt, features);
355 if (num_done % 10 == 0)
356 KALDI_LOG <<
"Processed " << num_done <<
" utterances";
357 KALDI_VLOG(2) <<
"Processed features for key " << utt;
359 if (!linear_interpolation) stats.
Print();
360 KALDI_LOG <<
"Done " << num_done <<
" utterances, " << num_err
362 return (num_done != 0 ? 0 : 1);
363 }
catch(
const std::exception &e) {
364 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
bool Open(const std::string &wspecifier)
BaseFloat max_pitch_change_per_frame
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
PitchInterpolatorOptions()
BaseFloat interpolator_factor
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
std::vector< BaseFloat > p_voicing_
void ComputeTransitionProb(int32 t)
std::vector< BaseFloat > min_pitch_
A templated class for writing objects to an archive or script file; see The Table concept...
void Register(OptionsItf *opts)
void Backtrace(Matrix< BaseFloat > *mat, PitchInterpolatorStats *stats)
void LinearlyInterpolatePitch(Matrix< BaseFloat > *mat)
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
int main(int argc, char *argv[])
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
BaseFloat max_voicing_prob
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
void SetRandn()
Set vector to random normally-distributed noise.
PitchInterpolator(const PitchInterpolatorOptions &opts, Matrix< BaseFloat > *mat, PitchInterpolatorStats *stats)
Vector< BaseFloat > prev_log_alpha_
void InitValues(const Matrix< BaseFloat > &mat)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
void MultiplyObsProb(int32 t)
A class representing a vector.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
const PitchInterpolatorOptions & opts_
std::vector< int32 > pitch_
Vector< BaseFloat > log_alpha_
std::vector< std::vector< int32 > > back_pointers_