interpolate-pitch.cc
Go to the documentation of this file.
1 // featbin/interpolate-pitch.cc
2 
3 // Copyright 2013 Bagher BabaAli
4 // Johns Hopkins University (author: Daniel Povey)
5 //
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include "base/kaldi-common.h"
22 #include "util/common-utils.h"
23 
24 namespace kaldi {
25 
27  BaseFloat pitch_interval; // Discretization interval [affects efficiency]
28  BaseFloat interpolator_factor; // This affects the tendency of the algorithm to
29  // follow the observed pitch contours versus pick its own path which will tend
30  // to be closer to a straight line.
31  BaseFloat max_voicing_prob; // p(voicing) we use at the end of the range when it was observed
32  // at one. (probably 0.9 is suitable; allows to not follow observed pitch even if p(voicing)=1.
34  PitchInterpolatorOptions(): pitch_interval(4.0),
35  interpolator_factor(1.0e-05),
36  max_voicing_prob(0.9),
37  max_pitch_change_per_frame(10.0) { }
38  void Register(OptionsItf *opts) {
39  opts->Register("pitch-interval", &pitch_interval, "Frequency interval in Hz, used "
40  "for the pitch interpolation and smoothing algorithm.");
41  opts->Register("interpolator-factor", &interpolator_factor, "Factor affecting the "
42  "interpolation algorithm; setting it closer to zero will cause "
43  "it to follow the measured pitch more faithfully but less "
44  "smoothly");
45  opts->Register("max-voicing-prob", &max_voicing_prob, "Probability of voicing the "
46  "algorithm uses as the observed p(voicing) approaches 1; having "
47  "value <1 allows it to interpolate even if p(voicing) = 1");
48  opts->Register("max-pitch-change-per-frame", &max_pitch_change_per_frame,
49  "This value should be set large enough to no longer affect the "
50  "results, but the larger it is the slower the algorithm will be.");
51  }
52  void Check() const {
53  KALDI_ASSERT(pitch_interval > 0.0 && pitch_interval < 20.0 &&
54  interpolator_factor > 0.0 && interpolator_factor < 1.0 &&
55  max_voicing_prob <= 1.0 && max_voicing_prob >= 0.5 &&
56  max_pitch_change_per_frame > 2.0 * pitch_interval);
57  }
58 };
59 
62  int64 num_frames_zero; // #frames that were zero in original pitch.
63  int64 num_frames_changed; // #frames that were not zero originally, but
64  // which the algorithm changed.
65 
66  PitchInterpolatorStats(): num_frames_tot(0), num_frames_zero(0),
67  num_frames_changed(0) { }
68  void Print() {
69  BaseFloat zero_percent = num_frames_zero * 100.0 / num_frames_tot,
70  changed_percent = num_frames_changed * 100.0 / num_frames_tot;
71  KALDI_LOG << "Over " << num_frames_tot << " frames, "
72  << zero_percent << "% were zero at input, and "
73  << changed_percent << "% were not zero but were changed.";
74  }
75 };
76 
78  public:
80  Matrix<BaseFloat> *mat,
81  PitchInterpolatorStats *stats):
82  opts_(opts) {
83  opts.Check();
84  InitValues(*mat);
85  Forward();
86  Backtrace(mat, stats);
87  }
88  private:
89  void InitValues(const Matrix<BaseFloat> &mat) {
90  BaseFloat pitch_interval = opts_.pitch_interval;
91  num_frames_ = mat.NumRows();
92  KALDI_ASSERT(mat.NumCols() == 2);
93  BaseFloat min_pitch = 1.0e+10, max_pitch = 0.0;
94  pitch_.resize(num_frames_);
95  p_voicing_.resize(num_frames_);
96  for (int32 f = 0; f < num_frames_; f++) {
97  BaseFloat p_voicing = mat(f, 0), pitch = mat(f, 1);
98  p_voicing *= opts_.max_voicing_prob;
99  if (pitch == 0.0) {
100  p_voicing = 0.0; // complete uncertainty about real pitch.
101  } else {
102  if (pitch < min_pitch) min_pitch = pitch;
103  if (pitch > max_pitch) max_pitch = pitch;
104  }
105  p_voicing_[f] = p_voicing;
106  }
107  if (max_pitch == 0.0) { // No voiced frames at all.
108  min_pitch = 100.0;
109  max_pitch = 100.0;
110  }
111  if (max_pitch <= min_pitch + (2.0 * pitch_interval)) {
112  max_pitch = min_pitch + 2.0 * pitch_interval;
113  } // avoid crashes.
114 
115  // Note: the + 2 here is for edge effects.
116  num_pitches_ = floor((max_pitch - min_pitch) / pitch_interval + 0.5) + 2;
117  KALDI_ASSERT(num_pitches_ >= 3);
118  min_pitch_.resize(num_frames_);
119  for (int32 f = 0; f < num_frames_; f++) {
120  min_pitch_[f] = min_pitch - pitch_interval * RandUniform(); // bottom of
121  // discretization range for each frame is randomly different.
122 
123  BaseFloat pitch = mat(f, 1);
124  if (pitch == 0.0) {
125  pitch_[f] = 0; // This will actually be a don't-care value; we just put in
126  // some value that won't crash the algorithm.
127  } else {
128  int32 int_pitch = floor((pitch - min_pitch_[f]) / pitch_interval + 0.5);
129  KALDI_ASSERT(int_pitch >= 0 && int_pitch < num_pitches_);
130  pitch_[f] = int_pitch;
131  }
132  }
133  }
134 
136  // For the forward computation:
137  // Multiplies the observation probabilities into alpha at time t.
138  // constant_prob is the constant part that does not depend on the pitch value:
139  BaseFloat constant_prob = (1.0 - p_voicing_[t]) * opts_.interpolator_factor,
140  specified_prob = p_voicing_[t] + constant_prob;
141  // specified_prob adds in the extra probability mass at the observed pitch value.
142  BaseFloat log_constant_prob = Log(constant_prob),
143  log_ratio = Log(specified_prob / constant_prob);
144  log_alpha_.Add(log_constant_prob); // add log_constant_prob to all pitches at this time.
145 
146  log_alpha_(pitch_[t]) += log_ratio; // corrects this to be like adding
147  // log(specified_prob) to the observed pitch at this time. Note: if pitch_[t] == 0,
148  // this won't have any effect because log_ratio will be zero too.
149 
150  Vector<BaseFloat> temp_rand(num_pitches_);
151  temp_rand.SetRandn(); // Set to Gaussian noise. Type of noise doesn't really matter.
152  log_alpha_.AddVec(0.01, temp_rand); // We add a small amount of noise to the
153  // observation probabilities; this has the effect of breaking symmetries in
154  // a more random way to overcome certain weirdnesses that could otherwise
155  // happen due to the discretization.
156  }
157 
158  // This function updates log_alpha_, as a function of prev_log_alpha_; it also
159  // updates back_pointers_[t];
161  KALDI_ASSERT(t > 0);
162  BaseFloat pitch_interval = opts_.pitch_interval;
163  back_pointers_[t].resize(num_pitches_);
164 
165  // Transition probability between pitch p and p' on times t-1 and t
166  // is (p - p')^2, with the pitch measured in Hz. We're doing Viterbi,
167  // so always pick the max over the previous frame's t.
168  KALDI_ASSERT(t > 0 && t < num_frames_);
169  int32 K = floor(opts_.max_pitch_change_per_frame / pitch_interval + 0.5);
170  // K is max #bins we can move; a kind of pruning, for speed.
171  for (int32 p = 0; p < num_pitches_; p++) {
172  int32 min_prev_p = p - K, max_prev_p = p + K;
173  if (min_prev_p < 0) min_prev_p = 0;
174  if (max_prev_p >= num_pitches_) max_prev_p = num_pitches_ - 1;
175  BaseFloat best_logprob = -1.0e+10;
176  int32 best_prev_p = -1;
177  for (int32 prev_p = min_prev_p; prev_p <= max_prev_p; prev_p++) {
178  BaseFloat delta_pitch = (min_pitch_[t-1] + prev_p * pitch_interval) -
179  (min_pitch_[t] + p * pitch_interval);
180  BaseFloat this_logprob = prev_log_alpha_(prev_p)
181  - 0.5 * delta_pitch * delta_pitch;
182  if (this_logprob > best_logprob) {
183  best_logprob = this_logprob;
184  best_prev_p = prev_p;
185  }
186  }
187  back_pointers_[t][p] = best_prev_p;
188  log_alpha_(p) = best_logprob;
189  }
190  }
191 
192  void Forward() {
193  // Viterbi in a discrete model of the pitch, in which the observation
194  // probability of a pitch is p(voicing) at the observed pitch, and
195  // interpolator_factor_ * 1.0 - p(voicing) at all other pitches. the
196  // transition log-probability is -0.5 times the squared difference in pitch.
197  // [We measure this in Hz, not in integer values, to make it more invariant
198  // to the discretization interval].
199 
200  back_pointers_.resize(num_frames_);
201 
202  log_alpha_.Resize(num_pitches_);
203  prev_log_alpha_.Resize(num_pitches_);
204  log_alpha_.Set(0.0);
205  MultiplyObsProb(0);
206  for (int32 t = 1; t < num_frames_; t++) {
207  log_alpha_.Swap(&prev_log_alpha_);
208  ComputeTransitionProb(t);
209  MultiplyObsProb(t);
210  }
211  }
213  const BaseFloat pitch_interval = opts_.pitch_interval;
214  BaseFloat *p_begin = log_alpha_.Data(), *p_end = p_begin + num_pitches_,
215  *p_best = std::max_element(p_begin, p_end);
216 
217  std::vector<int32> best_pitch(num_frames_);
218  int32 best_p = p_best - p_begin; // best discrete pitch p at time T-1.
219  for (int32 t = num_frames_ - 1; t >= 0; t--) {
220  { // Update stats:
221  stats->num_frames_tot++;
222  if (pitch_[t] == 0) stats->num_frames_zero++;
223  else if (best_p != pitch_[t]) stats->num_frames_changed++;
224  }
225  BaseFloat pitch = min_pitch_[t] + pitch_interval * best_p;
226  (*mat)(t, 1) = pitch;
227  KALDI_ASSERT(best_p >= 0 && best_p < num_pitches_);
228  if (t > 0)
229  best_p = back_pointers_[t][best_p];
230  }
231  }
233  std::vector<BaseFloat> min_pitch_; // Bottom of discretization range...
234  // previously this was a BaseFloat, but for better pseudo-randomization we
235  // have a slightly perturbed value for each frame now, so it's a vector.
236  int32 num_frames_; // number of frames;
237  int32 num_pitches_; // Number of discrete pitch intervals.
238  std::vector<int32> pitch_; // observed pitch, discretized; [it's don't-care if algorithm had no
239  // observation (0)]
240  std::vector<BaseFloat> p_voicing_; // p(voicing) times max_voicing_prob_; or zero if
241  // pitch was 0.0 for this frame.
242  std::vector<std::vector<int32> > back_pointers_; // at each t, points to best pitch
243  // on time t-1.
244 
247 };
248 
249 
250 
251 // Linear Interpolation for places where the pitch value is zero
253  int32 num_frames = mat->NumRows();
254  int i = 0;
255  Matrix<BaseFloat> &features = *mat;
256  while (i < num_frames) {
257  if(features(i, 1) == 0.0) {
258  int start = i - 1;
259  int end = i;
260  while( (features(end, 1)) == 0.0 && (end < num_frames))
261  end++;
262  BaseFloat end_value = -1, start_value = -1;
263  if (end < num_frames) end_value = features(end, 1);
264  if (start > 0) start_value = features(start, 1);
265 
266  if (start_value < 0 && end_value < 0) {
267  // the whole file is unvoiced -> just put an arbitrary value,
268  // it will all be normalized out anyway.
269  start_value = 1.0;
270  end_value = 1.0;
271  }
272  // If we don't have a value for one end of the range, i.e. at the start or
273  // end, set it to 0.9 times the pitch value that we have at the other end
274  // of the range. The reason we don't set it to that value itself, is that
275  // then over this segment we would have zero time-derivative, so if we
276  // took time derivatives we would have an artificial spike at zero.
277  if (start_value < 0.0) start_value = 0.9 * end_value;
278  if (end_value < 0.0) end_value = 0.9 * start_value;
279 
280  for(int k = start + 1; k < end; k++)
281  features(k, 1) = start_value +
282  (end_value - start_value) / (end - start) * (k - start);
283  i = end;
284  }
285  i++;
286  }
287 }
288 
289 
290 } // namespace kaldi
291 
292 int main(int argc, char *argv[]) {
293  try {
294  using namespace kaldi;
295  const char *usage =
296  "This is a rather special-purpose program which processes 2-dimensional\n"
297  "features consisting of (prob-of-voicing, pitch). By default we do model-based\n"
298  "pitch smoothing and interpolation (see code), or if --linear-interpolation=true,\n"
299  "just linear interpolation across gaps where pitch == 0 (not predicted).\n"
300  "Usage: interpolate-pitch [options...] <feats-rspecifier> <feats-wspecifier>\n";
301 
302 
303  // construct all the global objects
304  ParseOptions opts(usage);
305 
306  bool linear_interpolation = false;
307  PitchInterpolatorOptions interpolate_opts;
308 
309  opts.Register("linear-interpolation",
310  &linear_interpolation, "If true, just do simple linear "
311  "interpolation across gaps (else, model-based)");
312  interpolate_opts.Register(&opts);
313 
314  // parse options (+filling the registered variables)
315  opts.Read(argc, argv);
316 
317  if (opts.NumArgs() != 2) {
318  opts.PrintUsage();
319  exit(1);
320  }
321 
322  std::string input_rspecifier = opts.GetArg(1);
323  std::string output_wspecifier = opts.GetArg(2);
324 
325  SequentialBaseFloatMatrixReader reader(input_rspecifier);
326  BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter<something>.
327 
328  if (!kaldi_writer.Open(output_wspecifier))
329  KALDI_ERR << "Could not initialize output with wspecifier "
330  << output_wspecifier;
331 
332  int32 num_done = 0, num_err = 0;
334 
335  for (; !reader.Done(); reader.Next()) {
336  std::string utt = reader.Key();
337  Matrix<BaseFloat> features = reader.Value();
338  int num_frames = features.NumRows();
339 
340  if (num_frames == 0 && features.NumCols() != 2) {
341  KALDI_WARN << "Feature file has bad size "
342  << features.NumRows() << " by " << features.NumCols();
343  num_err++;
344  continue;
345  }
346 
347  if (linear_interpolation) LinearlyInterpolatePitch(&features);
348  else {
349  // work happens in constructor of this class.
350  PitchInterpolator pi(interpolate_opts, &features, &stats);
351  }
352  kaldi_writer.Write(utt, features);
353  num_done++;
354 
355  if (num_done % 10 == 0)
356  KALDI_LOG << "Processed " << num_done << " utterances";
357  KALDI_VLOG(2) << "Processed features for key " << utt;
358  }
359  if (!linear_interpolation) stats.Print();
360  KALDI_LOG << "Done " << num_done << " utterances, " << num_err
361  << " with errors.";
362  return (num_done != 0 ? 0 : 1);
363  } catch(const std::exception &e) {
364  std::cerr << e.what();
365  return -1;
366  }
367 }
368 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool Open(const std::string &wspecifier)
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
Definition: kaldi-math.h:151
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
std::vector< BaseFloat > p_voicing_
std::vector< BaseFloat > min_pitch_
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Register(OptionsItf *opts)
void Backtrace(Matrix< BaseFloat > *mat, PitchInterpolatorStats *stats)
void LinearlyInterpolatePitch(Matrix< BaseFloat > *mat)
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
int main(int argc, char *argv[])
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
double Log(double x)
Definition: kaldi-math.h:100
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
void SetRandn()
Set vector to random normally-distributed noise.
PitchInterpolator(const PitchInterpolatorOptions &opts, Matrix< BaseFloat > *mat, PitchInterpolatorStats *stats)
Vector< BaseFloat > prev_log_alpha_
void InitValues(const Matrix< BaseFloat > &mat)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
const PitchInterpolatorOptions & opts_
#define KALDI_LOG
Definition: kaldi-error.h:153
std::vector< int32 > pitch_
Vector< BaseFloat > log_alpha_
std::vector< std::vector< int32 > > back_pointers_