build-tree-utils.cc
Go to the documentation of this file.
1 // tree/build-tree-utils.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation; Haihua Xu
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include <set>
21 #include <queue>
22 #include "util/stl-utils.h"
23 #include "tree/build-tree-utils.h"
24 
25 
26 
27 namespace kaldi {
28 
29 void WriteBuildTreeStats(std::ostream &os, bool binary, const BuildTreeStatsType &stats) {
30  WriteToken(os, binary, "BTS");
31  uint32 size = stats.size();
32  WriteBasicType(os, binary, size);
33  for (size_t i = 0; i < size; i++) {
34  WriteEventType(os, binary, stats[i].first);
35  bool nonNull = (stats[i].second != NULL);
36  WriteBasicType(os, binary, nonNull);
37  if (nonNull) stats[i].second->Write(os, binary);
38  }
39  if (os.fail()) {
40  KALDI_ERR << "WriteBuildTreeStats: write failed.";
41  }
42  if (!binary) os << '\n';
43 }
44 
45 
46 void ReadBuildTreeStats(std::istream &is, bool binary, const Clusterable &example, BuildTreeStatsType *stats) {
47  KALDI_ASSERT(stats != NULL);
48  KALDI_ASSERT(stats->empty());
49  ExpectToken(is, binary, "BTS");
50  uint32 size;
51  ReadBasicType(is, binary, &size);
52  stats->resize(size);
53  for (size_t i = 0; i < size; i++) {
54  ReadEventType(is, binary, &((*stats)[i].first));
55  bool nonNull;
56  ReadBasicType(is, binary, &nonNull);
57  if (nonNull) (*stats)[i].second = example.ReadNew(is, binary);
58  else (*stats)[i].second = NULL;
59  }
60 }
61 
62 
64  const BuildTreeStatsType &stats,
65  std::vector<EventValueType> *ans) {
66  bool all_present = true;
67  std::set<EventValueType> values;
68  BuildTreeStatsType::const_iterator iter = stats.begin(), end = stats.end();
69  for (; iter != end; ++iter) {
70  EventValueType val;
71  if (EventMap::Lookup(iter->first, key, &val))
72  values.insert(val);
73  else
74  all_present = false;
75  }
76  if (ans)
77  CopySetToVector(values, ans);
78  return all_present;
79 }
80 
81 static void GetEventKeys(const EventType &vec, std::vector<EventKeyType> *keys) {
82  keys->resize(vec.size());
83  EventType::const_iterator iter = vec.begin(), end = vec.end();
84  std::vector<EventKeyType>::iterator out_iter = keys->begin();
85  for (; iter!= end; ++iter, ++out_iter)
86  *out_iter = iter->first;
87 }
88 
89 
90 // recall:
91 // typedef std::vector<std::pair<EventType, Clusterable*> > BuildTreeStatsType;
92 void FindAllKeys(const BuildTreeStatsType &stats, AllKeysType keys_type, std::vector<EventKeyType> *keys_out) {
93  KALDI_ASSERT(keys_out != NULL);
94  BuildTreeStatsType::const_iterator iter = stats.begin(), end = stats.end();
95  if (iter == end) return; // empty set of keys.
96  std::vector<EventKeyType> keys;
97  GetEventKeys(iter->first, &keys);
98  ++iter;
99  for (; iter!= end; ++iter) {
100  std::vector<EventKeyType> keys2;
101  GetEventKeys(iter->first, &keys2);
102  if (keys_type == kAllKeysInsistIdentical) {
103  if (keys2 != keys)
104  KALDI_ERR << "AllKeys: keys in events are not all the same [called with kAllKeysInsistIdentical and all are not identical.";
105  } else if (keys_type == kAllKeysIntersection) {
106  std::vector<EventKeyType> new_keys(std::max(keys.size(), keys2.size()));
107  // following line relies on sorted order of event keys.
108  std::vector<EventKeyType>::iterator end_iter =
109  std::set_intersection(keys.begin(), keys.end(), keys2.begin(), keys2.end(), new_keys.begin());
110  new_keys.erase(end_iter, new_keys.end());
111  keys = new_keys;
112  } else { // union.
113  KALDI_ASSERT(keys_type == kAllKeysUnion);
114  std::vector<EventKeyType> new_keys(keys.size()+keys2.size());
115  // following line relies on sorted order of event keys.
116  std::vector<EventKeyType>::iterator end_iter =
117  std::set_union(keys.begin(), keys.end(), keys2.begin(), keys2.end(), new_keys.begin());
118  new_keys.erase(end_iter, new_keys.end());
119  keys = new_keys;
120  }
121  }
122  *keys_out = keys;
123 }
124 
125 
127  int32 *num_leaves) {
128  // First-- map the stats to each leaf in the EventMap.
129  std::vector<BuildTreeStatsType> split_stats;
130  SplitStatsByMap(stats, orig, &split_stats);
131  // Now for each leaf that has stats in it, do the table split according to the given name.
132  std::vector<EventMap*> splits(split_stats.size(), NULL);
133  for (EventAnswerType leaf = 0; leaf < (EventAnswerType)split_stats.size(); leaf++) {
134  if (!split_stats[leaf].empty()) {
135  // first work out the possible values the name takes.
136  std::vector<EventValueType> vals; // vals are put here, sorted.
137  bool all_present = PossibleValues(key, split_stats[leaf], &vals);
138  KALDI_ASSERT(all_present); // currently do not support mapping undefined values.
139  KALDI_ASSERT(!vals.empty() && vals.front() >= 0); // don't support mapping negative values
140  // at present time-- would need different EventMap type, not TableEventMap.
141  std::vector<EventMap*> table(vals.back()+1, (EventMap*)NULL);
142  for (size_t idx = 0;idx < vals.size();idx++) {
143  EventValueType val = vals[idx];
144  if (idx == 0) table[val] = new ConstantEventMap(leaf); // reuse current leaf.
145  else table[val] = new ConstantEventMap( (*num_leaves)++ ); // else take new leaf id.
146  }
147  // takes ownershipof stats.
148  splits[leaf] = new TableEventMap(key, table);
149  }
150  }
151  EventMap *ans = orig.Copy(splits);
152  DeletePointers(&splits);
153  return ans;
154 }
155 
156 EventMap *DoTableSplitMultiple(const EventMap &orig, const std::vector<EventKeyType> &keys, const BuildTreeStatsType &stats, int32 *num_leaves) {
157 
158  if (keys.empty()) return orig.Copy();
159  else {
160  EventMap *cur = NULL; // would make it &orig, except for const issues.
161  for (size_t i = 0; i < keys.size(); i++) {
162  EventMap *next = DoTableSplit( (cur ? *cur : orig), keys[i], stats, num_leaves);
163  delete cur; // delete intermediate maps.
164  cur = next;
165  }
166  return cur;
167  }
168 }
169 
170 
171 
172 void SplitStatsByMap(const BuildTreeStatsType &stats, const EventMap &e, std::vector<BuildTreeStatsType> *stats_out) {
173  BuildTreeStatsType::const_iterator iter, end = stats.end();
174  KALDI_ASSERT(stats_out != NULL);
175  stats_out->clear();
176  size_t size = 0;
177  for (iter = stats.begin(); iter != end; ++iter) {
178  const EventType &evec = iter->first;
179  EventAnswerType ans;
180  if (!e.Map(evec, &ans)) // this is an error--could not map it.
181  KALDI_ERR << "SplitStatsByMap: could not map event vector " << EventTypeToString(evec)
182  << "if error seen during tree-building, check that "
183  << "--context-width and --central-position match stats, "
184  << "and that phones that are context-independent (CI) during "
185  << "stats accumulation do not share roots with non-CI phones.";
186  size = std::max(size, (size_t)(ans+1));
187  }
188  stats_out->resize(size);
189  for (iter = stats.begin(); iter != end; ++iter) {
190  const EventType &evec = iter->first;
191  EventAnswerType ans;
192  bool b = e.Map(evec, &ans);
193  KALDI_ASSERT(b);
194  (*stats_out)[ans].push_back(*iter);
195  }
196 }
197 
198 void SplitStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key, std::vector<BuildTreeStatsType> *stats_out) {
199  BuildTreeStatsType::const_iterator iter, end = stats_in.end();
200  KALDI_ASSERT(stats_out != NULL);
201  stats_out->clear();
202  size_t size = 0;
203  // This loop works out size of output vector.
204  for (iter = stats_in.begin(); iter != end; ++iter) {
205  const EventType &evec = iter->first;
206  EventValueType val;
207  if (! EventMap::Lookup(evec, key, &val)) // no such key.
208  KALDI_ERR << "SplitStats: key "<< key << " is not present in event vector " << EventTypeToString(evec);
209  size = std::max(size, (size_t)(val+1));
210  }
211  stats_out->resize(size);
212  // This loop splits up the stats.
213  for (iter = stats_in.begin(); iter != end; ++iter) {
214  const EventType &evec = iter->first;
215  EventValueType val;
216  EventMap::Lookup(evec, key, &val); // will not fail.
217  (*stats_out)[val].push_back(*iter);
218  }
219 }
220 
221 
223  EventKeyType key,
224  std::vector<EventValueType> &values,
225  bool include_if_present, // true-> retain only in "values",
226  // false-> retain only not in "values".
227  BuildTreeStatsType *stats_out) {
228  KALDI_ASSERT(IsSortedAndUniq(values));
229  BuildTreeStatsType::const_iterator iter, end = stats_in.end();
230  KALDI_ASSERT(stats_out != NULL);
231  stats_out->clear();
232 
233  for (iter = stats_in.begin(); iter != end; ++iter) {
234  const EventType &evec = iter->first;
235  EventValueType val;
236  if (! EventMap::Lookup(evec, key, &val)) // no such key. // HERE.
237  KALDI_ERR << "SplitStats: key "<< key << " is not present in event vector " << EventTypeToString(evec);
238  bool in_values = std::binary_search(values.begin(), values.end(), val);
239  if (in_values == include_if_present)
240  stats_out->push_back(*iter);
241  }
242 }
243 
244 
246  Clusterable *ans = NULL;
247  BuildTreeStatsType::const_iterator iter = stats_in.begin(), end = stats_in.end();
248  for (; iter != end; ++iter) {
249  Clusterable *cl = iter->second;
250  if (cl != NULL) {
251  if (!ans) ans = cl->Copy();
252  else ans->Add(*cl);
253  }
254  }
255  return ans;
256 }
257 
259  BaseFloat ans = 0.0;
260  BuildTreeStatsType::const_iterator iter = stats_in.begin(), end = stats_in.end();
261  for (; iter != end; ++iter) {
262  Clusterable *cl = iter->second;
263  if (cl != NULL) ans += cl->Normalizer();
264  }
265  return ans;
266 }
267 
269  BaseFloat ans = 0.0;
270  BuildTreeStatsType::const_iterator iter = stats_in.begin(), end = stats_in.end();
271  for (; iter != end; ++iter) {
272  Clusterable *cl = iter->second;
273  if (cl != NULL) ans += cl->Objf();
274  }
275  return ans;
276 }
277 
278 
279 void SumStatsVec(const std::vector<BuildTreeStatsType> &stats_in, std::vector<Clusterable*> *stats_out) {
280  KALDI_ASSERT(stats_out != NULL && stats_out->empty());
281  stats_out->resize(stats_in.size(), NULL);
282  for (size_t i = 0;i < stats_in.size();i++) (*stats_out)[i] = SumStats(stats_in[i]);
283 }
284 
285 BaseFloat ObjfGivenMap(const BuildTreeStatsType &stats_in, const EventMap &e) {
286  std::vector<BuildTreeStatsType> split_stats;
287  SplitStatsByMap(stats_in, e, &split_stats);
288  std::vector<Clusterable*> summed_stats;
289  SumStatsVec(split_stats, &summed_stats);
290  BaseFloat ans = SumClusterableObjf(summed_stats);
291  DeletePointers(&summed_stats);
292  return ans;
293 }
294 
295 // This function computes the best initial split of these stats [with this key].
296 // Returns best objf change (>=0).
297 BaseFloat ComputeInitialSplit(const std::vector<Clusterable*> &summed_stats,
298  const Questions &q_opts, EventKeyType key,
299  std::vector<EventValueType> *yes_set) {
300  KALDI_ASSERT(yes_set != NULL);
301  yes_set->clear();
302  const QuestionsForKey &key_opts = q_opts.GetQuestionsOf(key);
303 
304  // "total" needed for optimization in AddToClustersOptimized,
305  // and also used to work otu total objf.
306  Clusterable *total = SumClusterable(summed_stats);
307  if (total == NULL) return 0.0; // because there were no stats or non-NULL stats.
308  BaseFloat unsplit_objf = total->Objf();
309 
310  const std::vector<std::vector<EventValueType> > &questions_of_this_key = key_opts.initial_questions;
311 
312  int32 best_idx = -1;
313  BaseFloat best_objf_change = 0;
314 
315  for (size_t i = 0; i < questions_of_this_key.size(); i++) {
316  const std::vector<EventValueType> &yes_set = questions_of_this_key[i];
317  std::vector<int32> assignments(summed_stats.size(), 0); // 0 is index of "no".
318  std::vector<Clusterable*> clusters(2); // no and yes clusters.
319  for (std::vector<EventValueType>::const_iterator iter = yes_set.begin(); iter != yes_set.end(); ++iter) {
320  KALDI_ASSERT(*iter>=0);
321  if (*iter < (EventValueType)assignments.size()) assignments[*iter] = 1;
322  }
323  kaldi::AddToClustersOptimized(summed_stats, assignments, *total, &clusters);
324  BaseFloat this_objf = SumClusterableObjf(clusters);
325 
326  if (this_objf < unsplit_objf- 0.001*std::abs(unsplit_objf)) { // got worse; should never happen.
327  // of course small differences can be caused by roundoff.
328  KALDI_WARN << "Objective function got worse when building tree: "<< this_objf << " < " << unsplit_objf;
329  KALDI_ASSERT(!(this_objf < unsplit_objf - 0.01*(200 + std::abs(unsplit_objf)))); // do assert on more stringent check.
330  }
331 
332  BaseFloat this_objf_change = this_objf - unsplit_objf;
333  if (this_objf_change > best_objf_change) {
334  best_objf_change = this_objf_change;
335  best_idx = i;
336  }
337  DeletePointers(&clusters);
338  }
339  delete total;
340  if (best_idx != -1)
341  *yes_set = questions_of_this_key[best_idx];
342  return best_objf_change;
343 }
344 
345 
346 // returns best delta-objf.
347 // If key does not exist, returns 0 and sets yes_set_out to empty.
349  const Questions &q_opts,
350  EventKeyType key,
351  std::vector<EventValueType> *yes_set_out) {
352  if (stats.size()<=1) return 0.0; // cannot split if only zero or one instance of stats.
353  if (!PossibleValues(key, stats, NULL)) {
354  yes_set_out->clear();
355  return 0.0; // Can't split as key not always defined.
356  }
357  std::vector<Clusterable*> summed_stats; // indexed by value corresponding to key. owned here.
358  { // compute summed_stats
359  std::vector<BuildTreeStatsType> split_stats;
360  SplitStatsByKey(stats, key, &split_stats);
361  SumStatsVec(split_stats, &summed_stats);
362  }
363 
364  std::vector<EventValueType> yes_set;
365  BaseFloat improvement = ComputeInitialSplit(summed_stats,
366  q_opts, key, &yes_set);
367  // find best basic question.
368 
369  std::vector<int32> assignments(summed_stats.size(), 0); // assigns to "no" (0) by default.
370  for (std::vector<EventValueType>::const_iterator iter = yes_set.begin(); iter != yes_set.end(); ++iter) {
371  KALDI_ASSERT(*iter>=0);
372  if (*iter < (EventValueType)assignments.size()) {
373  // this guard necessary in case stats did not have all the
374  // values present in "yes_set".
375  assignments[*iter] = 1; // assign to "yes" (1).
376  }
377  }
378  std::vector<Clusterable*> clusters(2, (Clusterable*)NULL); // no, yes.
379  kaldi::AddToClusters(summed_stats, assignments, &clusters);
380 
381  EnsureClusterableVectorNotNull(&summed_stats);
383 
384  // even if improvement == 0 we continue; if we do RefineClusters we may get further improvement.
385  // now do the RefineClusters stuff. Note that this is null-op if
386  // q_opts.GetQuestionsOf(key).refine_opts.num_iters == 0. We could check for this but don't bother;
387  // it happens in RefineClusters anyway.
388 
389  if (q_opts.GetQuestionsOf(key).refine_opts.num_iters > 0) {
390  // If we want to refine the questions... (a bit like k-means w/ 2 classes).
391  // Note: the only reason we introduced the if-statement is so the yes_set
392  // doesn't get modified (truncated, actually) if we do the refine stuff with
393  // zero iters.
394  BaseFloat refine_impr = RefineClusters(summed_stats, &clusters, &assignments,
395  q_opts.GetQuestionsOf(key).refine_opts);
396  KALDI_ASSERT(refine_impr > std::min(-1.0, -0.1*fabs(improvement)));
397  // refine_impr should always be positive
398  improvement += refine_impr;
399  yes_set.clear();
400  for (size_t i = 0;i < assignments.size();i++) if (assignments[i] == 1) yes_set.push_back(i);
401  }
402  *yes_set_out = yes_set;
403 
404  DeletePointers(&clusters);
405 #ifdef KALDI_PARANOID
406  { // Check the "ans" is correct.
407  KALDI_ASSERT(clusters.size() == 2 && clusters[0] == 0 && clusters[1] == 0);
408  AddToClusters(summed_stats, assignments, &clusters);
409  BaseFloat impr;
410  if (clusters[0] == NULL || clusters[1] == NULL) impr = 0.0;
411  else impr = clusters[0]->Distance(*(clusters[1]));
412  if (!ApproxEqual(impr, improvement) && fabs(impr-improvement) > 0.01) {
413  KALDI_WARN << "FindBestSplitForKey: improvements do not agree: "<< impr
414  << " vs. " << improvement;
415  }
416  DeletePointers(&clusters);
417  }
418 #endif
419  DeletePointers(&summed_stats);
420  return improvement; // objective-function improvement.
421 }
422 
423 
424 
425 /*
426  DecisionTreeBuilder is a class used in SplitDecisionTree
427 */
428 
430  public:
432  if (!yes_) { // leaf.
433  return new ConstantEventMap(leaf_);
434  } else {
435  return new SplitEventMap(key_, yes_set_, yes_->GetMap(), no_->GetMap());
436  }
437  }
438  BaseFloat BestSplit() { return best_split_impr_; } // returns objf improvement (>=0) of best possible split.
439  void DoSplit(int32 *next_leaf) {
440  if (!yes_) { // not already split; we are a leaf, so split.
441  DoSplitInternal(next_leaf);
442  } else { // find which of our children is best to split, and split that.
443  (yes_->BestSplit() >= no_->BestSplit() ? yes_ : no_)->DoSplit(next_leaf);
444  best_split_impr_ = std::max(yes_->BestSplit(), no_->BestSplit()); // may have changed.
445  }
446  }
448  const Questions &q_opts): q_opts_(q_opts), yes_(NULL), no_(NULL), leaf_(leaf), stats_(stats) {
449  // not, this must work when stats is empty too. [just gives zero improvement, non-splittable].
450  FindBestSplit();
451  }
453  delete yes_;
454  delete no_;
455  }
456  private:
457  void DoSplitInternal(int32 *next_leaf) {
458  // Does the split; applicable only to leaf nodes.
459  KALDI_ASSERT(!yes_); // make sure children not already set up.
461  EventAnswerType yes_leaf = leaf_, no_leaf = (*next_leaf)++;
462  leaf_ = -1; // we now have no leaf.
463  // Now split the stats.
464  BuildTreeStatsType yes_stats, no_stats;
465  yes_stats.reserve(stats_.size()); no_stats.reserve(stats_.size()); // probably better than multiple resizings.
466  for (BuildTreeStatsType::const_iterator iter = stats_.begin(); iter != stats_.end(); ++iter) {
467  const EventType &vec = iter->first;
468  EventValueType val;
469  if (!EventMap::Lookup(vec, key_, &val)) KALDI_ERR << "DoSplitInternal: key has no value.";
470  if (std::binary_search(yes_set_.begin(), yes_set_.end(), val)) yes_stats.push_back(*iter);
471  else no_stats.push_back(*iter);
472  }
473 #ifdef KALDI_PARANOID
474  { // Check objf improvement.
475  Clusterable *yes_clust = SumStats(yes_stats), *no_clust = SumStats(no_stats);
476  BaseFloat impr_check = yes_clust->Distance(*no_clust);
477  // this is a negated objf improvement from merging (== objf improvement from splitting).
478  if (!ApproxEqual(impr_check, best_split_impr_, 0.01)) {
479  KALDI_WARN << "DoSplitInternal: possible problem: "<< impr_check << " != " << best_split_impr_;
480  }
481  delete yes_clust; delete no_clust;
482  }
483 #endif
484  yes_ = new DecisionTreeSplitter(yes_leaf, yes_stats, q_opts_);
485  no_ = new DecisionTreeSplitter(no_leaf, no_stats, q_opts_);
486  best_split_impr_ = std::max(yes_->BestSplit(), no_->BestSplit());
487  stats_.clear(); // note: pointers in stats_ were not owned here.
488  }
489  void FindBestSplit() {
490  // This sets best_split_impr_, key_ and yes_set_.
491  // May just pick best question, or may iterate a bit (depends on
492  // q_opts; see FindBestSplitForKey for details)
493  std::vector<EventKeyType> all_keys;
494  q_opts_.GetKeysWithQuestions(&all_keys);
495  if (all_keys.size() == 0) {
496  KALDI_WARN << "DecisionTreeSplitter::FindBestSplit(), no keys available to split on (maybe no key covered all of your events, or there was a problem with your questions configuration?)";
497  }
498  best_split_impr_ = 0;
499  for (size_t i = 0; i < all_keys.size(); i++) {
500  if (q_opts_.HasQuestionsForKey(all_keys[i])) {
501  std::vector<EventValueType> temp_yes_set;
502  BaseFloat split_improvement = FindBestSplitForKey(stats_, q_opts_, all_keys[i], &temp_yes_set);
503  if (split_improvement > best_split_impr_) {
504  best_split_impr_ = split_improvement;
505  yes_set_ = temp_yes_set;
506  key_ = all_keys[i];
507  }
508  }
509  }
510  }
511 
512 
513 
514  // Data members... Always used:
517 
518  // If already split:
521 
522  // Otherwise:
524  BuildTreeStatsType stats_; // vector of stats. pointers inside there not owned here.
525 
526  // key and "yes set" of best split:
528  std::vector<EventValueType> yes_set_;
529 
530 };
531 
533  const BuildTreeStatsType &stats,
534  Questions &q_opts,
535  BaseFloat thresh,
536  int32 max_leaves, // max_leaves<=0 -> no maximum.
537  int32 *num_leaves,
538  BaseFloat *obj_impr_out,
539  BaseFloat *smallest_split_change_out) {
540  KALDI_ASSERT(num_leaves != NULL && *num_leaves > 0); // can't be 0 or input_map would be empty.
541  int32 num_empty_leaves = 0;
542  BaseFloat like_impr = 0.0;
543  BaseFloat smallest_split_change = 1.0e+20;
544  std::vector<DecisionTreeSplitter*> builders;
545  { // set up "builders" [one for each current leaf]. This array is never extended.
546  // the structures generated during splitting remain as trees at each array location.
547  std::vector<BuildTreeStatsType> split_stats;
548  SplitStatsByMap(stats, input_map, &split_stats);
549  KALDI_ASSERT(split_stats.size() != 0);
550  builders.resize(split_stats.size()); // size == #leaves.
551  for (size_t i = 0;i < split_stats.size();i++) {
552  EventAnswerType leaf = static_cast<EventAnswerType>(i);
553  if (split_stats[i].size() == 0) num_empty_leaves++;
554  builders[i] = new DecisionTreeSplitter(leaf, split_stats[i], q_opts);
555  }
556  }
557 
558  { // Do the splitting.
559  int32 count = 0;
560  std::priority_queue<std::pair<BaseFloat, size_t> > queue; // use size_t because logically these
561  // are just indexes into the array, not leaf-ids (after splitting they are no longer leaf id's).
562  // Initialize queue.
563  for (size_t i = 0; i < builders.size(); i++)
564  queue.push(std::make_pair(builders[i]->BestSplit(), i));
565  // Note-- queue's size never changes from now. All the alternatives leaves to split are
566  // inside the "DecisionTreeSplitter*" objects, in a tree structure.
567  while (queue.top().first > thresh
568  && (max_leaves<=0 || *num_leaves < max_leaves)) {
569  smallest_split_change = std::min(smallest_split_change, queue.top().first);
570  size_t i = queue.top().second;
571  like_impr += queue.top().first;
572  builders[i]->DoSplit(num_leaves);
573  queue.pop();
574  queue.push(std::make_pair(builders[i]->BestSplit(), i));
575  count++;
576  }
577  KALDI_LOG << "DoDecisionTreeSplit: split "<< count << " times, #leaves now " << (*num_leaves);
578  }
579 
580  if (smallest_split_change_out)
581  *smallest_split_change_out = smallest_split_change;
582 
583  EventMap *answer = NULL;
584 
585  { // Create the output EventMap.
586  std::vector<EventMap*> sub_trees(builders.size());
587  for (size_t i = 0; i < sub_trees.size();i++) sub_trees[i] = builders[i]->GetMap();
588  answer = input_map.Copy(sub_trees);
589  for (size_t i = 0; i < sub_trees.size();i++) delete sub_trees[i];
590  }
591  // Free up memory.
592  for (size_t i = 0;i < builders.size();i++) delete builders[i];
593 
594  if (obj_impr_out != NULL) *obj_impr_out = like_impr;
595  return answer;
596 }
597 
598 
600  const BuildTreeStatsType &stats,
601  BaseFloat thresh,
602  std::vector<EventMap*> *mapping) {
603  // First map stats
604  KALDI_ASSERT(stats.size() != 0);
605  std::vector<BuildTreeStatsType> split_stats;
606  SplitStatsByMap(stats, e_in, &split_stats);
607  std::vector<Clusterable*> summed_stats;
608  SumStatsVec(split_stats, &summed_stats);
609 
610  std::vector<int32> indexes;
611  std::vector<Clusterable*> summed_stats_contiguous;
612  size_t max_index = 0;
613  for (size_t i = 0;i < summed_stats.size();i++) {
614  if (summed_stats[i] != NULL) {
615  indexes.push_back(i);
616  summed_stats_contiguous.push_back(summed_stats[i]);
617  if (i > max_index) max_index = i;
618  }
619  }
620  if (summed_stats_contiguous.empty()) {
621  KALDI_WARN << "ClusterBottomUp: nothing to cluster.";
622  return 0; // nothing merged.
623  }
624 
625  std::vector<int32> assignments;
626  BaseFloat normalizer = SumClusterableNormalizer(summed_stats_contiguous),
627  change;
628  change = ClusterBottomUp(summed_stats_contiguous,
629  thresh,
630  0, // no min-clust: use threshold for now.
631  NULL, // don't need clusters out.
632  &assignments); // this algorithm is quadratic, so might be quite slow.
633 
634 
635  KALDI_ASSERT(assignments.size() == summed_stats_contiguous.size() && !assignments.empty());
636  size_t num_clust = * std::max_element(assignments.begin(), assignments.end()) + 1;
637  int32 num_combined = summed_stats_contiguous.size() - num_clust;
638  KALDI_ASSERT(num_combined >= 0);
639 
640  KALDI_VLOG(2) << "ClusterBottomUp combined "<< num_combined
641  << " leaves and gave a likelihood change of " << change
642  << ", normalized = " << (change/normalizer)
643  << ", normalizer = " << normalizer;
644  KALDI_ASSERT(change < 0.0001); // should be negative or zero.
645 
646  KALDI_ASSERT(mapping != NULL);
647  if (max_index >= mapping->size()) mapping->resize(max_index+1, NULL);
648 
649  for (size_t i = 0;i < summed_stats_contiguous.size();i++) {
650  size_t index = indexes[i];
651  size_t new_index = indexes[assignments[i]]; // index assigned by clusterig-- map to existing indices in the map,
652  // that we clustered from, so we don't conflict with indices in other parts
653  // of the tree.
654  KALDI_ASSERT((*mapping)[index] == NULL || "Error: Cluster seems to have been "
655  "called for different parts of the tree with overlapping sets of "
656  "indices.");
657  (*mapping)[index] = new ConstantEventMap(new_index);
658  }
659  DeletePointers(&summed_stats);
660  return num_combined;
661 }
662 
663 
664 EventMap *RenumberEventMap(const EventMap &e_in, int32 *num_leaves) {
665  EventType empty_vec;
666  std::vector<EventAnswerType> initial_leaves; // before renumbering.
667  e_in.MultiMap(empty_vec, &initial_leaves);
668  if (initial_leaves.empty()) {
669  KALDI_ASSERT(num_leaves);
670  if (num_leaves) *num_leaves = 0;
671  return e_in.Copy();
672  }
673  SortAndUniq(&initial_leaves);
674  EventAnswerType max_leaf_plus_one = initial_leaves.back() + 1; // will typically, but not always, equal *num_leaves.
675  std::vector<EventMap*> mapping(max_leaf_plus_one, (EventMap*)NULL);
676  std::vector<EventAnswerType>::iterator iter = initial_leaves.begin(), end = initial_leaves.end();
677  EventAnswerType cur_leaf = 0;
678  for (; iter != end; ++iter) {
679  KALDI_ASSERT(*iter >= 0 && *iter<max_leaf_plus_one);
680  mapping[*iter] = new ConstantEventMap(cur_leaf++);
681  }
682  EventMap *ans = e_in.Copy(mapping);
683  DeletePointers(&mapping);
684  KALDI_ASSERT((size_t)cur_leaf == initial_leaves.size());
685  if (num_leaves) *num_leaves = cur_leaf;
686  return ans;
687 }
688 
690  const std::vector<int32> &mapping_in) {
691  std::vector<EventMap*> mapping(mapping_in.size());
692  for (size_t i = 0; i < mapping_in.size(); i++)
693  mapping[i] = new ConstantEventMap(mapping_in[i]);
694  EventMap *ans = e_in.Copy(mapping);
695  DeletePointers(&mapping);
696  return ans;
697 }
698 
700  BaseFloat thresh, int32 *num_removed_ptr) {
701  std::vector<EventMap*> mapping;
702  int32 num_removed = ClusterEventMapGetMapping(e_in, stats, thresh, &mapping);
703  EventMap *ans = e_in.Copy(mapping);
704  DeletePointers(&mapping);
705  if (num_removed_ptr != NULL) *num_removed_ptr = num_removed;
706  return ans;
707 
708 }
709 
711  std::vector<std::vector<EventValueType> > &values,
712  int32 *num_leaves) {
713  // the use of "pdfs" as the name of the next variable reflects the anticipated
714  // use of this function.
715  std::vector<std::vector<EventAnswerType> > pdfs(values.size());
716  for (size_t i = 0; i < values.size(); i++) {
717  EventType evec;
718  for (size_t j = 0; j < values[i].size(); j++) {
719  evec.push_back(MakeEventPair(key, values[i][j]));
720  size_t size_at_start = pdfs[i].size();
721  e_in.MultiMap(evec, &(pdfs[i])); // append any corresponding pdfs to pdfs[i].
722  if (pdfs[i].size() == size_at_start) { // Nothing added... unexpected.
723  KALDI_WARN << "ShareEventMapLeaves: had no leaves for key = " << key
724  << ", value = " << (values[i][j]);
725  }
726  }
727  SortAndUniq(&(pdfs[i]));
728  }
729  std::vector<EventMap*> remapping;
730  for (size_t i = 0; i < values.size(); i++) {
731  if (pdfs[i].empty())
732  KALDI_WARN << "ShareEventMapLeaves: no leaves in one bucket."; // not expected.
733  else {
734  EventAnswerType map_to_this = pdfs[i][0]; // map all in this bucket
735  // to this value.
736  for (size_t j = 1; j < pdfs[i].size(); j++) {
737  EventAnswerType leaf = pdfs[i][j];
738  KALDI_ASSERT(leaf>=0);
739  if (remapping.size() <= static_cast<size_t>(leaf))
740  remapping.resize(leaf+1, NULL);
741  KALDI_ASSERT(remapping[leaf] == NULL);
742  remapping[leaf] = new ConstantEventMap(map_to_this);
743  }
744  }
745  }
746  EventMap *shared = e_in.Copy(remapping);
747  DeletePointers(&remapping);
748  EventMap *renumbered = RenumberEventMap(*shared, num_leaves);
749  delete shared;
750  return renumbered;
751 }
752 
753 
755  KALDI_ASSERT(stats != NULL);
756  BuildTreeStatsType::iterator iter = stats->begin(), end = stats->end();
757  for (; iter!= end; ++iter) if (iter->second != NULL) { delete iter->second; iter->second = NULL; } // set to NULL for extra safety.
758 }
759 
761  const std::vector<EventValueType> *phones,
762  int32 default_length) {
763  std::vector<BuildTreeStatsType> stats_by_phone;
764  try {
765  SplitStatsByKey(stats, P, &stats_by_phone);
766  } catch(const KaldiFatalError &) {
767  KALDI_ERR <<
768  "You seem to have provided invalid stats [no central-phone key].";
769  }
770  std::map<EventValueType, EventAnswerType> phone_to_length;
771  for (size_t p = 0; p < stats_by_phone.size(); p++) {
772  if (! stats_by_phone[p].empty()) {
773  std::vector<BuildTreeStatsType> stats_by_length;
774  try {
775  SplitStatsByKey(stats_by_phone[p], kPdfClass, &stats_by_length);
776  } catch(const KaldiFatalError &) {
777  KALDI_ERR <<
778  "You seem to have provided invalid stats [no position key].";
779  }
780  size_t length = stats_by_length.size();
781  for (size_t i = 0; i < length; i++) {
782  if (stats_by_length[i].empty()) {
783  KALDI_ERR << "There are no stats available for position " << i
784  << " of phone " << p;
785  }
786  }
787  phone_to_length[p] = length;
788  }
789  }
790  if (phones != NULL) { // set default length for unseen phones.
791  for (size_t i = 0; i < phones->size(); i++) {
792  if (phone_to_length.count( (*phones)[i] ) == 0) { // unseen.
793  phone_to_length[(*phones)[i]] = default_length;
794  }
795  }
796  }
797  EventMap *ans = new TableEventMap(P, phone_to_length);
798  return ans;
799 }
800 
801 // Recursive routine that is a helper to ClusterEventMapRestricted.
802 // returns number removed.
804  const BuildTreeStatsType &stats,
805  BaseFloat thresh,
806  std::vector<EventKeyType> keys,
807  std::vector<EventMap*> *leaf_mapping) {
808  if (keys.size() == 0) {
809  return ClusterEventMapGetMapping(e_in, stats, thresh, leaf_mapping);
810  } else { // split on the key.
811  int32 ans = 0;
812  std::vector<BuildTreeStatsType> split_stats;
813  SplitStatsByKey(stats, keys.back(), &split_stats);
814  keys.pop_back();
815  for (size_t i = 0; i< split_stats.size(); i++)
816  if (split_stats[i].size() != 0)
817  ans += ClusterEventMapRestrictedHelper(e_in, split_stats[i], thresh, keys, leaf_mapping);
818  return ans;
819  }
820 }
821 
823  const BuildTreeStatsType &stats,
824  BaseFloat thresh,
825  const std::vector<EventKeyType> &keys,
826  int32 *num_removed) {
827  std::vector<EventMap*> leaf_mapping; // For output of ClusterEventMapGetMapping.
828 
829  int32 nr = ClusterEventMapRestrictedHelper(e_in, stats, thresh, keys, &leaf_mapping);
830  if (num_removed != NULL) *num_removed = nr;
831 
832  EventMap *ans = e_in.Copy(leaf_mapping);
833  DeletePointers(&leaf_mapping);
834  return ans;
835 }
836 
837 
839  const BuildTreeStatsType &stats,
840  BaseFloat thresh,
841  const EventMap &e_restrict,
842  int32 *num_removed_ptr) {
843  std::vector<EventMap*> leaf_mapping;
844 
845  std::vector<BuildTreeStatsType> split_stats;
846  int num_removed = 0;
847  SplitStatsByMap(stats, e_restrict, &split_stats);
848  for (size_t i = 0; i < split_stats.size(); i++) {
849  if (!split_stats[i].empty())
850  num_removed += ClusterEventMapGetMapping(e_in, split_stats[i], thresh,
851  &leaf_mapping);
852  }
853 
854  if (num_removed_ptr != NULL) *num_removed_ptr = num_removed;
855 
856  EventMap *ans = e_in.Copy(leaf_mapping);
857  DeletePointers(&leaf_mapping);
858  return ans;
859 }
860 
862  const EventMap &e_in,
863  const BuildTreeStatsType &stats,
864  int32 num_clusters_required,
865  const EventMap &e_restrict,
866  int32 *num_removed_ptr) {
867  std::vector<BuildTreeStatsType> split_stats;
868  SplitStatsByMap(stats, e_restrict, &split_stats);
869 
870  if (num_clusters_required < split_stats.size()) {
871  KALDI_WARN << "num-clusters-required is less than size of map. Not doing anything.";
872  if (num_removed_ptr) *num_removed_ptr = 0;
873  return e_in.Copy();
874  }
875 
876  std::vector<std::vector<int32> > indexes(split_stats.size());
877  std::vector<std::vector<Clusterable*> > summed_stats_contiguous(split_stats.size());
878 
879  BaseFloat normalizer = 0.0;
880 
881  size_t max_index = 0;
882 
883  int32 num_non_empty_clusters_required = num_clusters_required;
884 
885  int32 num_non_empty_clusters_in_map = 0;
886  int32 num_non_empty_clusters = 0;
887 
888  for (size_t i = 0; i < split_stats.size(); i++) {
889  if (!split_stats[i].empty()) {
890  num_non_empty_clusters_in_map++;
891 
892  std::vector<BuildTreeStatsType> split_stats_i;
893  SplitStatsByMap(split_stats[i], e_in, &split_stats_i);
894  std::vector<Clusterable*> summed_stats_i;
895  SumStatsVec(split_stats_i, &summed_stats_i);
896 
897  for (size_t j = 0; j < summed_stats_i.size(); j++) {
898  if (summed_stats_i[j] != NULL) {
899  num_non_empty_clusters++;
900  indexes[i].push_back(j);
901  summed_stats_contiguous[i].push_back(summed_stats_i[j]);
902  if (j > max_index) max_index = j;
903  }
904  }
905 
906  normalizer += SumClusterableNormalizer(summed_stats_contiguous[i]);
907  } else {
908  // Even if split_stats[i] is empty, a cluster will be assigned to
909  // that. To compensate, we decrease the num-clusters required.
910  num_non_empty_clusters_required--;
911  }
912  }
913 
914  KALDI_VLOG(1) << "Number of non-empty clusters in map = " << num_non_empty_clusters_in_map;
915  KALDI_VLOG(1) << "Number of non-empty clusters = " << num_non_empty_clusters;
916 
917  if (num_non_empty_clusters_required > num_non_empty_clusters) {
918  KALDI_WARN << "Cannot get required num-clusters " << num_clusters_required
919  << " as number of non-empty clusters required is larger than "
920  << " number of non-empty clusters: " << num_non_empty_clusters_required
921  << " > " << num_non_empty_clusters;
922  if (num_removed_ptr) *num_removed_ptr = 0;
923  return e_in.Copy();
924  }
925 
926  std::vector<std::vector<int32> > assignments;
928  summed_stats_contiguous,
929  std::numeric_limits<BaseFloat>::infinity(),
930  num_non_empty_clusters_required,
931  NULL, // don't need clusters out.
932  &assignments); // this algorithm is quadratic, so might be quite slow.
933 
934  KALDI_ASSERT(assignments.size() == split_stats.size());
935 
936  int32 num_combined = 0;
937  for (size_t i = 0; i < split_stats.size(); i++) {
938  KALDI_ASSERT(assignments[i].size() == summed_stats_contiguous[i].size());
939  if (assignments[i].size() == 0) continue;
940  size_t num_clust_i = *std::max_element(assignments[i].begin(),
941  assignments[i].end()) + 1;
942  num_combined += summed_stats_contiguous[i].size() - num_clust_i;
943  }
944 
945  KALDI_VLOG(2) << "ClusterBottomUpCompartmentalized combined " << num_combined
946  << " leaves and gave a likelihood change of " << change
947  << ", normalized = " << (change / normalizer)
948  << ", normalizer = " << normalizer;
949  KALDI_ASSERT(change < 0.0001); // should be negative or zero.
950 
951  std::vector<EventMap*> leaf_mapping(max_index + 1, NULL);
952 
953  for (size_t i = 0; i < split_stats.size(); i++) {
954  for (size_t j = 0; j < summed_stats_contiguous[i].size(); j++) {
955  size_t index = indexes[i][j];
956  size_t new_index = indexes[i][assignments[i][j]];
957  // index assigned by clusterig-- map to existing indices in the map,
958  // that we clustered from, so we don't conflict with indices in other parts
959  // of the tree.
960  KALDI_ASSERT(leaf_mapping[index] == NULL || "Error: Cluster seems to have been "
961  "called for different parts of the tree with overlapping sets of "
962  "indices.");
963  leaf_mapping[index] = new ConstantEventMap(new_index);
964  }
965  DeletePointers(&summed_stats_contiguous[i]);
966  }
967 
968  if (num_removed_ptr) *num_removed_ptr = num_combined;
969 
970  EventMap *ans = e_in.Copy(leaf_mapping);
971  DeletePointers(&leaf_mapping);
972  return ans;
973 }
974 
976  const std::vector<std::vector<int32> > &phone_sets,
977  const std::vector<int32> &phone2num_pdf_classes,
978  const std::vector<bool> &share_roots,
979  int32 *num_leaves_out) {
980 
981  { // Checking inputs.
982  KALDI_ASSERT(!phone_sets.empty() && share_roots.size() == phone_sets.size());
983  std::set<int32> all_phones;
984  for (size_t i = 0; i < phone_sets.size(); i++) {
985  KALDI_ASSERT(IsSortedAndUniq(phone_sets[i]));
986  KALDI_ASSERT(!phone_sets[i].empty());
987  for (size_t j = 0; j < phone_sets[i].size(); j++) {
988  KALDI_ASSERT(all_phones.count(phone_sets[i][j]) == 0); // check not present.
989  all_phones.insert(phone_sets[i][j]);
990  }
991  }
992  }
993 
994  // Initially create a single leaf for each phone set.
995 
996  size_t max_set_size = 0;
997  int32 highest_numbered_phone = 0;
998  for (size_t i = 0; i < phone_sets.size(); i++) {
999  max_set_size = std::max(max_set_size, phone_sets[i].size());
1000  highest_numbered_phone =
1001  std::max(highest_numbered_phone,
1002  * std::max_element(phone_sets[i].begin(), phone_sets[i].end()));
1003  }
1004 
1005  if (phone_sets.size() == 1) { // there is only one set so the recursion finishes.
1006  if (share_roots[0]) { // if "shared roots" return a single leaf.
1007  return new ConstantEventMap( (*num_leaves_out)++ );
1008  } else { // not sharing roots -> work out the length and return a
1009  // TableEventMap splitting on length.
1010  EventAnswerType max_len = 0;
1011  for (size_t i = 0; i < phone_sets[0].size(); i++) {
1012  EventAnswerType len;
1013  EventValueType phone = phone_sets[0][i];
1014  KALDI_ASSERT(static_cast<size_t>(phone) < phone2num_pdf_classes.size());
1015  len = phone2num_pdf_classes[phone];
1016  KALDI_ASSERT(len > 0);
1017  if (i == 0) max_len = len;
1018  else {
1019  if (len != max_len) {
1020  KALDI_WARN << "Mismatching lengths within a phone set: " << len
1021  << " vs. " << max_len << " [unusual, but not necessarily fatal]. ";
1022  max_len = std::max(len, max_len);
1023  }
1024  }
1025  }
1026  std::map<EventValueType, EventAnswerType> m;
1027  for (EventAnswerType p = 0; p < max_len; p++)
1028  m[p] = (*num_leaves_out)++;
1029  return new TableEventMap(kPdfClass, // split on hmm-position
1030  m);
1031  }
1032  } else if (max_set_size == 1
1033  && static_cast<int32>(phone_sets.size()) <= 2*highest_numbered_phone) {
1034  // create table map splitting on phone-- more efficient.
1035  // the part after the && checks that this would not contain a very sparse vector.
1036  std::map<EventValueType, EventMap*> m;
1037  for (size_t i = 0; i < phone_sets.size(); i++) {
1038  std::vector<std::vector<int32> > phone_sets_tmp;
1039  phone_sets_tmp.push_back(phone_sets[i]);
1040  std::vector<bool> share_roots_tmp;
1041  share_roots_tmp.push_back(share_roots[i]);
1042  EventMap *this_stub = GetStubMap(P, phone_sets_tmp, phone2num_pdf_classes,
1043  share_roots_tmp,
1044  num_leaves_out);
1045  KALDI_ASSERT(m.count(phone_sets_tmp[0][0]) == 0);
1046  m[phone_sets_tmp[0][0]] = this_stub;
1047  }
1048  return new TableEventMap(P, m);
1049  } else {
1050  // Do a split. Recurse.
1051  size_t half_sz = phone_sets.size() / 2;
1052  std::vector<std::vector<int32> >::const_iterator half_phones =
1053  phone_sets.begin() + half_sz;
1054  std::vector<bool>::const_iterator half_share =
1055  share_roots.begin() + half_sz;
1056  std::vector<std::vector<int32> > phone_sets_1, phone_sets_2;
1057  std::vector<bool> share_roots_1, share_roots_2;
1058  phone_sets_1.insert(phone_sets_1.end(), phone_sets.begin(), half_phones);
1059  phone_sets_2.insert(phone_sets_2.end(), half_phones, phone_sets.end());
1060  share_roots_1.insert(share_roots_1.end(), share_roots.begin(), half_share);
1061  share_roots_2.insert(share_roots_2.end(), half_share, share_roots.end());
1062 
1063  EventMap *map1 = GetStubMap(P, phone_sets_1, phone2num_pdf_classes, share_roots_1, num_leaves_out);
1064  EventMap *map2 = GetStubMap(P, phone_sets_2, phone2num_pdf_classes, share_roots_2, num_leaves_out);
1065 
1066  std::vector<EventKeyType> all_in_first_set;
1067  for (size_t i = 0; i < half_sz; i++)
1068  for (size_t j = 0; j < phone_sets[i].size(); j++)
1069  all_in_first_set.push_back(phone_sets[i][j]);
1070  std::sort(all_in_first_set.begin(), all_in_first_set.end());
1071  KALDI_ASSERT(IsSortedAndUniq(all_in_first_set));
1072  return new SplitEventMap(P, all_in_first_set, map1, map2);
1073  }
1074 }
1075 
1076 // convert stats to different (possibly smaller) context-window size.
1077 bool ConvertStats(int32 oldN, int32 oldP, int32 newN, int32 newP,
1078  BuildTreeStatsType *stats) {
1079  bool warned = false;
1080  KALDI_ASSERT(stats != NULL && oldN > 0 && newN > 0 && oldP >= 0
1081  && newP >= 0 && newP < newN && oldP < oldN);
1082  if (newN > oldN) { // can't add unseen context.
1083  KALDI_WARN << "Cannot convert stats to larger context: " << newN
1084  << " > " << oldN;
1085  return false;
1086  }
1087  if (newP > oldP) {
1088  KALDI_WARN << "Cannot convert stats to have more left-context: " << newP
1089  << " > " << oldP;
1090  }
1091  if (newN-newP-1 > oldN-oldP-1) {
1092  KALDI_WARN << "Cannot convert stats to have more right-context: " << (newN-newP-1)
1093  << " > " << (oldN-oldP-1);
1094  }
1095  // if shift < 0, this is by how much we "shift down" key
1096  // values.
1097  int32 shift = newP - oldP; // shift <= 0.
1098 
1099  for (size_t i = 0; i < stats->size(); i++) {
1100  EventType &evec = (*stats)[i].first;
1101  EventType evec_new;
1102  for (size_t j = 0; j < evec.size(); j++) {
1103  EventKeyType key = evec[j].first;
1104  if (key >= 0 && key < oldN) { //
1105  key += shift;
1106  if (key >= 0 && key < newN) // within new context window...
1107  evec_new.push_back(std::make_pair(key, evec[j].second));
1108  } else {
1109  if (key != -1) {
1110  // don't understand this key value but assume for now
1111  // it's something that doesn't interact with the context window.
1112  if (!warned) {
1113  KALDI_WARN << "Stats had keys defined that we cannot interpret";
1114  warned = true;
1115  }
1116  }
1117  evec_new.push_back(evec[j]);
1118  }
1119  }
1120  evec = evec_new; // Assign the modified EventVector with possibly
1121  // deleted keys.
1122  }
1123  return true;
1124 }
1125 
1126 
1127 } // end namespace kaldi
void GetKeysWithQuestions(std::vector< EventKeyType > *keys_out) const
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
BaseFloat SumNormalizer(const BuildTreeStatsType &stats_in)
Sums the normalizer [typically, data-count] over the stats.
EventMap * DoTableSplitMultiple(const EventMap &orig, const std::vector< EventKeyType > &keys, const BuildTreeStatsType &stats, int32 *num_leaves)
DoTableSplitMultiple does a complete split on all the keys, in order from keys[0], keys[1] and so on.
static void GetEventKeys(const EventType &vec, std::vector< EventKeyType > *keys)
void CopySetToVector(const std::set< T > &s, std::vector< T > *v)
Copies the elements of a set to a vector.
Definition: stl-utils.h:86
virtual void Add(const Clusterable &other)=0
Add other stats.
void DeletePointers(std::vector< A *> *v)
Deletes any non-NULL pointers in the vector v, and sets the corresponding entries of v to NULL...
Definition: stl-utils.h:184
std::pair< EventKeyType, EventValueType > MakeEventPair(EventKeyType k, EventValueType v)
Definition: event-map.h:62
bool ConvertStats(int32 oldN, int32 oldP, int32 newN, int32 newP, BuildTreeStatsType *stats)
Converts stats from a given context-window (N) and central-position (P) to a different N and P...
This class defines, for each EventKeyType, a set of initial questions that it tries and also a number...
BaseFloat RefineClusters(const std::vector< Clusterable *> &points, std::vector< Clusterable *> *clusters, std::vector< int32 > *assignments, RefineClustersOptions cfg)
RefineClusters is mainly used internally by other clustering algorithms.
EventMap * SplitDecisionTree(const EventMap &input_map, const BuildTreeStatsType &stats, Questions &q_opts, BaseFloat thresh, int32 max_leaves, int32 *num_leaves, BaseFloat *obj_impr_out, BaseFloat *smallest_split_change_out)
Does a decision-tree split at the leaves of an EventMap.
DecisionTreeSplitter * yes_
Clusterable * SumStats(const BuildTreeStatsType &stats_in)
Sums stats, or returns NULL stats_in has no non-NULL stats.
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
virtual BaseFloat Objf() const =0
Return the objective function associated with the stats [assuming ML estimation]. ...
BaseFloat SumClusterableNormalizer(const std::vector< Clusterable *> &vec)
Returns the total normalizer (usually count) of the cluster (pointers may be NULL).
std::string EventTypeToString(const EventType &evec)
Definition: event-map.cc:253
void SplitStatsByMap(const BuildTreeStatsType &stats, const EventMap &e, std::vector< BuildTreeStatsType > *stats_out)
Splits stats according to the EventMap, indexing them at output by the leaf type. ...
EventMap * ShareEventMapLeaves(const EventMap &e_in, EventKeyType key, std::vector< std::vector< EventValueType > > &values, int32 *num_leaves)
ShareEventMapLeaves performs a quite specific function that allows us to generate trees where...
void FindAllKeys(const BuildTreeStatsType &stats, AllKeysType keys_type, std::vector< EventKeyType > *keys_out)
FindAllKeys puts in *keys the (sorted, unique) list of all key identities in the stats.
const QuestionsForKey & GetQuestionsOf(EventKeyType key) const
static int32 ClusterEventMapRestrictedHelper(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, std::vector< EventKeyType > keys, std::vector< EventMap *> *leaf_mapping)
kaldi::int32 int32
BaseFloat FindBestSplitForKey(const BuildTreeStatsType &stats, const Questions &q_opts, EventKeyType key, std::vector< EventValueType > *yes_set_out)
FindBestSplitForKey is a function used in DoDecisionTreeSplit.
BaseFloat ClusterBottomUpCompartmentalized(const std::vector< std::vector< Clusterable *> > &points, BaseFloat thresh, int32 min_clust, std::vector< std::vector< Clusterable *> > *clusters_out, std::vector< std::vector< int32 > > *assignments_out)
This is a bottom-up clustering where the points are pre-clustered in a set of compartments, such that only points in the same compartment are clustered together.
Kaldi fatal runtime error exception.
Definition: kaldi-error.h:89
void SortAndUniq(std::vector< T > *vec)
Sorts and uniq&#39;s (removes duplicates) from a vector.
Definition: stl-utils.h:39
std::vector< EventValueType > yes_set_
void SplitStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key, std::vector< BuildTreeStatsType > *stats_out)
SplitStatsByKey splits stats up according to the value of a particular key, which must be always defi...
virtual bool Map(const EventType &event, EventAnswerType *ans) const =0
virtual Clusterable * Copy() const =0
Return a copy of this object.
bool PossibleValues(EventKeyType key, const BuildTreeStatsType &stats, std::vector< EventValueType > *ans)
Convenience function e.g.
void DeleteBuildTreeStats(BuildTreeStatsType *stats)
This frees the Clusterable* pointers in "stats", where non-NULL, and sets them to NULL...
void ReadBuildTreeStats(std::istream &is, bool binary, const Clusterable &example, BuildTreeStatsType *stats)
Reads BuildTreeStats object.
static const EventKeyType kPdfClass
Definition: context-dep.h:39
void WriteEventType(std::ostream &os, bool binary, const EventType &evec)
Definition: event-map.cc:228
virtual BaseFloat Distance(const Clusterable &other) const
Return the objective function decrease from merging the two clusters, negated to be a positive number...
EventMap * GetToLengthMap(const BuildTreeStatsType &stats, int32 P, const std::vector< EventValueType > *phones, int32 default_length)
const size_t count
AllKeysType
Typedef used when we get "all keys" from a set of stats– used in specifying which kinds of questions...
std::vector< std::pair< EventKeyType, EventValueType > > EventType
Definition: event-map.h:58
void DoSplitInternal(int32 *next_leaf)
void EnsureClusterableVectorNotNull(std::vector< Clusterable *> *stats)
Fills in any (NULL) holes in "stats" vector, with empty stats, because certain algorithms require non...
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191
BaseFloat ClusterBottomUp(const std::vector< Clusterable *> &points, BaseFloat max_merge_thresh, int32 min_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out)
A bottom-up clustering algorithm.
int32 EventKeyType
Things of type EventKeyType can take any value.
Definition: event-map.h:45
void SumStatsVec(const std::vector< BuildTreeStatsType > &stats_in, std::vector< Clusterable *> *stats_out)
Sum a vector of stats.
QuestionsForKey is a class used to define the questions for a key, and also options that allow us to ...
std::vector< std::vector< EventValueType > > initial_questions
int ClusterEventMapGetMapping(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, std::vector< EventMap *> *mapping)
"ClusterEventMapGetMapping" clusters the leaves of the EventMap, with "thresh" a delta-likelihood thr...
void AddToClusters(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, std::vector< Clusterable *> *clusters)
Given stats and a vector "assignments" of the same size (that maps to cluster indices), sums the stats up into "clusters." It will add to any stats already present in "clusters" (although typically "clusters" will be empty when called), and it will extend with NULL pointers for any unseen indices.
void ReadEventType(std::istream &is, bool binary, EventType *evec)
Definition: event-map.cc:239
#define KALDI_ERR
Definition: kaldi-error.h:147
RefineClustersOptions refine_opts
void AddToClustersOptimized(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, const Clusterable &total, std::vector< Clusterable *> *clusters)
AddToClustersOptimized does the same as AddToClusters (it sums up the stats within each cluster...
#define KALDI_WARN
Definition: kaldi-error.h:150
EventMap * ClusterEventMapRestrictedByKeys(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, const std::vector< EventKeyType > &keys, int32 *num_removed)
This is as ClusterEventMap, but first splits the stats on the keys specified in "keys" (e...
DecisionTreeSplitter * no_
EventMap * RenumberEventMap(const EventMap &e_in, int32 *num_leaves)
RenumberEventMap [intended to be used after calling ClusterEventMap] renumbers an EventMap so its lea...
EventMap * MapEventMapLeaves(const EventMap &e_in, const std::vector< int32 > &mapping_in)
This function remaps the event-map leaves using this mapping, indexed by the number at leaf...
virtual EventMap * Copy(const std::vector< EventMap *> &new_leaves) const =0
BaseFloat ObjfGivenMap(const BuildTreeStatsType &stats_in, const EventMap &e)
Cluster the stats given the event map return the total objf given those clusters. ...
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
EventMap * GetStubMap(int32 P, const std::vector< std::vector< int32 > > &phone_sets, const std::vector< int32 > &phone2num_pdf_classes, const std::vector< bool > &share_roots, int32 *num_leaves_out)
GetStubMap is used in tree-building functions to get the initial to-states map, before the decision-t...
EventMap * ClusterEventMapToNClustersRestrictedByMap(const EventMap &e_in, const BuildTreeStatsType &stats, int32 num_clusters_required, const EventMap &e_restrict, int32 *num_removed_ptr)
This version of ClusterEventMapRestrictedByMap clusters to get a specific number of clusters as speci...
virtual BaseFloat Normalizer() const =0
Return the normalizer (typically, count) associated with the stats.
DecisionTreeSplitter(EventAnswerType leaf, const BuildTreeStatsType &stats, const Questions &q_opts)
static bool Lookup(const EventType &event, EventKeyType key, EventValueType *ans)
Definition: event-map.cc:290
A class that is capable of representing a generic mapping from EventType (which is a vector of (key...
Definition: event-map.h:86
bool HasQuestionsForKey(EventKeyType key) const
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
BaseFloat SumObjf(const BuildTreeStatsType &stats_in)
Sums the objective function over the stats.
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34
BaseFloat SumClusterableObjf(const std::vector< Clusterable *> &vec)
Returns the total objective function after adding up all the statistics in the vector (pointers may b...
EventMap * ClusterEventMap(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, int32 *num_removed_ptr)
This is as ClusterEventMapGetMapping but a more convenient interface that exposes less of the interna...
BaseFloat ComputeInitialSplit(const std::vector< Clusterable *> &summed_stats, const Questions &q_opts, EventKeyType key, std::vector< EventValueType > *yes_set)
int32 EventAnswerType
As far as the event-map code itself is concerned, things of type EventAnswerType may take any value e...
Definition: event-map.h:56
void FilterStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key, std::vector< EventValueType > &values, bool include_if_present, BuildTreeStatsType *stats_out)
FilterStatsByKey filters the stats according the value of a specified key.
int32 EventValueType
Given current code, things of type EventValueType should generally be nonnegative and in a reasonably...
Definition: event-map.h:51
std::vector< std::pair< EventType, Clusterable * > > BuildTreeStatsType
virtual Clusterable * ReadNew(std::istream &os, bool binary) const =0
Read data from a stream and return the corresponding object (const function; it&#39;s a class member beca...
virtual void MultiMap(const EventType &event, std::vector< EventAnswerType > *ans) const =0
EventMap * ClusterEventMapRestrictedByMap(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, const EventMap &e_restrict, int32 *num_removed_ptr)
This version of ClusterEventMapRestricted restricts the clustering to only allow things that "e_restr...
bool IsSortedAndUniq(const std::vector< T > &vec)
Returns true if the vector is sorted and contains each element only once.
Definition: stl-utils.h:63
#define KALDI_LOG
Definition: kaldi-error.h:153
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265
Clusterable * SumClusterable(const std::vector< Clusterable *> &vec)
Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present.
void DoSplit(int32 *next_leaf)
EventMap * DoTableSplit(const EventMap &orig, EventKeyType key, const BuildTreeStatsType &stats, int32 *num_leaves)
DoTableSplit does a complete split on this key (e.g.
void WriteBuildTreeStats(std::ostream &os, bool binary, const BuildTreeStatsType &stats)
Writes BuildTreeStats object. This works even if pointers are NULL.