build-tree.cc
Go to the documentation of this file.
1 // tree/build-tree.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include <set>
21 #include <queue>
22 #include "util/stl-utils.h"
23 #include "tree/build-tree-utils.h"
25 #include "tree/build-tree.h"
26 
27 namespace kaldi {
28 
29 
30 void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
31  const std::vector<int32> &phone_ids,
32  const std::vector<int32> &phone2hmm_length,
33  const std::vector<bool> &is_ctx_dep,
34  bool ensure_all_phones_covered,
35  BuildTreeStatsType *stats_out) {
36 
37  KALDI_ASSERT(dim > 0);
38  KALDI_ASSERT(num_stats > 0);
39  KALDI_ASSERT(N > 0);
40  KALDI_ASSERT(P < N);
41  KALDI_ASSERT(phone_ids.size() != 0);
42  KALDI_ASSERT(stats_out != NULL && stats_out->empty());
43  int32 max_phone = *std::max_element(phone_ids.begin(), phone_ids.end());
44  KALDI_ASSERT(phone2hmm_length.size() >= static_cast<size_t>(1 + max_phone));
45  KALDI_ASSERT(is_ctx_dep.size() >= static_cast<size_t>(1 + max_phone));
46 
47  // Make sure phone id's distinct.
48  {
49  std::vector<int32> tmp(phone_ids);
50  SortAndUniq(&tmp);
51  KALDI_ASSERT(tmp.size() == phone_ids.size());
52  }
53  size_t num_phones = phone_ids.size();
54 
55  // Decide on an underlying "mean" for phones...
56  Matrix<BaseFloat> phone_vecs(max_phone+1, dim);
57  for (int32 i = 0;i < max_phone+1;i++)
58  for (int32 j = 0;j < dim;j++) phone_vecs(i, j) = RandGauss() * (2.0 / (j+1));
59 
60 
61  std::map<EventType, Clusterable*> stats_tmp;
62 
63  std::vector<bool> covered(1 + max_phone, false);
64 
65  bool all_covered = false;
66  for (int32 i = 0;i < num_stats || (ensure_all_phones_covered && !all_covered);i++) {
67  // decide randomly on a phone-in-context.
68  std::vector<int32> phone_vec(N);
69  for (size_t i = 0;i < (size_t)N;i++) phone_vec[i] = phone_ids[(Rand() % num_phones)];
70 
71  int32 hmm_length = phone2hmm_length[phone_vec[P]];
72  KALDI_ASSERT(hmm_length > 0);
73  covered[phone_vec[P]] = true;
74 
75  // For each position [in the central phone]...
76  for (int32 j = 0; j < hmm_length; j++) {
77  // create event vector.
78  EventType event_vec;
79  event_vec.push_back(std::make_pair(kPdfClass, (EventValueType)j)); // record the position.
80  for (size_t pos = 0; pos < (size_t)N; pos++) {
81  if (pos == (size_t)(P) || is_ctx_dep[phone_vec[P]])
82  event_vec.push_back(std::make_pair((EventKeyType)pos, (EventValueType)phone_vec[pos]));
83  // The if-statement above ensures we do not record the context of "context-free"
84  // phone (e.g., silence).
85  }
86 
87  Vector<BaseFloat> mean(dim); // mean of Gaussian.
88  GaussClusterable *this_stats = new GaussClusterable(dim, 0.1); // 0.1 is var floor.
89  { // compute stats; this block attempts to simulate the process of "real" data
90  // collection and does not correspond to any code you would write in a real
91  // scenario.
92  Vector<BaseFloat> weights(N); // weight of each component.
93  for (int32 k = 0; k < N; k++) {
94  BaseFloat k_pos = (N - 0.5 - k) / N; // between 0 and 1, less for lower k...
95  BaseFloat j_pos = (hmm_length - 0.5 - j) / hmm_length;
96  // j_pos is between 0 and 1, less for lower j.
97 
98  BaseFloat weight = j_pos*k_pos + (1.0-j_pos)*(1.0-k_pos);
99  // if j_pos close to zero, gives larger weight to k_pos close
100  // to zero.
101  if (k == P) weight += 1.0;
102  weights(k) = weight;
103  }
104  KALDI_ASSERT(weights.Sum() != 0);
105  weights.Scale(1.0 / weights.Sum());
106  for (int32 k = 0; k < N; k++)
107  mean.AddVec(weights(k), phone_vecs.Row(phone_vec[k]));
109  if (Rand() % 2 == 0) count = 1000.0 * RandUniform();
110  else count = 100.0 * RandUniform();
111 
112  int32 num_samples = 10;
113  for (size_t p = 0;p < (size_t)num_samples; p++) {
114  Vector<BaseFloat> sample(mean); // copy mean.
115  for (size_t d = 0; d < (size_t)dim; d++) sample(d) += RandGauss(); // unit var.
116  this_stats->AddStats(sample, count / num_samples);
117  }
118  }
119 
120  if (stats_tmp.count(event_vec) != 0) {
121  stats_tmp[event_vec]->Add(*this_stats);
122  delete this_stats;
123  } else {
124  stats_tmp[event_vec] = this_stats;
125  }
126  }
127  all_covered = true;
128  for (size_t i = 0; i< num_phones; i++) if (!covered[phone_ids[i]]) all_covered = false;
129  }
130  CopyMapToVector(stats_tmp, stats_out);
131  KALDI_ASSERT(stats_out->size() > 0);
132 }
133 
134 
135 
137  const std::vector<std::vector<int32> > &phone_sets,
138  const std::vector<int32> &phone2num_pdf_classes,
139  const std::vector<bool> &share_roots,
140  const std::vector<bool> &do_split,
141  const BuildTreeStatsType &stats,
142  BaseFloat thresh,
143  int32 max_leaves,
144  BaseFloat cluster_thresh, // typically == thresh. If negative, use smallest split.
145  int32 P,
146  bool round_num_leaves) {
147  KALDI_ASSERT(thresh > 0 || max_leaves > 0);
148  KALDI_ASSERT(stats.size() != 0);
149  KALDI_ASSERT(!phone_sets.empty()
150  && phone_sets.size() == share_roots.size()
151  && do_split.size() == phone_sets.size());
152 
153  // the inputs will be further checked in GetStubMap.
154  int32 num_leaves = 0; // allocator for leaves.
155 
156  EventMap *tree_stub = GetStubMap(P,
157  phone_sets,
158  phone2num_pdf_classes,
159  share_roots,
160  &num_leaves);
161  KALDI_LOG << "BuildTree: before building trees, map has "<< num_leaves << " leaves.";
162 
163 
164  BaseFloat impr;
165  BaseFloat smallest_split = 1.0e+10;
166 
167 
168  std::vector<int32> nonsplit_phones;
169  for (size_t i = 0; i < phone_sets.size(); i++)
170  if (!do_split[i])
171  nonsplit_phones.insert(nonsplit_phones.end(), phone_sets[i].begin(), phone_sets[i].end());
172 
173  std::sort(nonsplit_phones.begin(), nonsplit_phones.end());
174 
175  KALDI_ASSERT(IsSortedAndUniq(nonsplit_phones));
176  BuildTreeStatsType filtered_stats;
177  FilterStatsByKey(stats, P, nonsplit_phones, false, // retain only those not
178  // in "nonsplit_phones"
179  &filtered_stats);
180 
181  EventMap *tree_split = SplitDecisionTree(*tree_stub,
182  filtered_stats,
183  qopts, thresh, max_leaves,
184  &num_leaves, &impr, &smallest_split);
185 
186  if (cluster_thresh < 0.0) {
187  KALDI_LOG << "Setting clustering threshold to smallest split " << smallest_split;
188  cluster_thresh = smallest_split;
189  }
190 
191  BaseFloat normalizer = SumNormalizer(stats),
192  impr_normalized = impr / normalizer,
193  normalizer_filt = SumNormalizer(filtered_stats),
194  impr_normalized_filt = impr / normalizer_filt;
195 
196  KALDI_VLOG(1) << "After decision tree split, num-leaves = " << num_leaves
197  << ", like-impr = " << impr_normalized << " per frame over "
198  << normalizer << " frames.";
199 
200  KALDI_VLOG(1) << "Including just phones that were split, improvement is "
201  << impr_normalized_filt << " per frame over "
202  << normalizer_filt << " frames.";
203 
204 
205  if (cluster_thresh != 0.0) { // Cluster the tree.
206  BaseFloat objf_before_cluster = ObjfGivenMap(stats, *tree_split);
207 
208  // Now do the clustering.
209  int32 num_removed = 0;
210  EventMap *tree_clustered = ClusterEventMapRestrictedByMap(*tree_split,
211  stats,
212  cluster_thresh,
213  *tree_stub,
214  &num_removed);
215  KALDI_LOG << "BuildTree: removed "<< num_removed << " leaves.";
216 
217  int32 num_leaves_out = 0;
218  EventMap *tree_renumbered;
219  if (round_num_leaves) {
220  // Round the number of leaves to a multiple of 8 by clustering the leaves
221  // and merging them within each cluster.
222  int32 num_leaves_required = ((num_leaves - num_removed) / 8) * 8;
223  std::vector<EventMap*> leaf_mapping;
224 
225  int32 num_removed_in_rounding = 0;
227  *tree_clustered, stats, num_leaves_required, *tree_stub,
228  &num_removed_in_rounding);
229 
230  if (num_removed_in_rounding > 0)
231  KALDI_LOG << "BuildTree: Rounded num leaves to multiple of 8 by"
232  << " removing " << num_removed_in_rounding << " leaves.";
233 
234  if (num_leaves - num_removed - num_removed_in_rounding !=
235  num_leaves_required) {
236  KALDI_WARN << "Did not get expected number of leaves: "
237  << num_leaves << " - " << num_removed << " - "
238  << num_removed_in_rounding
239  << " != " << num_leaves_required;
240  }
241 
242  tree_renumbered = RenumberEventMap(*tree_rounded, &num_leaves_out);
243 
244  if (num_leaves_out != num_leaves_required) {
245  KALDI_WARN << "num-leaves-out != num-leaves-required: "
246  << num_leaves_out << " != " << num_leaves_required;
247  }
248 
249  delete tree_rounded;
250  } else {
251  tree_renumbered = RenumberEventMap(*tree_clustered, &num_leaves_out);
252  }
253 
254  BaseFloat objf_after_cluster = ObjfGivenMap(stats, *tree_renumbered);
255 
256  KALDI_VLOG(1) << "Objf change due to clustering "
257  << ((objf_after_cluster-objf_before_cluster) / normalizer)
258  << " per frame.";
259  KALDI_VLOG(1) << "Normalizing over only split phones, this is: "
260  << ((objf_after_cluster-objf_before_cluster) / normalizer_filt)
261  << " per frame.";
262  KALDI_VLOG(1) << "Num-leaves is now "<< num_leaves_out;
263 
264  delete tree_clustered;
265  delete tree_split;
266  delete tree_stub;
267  return tree_renumbered;
268  } else {
269  if (round_num_leaves) {
270  // Round the number of leaves to a multiple of 8 by clustering the leaves
271  // and merging them within each cluster.
272  // The final number of leaves will be 'num_leaves_required'.
273  BaseFloat objf_before_cluster = ObjfGivenMap(stats, *tree_split);
274 
275  int32 num_leaves_required = (num_leaves / 8) * 8;
276  std::vector<EventMap*> leaf_mapping;
277 
278  int32 num_removed_in_rounding = 0;
280  *tree_split, stats, num_leaves_required, *tree_stub,
281  &num_removed_in_rounding);
282 
283  if (num_removed_in_rounding > 0)
284  KALDI_LOG << "BuildTree: Rounded num leaves to multiple of 8 by"
285  << " removing " << num_removed_in_rounding << " leaves.";
286 
287  KALDI_ASSERT(num_removed_in_rounding < 8);
288 
289  int32 num_leaves_out;
290  EventMap* tree_renumbered = RenumberEventMap(*tree_rounded, &num_leaves_out);
291 
292  BaseFloat objf_after_cluster = ObjfGivenMap(stats, *tree_renumbered);
293 
294  KALDI_VLOG(1) << "Objf change due to clustering "
295  << ((objf_after_cluster-objf_before_cluster) / normalizer)
296  << " per frame.";
297  KALDI_VLOG(1) << "Normalizing over only split phones, this is: "
298  << ((objf_after_cluster-objf_before_cluster) / normalizer_filt)
299  << " per frame.";
300  KALDI_VLOG(1) << "Num-leaves is now "<< num_leaves_out;
301 
302  delete tree_stub;
303  delete tree_rounded;
304  return tree_renumbered;
305  }
306 
307  delete tree_stub;
308  return tree_split;
309  }
310 }
311 
312 
313 // This function, called from BuildTreeTwoLevel, computes the mapping from the
314 // leaves of the big tree to the leaves of the small tree. It does this by
315 // working out which stats correspond to which leaf of the big tree, then
316 // mapping those stats to the leaves of the small tree and seeing where they go.
317 // It only works of all leaves of the big tree have stats-- but they should have
318 // stats, or there would have been an error in tree building.
319 
320 static void ComputeTreeMapping(const EventMap &small_tree,
321  const EventMap &big_tree,
322  const BuildTreeStatsType &stats,
323  std::vector<int32> *leaf_map) {
324  std::vector<BuildTreeStatsType> split_stats_small; // stats split by small tree
325  int32 num_leaves_big = big_tree.MaxResult() + 1,
326  num_leaves_small = small_tree.MaxResult() + 1;
327  SplitStatsByMap(stats, small_tree, &split_stats_small);
328  KALDI_ASSERT(static_cast<int32>(split_stats_small.size()) <=
329  num_leaves_small);
330  leaf_map->clear();
331  leaf_map->resize(num_leaves_big, -1); // fill with -1.
332 
333  std::vector<int32> small_leaves_unseen; // a list of small leaves that had no stats..
334  // this is used as a workaround for when there are no stats at leaves...
335  // it's really an error condition and it will cause errors later (e.g. when
336  // you initialize your model), but at this point we will try to handle it
337  // gracefully.
338 
339  for (int32 i = 0; i < num_leaves_small; i++) {
340  if (static_cast<size_t>(i) >= split_stats_small.size() ||
341  split_stats_small[i].empty()) {
342  KALDI_WARN << "No stats mapping to " << i << " in small tree. "
343  << "Continuing but this is a serious error.";
344  small_leaves_unseen.push_back(i);
345  } else {
346  for (size_t j = 0; j < split_stats_small[i].size(); j++) {
347  int32 leaf = 0; // = 0 to keep compiler happy. Leaf in big tree.
348  bool ok = big_tree.Map(split_stats_small[i][j].first, &leaf);
349  if (!ok)
350  KALDI_ERR << "Could not map stats with big tree: probable code error.";
351  if (leaf < 0 || leaf >= num_leaves_big)
352  KALDI_ERR << "Leaf out of range: " << leaf << " vs. " << num_leaves_big;
353  if ((*leaf_map)[leaf] != -1 && (*leaf_map)[leaf] != i)
354  KALDI_ERR << "Inconsistent mapping for big tree: "
355  << i << " vs. " << (*leaf_map)[leaf];
356  (*leaf_map)[leaf] = i;
357  }
358  }
359  }
360  // Now make sure that all leaves in the big tree have a leaf in the small tree
361  // assigned to them. If not we try to clean up... this should never normally
362  // happen and if it does it's due to trying to assign tree roots to unseen phones,
363  // which will anyway cause an error in a later stage of system building.
364  for (int32 leaf = 0; leaf < num_leaves_big; leaf++) {
365  int32 small_leaf = (*leaf_map)[leaf];
366  if (small_leaf == -1) {
367  KALDI_WARN << "In ComputeTreeMapping, could not get mapping from leaf "
368  << leaf;
369  if (!small_leaves_unseen.empty()) {
370  small_leaf = small_leaves_unseen.back();
371  KALDI_WARN << "Assigning it to unseen small-tree leaf " << small_leaf;
372  small_leaves_unseen.pop_back();
373  (*leaf_map)[leaf] = small_leaf;
374  } else {
375  KALDI_WARN << "Could not find any unseen small-tree leaf to assign "
376  << "it to. Making it zero, but this is bad. ";
377  (*leaf_map)[leaf] = 0;
378  }
379  } else if (small_leaf < 0 || small_leaf >= num_leaves_small)
380  KALDI_ERR << "Leaf in leaf mapping out of range: for big-map leaf "
381  << leaf << ", mapped to " << small_leaf << ", vs. "
382  << num_leaves_small;
383  }
384 }
385 
386 
388  const std::vector<std::vector<int32> > &phone_sets,
389  const std::vector<int32> &phone2num_pdf_classes,
390  const std::vector<bool> &share_roots,
391  const std::vector<bool> &do_split,
392  const BuildTreeStatsType &stats,
393  int32 max_leaves_first,
394  int32 max_leaves_second,
395  bool cluster_leaves,
396  int32 P,
397  std::vector<int32> *leaf_map) {
398 
399  KALDI_LOG << "****BuildTreeTwoLevel: building first level tree";
400  EventMap *first_level_tree = BuildTree(qopts, phone_sets,
401  phone2num_pdf_classes,
402  share_roots, do_split, stats, 0.0,
403  max_leaves_first, 0.0, P);
404  KALDI_ASSERT(first_level_tree != NULL);
405  KALDI_LOG << "****BuildTreeTwoLevel: done building first level tree";
406 
407 
408  std::vector<int32> nonsplit_phones;
409  for (size_t i = 0; i < phone_sets.size(); i++)
410  if (!do_split[i])
411  nonsplit_phones.insert(nonsplit_phones.end(), phone_sets[i].begin(), phone_sets[i].end());
412  std::sort(nonsplit_phones.begin(), nonsplit_phones.end());
413 
414  KALDI_ASSERT(IsSortedAndUniq(nonsplit_phones));
415  BuildTreeStatsType filtered_stats;
416  FilterStatsByKey(stats, P, nonsplit_phones, false, // retain only those not
417  // in "nonsplit_phones"
418  &filtered_stats);
419 
420  int32 num_leaves = first_level_tree->MaxResult() + 1,
421  old_num_leaves = num_leaves;
422 
423  BaseFloat smallest_split = 0.0;
424 
425  BaseFloat impr;
426  EventMap *tree = SplitDecisionTree(*first_level_tree,
427  filtered_stats,
428  qopts, 0.0, max_leaves_second,
429  &num_leaves, &impr, &smallest_split);
430 
431  KALDI_LOG << "Building second-level tree: increased #leaves from "
432  << old_num_leaves << " to " << num_leaves << ", smallest split was "
433  << smallest_split;
434 
435  BaseFloat normalizer = SumNormalizer(stats),
436  impr_normalized = impr / normalizer;
437 
438  KALDI_LOG << "After second decision tree split, num-leaves = "
439  << num_leaves << ", like-impr = " << impr_normalized
440  << " per frame over " << normalizer << " frames.";
441 
442  if (cluster_leaves) { // Cluster the leaves of the tree.
443  KALDI_LOG << "Clustering leaves of larger tree.";
444  BaseFloat objf_before_cluster = ObjfGivenMap(stats, *tree);
445 
446  // Now do the clustering.
447  int32 num_removed = 0;
448  EventMap *tree_clustered = ClusterEventMapRestrictedByMap(*tree,
449  stats,
450  smallest_split,
451  *first_level_tree,
452  &num_removed);
453  KALDI_LOG << "BuildTreeTwoLevel: removed " << num_removed << " leaves.";
454 
455  int32 num_leaves = 0;
456  EventMap *tree_renumbered = RenumberEventMap(*tree_clustered, &num_leaves);
457 
458  BaseFloat objf_after_cluster = ObjfGivenMap(stats, *tree_renumbered);
459 
460  KALDI_LOG << "Objf change due to clustering "
461  << ((objf_after_cluster-objf_before_cluster) / SumNormalizer(stats))
462  << " per frame.";
463  KALDI_LOG << "Num-leaves now "<< num_leaves;
464  delete tree;
465  delete tree_clustered;
466  tree = tree_renumbered;
467  }
468 
469  ComputeTreeMapping(*first_level_tree,
470  *tree,
471  stats,
472  leaf_map);
473 
474  { // Next do another renumbering of "tree" so that leaves with the
475  // same value in "first_level_tree" are contiguous.
476  std::vector<std::pair<int32, int32> > leaf_pairs;
477  for (size_t i = 0; i < leaf_map->size(); i++)
478  leaf_pairs.push_back(std::make_pair((*leaf_map)[i], static_cast<int32>(i)));
479  // pair of (small-tree-number, big-tree-number).
480  std::sort(leaf_pairs.begin(), leaf_pairs.end());
481  std::vector<int32> old2new_map(leaf_map->size()),
482  new_leaf_map(leaf_map->size());
483  // Note: old2new_map maps from old indices to new indices, in the
484  // renumbering; new_leaf_map maps from 2nd-level tree indices to
485  // 1st-level tree indices.
486  for (size_t i = 0; i < leaf_pairs.size(); i++) {
487  int32 old_number = leaf_pairs[i].second, new_number = i;
488  old2new_map[old_number] = new_number;
489  new_leaf_map[new_number] = (*leaf_map)[old_number];
490  }
491  *leaf_map = new_leaf_map;
492  EventMap *renumbered_tree = MapEventMapLeaves(*tree, old2new_map);
493  delete tree;
494  tree = renumbered_tree;
495  }
496 
497  delete first_level_tree;
498  return tree;
499 }
500 
501 
502 void ReadSymbolTableAsIntegers(std::string filename,
503  bool include_eps,
504  std::vector<int32> *syms) {
505  std::ifstream is(filename.c_str());
506  if (!is.good())
507  KALDI_ERR << "ReadSymbolTableAsIntegers: could not open symbol table "<<filename;
508  std::string line;
509  KALDI_ASSERT(syms != NULL);
510  syms->clear();
511  while (getline(is, line)) {
512  std::string sym;
513  int64 index;
514  std::istringstream ss(line);
515  ss >> sym >> index >> std::ws;
516  if (ss.fail() || !ss.eof()) {
517  KALDI_ERR << "Bad line in symbol table: "<< line<<", file is: "<<filename;
518  }
519  if (include_eps || index != 0)
520  syms->push_back(index);
521  if (index == 0 && sym != "<eps>") {
522  KALDI_WARN << "Symbol zero is "<<sym<<", traditionally <eps> is used. Make sure this is not a \"real\" symbol.";
523  }
524  }
525  size_t sz = syms->size();
526  SortAndUniq(syms);
527  if (syms->size() != sz)
528  KALDI_ERR << "Symbol table "<<filename<<" seems to contain duplicate symbols.";
529 }
530 
531 // Used in ObtainSetsOfPhones, this function removes duplicates from a vector of vectors,
532 // while otherwise preserving the order. It also prints how many it removed.
533 static void RemoveDuplicates(std::vector<std::vector<int32 > > *vecs) {
534  unordered_set<std::vector<int32>, VectorHasher<int32> > vec_set;
535  std::vector<std::vector<int32 > > new_vecs;
536  new_vecs.reserve(vecs->size());
537  int32 num_not_inserted = 0;
538  for (std::vector<std::vector<int32 > >::const_iterator iter = vecs->begin(),
539  end = vecs->end(); iter != end; iter++) {
540  if (vec_set.insert(*iter).second) { // if this vector was not already in
541  // the set...
542  new_vecs.push_back(*iter);
543  } else {
544  num_not_inserted++;
545  }
546  }
547  KALDI_VLOG(2) << "Removed " << num_not_inserted
548  << " duplicates from the phone sets.";
549  vecs->swap(new_vecs);
550 }
551 
552 
557 static void ObtainSetsOfPhones(const std::vector<std::vector<int32> > &phone_sets, // the original phone sets, may
558  // just be individual phones.
559  const std::vector<int32> &assignments, // phone-sets->clusters
560  const std::vector<int32> &clust_assignments, // clust->parent
561  int32 num_leaves, // number of clusters present..
562  std::vector<std::vector<int32> > *sets_out) {
563  KALDI_ASSERT(sets_out != NULL);
564  sets_out->clear();
565  std::vector<std::vector<int32> > raw_sets(clust_assignments.size());
566 
567  KALDI_ASSERT(num_leaves < static_cast<int32>(clust_assignments.size()));
568  KALDI_ASSERT(assignments.size() == phone_sets.size());
569  for (size_t i = 0; i < assignments.size(); i++) {
570  int32 clust = assignments[i]; // this is an index into phone_sets.
571  KALDI_ASSERT(clust>=0 && clust < num_leaves);
572  for (size_t j = 0; j < phone_sets[i].size(); j++) {
573  // and not just a hole.
574  raw_sets[clust].push_back(phone_sets[i][j]);
575  }
576  }
577  // for all clusters including the top-level cluster:
578  // [note that the top-level cluster contains all phones, but it may actually
579  // be useful because sometimes we cluster just the non-silence phones, so
580  // the list of all phones is a way of asking about silence in such a way
581  // that epsilon (end-or-begin-of-utterance) gets lumped with silence.
582  for (int32 j = 0; j < static_cast<int32>(clust_assignments.size()); j++) {
583  int32 parent = clust_assignments[j];
584  std::sort(raw_sets[j].begin(), raw_sets[j].end());
585  KALDI_ASSERT(IsSortedAndUniq(raw_sets[j])); // should be no dups.
586  if (parent < static_cast<int32>(clust_assignments.size())-1) { // parent is not out of range [i.e. not the top one]...
587  // add all j's phones to its parent.
588  raw_sets[parent].insert(raw_sets[parent].end(),
589  raw_sets[j].begin(),
590  raw_sets[j].end());
591  }
592  }
593  // Reverse the 'raw_sets' so the most important things (top-level questions)
594  // appear at the front... this will end up mattering because of the
595  // --truncate-leftmost-questions option to compile-questions.
596  std::reverse(raw_sets.begin(), raw_sets.end());
597 
598  // Now add the original sets-of-phones to the raw sets, to make sure all of
599  // these are present. (The main reason they might be absent is if the stats
600  // are empty, but we want to ensure they are all there regardless). note these
601  // will be actual singleton sets if the sets-of-phones each contain just one
602  // phone, which in some sense is the normal situation.
603  for (size_t i = 0; i < phone_sets.size(); i++) {
604  raw_sets.push_back(phone_sets[i]);
605  }
606  // Remove duplicate sets from "raw_sets".
607  RemoveDuplicates(&raw_sets);
608  sets_out->reserve(raw_sets.size());
609  for (size_t i = 0; i < raw_sets.size(); i++)
610  if (! raw_sets[i].empty()) // if the empty set is present, remove it...
611  sets_out->push_back(raw_sets[i]);
612 }
613 
614 
616  const std::vector<std::vector<int32> > &phone_sets_in,
617  const std::vector<int32> &all_pdf_classes_in,
618  int32 P,
619  std::vector<std::vector<int32> > *questions_out) {
620  std::vector<std::vector<int32> > phone_sets(phone_sets_in);
621  std::vector<int32> phones;
622  for (size_t i = 0; i < phone_sets.size() ;i++) {
623  std::sort(phone_sets[i].begin(), phone_sets[i].end());
624  if (phone_sets[i].empty())
625  KALDI_ERR << "Empty phone set in AutomaticallyObtainQuestions";
626  if (!IsSortedAndUniq(phone_sets[i]))
627  KALDI_ERR << "Phone set in AutomaticallyObtainQuestions contains duplicate phones";
628  for (size_t j = 0; j < phone_sets[i].size(); j++)
629  phones.push_back(phone_sets[i][j]);
630  }
631  std::sort(phones.begin(), phones.end());
632  if (!IsSortedAndUniq(phones))
633  KALDI_ERR << "Phones are present in more than one phone set.";
634  if (phones.empty())
635  KALDI_ERR << "No phones provided.";
636 
637  std::vector<int32> all_pdf_classes(all_pdf_classes_in);
638  SortAndUniq(&all_pdf_classes);
639  KALDI_ASSERT(!all_pdf_classes.empty());
640 
641  BuildTreeStatsType retained_stats;
642  FilterStatsByKey(stats, kPdfClass, all_pdf_classes,
643  true, // retain only the listed positions
644  &retained_stats);
645 
646  if (retained_stats.size() * 10 < stats.size()) {
647  std::ostringstream ss;
648  for (size_t i = 0; i < all_pdf_classes.size(); i++)
649  ss << all_pdf_classes[i] << ' ';
650  KALDI_WARN << "After filtering the tree statistics to retain only stats where "
651  << "pdf-class is in the set { " << ss.str() << "}, most of your "
652  << "stats disappeared: the size changed from " << stats.size()
653  << " to " << retained_stats.size() << ". You might be using "
654  << "a nonstandard topology but forgot to modify the "
655  << "--pdf-class-list option (it defaults to { 1 } which is "
656  << "the central state in a 3-state left-to-right topology)."
657  << " E.g. a 1-state HMM topology would require the option "
658  << "--pdf-class-list=0.";
659  }
660 
661 
662  std::vector<BuildTreeStatsType> split_stats; // split by phone.
663  SplitStatsByKey(retained_stats, P, &split_stats);
664 
665  std::vector<Clusterable*> summed_stats; // summed up by phone.
666  SumStatsVec(split_stats, &summed_stats);
667 
668  int32 max_phone = phones.back();
669  if (static_cast<int32>(summed_stats.size()) < max_phone+1) {
670  // this can happen if the last phone had no data.. if we are using
671  // stress-marked, position-marked phones, this can happen. The later
672  // code will assume that a summed_stats entry exists for all phones.
673  summed_stats.resize(max_phone+1, NULL);
674  }
675 
676  for (int32 i = 0; static_cast<size_t>(i) < summed_stats.size(); i++) { // A check.
677  if (summed_stats[i] != NULL &&
678  !binary_search(phones.begin(), phones.end(), i)) {
679  KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this].";
680  }
681  }
682 
683  EnsureClusterableVectorNotNull(&summed_stats); // make sure no NULL pointers in summed_stats.
684  // will replace them with pointers to empty stats.
685 
686  std::vector<Clusterable*> summed_stats_per_set(phone_sets.size(), NULL); // summed up by set.
687  for (size_t i = 0; i < phone_sets.size(); i++) {
688  const std::vector<int32> &this_set = phone_sets[i];
689  summed_stats_per_set[i] = summed_stats[this_set[0]]->Copy();
690  for (size_t j = 1; j < this_set.size(); j++)
691  summed_stats_per_set[i]->Add(*(summed_stats[this_set[j]]));
692  }
693 
694  int32 num_no_data = 0;
695  for (size_t i = 0; i < summed_stats_per_set.size(); i++) { // A check.
696  if (summed_stats_per_set[i]->Normalizer() == 0.0) {
697  num_no_data++;
698  std::ostringstream ss;
699  ss << "AutomaticallyObtainQuestions: no stats available for phone set: ";
700  for (size_t j = 0; j < phone_sets[i].size(); j++)
701  ss << phone_sets[i][j] << ' ' ;
702  KALDI_WARN << ss.str();
703  }
704  }
705  if (num_no_data + 1 >= summed_stats_per_set.size()) {
706  std::ostringstream ss;
707  for (size_t i = 0; i < all_pdf_classes.size(); i++)
708  ss << all_pdf_classes[i] << ' ';
709  KALDI_WARN << "All or all but one of your classes of phones had no data. "
710  << "Note that we only consider data where pdf-class is in the "
711  << "set ( " << ss.str() << "). If you have an unusual HMM "
712  << "topology this may not be what you want; use the "
713  << "--pdf-class-list option to change this if needed. See "
714  << "also any warnings above.";
715  }
716 
717 
718  TreeClusterOptions topts;
719  topts.kmeans_cfg.num_tries = 10; // This is a slow-but-accurate setting,
720  // we do it this way since there are typically few phones.
721 
722  std::vector<int32> assignments; // assignment of phones to clusters. dim == summed_stats.size().
723  std::vector<int32> clust_assignments; // Parent of each cluster. Dim == #clusters.
724  int32 num_leaves; // number of leaf-level clusters.
725  TreeCluster(summed_stats_per_set,
726  summed_stats_per_set.size(), // max-#clust is all of the points.
727  NULL, // don't need the clusters out.
728  &assignments,
729  &clust_assignments,
730  &num_leaves,
731  topts);
732 
733  // process the information obtained by TreeCluster into the
734  // form we want at output.
735  ObtainSetsOfPhones(phone_sets,
736  assignments,
737  clust_assignments,
738  num_leaves,
739  questions_out);
740 
741  // The memory in summed_stats was newly allocated. [the other algorithms
742  // used here do not allocate].
743  DeletePointers(&summed_stats);
744  DeletePointers(&summed_stats_per_set);
745 }
746 
747 
749  const std::vector<std::vector<int32> > &phone_sets_in,
750  const std::vector<int32> &all_pdf_classes_in,
751  int32 P,
752  int32 num_classes,
753  std::vector<std::vector<int32> > *sets_out) {
754  std::vector<std::vector<int32> > phone_sets(phone_sets_in);
755  std::vector<int32> phones;
756  for (size_t i = 0; i < phone_sets.size() ;i++) {
757  std::sort(phone_sets[i].begin(), phone_sets[i].end());
758  if (phone_sets[i].empty())
759  KALDI_ERR << "Empty phone set in AutomaticallyObtainQuestions";
760  if (!IsSortedAndUniq(phone_sets[i]))
761  KALDI_ERR << "Phone set in AutomaticallyObtainQuestions contains duplicate phones";
762  for (size_t j = 0; j < phone_sets[i].size(); j++)
763  phones.push_back(phone_sets[i][j]);
764  }
765  std::sort(phones.begin(), phones.end());
766  if (!IsSortedAndUniq(phones))
767  KALDI_ERR << "Phones are present in more than one phone set.";
768  if (phones.empty())
769  KALDI_ERR << "No phones provided.";
770 
771  std::vector<int32> all_pdf_classes(all_pdf_classes_in);
772  SortAndUniq(&all_pdf_classes);
773  KALDI_ASSERT(!all_pdf_classes.empty());
774 
775  BuildTreeStatsType retained_stats;
776  FilterStatsByKey(stats, kPdfClass, all_pdf_classes,
777  true, // retain only the listed positions
778  &retained_stats);
779 
780 
781  std::vector<BuildTreeStatsType> split_stats; // split by phone.
782  SplitStatsByKey(retained_stats, P, &split_stats);
783 
784  std::vector<Clusterable*> summed_stats; // summed up by phone.
785  SumStatsVec(split_stats, &summed_stats);
786 
787  int32 max_phone = phones.back();
788  if (static_cast<int32>(summed_stats.size()) < max_phone+1) {
789  // this can happen if the last phone had no data.. if we are using
790  // stress-marked, position-marked phones, this can happen. The later
791  // code will assume that a summed_stats entry exists for all phones.
792  summed_stats.resize(max_phone+1, NULL);
793  }
794 
795  for (int32 i = 0; static_cast<size_t>(i) < summed_stats.size(); i++) {
796  // just a check.
797  if (summed_stats[i] != NULL &&
798  !binary_search(phones.begin(), phones.end(), i)) {
799  KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this].";
800  }
801  }
802 
803  EnsureClusterableVectorNotNull(&summed_stats); // make sure no NULL pointers in summed_stats.
804  // will replace them with pointers to empty stats.
805 
806  std::vector<Clusterable*> summed_stats_per_set(phone_sets.size(), NULL); // summed up by set.
807  for (size_t i = 0; i < phone_sets.size(); i++) {
808  const std::vector<int32> &this_set = phone_sets[i];
809  summed_stats_per_set[i] = summed_stats[this_set[0]]->Copy();
810  for (size_t j = 1; j < this_set.size(); j++)
811  summed_stats_per_set[i]->Add(*(summed_stats[this_set[j]]));
812  }
813 
814  for (size_t i = 0; i < summed_stats_per_set.size(); i++) { // A check.
815  if (summed_stats_per_set[i]->Normalizer() == 0.0) {
816  std::ostringstream ss;
817  ss << "AutomaticallyObtainQuestions: no stats available for phone set: ";
818  for (size_t j = 0; j < phone_sets[i].size(); j++)
819  ss << phone_sets[i][j] << ' ' ;
820  KALDI_WARN << ss.str();
821  }
822  }
823 
824  ClusterKMeansOptions opts; // Just using the default options which are a reasonable
825  // compromise between speed and accuracy.
826 
827  std::vector<int32> assignments;
828  BaseFloat objf_impr = ClusterKMeans(summed_stats_per_set,
829  num_classes,
830  NULL,
831  &assignments,
832  opts);
833 
834  BaseFloat count = SumClusterableNormalizer(summed_stats_per_set);
835 
836  KALDI_LOG << "ClusterKMeans: objf change from clustering [versus single set] is "
837  << (objf_impr/count) << " over " << count << " frames.";
838 
839  sets_out->resize(num_classes);
840  KALDI_ASSERT(assignments.size() == phone_sets.size());
841  for (size_t i = 0; i < assignments.size(); i++) {
842  int32 class_idx = assignments[i];
843  KALDI_ASSERT(static_cast<size_t>(class_idx) < sets_out->size());
844  for (size_t j = 0; j < phone_sets[i].size(); j++)
845  (*sets_out)[class_idx].push_back(phone_sets[i][j]);
846  }
847  for (size_t i = 0; i < sets_out->size(); i++) {
848  std::sort( (*sets_out)[i].begin(), (*sets_out)[i].end() ); // just good
849  // practice to have them sorted as who knows if whatever we need them for
850  // will require sorting...
851  KALDI_ASSERT(IsSortedAndUniq( (*sets_out)[i] ));
852  }
853  DeletePointers(&summed_stats);
854  DeletePointers(&summed_stats_per_set);
855 }
856 
857 void ReadRootsFile(std::istream &is,
858  std::vector<std::vector<int32> > *phone_sets,
859  std::vector<bool> *is_shared_root,
860  std::vector<bool> *is_split_root) {
861  KALDI_ASSERT(phone_sets != NULL && is_shared_root != NULL &&
862  is_split_root != NULL && phone_sets->empty()
863  && is_shared_root->empty() && is_split_root->empty());
864 
865  std::string line;
866  int line_number = 0;
867  while ( ! getline(is, line).fail() ) {
868  line_number++;
869  std::istringstream ss(line);
870  std::string shared;
871  ss >> shared;
872  if (ss.fail() && shared != "shared" && shared != "not-shared")
873  KALDI_ERR << "Bad line in roots file: line "<< line_number << ": " << line;
874  is_shared_root->push_back(shared == "shared");
875 
876  std::string split;
877  ss >> split;
878  if (ss.fail() && shared != "split" && shared != "not-split")
879  KALDI_ERR << "Bad line in roots file: line "<< line_number << ": " << line;
880  is_split_root->push_back(split == "split");
881 
882  phone_sets->push_back(std::vector<int32>());
883  int32 i;
884  while ( !(ss >> i).fail() ) {
885  phone_sets->back().push_back(i);
886  }
887  std::sort(phone_sets->back().begin(), phone_sets->back().end());
888  if (!IsSortedAndUniq(phone_sets->back()) || phone_sets->back().empty()
889  || phone_sets->back().front() <= 0)
890  KALDI_ERR << "Bad line in roots file [empty, or contains non-positive "
891  << " or duplicate phone-ids]: line " << line_number << ": "
892  << line;
893  }
894  if (phone_sets->empty())
895  KALDI_ERR << "Empty roots file ";
896 }
897 
898 
899 
900 } // end namespace kaldi
void AddStats(const VectorBase< BaseFloat > &vec, BaseFloat weight=1.0)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
BaseFloat SumNormalizer(const BuildTreeStatsType &stats_in)
Sums the normalizer [typically, data-count] over the stats.
EventMap * BuildTreeTwoLevel(Questions &qopts, const std::vector< std::vector< int32 > > &phone_sets, const std::vector< int32 > &phone2num_pdf_classes, const std::vector< bool > &share_roots, const std::vector< bool > &do_split, const BuildTreeStatsType &stats, int32 max_leaves_first, int32 max_leaves_second, bool cluster_leaves, int32 P, std::vector< int32 > *leaf_map)
BuildTreeTwoLevel builds a two-level tree, useful for example in building tied mixture systems with m...
Definition: build-tree.cc:387
void DeletePointers(std::vector< A *> *v)
Deletes any non-NULL pointers in the vector v, and sets the corresponding entries of v to NULL...
Definition: stl-utils.h:184
A hashing function-object for vectors.
Definition: stl-utils.h:216
This class defines, for each EventKeyType, a set of initial questions that it tries and also a number...
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
Definition: kaldi-math.h:151
EventMap * SplitDecisionTree(const EventMap &input_map, const BuildTreeStatsType &stats, Questions &q_opts, BaseFloat thresh, int32 max_leaves, int32 *num_leaves, BaseFloat *obj_impr_out, BaseFloat *smallest_split_change_out)
Does a decision-tree split at the leaves of an EventMap.
virtual EventAnswerType MaxResult() const
Definition: event-map.h:142
BaseFloat SumClusterableNormalizer(const std::vector< Clusterable *> &vec)
Returns the total normalizer (usually count) of the cluster (pointers may be NULL).
BaseFloat ClusterKMeans(const std::vector< Clusterable *> &points, int32 num_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, ClusterKMeansOptions cfg)
ClusterKMeans is a K-means-like clustering algorithm.
void AutomaticallyObtainQuestions(BuildTreeStatsType &stats, const std::vector< std::vector< int32 > > &phone_sets_in, const std::vector< int32 > &all_pdf_classes_in, int32 P, std::vector< std::vector< int32 > > *questions_out)
Outputs sets of phones that are reasonable for questions to ask in the tree-building algorithm...
Definition: build-tree.cc:615
void SplitStatsByMap(const BuildTreeStatsType &stats, const EventMap &e, std::vector< BuildTreeStatsType > *stats_out)
Splits stats according to the EventMap, indexing them at output by the leaf type. ...
static void RemoveDuplicates(std::vector< std::vector< int32 > > *vecs)
Definition: build-tree.cc:533
float RandGauss(struct RandomState *state=NULL)
Definition: kaldi-math.h:155
kaldi::int32 int32
void SortAndUniq(std::vector< T > *vec)
Sorts and uniq&#39;s (removes duplicates) from a vector.
Definition: stl-utils.h:39
void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P, const std::vector< int32 > &phone_ids, const std::vector< int32 > &phone2hmm_length, const std::vector< bool > &is_ctx_dep, bool ensure_all_phones_covered, BuildTreeStatsType *stats_out)
GenRandStats generates random statistics of the form used by BuildTree.
Definition: build-tree.cc:30
void SplitStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key, std::vector< BuildTreeStatsType > *stats_out)
SplitStatsByKey splits stats up according to the value of a particular key, which must be always defi...
virtual bool Map(const EventType &event, EventAnswerType *ans) const =0
static const EventKeyType kPdfClass
Definition: context-dep.h:39
const size_t count
std::vector< std::pair< EventKeyType, EventValueType > > EventType
Definition: event-map.h:58
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
void EnsureClusterableVectorNotNull(std::vector< Clusterable *> *stats)
Fills in any (NULL) holes in "stats" vector, with empty stats, because certain algorithms require non...
void ReadSymbolTableAsIntegers(std::string filename, bool include_eps, std::vector< int32 > *syms)
included here because it&#39;s used in some tree-building calling code.
Definition: build-tree.cc:502
static void ObtainSetsOfPhones(const std::vector< std::vector< int32 > > &phone_sets, const std::vector< int32 > &assignments, const std::vector< int32 > &clust_assignments, int32 num_leaves, std::vector< std::vector< int32 > > *sets_out)
ObtainSetsOfPhones is called by AutomaticallyObtainQuestions.
Definition: build-tree.cc:557
int32 EventKeyType
Things of type EventKeyType can take any value.
Definition: event-map.h:45
void SumStatsVec(const std::vector< BuildTreeStatsType > &stats_in, std::vector< Clusterable *> *stats_out)
Sum a vector of stats.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
EventMap * RenumberEventMap(const EventMap &e_in, int32 *num_leaves)
RenumberEventMap [intended to be used after calling ClusterEventMap] renumbers an EventMap so its lea...
EventMap * MapEventMapLeaves(const EventMap &e_in, const std::vector< int32 > &mapping_in)
This function remaps the event-map leaves using this mapping, indexed by the number at leaf...
BaseFloat ObjfGivenMap(const BuildTreeStatsType &stats_in, const EventMap &e)
Cluster the stats given the event map return the total objf given those clusters. ...
void Scale(Real alpha)
Multiplies all elements by this constant.
EventMap * GetStubMap(int32 P, const std::vector< std::vector< int32 > > &phone_sets, const std::vector< int32 > &phone2num_pdf_classes, const std::vector< bool > &share_roots, int32 *num_leaves_out)
GetStubMap is used in tree-building functions to get the initial to-states map, before the decision-t...
EventMap * ClusterEventMapToNClustersRestrictedByMap(const EventMap &e_in, const BuildTreeStatsType &stats, int32 num_clusters_required, const EventMap &e_restrict, int32 *num_removed_ptr)
This version of ClusterEventMapRestrictedByMap clusters to get a specific number of clusters as speci...
int Rand(struct RandomState *state)
Definition: kaldi-math.cc:45
Real Sum() const
Returns sum of the elements.
static void ComputeTreeMapping(const EventMap &small_tree, const EventMap &big_tree, const BuildTreeStatsType &stats, std::vector< int32 > *leaf_map)
Definition: build-tree.cc:320
A class that is capable of representing a generic mapping from EventType (which is a vector of (key...
Definition: event-map.h:86
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
EventMap * BuildTree(Questions &qopts, const std::vector< std::vector< int32 > > &phone_sets, const std::vector< int32 > &phone2num_pdf_classes, const std::vector< bool > &share_roots, const std::vector< bool > &do_split, const BuildTreeStatsType &stats, BaseFloat thresh, int32 max_leaves, BaseFloat cluster_thresh, int32 P, bool round_num_leaves)
BuildTree is the normal way to build a set of decision trees.
Definition: build-tree.cc:136
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
void ReadRootsFile(std::istream &is, std::vector< std::vector< int32 > > *phone_sets, std::vector< bool > *is_shared_root, std::vector< bool > *is_split_root)
Reads the roots file (throws on error).
Definition: build-tree.cc:857
void FilterStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key, std::vector< EventValueType > &values, bool include_if_present, BuildTreeStatsType *stats_out)
FilterStatsByKey filters the stats according the value of a specified key.
BaseFloat TreeCluster(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out, TreeClusterOptions cfg)
TreeCluster is a top-down clustering algorithm, using a binary tree (not necessarily balanced)...
int32 EventValueType
Given current code, things of type EventValueType should generally be nonnegative and in a reasonably...
Definition: event-map.h:51
ClusterKMeansOptions kmeans_cfg
std::vector< std::pair< EventType, Clusterable * > > BuildTreeStatsType
GaussClusterable wraps Gaussian statistics in a form accessible to generic clustering algorithms...
int32 line_number
EventMap * ClusterEventMapRestrictedByMap(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, const EventMap &e_restrict, int32 *num_removed_ptr)
This version of ClusterEventMapRestricted restricts the clustering to only allow things that "e_restr...
bool IsSortedAndUniq(const std::vector< T > &vec)
Returns true if the vector is sorted and contains each element only once.
Definition: stl-utils.h:63
#define KALDI_LOG
Definition: kaldi-error.h:153
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
void CopyMapToVector(const std::map< A, B > &m, std::vector< std::pair< A, B > > *v)
Copies the (key, value) pairs in a map to a vector of pairs.
Definition: stl-utils.h:112
void KMeansClusterPhones(BuildTreeStatsType &stats, const std::vector< std::vector< int32 > > &phone_sets_in, const std::vector< int32 > &all_pdf_classes_in, int32 P, int32 num_classes, std::vector< std::vector< int32 > > *sets_out)
This function clusters the phones (or some initially specified sets of phones) into sets of phones...
Definition: build-tree.cc:748