build-tree-utils-test.cc
Go to the documentation of this file.
1 // tree/build-tree-utils-test.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include "util/stl-utils.h"
21 #include "util/kaldi-io.h"
22 #include "tree/build-tree-utils.h"
24 
25 
26 namespace kaldi {
27 
29  int32 nleaves = 0;
30  EventMap *tree = TrivialTree(&nleaves);
31  EventType empty; EventAnswerType ans;
32  bool b = tree->Map(empty, &ans);
33  KALDI_ASSERT(b);
34  KALDI_ASSERT(nleaves == 1);
35  delete tree;
36 }
37 
39  BuildTreeStatsType stats;
40  std::map<EventKeyType, std::set<EventValueType> > all_key_vals;
41  for (size_t i = 0;i < 20;i++) {
42  std::map<EventKeyType, EventValueType> key_vals;
43  for (size_t j = 0;j < 20;j++) {
44  EventKeyType k = RandInt(-5, 10);
45  EventValueType v = RandInt(0, 10);
46  if (key_vals.count(k) == 0) {
47  key_vals[ k ] = v;
48  all_key_vals[k].insert(v);
49  }
50  }
51  EventType evec;
52  CopyMapToVector(key_vals, &evec);
53  stats.push_back(std::pair<EventType, Clusterable*>(evec, (Clusterable*)NULL));
54  }
55  for (std::map<EventKeyType, std::set<EventValueType> >::iterator iter = all_key_vals.begin();
56  iter != all_key_vals.end(); iter++) {
57  EventKeyType key = iter->first;
58  std::vector<EventValueType> vals1, vals2;
59  CopySetToVector(iter->second, &vals1);
60  PossibleValues(key, stats, &vals2);
61  if (vals1 != vals2) {
62  printf("vals differ!\n");
63  for (size_t i = 0;i < vals1.size();i++) std::cout << vals1[i] << " ";
64  std::cout<<'\n';
65  for (size_t i = 0;i < vals2.size();i++) std::cout << vals2[i] << " ";
66  std::cout<<'\n';
67  KALDI_ASSERT(0);
68  }
69  }
70 }
71 
73  {
74  BuildTreeStatsType stats;
75  EventType evec;
76  // this example is of ctx window (10, 11, 12), and pdf-class = 1.
77  evec.push_back(std::pair<int32, int32>(-1, 1));
78  evec.push_back(std::pair<int32, int32>(0, 10));
79  evec.push_back(std::pair<int32, int32>(1, 11));
80  evec.push_back(std::pair<int32, int32>(2, 12));
81  stats.push_back(std::make_pair(evec, static_cast<Clusterable*>(NULL)));
82  int32 oldN = 3, oldP = 1, newN = 1, newP = 0;
83  ConvertStats(oldN, oldP, newN, newP, &stats);
84  EventType new_evec = stats[0].first;
85  KALDI_ASSERT(new_evec.size() == 2); // keys: -1, 0.
86  KALDI_ASSERT(new_evec[0].first == -1 && new_evec[0].second == 1);
87  KALDI_ASSERT(new_evec[1].first == 0 && new_evec[1].second == 11);
88  }
89 
90  { // as above, but convert to left bigram (N = 2, P = 1).
91  BuildTreeStatsType stats;
92  EventType evec;
93  // this example is of ctx window (10, 11, 12), and pdf-class = 1.
94  evec.push_back(std::pair<int32, int32>(-1, 1));
95  evec.push_back(std::pair<int32, int32>(0, 10));
96  evec.push_back(std::pair<int32, int32>(1, 11));
97  evec.push_back(std::pair<int32, int32>(2, 12));
98  stats.push_back(std::make_pair(evec, static_cast<Clusterable*>(NULL)));
99  int32 oldN = 3, oldP = 1, newN = 2, newP = 1;
100  ConvertStats(oldN, oldP, newN, newP, &stats);
101  EventType new_evec = stats[0].first;
102  KALDI_ASSERT(new_evec.size() == 3); // keys: -1, 0, 1.
103  KALDI_ASSERT(new_evec[0].first == -1 && new_evec[0].second == 1);
104  KALDI_ASSERT(new_evec[1].first == 0 && new_evec[1].second == 10);
105  KALDI_ASSERT(new_evec[2].first == 1 && new_evec[2].second == 11);
106  }
107 
108  { // as above, but leave unchanged.
109  BuildTreeStatsType stats;
110  EventType evec;
111  // this example is of ctx window (10, 11, 12), and pdf-class = 1.
112  evec.push_back(std::pair<int32, int32>(-1, 1));
113  evec.push_back(std::pair<int32, int32>(0, 10));
114  evec.push_back(std::pair<int32, int32>(1, 11));
115  evec.push_back(std::pair<int32, int32>(2, 12));
116  stats.push_back(std::make_pair(evec, static_cast<Clusterable*>(NULL)));
117  int32 oldN = 3, oldP = 1, newN = 3, newP = 1;
118  ConvertStats(oldN, oldP, newN, newP, &stats);
119  EventType new_evec = stats[0].first;
120  KALDI_ASSERT(new_evec == evec);
121  }
122 }
124  {
125  BuildTreeStatsType stats;
126  for(int32 i = 0; i < 100; i++) {
127  EventType evec;
128  if (Rand() % 2)
129  evec.push_back(std::make_pair(12, Rand() % 10));
130  evec.push_back(std::make_pair(10, Rand() % 10));
131  if (Rand() % 2)
132  evec.push_back(std::make_pair(8, Rand() % 10));
133  std::sort(evec.begin(), evec.end());
134  stats.push_back(std::make_pair(evec, static_cast<Clusterable*>(NULL)));
135  }
136  std::vector<BuildTreeStatsType> stats_vec;
137  SplitStatsByKey(stats, 10, &stats_vec);
138  for(int32 i = 0; i < static_cast<int32>(stats_vec.size()); i++) {
139  for(int32 j = 0; j < static_cast<int32>(stats_vec[i].size()); j++) {
140  EventAnswerType ans;
141  bool ok = EventMap::Lookup(stats_vec[i][j].first, 10, &ans);
142  KALDI_ASSERT(ok && ans == i);
143  }
144  }
145  }
146 }
147 
149  for (size_t iter = 0;iter < 10;iter++) {
150  BuildTreeStatsType stats;
151  std::set<EventKeyType> all_keys_union;
152  std::set<EventKeyType> all_keys_intersection;
153 
154  for (size_t i = 0;i < 3;i++) {
155  std::map<EventKeyType, EventValueType> key_vals;
156  for (size_t j = 0;j < 5;j++) {
157  EventKeyType k = RandInt(-2, 1);
158  EventValueType v = RandInt(0, 10);
159  key_vals[k] = v;
160  }
161  EventType evec;
162  CopyMapToVector(key_vals, &evec);
163  stats.push_back(std::pair<EventType, Clusterable*>(evec, (Clusterable*) NULL));
164  std::set<EventKeyType> s;
165  CopyMapKeysToSet(key_vals, &s);
166  if (i == 0) { all_keys_union = s; all_keys_intersection = s; }
167  else {
168  std::set<EventKeyType> new_intersection;
169  for (std::set<EventKeyType>::iterator iter = s.begin(); iter != s.end(); iter++) {
170  all_keys_union.insert(*iter);
171  if (all_keys_intersection.count(*iter) != 0) new_intersection.insert(*iter);
172  }
173  all_keys_intersection = new_intersection;
174  }
175  }
176 
177  { // test in union mode.
178  std::vector<EventKeyType> keys1, keys2;
179  CopySetToVector(all_keys_union, &keys1);
180  FindAllKeys(stats, kAllKeysUnion, &keys2);
181  KALDI_ASSERT(keys1 == keys2);
182  }
183  { // test in intersection mode.
184  std::vector<EventKeyType> keys1, keys2;
185  CopySetToVector(all_keys_intersection, &keys1);
186  FindAllKeys(stats, kAllKeysIntersection, &keys2);
187  KALDI_ASSERT(keys1 == keys2);
188  }
189  { // test in insist-same mode.
190  std::vector<EventKeyType> keys1, keys2;
191  CopySetToVector(all_keys_intersection, &keys1);
192  try {
193  FindAllKeys(stats, kAllKeysInsistIdentical, &keys2); // note, it SHOULD throw an exception here.
194  // This is for testing purposes. It gets caught.
195  KALDI_ASSERT(keys1 == keys2);
196  KALDI_ASSERT(all_keys_union == all_keys_intersection);
197  } catch(...) { // it should throw exception if all keys are not the same.
198  KALDI_LOG << "Ignore previous error.";
199  KALDI_ASSERT(all_keys_union != all_keys_intersection);
200  }
201  }
202  }
203 }
204 
205 
207  EventValueType numvals = 11;
208  for (size_t iter = 0;iter < 10;iter++) {
209  BuildTreeStatsType stats;
210  int32 nKeys = 3;
211  EventKeyType k = RandInt(-1, nKeys-2); // key we will split on.
212  std::set<EventValueType> all_vals;
213  for (size_t i = 0;i < 10;i++) {
214  EventType evec;
215  for (EventKeyType kk = -1;kk < nKeys-1;kk++) {
216  EventValueType v = RandInt(0, numvals);
217  if (kk == k) all_vals.insert(v);
218  evec.push_back(std::make_pair(kk, v));
219  }
220  stats.push_back(std::pair<EventType, Clusterable*>(evec, (Clusterable*) NULL));
221  }
222  int32 nleaves = 0;
223  EventMap *trivial_map = TrivialTree(&nleaves);
224 
225  EventMap *table_map = DoTableSplit(*trivial_map, k, stats, &nleaves);
226  KALDI_ASSERT(nleaves <= numvals);
227  for (size_t i = 0;i < 10;i++) {
228  size_t idx1 = RandInt(0, stats.size()-1), idx2 = RandInt(0, stats.size()-1);
229  EventAnswerType ans1;
230  table_map->Map(stats[idx1].first, &ans1);
231  EventAnswerType ans2;
232  table_map->Map(stats[idx2].first, &ans2);
233 
234  EventValueType val1, val2;
235  bool b = EventMap::Lookup(stats[idx1].first, k, &val1)
236  && EventMap::Lookup(stats[idx2].first, k, &val2);
237  KALDI_ASSERT(b);
238  KALDI_ASSERT(val1 >= 0 );
239  KALDI_ASSERT( (val1 == val2) == (ans1 == ans2) );
240  }
241  for (EventValueType i = 0;i < numvals+1;i++) {
242  if (all_vals.count(i) == 0) {
243  EventType v; v.push_back(std::make_pair(k, i));
244  EventAnswerType ans;
245  bool b = table_map->Map(v, &ans);
246  KALDI_ASSERT(!b); // check it maps stuff we never saw to undefined.
247  }
248  }
249  delete trivial_map;
250  delete table_map;
251  }
252 }
253 
254 
255 
257  EventKeyType key = 0; // just one key.
258  for (size_t iter = 0;iter < 1;iter++) { // in loop anyway.
259  BuildTreeStatsType stats;
260  EventValueType cur_value = 0;
261  int32 num_clust = 10;
262  for (int32 i = 0;i < num_clust;i++) { // this will correspond to the "cluster".
263  size_t n = 1 + Rand() % 3;
264  for (size_t j = 0;j < n;j++) {
265  BaseFloat scalar = static_cast<BaseFloat>(i) + RandUniform()*0.001;
266  EventType evec;
267  evec.push_back(std::make_pair(key, cur_value++));
268  stats.push_back(std::make_pair(evec, static_cast<Clusterable*>(new ScalarClusterable(scalar))));
269  }
270  }
271 
272  int32 nleaves = 0;
273  EventMap *trivial_map = TrivialTree(&nleaves);
274 
275  EventMap *table_map = DoTableSplit(*trivial_map, key, stats, &nleaves);
276  KALDI_ASSERT(nleaves == cur_value);
277 
278  std::vector<EventMap*> mapping;
279  int32 num_reduced = ClusterEventMapGetMapping(*table_map, stats, 0.1, &mapping);
280 
281  std::cout << "TestCluster(): num_reduced = "<<num_reduced<<", expected: "<<cur_value<<" - "<<num_clust<<" = "<<(cur_value-num_clust)<<'\n';
282  KALDI_ASSERT(num_reduced == cur_value - num_clust);
283 
284  EventMap *clustered_map = table_map->Copy(mapping);
285 
286  EventAnswerType new_nleaves;
287  EventMap *renumbered_map = RenumberEventMap(*clustered_map, &new_nleaves);
288  KALDI_ASSERT(new_nleaves == num_clust);
289 
290  std::vector<EventAnswerType> orig_answers, clustered_answers, renumbered_answers;
291 
292  EventType empty_vec;
293  table_map->MultiMap(empty_vec, &orig_answers);
294  clustered_map->MultiMap(empty_vec, &clustered_answers);
295  renumbered_map->MultiMap(empty_vec, &renumbered_answers);
296 
297  SortAndUniq(&orig_answers);
298  SortAndUniq(&clustered_answers);
299  SortAndUniq(&renumbered_answers);
300  KALDI_ASSERT(orig_answers.size() == (size_t) cur_value);
301  KALDI_ASSERT(clustered_answers.size() == (size_t) num_clust);
302  KALDI_ASSERT(renumbered_answers.size() == (size_t) num_clust);
303  KALDI_ASSERT(renumbered_map->MaxResult()+1 == num_clust);
304 
305  DeletePointers(&mapping);
306  delete renumbered_map;
307  delete clustered_map;
308  delete table_map;
309  delete trivial_map;
310  DeleteBuildTreeStats(&stats);
311  }
312 }
313 
314 
316  EventKeyType key = 0; // just one key.
317  for (size_t iter = 0;iter < 1;iter++) { // in loop anyway.
318  BuildTreeStatsType stats;
319  BuildTreeStatsType stats_reduced;
320  EventValueType cur_value = 0;
321  int32 num_clust = 10;
322  for (int32 i = 0;i < num_clust;i++) { // this will correspond to the "cluster".
323  size_t n = 1 + Rand() % 3;
324  for (size_t j = 0;j < n;j++) {
325  BaseFloat scalar = static_cast<BaseFloat>(i) + RandUniform()*0.001;
326  EventType evec;
327  evec.push_back(std::make_pair(key, cur_value++));
328  stats.push_back(std::make_pair(evec, static_cast<Clusterable*>(new ScalarClusterable(scalar))));
329  if (Rand() % 10 < 5) stats_reduced.push_back(stats.back());
330  }
331  }
332 
333  int32 nleaves = 0;
334  EventMap *trivial_map = TrivialTree(&nleaves);
335 
336  EventMap *table_map = DoTableSplit(*trivial_map, key, stats, &nleaves);
337  KALDI_ASSERT(nleaves == cur_value);
338 
339  std::vector<EventMap*> mapping;
340  int32 num_reduced = ClusterEventMapGetMapping(*table_map, stats_reduced, 0.1, &mapping);
341 
342  std::cout << "TestCluster(): num_reduced = "<<num_reduced<<", expected [ignoring gaps]: "<<cur_value<<" - "<<num_clust<<" = "<<(cur_value-num_clust)<<'\n';
343  // KALDI_ASSERT(num_reduced == cur_value - num_clust);
344 
345  EventMap *clustered_map = table_map->Copy(mapping);
346 
347  EventAnswerType new_nleaves;
348  EventMap *renumbered_map = RenumberEventMap(*clustered_map, &new_nleaves);
349  // KALDI_ASSERT(new_nleaves == num_clust);
350 
351  std::vector<EventAnswerType> orig_answers, clustered_answers, renumbered_answers;
352 
353  EventType empty_vec;
354  table_map->MultiMap(empty_vec, &orig_answers);
355  clustered_map->MultiMap(empty_vec, &clustered_answers);
356  renumbered_map->MultiMap(empty_vec, &renumbered_answers);
357 
358  SortAndUniq(&orig_answers);
359  SortAndUniq(&clustered_answers);
360  SortAndUniq(&renumbered_answers);
361  // KALDI_ASSERT(orig_answers.size() == (size_t) cur_value);
362  // KALDI_ASSERT(clustered_answers.size() == (size_t) num_clust);
363  // KALDI_ASSERT(renumbered_answers.size() == (size_t) num_clust);
364  // KALDI_ASSERT(renumbered_map->MaxResult()+1 == num_clust);
365 
366  DeletePointers(&mapping);
367  delete renumbered_map;
368  delete clustered_map;
369  delete table_map;
370  delete trivial_map;
371  DeleteBuildTreeStats(&stats);
372  }
373 }
374 
375 
377  // This second testing routine checks that ClusterEventMap does not renumber leaves whose stats
378  // we exclude.
379  // ClusterEventMapGetMapping (the internal version) were tested in another
380  // testing routine.
381 
382  EventKeyType key = 0; // just one key.
383  for (size_t iter = 0;iter < 1;iter++) { // in loop anyway.
384  BuildTreeStatsType stats;
385  EventValueType cur_value = 0;
386  int32 num_clust = 10;
387  for (int32 i = 0;i < num_clust;i++) { // this will correspond to the "cluster".
388  size_t n = 1 + Rand() % 3;
389  for (size_t j = 0;j < n;j++) {
390  BaseFloat scalar = static_cast<BaseFloat>(i) + RandUniform()*0.001;
391  EventType evec;
392  evec.push_back(std::make_pair(key, cur_value++));
393  stats.push_back(std::make_pair(evec, static_cast<Clusterable*>(new ScalarClusterable(scalar))));
394  }
395  }
396 
397  int32 nleaves = 0;
398  EventMap *trivial_map = TrivialTree(&nleaves);
399 
400  EventMap *table_map = DoTableSplit(*trivial_map, key, stats, &nleaves);
401  KALDI_ASSERT(nleaves == cur_value);
402 
403  std::set<EventValueType> exclude_leaves;
404  for (size_t i = 0;i < 4;i++) exclude_leaves.insert(Rand() % num_clust);
405  BuildTreeStatsType stats_excluded;
406  BuildTreeStatsType stats_included;
407  for (size_t i = 0;i < stats.size();i++) {
408  if (exclude_leaves.count(stats[i].first[0].second) != 0) { // this code relies on the fact that there is just one event in the EventVector.
409  stats_excluded.push_back(stats[i]);
410  } else {
411  stats_included.push_back(stats[i]);
412  }
413  }
414  KALDI_ASSERT(!stats_excluded.empty()&&!stats_included.empty() && stats_excluded.size()+stats_included.size() == stats.size());
415 
416  int32 num_reduced;
417  EventMap *clustered_map = ClusterEventMap(*table_map, stats_included, 0.1, &num_reduced);
418 
419  std::cout << "TestCluster*(): num_reduced = "<<num_reduced<<", expected [without exclusion]: "<<cur_value<<" - "<<num_clust<<" = "<<(cur_value-num_clust)<<'\n';
420 
421  // Make sure stats we excluded are not renumbered.
422  for (size_t i = 0;i < stats_excluded.size();i++) {
423  const EventType &evec = stats_excluded[i].first;
424  EventAnswerType ans; table_map->Map(evec, &ans);
425  EventAnswerType ans2; clustered_map->Map(evec, &ans2);
426  KALDI_ASSERT(ans == ans2);
427  }
428 
429  delete clustered_map;
430  delete table_map;
431  delete trivial_map;
432  DeleteBuildTreeStats(&stats);
433  }
434 }
435 
436 
438 
439  // TestClusterEventMapRestricted() tests that ClusterEventMapRestricted()
440  // does not combine leaves that we were trying to keep separate.
441 
442  bool test_by_key = (Rand()%2 == 0);
443 
444  int32 num_keys = 1 + Rand() % 4;
445  std::vector<EventKeyType> keys;
446 
447  { // randomly choose keys. Will always define all of them.
448  std::set<EventKeyType> keys_set;
449  while (keys_set.size() < (size_t)num_keys)
450  keys_set.insert( (Rand() % (num_keys + 10)) - 3 );
451  CopySetToVector(keys_set, &keys);
452  }
453 
454 
455  BuildTreeStatsType stats;
456 
457  int32 n_stats = 1 + (Rand() % 10);
458  n_stats *= n_stats; // up to 81 stats.
459 
460  for (size_t i = 0; i < (size_t)n_stats; i++) {
461  EventType evec;
462 
463  for (size_t j = 0; j < keys.size(); j++) {
464  EventValueType val = Rand() % 100;
465  evec.push_back(std::make_pair(keys[j], val));
466  }
467  stats.push_back(std::make_pair(evec, static_cast<Clusterable*>(new ScalarClusterable(RandGauss()))));
468  }
469 
470  std::vector<EventKeyType> special_keys;
471  for (size_t i = 0; i < keys.size(); i++)
472  if (RandUniform() < 0.5) special_keys.push_back(keys[i]);
473 
474 
475  int32 nleaves = 0;
476  EventMap *trivial_map = TrivialTree(&nleaves);
477 
478 
479  // We do a complete split on these keys.
480  EventMap *table_split_map = DoTableSplitMultiple(*trivial_map, special_keys,
481  stats, &nleaves);
482  int32 nleaves_after_table_split = nleaves;
483  std::cout << "TestClusterEventMapRestricted: after splitting on "<<special_keys.size()<<" keys, nleaves = " <<nleaves<<'\n';
484  // We now do decision tree split.
485 
486  Questions qo; // all default.
487  int32 num_quest = Rand() % 10, num_iters = rand () % 5;
488  qo.InitRand(stats, num_quest, num_iters, kAllKeysInsistIdentical);
489  float thresh = 0.001;
490  int32 max_leaves = 50;
491  BaseFloat smallest_split = 0.0;
492  BaseFloat impr;
493  EventMap *split_tree = SplitDecisionTree(*table_split_map, stats, qo, thresh, max_leaves,
494  &nleaves, &impr, &smallest_split);
495  KALDI_ASSERT((nleaves <= max_leaves || nleaves == nleaves_after_table_split) && smallest_split >= thresh);
496 
497  std::cout << "TestClusterEventMapRestricted: after building decision tree, " <<nleaves<<'\n';
498 
499  thresh = 1000; // will ensure everything is combined.
500  {
501  int32 num_removed;
502  EventMap *map_clustered = ClusterEventMap(*split_tree, stats,
503  thresh, &num_removed);
504  std::cout << "ClusterEventMap: num_removed = "<<num_removed;
505  KALDI_ASSERT(num_removed == nleaves - 1);
506  delete map_clustered;
507  }
508 
509  {
510  int32 num_removed;
511 
512  EventMap *map_clustered = NULL;
513  if (test_by_key)
514  map_clustered = ClusterEventMapRestrictedByKeys(*split_tree, stats,
515  thresh, special_keys,
516  &num_removed);
517  else
518  map_clustered = ClusterEventMapRestrictedByMap(*split_tree, stats,
519  thresh, *table_split_map,
520  &num_removed);
521 
522  std::cout << "ClusterEventMapRestricted: num_removed = "<<num_removed;
523  // should take it back to status after table split.
524  KALDI_ASSERT(num_removed == nleaves - nleaves_after_table_split);
525  delete map_clustered;
526  }
527 
528  delete split_tree;
529  delete trivial_map;
530  delete table_split_map;
531  DeleteBuildTreeStats(&stats);
532 }
533 
534 
536  // this is modified from TestClusterEventMapRestricted() [rather arbitrarily].
537 
538 
539  int32 num_keys = 1 + Rand() % 4;
540  std::vector<EventKeyType> keys;
541 
542  { // randomly choose keys. Will always define all of them.
543  std::set<EventKeyType> keys_set;
544  while (keys_set.size() < (size_t)num_keys)
545  keys_set.insert( (Rand() % (num_keys + 10)) - 3 );
546  CopySetToVector(keys_set, &keys);
547  }
548 
549 
550  BuildTreeStatsType stats;
551 
552  int32 n_stats = 1 + (Rand() % 10);
553  n_stats *= n_stats; // up to 81 stats.
554 
555  for (size_t i = 0; i < (size_t)n_stats; i++) {
556  EventType evec;
557 
558  for (size_t j = 0; j < keys.size(); j++) {
559  EventValueType val = Rand() % 100;
560  evec.push_back(std::make_pair(keys[j], val));
561  }
562  stats.push_back(std::make_pair(evec, static_cast<Clusterable*>(new ScalarClusterable(RandGauss()))));
563  }
564 
565  std::vector<EventKeyType> special_keys;
566  for (size_t i = 0; i < keys.size(); i++)
567  if (RandUniform() < 0.5) special_keys.push_back(keys[i]);
568 
569 
570  int32 nleaves = 0;
571  EventMap *trivial_map = TrivialTree(&nleaves);
572 
573 
574  // We do a complete split on these keys.
575  EventMap *table_split_map = DoTableSplitMultiple(*trivial_map, special_keys,
576  stats, &nleaves);
577 
578  std::cout << "TestClusterEventMapRestricted: after splitting on "<<special_keys.size()<<" keys, nleaves = " <<nleaves<<'\n';
579  // We now do decision tree split.
580  int nleaves_after_table_split = nleaves;
581  Questions qo; // all default.
582  int32 num_quest = Rand() % 10, num_iters = rand () % 5;
583  qo.InitRand(stats, num_quest, num_iters, kAllKeysInsistIdentical);
584  float thresh = 0.001;
585  int32 max_leaves = 100;
586  BaseFloat impr;
587  BaseFloat smallest_split;
588  EventMap *split_tree = SplitDecisionTree(*table_split_map, stats, qo, thresh, max_leaves,
589  &nleaves, &impr, &smallest_split);
590  KALDI_ASSERT((nleaves <= max_leaves || nleaves == nleaves_after_table_split) && smallest_split >= thresh);
591 
592  std::cout << "TestShareEventMapLeaves: after building decision tree, " <<nleaves<<'\n';
593 
594  if (special_keys.size() == 0) {
595  KALDI_WARN << "TestShareEventMapLeaves(): could not test since key not always defined.";
596  delete split_tree;
597  delete trivial_map;
598  delete table_split_map;
599  DeleteBuildTreeStats(&stats);
600  return;
601  }
602  EventKeyType key = special_keys[Rand() % special_keys.size()];
603  std::vector<EventValueType> values;
604  bool always_defined = PossibleValues(key, stats, &values);
605  KALDI_ASSERT(always_defined);
606 
607  std::set<EventValueType> to_share;
608  for (size_t i = 0; i < 3; i++) to_share.insert(values[Rand() % values.size()]);
609 
610  std::vector<std::vector<EventValueType> > share_value;
611  for (std::set<EventValueType>::iterator iter = to_share.begin();
612  iter != to_share.end();
613  iter++) {
614  share_value.resize(share_value.size()+1);
615  share_value.back().push_back(*iter);
616  }
617  int num_leaves;
618  EventMap *shared = ShareEventMapLeaves(*split_tree,
619  key,
620  share_value,
621  &num_leaves);
622  KALDI_ASSERT(num_leaves <= nleaves);
623  for (size_t i = 0; i < share_value.size(); i++) {
624  EventType evec;
625  std::vector<EventAnswerType> answers;
626  evec.push_back(MakeEventPair(key, share_value[i][0]));
627  shared->MultiMap(evec, &answers);
628  SortAndUniq(&answers);
629  KALDI_ASSERT(answers.size() == 1); // should have been shared.
630  }
631  delete shared;
632 
633  delete split_tree;
634  delete trivial_map;
635  delete table_split_map;
636  DeleteBuildTreeStats(&stats);
637 }
638 
639 
641  // testing Questions::InitRand() function. Also testing I/O of Questions class.
642  for (int32 p = 0;p < 10;p++) {
643  std::vector<EventKeyType> keys_all, keys_some;
644  {
645  std::set<EventKeyType> keys_all_set, keys_some_set;
646  int32 num_all = Rand() % 3, num_some = Rand() % 3;
647  for (int32 i = 0;i < num_all;i++) keys_all_set.insert(Rand() % 10);
648  for (int32 i = 0;i < num_some;i++) {
649  int32 k = Rand() % 10;
650  if (keys_all_set.count(k) == 0) keys_some_set.insert(k);
651  }
652  CopySetToVector(keys_all_set, &keys_all);
653  CopySetToVector(keys_some_set, &keys_some);
654  }
655  std::set<EventKeyType> keys_all_saw_set;
656  // Now we have two distinct sets of keys keys_all and keys_some.
657  // We now create the Clusterable* stuff.
658  BuildTreeStatsType dummy_stats; // dummy because the Clusterable *pointers are actually NULL.
659  size_t n_stats = Rand() % 100;
660  // make sure we sometimes have empty or just one stats: may find extra bugs.
661  if (n_stats > 90) n_stats = 0;
662  if (n_stats > 80) n_stats = 1;
663 
664  for (size_t i = 0;i < n_stats;i++) { // Create stats...
665  EventType evec;
666  for (size_t j = 0;j < keys_all.size();j++) {
667  evec.push_back(std::make_pair( keys_all[j], (EventValueType)(Rand() % 10)));
668  keys_all_saw_set.insert(keys_all[j]);
669  }
670  for (size_t j = 0;j < keys_some.size();j++)
671  if (Rand() % 2 == 0) { // randomly w.p. 1/2
672  evec.push_back(std::make_pair( keys_some[j], (EventValueType)(Rand() % 10)));
673  keys_all_saw_set.insert(keys_some[j]);
674  }
675  std::sort(evec.begin(), evec.end()); // sorts on keys.
676  EventMap::Check(evec);
677  dummy_stats.push_back(std::make_pair(evec, (Clusterable*)NULL));
678  }
679  Questions qo; // all default.
680  bool intersection = (p%2 == 0);
681  int32 num_quest = Rand() % 10, num_iters = rand () % 5;
682  qo.InitRand(dummy_stats, num_quest, num_iters, intersection ? kAllKeysIntersection : kAllKeysUnion);
683 
684  for (int i = 0; i < 2; i++) {
685  // Here, test I/O of questions class.
686  bool binary = (i == 0);
687  std::ostringstream oss;
688  qo.Write(oss, binary);
689 
690  std::istringstream iss(oss.str());
691  Questions qo2;
692  qo2.Read(iss, binary);
693 
694  std::ostringstream oss2;
695  qo2.Write(oss2, binary);
696 
697  if (oss.str() != oss2.str()) {
698  KALDI_ERR << "Questions I/O failure: " << oss.str() << " vs. " << oss2.str();
699  }
700  }
701 
702  if (n_stats > 0) {
703  if (p < 2) {
704  for (size_t i = 0;i < keys_all.size();i++) {
705  KALDI_ASSERT(qo.HasQuestionsForKey(keys_all[i]));
706  const QuestionsForKey &opts = qo.GetQuestionsOf(keys_all[i]);
707  std::cout << "num-quest: "<< opts.initial_questions.size() << '\n';
708  for (size_t j = 0;j < opts.initial_questions.size();j++) {
709  for (size_t k = 0;k < opts.initial_questions[j].size();k++)
710  std::cout << opts.initial_questions[j][k] <<" ";
711  std::cout << '\n';
712  }
713  }
714  }
715  if (intersection) {
716  for (size_t i = 0;i < keys_all.size();i++) {
717  KALDI_ASSERT(qo.HasQuestionsForKey(keys_all[i]));
718  qo.GetQuestionsOf(keys_all[i]);
719  }
720  } else { // union: expect to see all keys that were in the data.
721  for (std::set<int32>::iterator iter = keys_all_saw_set.begin(); iter != keys_all_saw_set.end(); iter++) {
722  KALDI_ASSERT(qo.HasQuestionsForKey(*iter));
723  }
724  }
725  }
726  }
727 }
728 
729 
731  // part of the code is the same as the code for testing Questions::InitRand() function.
732  for (int32 p = 0;p < 4;p++) {
733  std::vector<EventKeyType> keys_all, keys_some;
734  {
735  std::set<EventKeyType> keys_all_set, keys_some_set;
736  int32 num_all = Rand() % 3, num_some = Rand() % 3;
737  for (int32 i = 0;i < num_all;i++) keys_all_set.insert(Rand() % 10);
738  for (int32 i = 0;i < num_some;i++) {
739  int32 k = Rand() % 10;
740  if (keys_all_set.count(k) == 0) keys_some_set.insert(k);
741  }
742  CopySetToVector(keys_all_set, &keys_all);
743  CopySetToVector(keys_some_set, &keys_some);
744  }
745  std::set<EventKeyType> keys_all_saw_set;
746  // Now we have two distinct sets of keys keys_all and keys_some.
747  // We now create the Clusterable* stuff.
748  BuildTreeStatsType stats; // dummy because the Clusterable *pointers are actually NULL.
749  size_t n_stats = Rand() % 100;
750  // make sure we sometimes have empty or just one stats: may find extra bugs.
751  if (n_stats > 90) n_stats = 0;
752  if (n_stats > 80) n_stats = 1;
753 
754  for (size_t i = 0;i < n_stats;i++) { // Create stats...
755  EventType evec;
756  for (size_t j = 0;j < keys_all.size();j++) {
757  evec.push_back(std::make_pair( keys_all[j], (EventValueType)(Rand() % 10)));
758  keys_all_saw_set.insert(keys_all[j]);
759  }
760  for (size_t j = 0;j < keys_some.size();j++)
761  if (Rand() % 2 == 0) { // randomly w.p. 1/2
762  evec.push_back(std::make_pair( keys_some[j], (EventValueType)(Rand() % 10)));
763  keys_all_saw_set.insert(keys_some[j]);
764  }
765  std::sort(evec.begin(), evec.end()); // sorts on keys.
766  EventMap::Check(evec);
767  stats.push_back(std::make_pair(evec, (Clusterable*) new ScalarClusterable(RandGauss())));
768  }
769  Questions qo; // all default.
770  BaseFloat thresh = 0.00001; // these stats have a count of 1... need v. small thresh to get them to merge.
771  // 0.000001 tries to ensure everything is split.
772 
773  bool intersection = true; // keep borrowed code later on happy.
774 
775  int32 num_quest = Rand() % 10, num_iters = rand () % 5;
776  qo.InitRand(stats, num_quest, num_iters, kAllKeysIntersection);
777 
778  if (n_stats > 0) {
779  if (p < 2) {
780  for (size_t i = 0;i < keys_all.size();i++) {
781  KALDI_ASSERT(qo.HasQuestionsForKey(keys_all[i]));
782  const QuestionsForKey &opts = qo.GetQuestionsOf(keys_all[i]);
783  std::cout << "num-quest: "<< opts.initial_questions.size() << '\n';
784  for (size_t j = 0;j < opts.initial_questions.size();j++) {
785  for (size_t k = 0;k < opts.initial_questions[j].size();k++)
786  std::cout << opts.initial_questions[j][k] <<" ";
787  std::cout << '\n';
788  }
789  }
790  }
791  if (intersection) {
792  for (size_t i = 0;i < keys_all.size();i++) {
793  KALDI_ASSERT(qo.HasQuestionsForKey(keys_all[i]));
794  qo.GetQuestionsOf(keys_all[i]);
795  }
796  } else { // union: expect to see all keys that were in the data.
797  for (std::set<int32>::iterator iter = keys_all_saw_set.begin(); iter != keys_all_saw_set.end(); iter++) {
798  KALDI_ASSERT(qo.HasQuestionsForKey(*iter));
799  }
800  }
801  std::cout << "num_quest = " <<num_quest<<", num_iters = "<<num_iters<<'\n';
802  // OK, now want to do decision-tree split.
803 
804  int32 num_leaves = 0;
805  int32 max_leaves = 50;
806  EventMap *trivial_tree = TrivialTree(&num_leaves);
807 
808  BaseFloat impr, smallest_split;
809  EventMap *split_tree = SplitDecisionTree(*trivial_tree, stats, qo, thresh, max_leaves,
810  &num_leaves, &impr, &smallest_split);
811  KALDI_ASSERT(num_leaves <= max_leaves && smallest_split >= thresh);
812 
813  {
814  BaseFloat impr_check = ObjfGivenMap(stats, *split_tree) - ObjfGivenMap(stats, *trivial_tree);
815  std::cout << "Objf impr is " << impr << ", computed differently: " <<impr_check<<'\n';
816  KALDI_ASSERT(fabs(impr - impr_check) < 0.1);
817  }
818 
819 
820  std::cout << "After splitting, num_leaves = " << num_leaves << '\n';
821 
822  std::vector<BuildTreeStatsType> mapped_stats;
823  SplitStatsByMap(stats, *split_tree, &mapped_stats);
824  std::cout << "Assignments of stats to leaves is:\n";
825  for (size_t i = 0; i < mapped_stats.size(); i++) {
826  std::cout << " [ leaf "<<i<<"]: ";
827  for (size_t j = 0; j < mapped_stats[i].size(); j++) {
828  std::cout << ((ScalarClusterable*)(mapped_stats[i][j].second))->Info() << " ";
829  }
830  std::cout << '\n';
831  }
832  delete trivial_tree;
833  delete split_tree;
834  DeleteBuildTreeStats(&stats);
835  }
836  }
837 }
838 void TestBuildTreeStatsIo(bool binary) {
839  for (int32 p = 0; p < 10; p++) {
840  size_t num_stats = Rand() % 20;
841  BuildTreeStatsType stats;
842  for (size_t i = 0; i < num_stats; i++) {
843  EventType ev;
844  int32 ev_sz = Rand() % 5;
845  for (int32 i = 0; i < ev_sz; i++) {
846  EventKeyType key = (i == 0 ? 0 : ev[i-1].first) + Rand() % 2, value = Rand() % 10;
847  ev.push_back(std::make_pair(key, value));
848  }
849  stats.push_back(std::make_pair(ev, (Clusterable*) NULL));
850  }
851  const char *filename = "tmpf";
852  WriteBuildTreeStats(Output(filename, binary).Stream(), binary, stats);
853 
854  {
855  bool binary_in;
856  BuildTreeStatsType stats2;
857  GaussClusterable gc; // just need some random Clusterable object
858  Input ki(filename, &binary_in);
860  binary_in, gc, &stats2);
861  KALDI_ASSERT(stats == stats2);
862  }
863  }
864 
865  unlink("tmpf");
866 }
867 
868 
869 
870 } // end namespace kaldi
871 
872 int main() {
873  using namespace kaldi;
874  for (size_t i = 0;i < 2;i++) {
875  TestTrivialTree();
885  TestBuildTreeStatsIo(false);
886  TestBuildTreeStatsIo(true);
889  TestFindAllKeys(); // put at end because it throws+catches internally.
890  }
891 }
892 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
EventMap * DoTableSplitMultiple(const EventMap &orig, const std::vector< EventKeyType > &keys, const BuildTreeStatsType &stats, int32 *num_leaves)
DoTableSplitMultiple does a complete split on all the keys, in order from keys[0], keys[1] and so on.
void CopySetToVector(const std::set< T > &s, std::vector< T > *v)
Copies the elements of a set to a vector.
Definition: stl-utils.h:86
void DeletePointers(std::vector< A *> *v)
Deletes any non-NULL pointers in the vector v, and sets the corresponding entries of v to NULL...
Definition: stl-utils.h:184
std::pair< EventKeyType, EventValueType > MakeEventPair(EventKeyType k, EventValueType v)
Definition: event-map.h:62
bool ConvertStats(int32 oldN, int32 oldP, int32 newN, int32 newP, BuildTreeStatsType *stats)
Converts stats from a given context-window (N) and central-position (P) to a different N and P...
void TestClusterEventMapGetMappingAndRenumberEventMap()
This class defines, for each EventKeyType, a set of initial questions that it tries and also a number...
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
Definition: kaldi-math.h:151
EventMap * SplitDecisionTree(const EventMap &input_map, const BuildTreeStatsType &stats, Questions &q_opts, BaseFloat thresh, int32 max_leaves, int32 *num_leaves, BaseFloat *obj_impr_out, BaseFloat *smallest_split_change_out)
Does a decision-tree split at the leaves of an EventMap.
void TestDoTableSplit()
virtual EventAnswerType MaxResult() const
Definition: event-map.h:142
void SplitStatsByMap(const BuildTreeStatsType &stats, const EventMap &e, std::vector< BuildTreeStatsType > *stats_out)
Splits stats according to the EventMap, indexing them at output by the leaf type. ...
EventMap * ShareEventMapLeaves(const EventMap &e_in, EventKeyType key, std::vector< std::vector< EventValueType > > &values, int32 *num_leaves)
ShareEventMapLeaves performs a quite specific function that allows us to generate trees where...
void FindAllKeys(const BuildTreeStatsType &stats, AllKeysType keys_type, std::vector< EventKeyType > *keys_out)
FindAllKeys puts in *keys the (sorted, unique) list of all key identities in the stats.
const QuestionsForKey & GetQuestionsOf(EventKeyType key) const
float RandGauss(struct RandomState *state=NULL)
Definition: kaldi-math.h:155
kaldi::int32 int32
void TestTrivialTree()
void SortAndUniq(std::vector< T > *vec)
Sorts and uniq&#39;s (removes duplicates) from a vector.
Definition: stl-utils.h:39
void SplitStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key, std::vector< BuildTreeStatsType > *stats_out)
SplitStatsByKey splits stats up according to the value of a particular key, which must be always defi...
virtual bool Map(const EventType &event, EventAnswerType *ans) const =0
bool PossibleValues(EventKeyType key, const BuildTreeStatsType &stats, std::vector< EventValueType > *ans)
Convenience function e.g.
void DeleteBuildTreeStats(BuildTreeStatsType *stats)
This frees the Clusterable* pointers in "stats", where non-NULL, and sets them to NULL...
void ReadBuildTreeStats(std::istream &is, bool binary, const Clusterable &example, BuildTreeStatsType *stats)
Reads BuildTreeStats object.
int main()
static void Check(const EventType &event)
Definition: event-map.cc:281
std::istream & Stream()
Definition: kaldi-io.cc:826
std::vector< std::pair< EventKeyType, EventValueType > > EventType
Definition: event-map.h:58
void CopyMapKeysToSet(const std::map< A, B > &m, std::set< A > *s)
Copies the keys in a map to a set.
Definition: stl-utils.h:150
void TestFindAllKeys()
void InitRand(const BuildTreeStatsType &stats, int32 num_quest, int32 num_iters_refine, AllKeysType all_keys_type)
InitRand attempts to generate "reasonable" random questions.
struct rnnlm::@11::@12 n
int32 EventKeyType
Things of type EventKeyType can take any value.
Definition: event-map.h:45
QuestionsForKey is a class used to define the questions for a key, and also options that allow us to ...
std::vector< std::vector< EventValueType > > initial_questions
void TestSplitStatsByKey()
void Read(std::istream &is, bool binary)
int ClusterEventMapGetMapping(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, std::vector< EventMap *> *mapping)
"ClusterEventMapGetMapping" clusters the leaves of the EventMap, with "thresh" a delta-likelihood thr...
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
EventMap * ClusterEventMapRestrictedByKeys(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, const std::vector< EventKeyType > &keys, int32 *num_removed)
This is as ClusterEventMap, but first splits the stats on the keys specified in "keys" (e...
EventMap * RenumberEventMap(const EventMap &e_in, int32 *num_leaves)
RenumberEventMap [intended to be used after calling ClusterEventMap] renumbers an EventMap so its lea...
void TestShareEventMapLeaves()
virtual EventMap * Copy(const std::vector< EventMap *> &new_leaves) const =0
BaseFloat ObjfGivenMap(const BuildTreeStatsType &stats_in, const EventMap &e)
Cluster the stats given the event map return the total objf given those clusters. ...
int Rand(struct RandomState *state)
Definition: kaldi-math.cc:45
void TestBuildTreeStatsIo(bool binary)
static bool Lookup(const EventType &event, EventKeyType key, EventValueType *ans)
Definition: event-map.cc:290
void TestClusterEventMapRestricted()
A class that is capable of representing a generic mapping from EventType (which is a vector of (key...
Definition: event-map.h:86
bool HasQuestionsForKey(EventKeyType key) const
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void TestPossibleValues()
EventMap * ClusterEventMap(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, int32 *num_removed_ptr)
This is as ClusterEventMapGetMapping but a more convenient interface that exposes less of the interna...
void TestQuestionsInitRand()
int32 EventAnswerType
As far as the event-map code itself is concerned, things of type EventAnswerType may take any value e...
Definition: event-map.h:56
int32 EventValueType
Given current code, things of type EventValueType should generally be nonnegative and in a reasonably...
Definition: event-map.h:51
std::vector< std::pair< EventType, Clusterable * > > BuildTreeStatsType
void TestClusterEventMap()
GaussClusterable wraps Gaussian statistics in a form accessible to generic clustering algorithms...
virtual void MultiMap(const EventType &event, std::vector< EventAnswerType > *ans) const =0
EventMap * ClusterEventMapRestrictedByMap(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, const EventMap &e_restrict, int32 *num_removed_ptr)
This version of ClusterEventMapRestricted restricts the clustering to only allow things that "e_restr...
#define KALDI_LOG
Definition: kaldi-error.h:153
void CopyMapToVector(const std::map< A, B > &m, std::vector< std::pair< A, B > > *v)
Copies the (key, value) pairs in a map to a vector of pairs.
Definition: stl-utils.h:112
void TestConvertStats()
EventMap * TrivialTree(int32 *num_leaves)
Returns a tree with just one node.
void TestClusterEventMapGetMappingAndRenumberEventMap2()
void TestSplitDecisionTree()
void Write(std::ostream &os, bool binary) const
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
Definition: kaldi-math.cc:95
EventMap * DoTableSplit(const EventMap &orig, EventKeyType key, const BuildTreeStatsType &stats, int32 *num_leaves)
DoTableSplit does a complete split on this key (e.g.
void WriteBuildTreeStats(std::ostream &os, bool binary, const BuildTreeStatsType &stats)
Writes BuildTreeStats object. This works even if pointers are NULL.
ScalarClusterable clusters scalars with x^2 loss.