cluster-utils-test.cc
Go to the documentation of this file.
1 // tree/cluster-utils-test.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include "tree/cluster-utils.h"
22 #include "util/stl-utils.h"
23 
24 namespace kaldi {
25 static void TestClusterUtils() { // just some very basic tests of the GaussClusterable class.
26  BaseFloat varFloor = 0.1;
27  size_t dim = 1 + Rand() % 10;
28  size_t nGauss = 1 + Rand() % 10;
29  std::vector< GaussClusterable * > v(nGauss);
30  for (size_t i = 0;i < nGauss;i++) {
31  v[i] = new GaussClusterable(dim, varFloor);
32  }
33  for (size_t i = 0;i < nGauss;i++) {
34  size_t nPoints = 1 + Rand() % 30;
35  for (size_t j = 0;j < nPoints;j++) {
36  BaseFloat post = 0.5 *(Rand()%3);
37  Vector<BaseFloat> vec(dim);
38  for (size_t k = 0;k < dim;k++) vec(k) = RandGauss();
39  v[i]->AddStats(vec, post);
40  }
41  }
42  for (size_t i = 0;i+1 < nGauss;i++) {
43  BaseFloat like_before = (v[i]->Objf() + v[i+1]->Objf()) / (v[i]->Normalizer() + v[i+1]->Normalizer());
44  Clusterable *tmp = v[i]->Copy();
45  tmp->Add( *(v[i+1]));
46  BaseFloat like_after = tmp->Objf() / tmp->Normalizer();
47  KALDI_LOG << "Like_before = " << like_before <<", after = "<<like_after <<" over "<<tmp->Normalizer()<<" frames.";
48  if (tmp->Normalizer() > 0.1)
49  KALDI_ASSERT(like_after <= like_before); // should get worse after combining stats.
50  delete tmp;
51  }
52  for (size_t i = 0;i < nGauss;i++)
53  delete v[i];
54 }
55 
56 static void TestClusterUtilsVector() { // just some very basic tests of the VectorClusterable class.
57  size_t dim = 2 + Rand() % 10;
58  size_t num_vectors = 1 + Rand() % 10;
59  std::vector<VectorClusterable*> v(num_vectors);
60  for (size_t i = 0;i < num_vectors;i++) {
61  BaseFloat weight = RandUniform();
62  Vector<BaseFloat> vec(dim);
63  vec.SetRandn();
64  v[i] = new VectorClusterable(vec, weight);
65 
66  {
67  VectorClusterable *tmp = static_cast<VectorClusterable*>(v[i]->Copy()),
68  *tmp2 = static_cast<VectorClusterable*>(v[i]->Copy());
69  tmp->Add(*tmp2);
70  KALDI_ASSERT(fabs(tmp->Objf()) < 0.001);
71  if (i > 0) {
72  tmp->Add(*(v[i-1]));
73  KALDI_ASSERT(tmp->Objf() < 0.0);
74  }
75  delete tmp;
76  delete tmp2;
77  }
78  }
79 
80  for (size_t i = 0; i+1 < num_vectors; i++) {
81  BaseFloat like_before = (v[i]->Objf() + v[i+1]->Objf()) / (v[i]->Normalizer() + v[i+1]->Normalizer());
82  Clusterable *tmp = v[i]->Copy();
83  tmp->Add( *(v[i+1]));
84  BaseFloat like_after = tmp->Objf() / tmp->Normalizer();
85  KALDI_LOG << "Like_before = " << like_before <<", after = "<<like_after <<" over "<<tmp->Normalizer()<<" frames.";
86  if (tmp->Normalizer() > 0.1)
87  KALDI_ASSERT(like_after <= like_before); // should get worse after combining stats.
88  delete tmp;
89  }
90  for (size_t i = 0;i < num_vectors;i++)
91  delete v[i];
92 }
93 
94 
95 static void TestObjfPlus() {
96  ScalarClusterable a(1.0), b(2.5);
97  AssertEqual(a.Objf(), (BaseFloat)0.0);
98  AssertEqual(b.Objf(), (BaseFloat)0.0);
99  AssertEqual( a.ObjfPlus(b), -0.5 * (1.0-2.5)*(1.0-2.5)); // 0.5 because half-distance, squared = 1/4, times two points...
100  KALDI_LOG << "Non-binary Output:";
101  a.Write(std::cerr, false);
102  std::cerr << "\nBinary Output:\n";
103  a.Write(std::cerr, true);
104  std::cerr << "\n";
105 }
106 
107 static void TestObjfMinus() {
108  ScalarClusterable a(1.0), b(2.5);
109  AssertEqual(a.Objf(), 0.0);
110  AssertEqual(b.Objf(), 0.0);
111  a.Add(b);
112  AssertEqual(a.ObjfMinus(b), 0.0);
113  a.Add(b);
114  AssertEqual(a.ObjfMinus(b), -0.5 * (1.0-2.5)*(1.0-2.5));
115 }
116 
117 static void TestDistance() {
118  ScalarClusterable a(1.0), b(2.5);
119  AssertEqual(a.Objf(), 0.0);
120  AssertEqual(b.Objf(), 0.0);
121  AssertEqual(a.ObjfPlus(b), -a.Distance(b)); // since distance is negated objf-change, and original objfs were zero.
122 } // end namespace kaldi
123 
124 
126  ScalarClusterable a(1.0), b(2.5);
127  AssertEqual(a.Objf(), 0.0);
128  AssertEqual(b.Objf(), 0.0);
129  a.Add(b);
130  std::vector<Clusterable*> vec;
131  vec.push_back(&a);
132  vec.push_back(&a);
133  AssertEqual(SumClusterableObjf(vec), 2*vec[0]->Objf());
134  AssertEqual(SumClusterableNormalizer(vec), 2*vec[0]->Normalizer());
135 }
136 
137 static void TestSum() {
138  ScalarClusterable a(1.0), b(2.5);
139  std::vector<Clusterable*> vec;
140  vec.push_back(&a);
141  vec.push_back(&b);
142  Clusterable *sum = SumClusterable(vec);
143  AssertEqual(a.ObjfPlus(b), sum->Objf());
144  delete sum;
145 }
146 
148  ScalarClusterable a(1.0), b(2.5);
149  std::vector<Clusterable*> vec(4);
150  vec[1] = a.Copy(); vec[3] = a.Copy();
152  KALDI_ASSERT(vec[0] != NULL && vec[2] != NULL && vec[0]->Objf() == 0 && vec[2]->Objf() == 0 && vec[0] != vec[2] && vec[0] != vec[1]);
153  DeletePointers(&vec);
154 }
155 
156 static void TestAddToClusters() {
157  ScalarClusterable a(1.0), b(2.5), c(3.0);
158  std::vector<Clusterable*> stats(3);
159  stats[0] = a.Copy(); stats[1] = b.Copy(); stats[2] = c.Copy();
160  std::vector<int32> assignments(3);
161  assignments[0] = 1; assignments[1] = 1; assignments[2] = 4;
162  std::vector<Clusterable*> clusters;
163  std::vector<Clusterable*> clusters2;
164  AddToClusters(stats, assignments, &clusters);
165 
166  AddToClusters(stats, assignments, &clusters2); // do this twice.
167  AddToClusters(stats, assignments, &clusters2);
168 
169  KALDI_ASSERT(clusters.size() == 5);
170  KALDI_ASSERT(clusters[0] == NULL && clusters[1] != NULL && clusters[4] != NULL);
171  for (size_t i = 0;i < 5;i++) {
172  if (clusters[i] != NULL) {
173  AssertEqual(clusters2[i]->Objf(), clusters[i]->Objf()*2);
174  }
175  }
176  AssertEqual(c.Mean(), ((ScalarClusterable*)clusters[4])->Mean());
177  AssertEqual( ((ScalarClusterable*)clusters[1])->Mean(), 0.5*(1.0+2.5));
178  DeletePointers(&stats);
179  DeletePointers(&clusters);
180  DeletePointers(&clusters2);
181 }
182 
184  for (size_t p = 0;p < 100;p++) {
185  size_t n_stats = Rand() % 5;
186  n_stats = n_stats * n_stats; // more interestingly distributed.
187  std::vector<Clusterable*> stats(n_stats);
188  for (size_t i = 0;i < n_stats;i++) {
189  if (Rand() % 5 < 4) {
191  if (Rand() % 2 == 0) ptr->Add(*ptr); // make count equal 2. for more randomness.
192  stats[i] = ptr;
193  } else stats[i] = NULL; // make some zero. supposed to be robust to this.
194  }
195  size_t n_clust = 1 + Rand() % 4;
196  std::vector<int32> assignments(n_stats);
197  for (size_t i = 0;i < assignments.size();i++)
198  assignments[i] = Rand() % n_clust;
199  std::vector<Clusterable*> clusts1;
200  std::vector<Clusterable*> clusts2;
201  Clusterable *total = SumClusterable(stats);
202  if (total == NULL) { // no stats were non-NULL.
203  KALDI_ASSERT(stats.size() == 0 || stats[0] == NULL);
204  DeletePointers(&stats);
205  continue;
206  }
207  AddToClusters(stats, assignments, &clusts1);
208  AddToClustersOptimized(stats, assignments, *total, &clusts2);
209 
210  BaseFloat tot1 = SumClusterableNormalizer(stats),
211  tot2 = SumClusterableNormalizer(clusts1),
212  tot3 = SumClusterableNormalizer(clusts2);
213  AssertEqual(tot1, tot2);
214  AssertEqual(tot1, tot3);
215  KALDI_ASSERT(clusts1.size() == clusts2.size());
216  for (size_t i = 0;i < clusts1.size();i++) {
217  if (clusts1[i] != NULL || clusts2[i] != NULL) {
218  KALDI_ASSERT(clusts1[i] != NULL && clusts2[i] != NULL);
219  AssertEqual(clusts1[i]->Normalizer(), clusts2[i]->Normalizer());
220  AssertEqual( ((ScalarClusterable*)clusts1[i])->Mean(),
221  ((ScalarClusterable*)clusts2[i])->Mean() );
222  }
223  }
224  delete total;
225  DeletePointers(&clusts1);
226  DeletePointers(&clusts2);
227  DeletePointers(&stats);
228  }
229 }
230 
231 
232 static void TestClusterBottomUp() {
233  for (size_t i = 0;i < 10;i++) {
234  size_t n_clust = Rand() % 10;
235  std::vector<Clusterable*> points;
236  for (size_t j = 0;j < n_clust;j++) {
237  size_t n_points = 1 + Rand() % 5;
238  BaseFloat clust_center = (BaseFloat)j;
239  for (size_t k = 0;k < n_points;k++) points.push_back(new ScalarClusterable(clust_center + RandUniform()*0.01));
240  }
241 
242  BaseFloat max_merge_thresh = 0.1;
243  size_t min_clust = Rand() % 10; // use max_merge_thresh to control #clust.
244  std::vector<Clusterable*> clusters;
245  std::vector<int32> assignments;
246 
247  for (size_t i = 0;i < points.size();i++) {
248  size_t j = Rand() % points.size();
249  if (i != j) std::swap(points[i], points[j]); // randomize order.
250  }
251 
252 
253  float ans = ClusterBottomUp(points, max_merge_thresh, min_clust, &clusters, &assignments);
254 
255  KALDI_ASSERT(ans < 0.000001); // objf change should be negative.
256  KALDI_LOG << "Objf change from bottom-up clustering is "<<ans<<'\n';
257 
258  ClusterBottomUp(points, max_merge_thresh, min_clust, NULL, NULL); // make sure no crash.
259 
260  if (0) { // for debug if it breaks.
261  for (size_t i = 0;i < points.size();i++) {
262  KALDI_LOG << "point " << i << ": " << ((ScalarClusterable*)points[i])->Info() << " -> " << assignments[i];
263  }
264  for (size_t i = 0;i < clusters.size();i++) {
265  KALDI_LOG << "clust " << i << ": " << ((ScalarClusterable*)clusters[i])->Info();
266  }
267  }
268 
269  KALDI_ASSERT(clusters.size() == std::max(n_clust, std::min(points.size(), min_clust)));
270 
271  for (size_t i = 0;i < points.size();i++) {
272  size_t j = Rand() % points.size();
273  BaseFloat xi = ((ScalarClusterable*)points[i])->Mean(),
274  xj = ((ScalarClusterable*)points[j])->Mean();
275  if (fabs(xi-xj) < 0.011) {
276  if (clusters.size() == n_clust) KALDI_ASSERT(assignments[i] == assignments[j]);
277  } else KALDI_ASSERT(assignments[i] != assignments[j]);
278  }
279  DeletePointers(&clusters);
280  DeletePointers(&points);
281  }
282 
283 }
284 
285 
286 static void TestRefineClusters() {
287  for (size_t n = 0;n < 4;n++) {
288  // Test it by creating a random clustering and verifying that it does not make it worse, and
289  // if done with the optimal parameters, makes it optimal.
290  size_t n_clust = Rand() % 10;
291  std::vector<Clusterable*> points;
292  for (size_t j = 0;j < n_clust;j++) {
293  size_t n_points = 1 + Rand() % 5;
294  BaseFloat clust_center = (BaseFloat)j;
295  for (size_t k = 0;k < n_points;k++) points.push_back(new ScalarClusterable(clust_center + RandUniform()*0.01));
296  }
297  std::vector<Clusterable*> clusters(n_clust);
298  std::vector<int32> assignments(points.size());
299  for (size_t i = 0;i < clusters.size();i++) clusters[i] = new ScalarClusterable();
300  // assign each point to a random cluster.
301  for (size_t i = 0;i < points.size();i++) {
302  assignments[i] = Rand() % n_clust;
303  clusters[assignments[i]]->Add(*(points[i]));
304  }
305  BaseFloat points_objf = SumClusterableObjf(points),
306  clust_objf_before = SumClusterableObjf(clusters),
307  clust_objf_after;
308 
309  KALDI_ASSERT(points_objf >= clust_objf_before -
310  (std::abs(points_objf)+std::abs(clust_objf_before))*0.001);
311 
313  cfg.num_iters = 10000; // very large.
314  cfg.top_n = 2 + (Rand() % 20);
315  BaseFloat impr = RefineClusters(points, &clusters, &assignments, cfg);
316 
317  clust_objf_after = SumClusterableObjf(clusters);
318  KALDI_LOG << "TestRefineClusters: objfs are: "<<points_objf<<" "<<clust_objf_before<<" "<<clust_objf_after<<", impr = "<<impr<<'\n';
319  if (cfg.top_n >=(int32) n_clust) { // check exact.
320  KALDI_ASSERT(clust_objf_after <= 0.01*points.size());
321  }
322  AssertEqual(clust_objf_after - clust_objf_before, impr);
323  DeletePointers(&clusters);
324  DeletePointers(&points);
325 
326  }
327 }
328 
329 static void TestClusterKMeans() {
330  size_t n_points_tot = 0, n_wrong_tot = 0;
331  for (size_t n = 0;n < 3;n++) {
332  // Test it by creating a random clustering and verifying that it does not make it worse, and
333  // if done with the optimal parameters, makes it optimal.
334  size_t n_clust = Rand() % 10;
335  std::vector<Clusterable*> points;
336  std::vector<int32> assignments_ref;
337  for (size_t j = 0;j < n_clust;j++) {
338  size_t n_points = 1 + Rand() % 5;
339  BaseFloat clust_center = (BaseFloat)j;
340  for (size_t k = 0;k < n_points;k++) {
341  points.push_back(new ScalarClusterable(clust_center + RandUniform()*0.01));
342  assignments_ref.push_back(j);
343  }
344  }
345  std::vector<Clusterable*> clusters;
346  std::vector<int32> assignments;
348 
349  BaseFloat ans = ClusterKMeans(points, n_clust, &clusters, &assignments, kcfg);
350 
351  if (n < 3) ClusterKMeans(points, n_clust, NULL, NULL, kcfg); // make sure no crash.
352 
353  BaseFloat clust_objf = SumClusterableObjf(clusters);
354 
355  KALDI_LOG << "TestClusterKmeans: objf after clustering is: "<<clust_objf<<", impr is: "<<ans<<'\n';
356 
357  if (clusters.size() != n_clust) {
358  KALDI_LOG << "Warning: unexpected number of clusters "<<clusters.size()<<" vs. "<<n_clust<<"";
359  }
360  KALDI_ASSERT(assignments.size() == points.size());
361 
362  if (clust_objf < -1.0 * points.size()) { // a bit high...
363  KALDI_LOG << "Warning: ClusterKMeans did not work quite as well as expected";
364  }
365 
366  int32 num_wrong = 0;
367  for (size_t i = 0;i < points.size();i++) {
368  size_t j = Rand() % points.size();
369  if (assignments_ref[i] == assignments_ref[j]) {
370  if (assignments[i] != assignments[j]) num_wrong++;
371  } else
372  if (assignments[i] == assignments[j]) num_wrong++;
373  }
374  KALDI_LOG << "num_wrong = "<<num_wrong<<'\n';
375 
376  n_points_tot += points.size();
377  n_wrong_tot += num_wrong;
378 
379  DeletePointers(&clusters);
380  DeletePointers(&points);
381  }
382  if (n_wrong_tot*4 > n_points_tot) {
383  KALDI_LOG << "Got too many wrong in k-means test [may not be fatal, but check it out.";
384  KALDI_ASSERT(0);
385  }
386 }
387 
388 static void TestClusterKMeansVector() {
389  size_t n_points_tot = 0, n_wrong_tot = 0;
390  for (size_t n = 0; n < 3; n++) {
391  std::vector<int32> assignments_ref;
392  int32 dim = 5 + Rand() % 5;
393  // Test it by creating a random clustering and verifying that it does not make it worse, and
394  // if done with the optimal parameters, makes it optimal.
395  size_t n_clust = Rand() % 10;
396  std::vector<Clusterable*> points;
397  for (size_t j = 0; j < n_clust; j++) {
398  size_t n_points = 1 + Rand() % 5;
399 
400  Vector<BaseFloat> clust_center(dim);
401  clust_center.SetRandn();
402  for (size_t k = 0; k < n_points; k++) {
403  Vector<BaseFloat> point(dim);
404  point.SetRandn();
405  point.Scale(0.01);
406  point.AddVec(1.0, clust_center);
407  BaseFloat weight = 0.5 + 0.432 * (Rand() % 5);
408  points.push_back(new VectorClusterable(point, weight));
409  assignments_ref.push_back(j);
410  }
411  }
412  std::vector<Clusterable*> clusters;
413  std::vector<int32> assignments;
415  kcfg.num_tries = 5;
416 
417  BaseFloat ans = ClusterKMeans(points, n_clust, &clusters, &assignments, kcfg);
418 
419  if (n < 3) ClusterKMeans(points, n_clust, NULL, NULL, kcfg); // make sure no crash.
420 
421  BaseFloat clust_objf = SumClusterableObjf(clusters);
422 
423  KALDI_LOG << "TestClusterKmeans: objf after clustering is: "<<clust_objf<<", impr is: "<<ans<<'\n';
424 
425  if (clusters.size() != n_clust) {
426  KALDI_LOG << "Warning: unexpected number of clusters "<<clusters.size()<<" vs. "<<n_clust<<"";
427  }
428  KALDI_ASSERT(assignments.size() == points.size());
429 
430  if (clust_objf < -1.0 * points.size()) { // a bit high...
431  KALDI_LOG << "Warning: ClusterKMeans did not work quite as well as expected";
432  }
433 
434 
435  int32 num_wrong = 0;
436  for (size_t i = 0;i < points.size();i++) {
437  size_t j = Rand() % points.size();
438  if (assignments_ref[i] == assignments_ref[j]) {
439  if (assignments[i] != assignments[j]) num_wrong++;
440  } else
441  if (assignments[i] == assignments[j]) num_wrong++;
442  }
443 
444  n_points_tot += points.size();
445  n_wrong_tot += num_wrong;
446 
447  KALDI_LOG << "num_wrong = " << num_wrong << ", num-points-tot = "
448  << n_points_tot;
449 
450  DeletePointers(&clusters);
451  DeletePointers(&points);
452  }
453  if (n_wrong_tot*4 > n_points_tot) {
454  KALDI_LOG << "Got too many wrong in k-means test [may not be fatal, but check it out.";
455  KALDI_ASSERT(0);
456  }
457 }
458 
459 
460 
461 static void TestTreeCluster() {
462  size_t n_points_tot = 0, n_wrong_tot = 0;
463  for (size_t n = 0;n < 10;n++) {
464 
465  int32 n_clust = Rand() % 10;
466  std::vector<Clusterable*> points;
467  for (int32 j = 0;j < n_clust;j++) {
468  int32 n_points = 1 + Rand() % 5;
469  BaseFloat clust_center = (BaseFloat)j;
470  for (int32 k = 0;k < n_points;k++) points.push_back(new ScalarClusterable(clust_center + RandUniform()*0.01));
471  }
472  std::vector<Clusterable*> clusters_ext;
473  std::vector<int32> assignments;
474  std::vector<int32> clust_assignments;
475  TreeClusterOptions tcfg;
476  tcfg.thresh = 0.01; // should prevent us splitting things in same bucket.
477  int32 num_leaves = 0;
478  BaseFloat ans = TreeCluster(points, n_clust, &clusters_ext, &assignments, &clust_assignments, &num_leaves, tcfg);
479 
480  if (n < 3) TreeCluster(points, n_clust, NULL, NULL, NULL, NULL, tcfg); // make sure no crash
481 
482  KALDI_ASSERT(num_leaves == n_clust);
483  KALDI_ASSERT(clusters_ext.size() >= static_cast<size_t>(n_clust));
484  std::vector<Clusterable*> clusters(clusters_ext);
485  clusters.resize(n_clust); // ignore non-leaves.
486  BaseFloat clust_objf = SumClusterableObjf(clusters);
487 
488  KALDI_LOG << "TreeCluster: objf after clustering is: "<<clust_objf<<", impr is: "<<ans<<'\n';
489 
490  if (n < 2) // avoid generating too much output.
491  KALDI_LOG << "Num nodes is "<<clusters_ext.size() <<", leaves "<<num_leaves;
492  for (int32 i = 0;i<static_cast<int32>(clusters_ext.size());i++) {
493  if (n < 2) // avoid generating too much output.
494  KALDI_LOG << "Cluster "<<i<<": "<<((ScalarClusterable*)clusters_ext[i])->Info()<<", parent is: "<< clust_assignments[i]<<"";
495  KALDI_ASSERT(clust_assignments[i]>i || (i+1 == static_cast<int32>(clusters_ext.size()) && clust_assignments[i] == i));
496  if (i == static_cast<int32>(clusters_ext.size())-1)
497  KALDI_ASSERT(clust_assignments[i] == i); // top node.
498  }
499  DeletePointers(&clusters_ext);
500  DeletePointers(&points);
501  }
502  if (n_wrong_tot*4 > n_points_tot) {
503  KALDI_LOG << "Got too many wrong in k-means test [may not be fatal, but check it out.";
504  KALDI_ASSERT(0);
505  }
506 }
507 
508 
509 static void TestClusterTopDown() {
510  size_t n_points_tot = 0, n_wrong_tot = 0;
511  for (size_t n = 0;n < 10;n++) {
512 
513  size_t n_clust = Rand() % 10;
514  std::vector<Clusterable*> points;
515  for (size_t j = 0;j < n_clust;j++) {
516  size_t n_points = 1 + Rand() % 5;
517  BaseFloat clust_center = (BaseFloat)j;
518  for (size_t k = 0;k < n_points;k++) points.push_back(new ScalarClusterable(clust_center + RandUniform()*0.01));
519  }
520  std::vector<Clusterable*> clusters;
521  std::vector<int32> assignments;
522  TreeClusterOptions tcfg;
523  tcfg.thresh = 0.01; // should prevent us splitting things in same bucket.
524 
525 
526  BaseFloat ans = ClusterTopDown(points, n_clust, &clusters, &assignments, tcfg);
527 
528  if (n < 3) ClusterTopDown(points, n_clust, NULL, NULL, tcfg); // make sure doesn't crash.
529 
530  BaseFloat clust_objf = SumClusterableObjf(clusters);
531 
532  KALDI_LOG << "ClusterTopDown: objf after clustering is: "<<clust_objf<<", impr is: "<<ans<<'\n';
533 
534  if (n<=2) // avoid generating too much output.
535  KALDI_LOG << "Num nodes is "<<clusters.size()<<'\n';
536  for (size_t i = 0;i < clusters.size();i++) {
537  if (n<=2) { // avoid generating too much output.
538  KALDI_LOG << "Cluster "<<i<<": "<<((ScalarClusterable*)clusters[i])->Info()<<", objf is: "<<clusters[i]->Objf()<<"";
539  }
540  }
541  KALDI_ASSERT(clusters.size() == n_clust);
542  DeletePointers(&clusters);
543  DeletePointers(&points);
544  }
545  if (n_wrong_tot*4 > n_points_tot) {
546  KALDI_LOG << "Got too many wrong in k-means test [may not be fatal, but check it out.";
547  KALDI_ASSERT(0);
548  }
549 }
550 
551 
552 
553 } // end namespace kaldi
554 
555 int main() {
556  using namespace kaldi;
557 
558  for (size_t i = 0; i < 2; i++) {
561  }
563  TestObjfPlus();
564  TestObjfMinus();
565  TestDistance();
567  TestSum();
571  TestTreeCluster();
576 }
static void TestClusterUtils()
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
static void TestClusterTopDown()
virtual void Add(const Clusterable &other)=0
Add other stats.
int main()
void DeletePointers(std::vector< A *> *v)
Deletes any non-NULL pointers in the vector v, and sets the corresponding entries of v to NULL...
Definition: stl-utils.h:184
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
Definition: kaldi-math.h:151
BaseFloat RefineClusters(const std::vector< Clusterable *> &points, std::vector< Clusterable *> *clusters, std::vector< int32 > *assignments, RefineClustersOptions cfg)
RefineClusters is mainly used internally by other clustering algorithms.
static void TestEnsureClusterableVectorNotNull()
virtual BaseFloat Objf() const =0
Return the objective function associated with the stats [assuming ML estimation]. ...
BaseFloat SumClusterableNormalizer(const std::vector< Clusterable *> &vec)
Returns the total normalizer (usually count) of the cluster (pointers may be NULL).
BaseFloat ClusterKMeans(const std::vector< Clusterable *> &points, int32 num_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, ClusterKMeansOptions cfg)
ClusterKMeans is a K-means-like clustering algorithm.
static void TestRefineClusters()
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
float RandGauss(struct RandomState *state=NULL)
Definition: kaldi-math.h:155
kaldi::int32 int32
VectorClusterable wraps vectors in a form accessible to generic clustering algorithms.
virtual void Add(const Clusterable &other_in)
Add other stats.
virtual Clusterable * Copy() const =0
Return a copy of this object.
virtual void Add(const Clusterable &other_in)
Add other stats.
float BaseFloat
Definition: kaldi-types.h:29
static void TestObjfMinus()
void EnsureClusterableVectorNotNull(std::vector< Clusterable *> *stats)
Fills in any (NULL) holes in "stats" vector, with empty stats, because certain algorithms require non...
static void TestDistance()
BaseFloat ClusterBottomUp(const std::vector< Clusterable *> &points, BaseFloat max_merge_thresh, int32 min_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out)
A bottom-up clustering algorithm.
struct rnnlm::@11::@12 n
void AddToClusters(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, std::vector< Clusterable *> *clusters)
Given stats and a vector "assignments" of the same size (that maps to cluster indices), sums the stats up into "clusters." It will add to any stats already present in "clusters" (although typically "clusters" will be empty when called), and it will extend with NULL pointers for any unseen indices.
static void TestAddToClustersOptimized()
virtual Clusterable * Copy() const
Return a copy of this object.
void AddToClustersOptimized(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, const Clusterable &total, std::vector< Clusterable *> *clusters)
AddToClustersOptimized does the same as AddToClusters (it sums up the stats within each cluster...
BaseFloat ClusterTopDown(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, TreeClusterOptions cfg)
A clustering algorithm that internally uses TreeCluster, but does not give you the information about ...
void Scale(Real alpha)
Multiplies all elements by this constant.
static void TestObjfPlus()
static void TestSumObjfAndSumNormalizer()
int Rand(struct RandomState *state)
Definition: kaldi-math.cc:45
void SetRandn()
Set vector to random normally-distributed noise.
virtual BaseFloat Normalizer() const =0
Return the normalizer (typically, count) associated with the stats.
static void TestTreeCluster()
A class representing a vector.
Definition: kaldi-vector.h:406
static void TestClusterBottomUp()
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
static void TestClusterUtilsVector()
virtual BaseFloat Objf() const
Return the objective function associated with the stats [assuming ML estimation]. ...
static void TestAddToClusters()
static void TestClusterKMeansVector()
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
Definition: kaldi-math.h:276
virtual BaseFloat Objf() const
Return the objective function associated with the stats [assuming ML estimation]. ...
static void TestClusterKMeans()
BaseFloat SumClusterableObjf(const std::vector< Clusterable *> &vec)
Returns the total objective function after adding up all the statistics in the vector (pointers may b...
BaseFloat TreeCluster(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out, TreeClusterOptions cfg)
TreeCluster is a top-down clustering algorithm, using a binary tree (not necessarily balanced)...
GaussClusterable wraps Gaussian statistics in a form accessible to generic clustering algorithms...
#define KALDI_LOG
Definition: kaldi-error.h:153
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
static void TestSum()
Clusterable * SumClusterable(const std::vector< Clusterable *> &vec)
Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present.
void Copy(const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< Real > *tgt)
Copies elements from src into tgt as given by copy_from_indices.
Definition: cu-math.cc:173
ScalarClusterable clusters scalars with x^2 loss.