cluster-utils.cc
Go to the documentation of this file.
1 // tree/cluster-utils.cc
2 
3 // Copyright 2012 Arnab Ghoshal
4 // Copyright 2009-2011 Microsoft Corporation; Saarland University
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include <functional>
22 #include <queue>
23 #include <vector>
24 using std::vector;
25 
26 #include "base/kaldi-math.h"
27 #include "util/stl-utils.h"
28 #include "tree/cluster-utils.h"
29 
30 namespace kaldi {
31 
32 typedef uint16 uint_smaller;
33 typedef int16 int_smaller;
34 
35 // ============================================================================
36 // Some convenience functions used in the clustering routines
37 // ============================================================================
38 
39 BaseFloat SumClusterableObjf(const std::vector<Clusterable*> &vec) {
40  BaseFloat ans = 0.0;
41  for (size_t i = 0; i < vec.size(); i++) {
42  if (vec[i] != NULL) {
43  BaseFloat objf = vec[i]->Objf();
44  if (KALDI_ISNAN(objf)) {
45  KALDI_WARN << "SumClusterableObjf, NaN objf";
46  } else {
47  ans += objf;
48  }
49  }
50  }
51  return ans;
52 }
53 
54 BaseFloat SumClusterableNormalizer(const std::vector<Clusterable*> &vec) {
55  BaseFloat ans = 0.0;
56  for (size_t i = 0; i < vec.size(); i++) {
57  if (vec[i] != NULL) {
58  BaseFloat objf = vec[i]->Normalizer();
59  if (KALDI_ISNAN(objf)) {
60  KALDI_WARN << "SumClusterableObjf, NaN objf";
61  } else {
62  ans += objf;
63  }
64  }
65  }
66  return ans;
67 }
68 
69 Clusterable* SumClusterable(const std::vector<Clusterable*> &vec) {
70  Clusterable *ans = NULL;
71  for (size_t i = 0; i < vec.size(); i++) {
72  if (vec[i] != NULL) {
73  if (ans == NULL)
74  ans = vec[i]->Copy();
75  else
76  ans->Add(*(vec[i]));
77  }
78  }
79  return ans;
80 }
81 
82 void EnsureClusterableVectorNotNull(std::vector<Clusterable*> *stats) {
83  KALDI_ASSERT(stats != NULL);
84  std::vector<Clusterable*>::iterator itr = stats->begin(), end = stats->end();
85  if (itr == end) return; // Nothing to do.
86  Clusterable *nonNullExample = NULL;
87  for (; itr != end; ++itr) {
88  if (*itr != NULL) {
89  nonNullExample = *itr;
90  break;
91  }
92  }
93  if (nonNullExample == NULL) {
94  KALDI_ERR << "All stats are NULL."; // crash. logic error.
95  }
96  itr = stats->begin();
97  Clusterable *nonNullExampleCopy = nonNullExample->Copy();
98  // sets stats to zero. do this just once (on copy) for efficiency.
99  nonNullExampleCopy->SetZero();
100  for (; itr != end; ++itr) {
101  if (*itr == NULL) {
102  *itr = nonNullExampleCopy->Copy();
103  }
104  }
105  delete nonNullExampleCopy;
106 }
107 
108 void AddToClusters(const std::vector<Clusterable*> &stats,
109  const std::vector<int32> &assignments,
110  std::vector<Clusterable*> *clusters) {
111  KALDI_ASSERT(assignments.size() == stats.size());
112  int32 size = stats.size();
113  if (size == 0) return; // Nothing to do.
114  KALDI_ASSERT(clusters != NULL);
115  int32 max_assignment = *std::max_element(assignments.begin(),
116  assignments.end());
117  if (static_cast<int32> (clusters->size()) <= max_assignment)
118  clusters->resize(max_assignment + 1, NULL); // extend with NULLs.
119  for (int32 i = 0; i < size; i++) {
120  if (stats[i] != NULL) {
121  if ((*clusters)[assignments[i]] == NULL)
122  (*clusters)[assignments[i]] = stats[i]->Copy();
123  else
124  (*clusters)[assignments[i]]->Add(*(stats[i]));
125  }
126  }
127 }
128 
129 
130 void AddToClustersOptimized(const std::vector<Clusterable*> &stats,
131  const std::vector<int32> &assignments,
132  const Clusterable &total,
133  std::vector<Clusterable*> *clusters) {
134 #ifdef KALDI_PARANOID
135  // Make sure "total" is actually the sum of stats in "stats".
136  {
137  BaseFloat stats_norm = SumClusterableNormalizer(stats),
138  tot_norm = total.Normalizer();
139  AssertEqual(stats_norm, tot_norm, 0.01);
140  }
141 #endif
142 
143  KALDI_ASSERT(assignments.size() == stats.size());
144  int32 size = stats.size();
145  if (size == 0) return; // Nothing to do.
146  KALDI_ASSERT(clusters != NULL);
147  int32 num_clust = 1 + *std::max_element(assignments.begin(),
148  assignments.end());
149  if (static_cast<int32> (clusters->size()) < num_clust)
150  clusters->resize(num_clust, NULL); // extend with NULLs.
151  std::vector<int32> num_stats_for_cluster(num_clust, 0);
152  int32 num_total_stats = 0;
153  for (int32 i = 0; i < size; i++) {
154  if (stats[i] != NULL) {
155  num_total_stats++;
156  num_stats_for_cluster[assignments[i]]++;
157  }
158  }
159  if (num_total_stats == 0) return; // Nothing to do.
160 
161  // it will only ever be efficient to subtract for at most one index.
162  int32 subtract_index = -1;
163  for (int32 c = 0; c < num_clust; c++) {
164  if (num_stats_for_cluster[c] > num_total_stats - num_stats_for_cluster[c]) {
165  subtract_index = c;
166  if ((*clusters)[c] == NULL)
167  (*clusters)[c] = total.Copy();
168  else
169  (*clusters)[c]->Add(total);
170  break;
171  }
172  }
173 
174  for (int32 i = 0; i < size; i++) {
175  if (stats[i] != NULL) {
176  int32 assignment = assignments[i];
177  if (assignment != (int32) subtract_index) {
178  if ((*clusters)[assignment] == NULL)
179  (*clusters)[assignment] = stats[i]->Copy();
180  else
181  (*clusters)[assignment]->Add(*(stats[i]));
182  }
183  if (subtract_index != -1 && assignment != subtract_index)
184  (*clusters)[subtract_index]->Sub(*(stats[i]));
185  }
186  }
187 }
188 
189 // ============================================================================
190 // Bottom-up clustering routines
191 // ============================================================================
192 
194  public:
195  BottomUpClusterer(const std::vector<Clusterable*> &points,
196  BaseFloat max_merge_thresh,
197  int32 min_clust,
198  std::vector<Clusterable*> *clusters_out,
199  std::vector<int32> *assignments_out)
200  : ans_(0.0), points_(points), max_merge_thresh_(max_merge_thresh),
201  min_clust_(min_clust), clusters_(clusters_out != NULL? clusters_out
202  : &tmp_clusters_), assignments_(assignments_out != NULL ?
203  assignments_out : &tmp_assignments_) {
204  nclusters_ = npoints_ = points.size();
205  dist_vec_.resize((npoints_ * (npoints_ - 1)) / 2);
206  }
207 
208  BaseFloat Cluster();
210 
211  private:
212  void Renumber();
213  void InitializeAssignments();
214  void SetInitialDistances();
215  bool CanMerge(int32 i, int32 j, BaseFloat dist);
219  void MergeClusters(int32 i, int32 j);
221  void ReconstructQueue();
222 
223  void SetDistance(int32 i, int32 j);
225  KALDI_ASSERT(i < npoints_ && j < i);
226  return dist_vec_[(i * (i - 1)) / 2 + j];
227  }
228 
230  const std::vector<Clusterable*> &points_;
233  std::vector<Clusterable*> *clusters_;
234  std::vector<int32> *assignments_;
235 
236  std::vector<Clusterable*> tmp_clusters_;
237  std::vector<int32> tmp_assignments_;
238 
239  std::vector<BaseFloat> dist_vec_;
242  typedef std::pair<BaseFloat, std::pair<uint_smaller, uint_smaller> > QueueElement;
243  // Priority queue using greater (lowest distances are highest priority).
244  typedef std::priority_queue<QueueElement, std::vector<QueueElement>,
245  std::greater<QueueElement> > QueueType;
246  QueueType queue_;
247 };
248 
250  KALDI_VLOG(2) << "Initializing cluster assignments.";
252  KALDI_VLOG(2) << "Setting initial distances.";
254 
255  KALDI_VLOG(2) << "Clustering...";
256  while (nclusters_ > min_clust_ && !queue_.empty()) {
257  std::pair<BaseFloat, std::pair<uint_smaller, uint_smaller> > pr = queue_.top();
258  BaseFloat dist = pr.first;
259  int32 i = (int32) pr.second.first, j = (int32) pr.second.second;
260  queue_.pop();
261  if (CanMerge(i, j, dist)) MergeClusters(i, j);
262  }
263  KALDI_VLOG(2) << "Renumbering clusters to contiguous numbers.";
264  Renumber();
265  return ans_;
266 }
267 
269  KALDI_VLOG(2) << "Freeing up distance vector.";
270  {
271  vector<BaseFloat> tmp;
272  tmp.swap(dist_vec_);
273  }
274 
275 // Commented the following out since it was causing the process to take up too
276 // much memory with larger models. While the swap() method of STL types swaps
277 // the data pointers, std::swap() creates a temporary copy. -Arnab
278 // KALDI_VLOG(2) << "Freeing up the queue";
279 // // first free up memory by getting rid of queue. this is a special trick
280 // // to force STL to free memory.
281 // {
282 // QueueType tmp;
283 // std::swap(tmp, queue_);
284 // }
285 
286  // called after clustering, renumbers to make clusters contiguously
287  // numbered. also processes assignments_ to remove chains of references.
288  KALDI_VLOG(2) << "Creating new copy of non-NULL clusters.";
289  std::vector<uint_smaller> mapping(npoints_, static_cast<uint_smaller> (-1)); // mapping from intermediate to final clusters.
290  std::vector<Clusterable*> new_clusters(nclusters_);
291  int32 clust = 0;
292  for (int32 i = 0; i < npoints_; i++) {
293  if ((*clusters_)[i] != NULL) {
294  KALDI_ASSERT(clust < nclusters_);
295  new_clusters[clust] = (*clusters_)[i];
296  mapping[i] = clust;
297  clust++;
298  }
299  }
300  KALDI_ASSERT(clust == nclusters_);
301 
302  KALDI_VLOG(2) << "Creating new copy of assignments.";
303  std::vector<int32> new_assignments(npoints_);
304  for (int32 i = 0; i < npoints_; i++) { // now reprocess assignments_.
305  int32 ii = i;
306  while ((*assignments_)[ii] != ii)
307  ii = (*assignments_)[ii]; // follow the chain.
308  KALDI_ASSERT((*clusters_)[ii] != NULL); // cannot have assignment to nonexistent cluster.
309  KALDI_ASSERT(mapping[ii] != static_cast<uint_smaller>(-1));
310  new_assignments[i] = mapping[ii];
311  }
312  clusters_->swap(new_clusters);
313  assignments_->swap(new_assignments);
314 }
315 
317  clusters_->resize(npoints_);
318  assignments_->resize(npoints_);
319  for (int32 i = 0; i < npoints_; i++) { // initialize as 1-1 mapping.
320  (*clusters_)[i] = points_[i]->Copy();
321  (*assignments_)[i] = i;
322  }
323 }
324 
326  for (int32 i = 0; i < npoints_; i++) {
327  for (int32 j = 0; j < i; j++) {
328  BaseFloat dist = (*clusters_)[i]->Distance(*((*clusters_)[j]));
329  dist_vec_[(i * (i - 1)) / 2 + j] = dist;
330  if (dist <= max_merge_thresh_)
331  queue_.push(std::make_pair(dist, std::make_pair(static_cast<uint_smaller>(i),
332  static_cast<uint_smaller>(j))));
333  }
334  }
335 }
336 
338  KALDI_ASSERT(i != j && i < npoints_ && j < npoints_);
339  if ((*clusters_)[i] == NULL || (*clusters_)[j] == NULL)
340  return false;
341  BaseFloat cached_dist = dist_vec_[(i * (i - 1)) / 2 + j];
342  return (std::fabs(cached_dist - dist) <= 1.0e-05 * std::fabs(dist));
343 }
344 
346  KALDI_ASSERT(i != j && i < npoints_ && j < npoints_);
347  (*clusters_)[i]->Add(*((*clusters_)[j]));
348  delete (*clusters_)[j];
349  (*clusters_)[j] = NULL;
350  // note that we may have to follow the chain within "assignment_" to get
351  // final assignments.
352  (*assignments_)[j] = i;
353  // subtract negated objective function change, i.e. add objective function
354  // change.
355  ans_ -= dist_vec_[(i * (i - 1)) / 2 + j];
356  nclusters_--;
357  // Now update "distances".
358  for (int32 k = 0; k < npoints_; k++) {
359  if (k != i && (*clusters_)[k] != NULL) {
360  if (k < i)
361  SetDistance(i, k); // SetDistance requires k < i.
362  else
363  SetDistance(k, i);
364  }
365  }
366 }
367 
369  // empty queue [since there is no clear()]
370  {
371  QueueType tmp;
372  std::swap(tmp, queue_);
373  }
374  for (int32 i = 0; i < npoints_; i++) {
375  if ((*clusters_)[i] != NULL) {
376  for (int32 j = 0; j < i; j++) {
377  if ((*clusters_)[j] != NULL) {
378  BaseFloat dist = dist_vec_[(i * (i - 1)) / 2 + j];
379  if (dist <= max_merge_thresh_) {
380  queue_.push(std::make_pair(dist, std::make_pair(
381  static_cast<uint_smaller>(i), static_cast<uint_smaller>(j))));
382  }
383  }
384  }
385  }
386  }
387 }
388 
390  KALDI_ASSERT(i < npoints_ && j < i && (*clusters_)[i] != NULL
391  && (*clusters_)[j] != NULL);
392  BaseFloat dist = (*clusters_)[i]->Distance(*((*clusters_)[j]));
393  dist_vec_[(i * (i - 1)) / 2 + j] = dist; // set the distance in the array.
394  if (dist < max_merge_thresh_) {
395  queue_.push(std::make_pair(dist, std::make_pair(static_cast<uint_smaller>(i),
396  static_cast<uint_smaller>(j))));
397  }
398  // every time it's at least twice the maximum possible size.
399  if (queue_.size() >= static_cast<size_t> (npoints_ * npoints_)) {
400  // Control memory use by getting rid of orphaned queue entries
402  }
403 }
404 
405 
406 
407 BaseFloat ClusterBottomUp(const std::vector<Clusterable*> &points,
408  BaseFloat max_merge_thresh,
409  int32 min_clust,
410  std::vector<Clusterable*> *clusters_out,
411  std::vector<int32> *assignments_out) {
412  KALDI_ASSERT(max_merge_thresh >= 0.0 && min_clust >= 0);
414  int32 npoints = points.size();
415  // make sure fits in uint_smaller and does not hit the -1 which is reserved.
416  KALDI_ASSERT(sizeof(uint_smaller)==sizeof(uint32) ||
417  npoints < static_cast<int32>(static_cast<uint_smaller>(-1)));
418 
419  KALDI_VLOG(2) << "Initializing clustering object.";
420  BottomUpClusterer bc(points, max_merge_thresh, min_clust, clusters_out, assignments_out);
421  BaseFloat ans = bc.Cluster();
422  if (clusters_out) KALDI_ASSERT(!ContainsNullPointers(*clusters_out));
423  return ans;
424 }
425 
426 
427 // ============================================================================
428 // Compartmentalized bottom-up clustering routines
429 // ============================================================================
430 
433  int32 compartment, point1, point2;
435  : dist(d), compartment(comp), point1(i), point2(j) {}
436 };
437 
439  return a.dist > b.dist;
440 }
441 
443  public:
445  const vector< vector<Clusterable*> > &points, BaseFloat max_merge_thresh,
446  int32 min_clust)
447  : points_(points), max_merge_thresh_(max_merge_thresh),
448  min_clust_(min_clust) {
449  ncompartments_ = points.size();
450  nclusters_ = 0;
451  npoints_.resize(ncompartments_);
452  for (int32 comp = 0; comp < ncompartments_; comp++) {
453  npoints_[comp] = points[comp].size();
454  nclusters_ += npoints_[comp];
455  }
456  }
457  BaseFloat Cluster(vector< vector<Clusterable*> > *clusters_out,
458  vector< vector<int32> > *assignments_out);
460  for (vector< vector<Clusterable*> >::iterator itr = clusters_.begin(),
461  end = clusters_.end(); itr != end; ++itr)
462  DeletePointers(&(*itr));
463  }
464 
465  private:
466  // Renumbers to make clusters contiguously numbered. Called after clustering.
467  // Also processes assignments_ to remove chains of references.
468  void Renumber(int32 compartment);
469  void InitializeAssignments();
470  void SetInitialDistances();
471  bool CanMerge(int32 compartment, int32 i, int32 j, BaseFloat dist);
475  BaseFloat MergeClusters(int32 compartment, int32 i, int32 j);
477  void ReconstructQueue();
478 
479  void SetDistance(int32 compartment, int32 i, int32 j);
480 
481  const vector< vector<Clusterable*> > &points_;
484  vector< vector<Clusterable*> > clusters_;
485  vector< vector<int32> > assignments_;
486  vector< vector<BaseFloat> > dist_vec_;
488  vector<int32> npoints_;
489  // Priority queue using greater (lowest distances are highest priority).
490  typedef std::priority_queue< CompBotClustElem, std::vector<CompBotClustElem>,
491  std::greater<CompBotClustElem> > QueueType;
492  QueueType queue_;
493 };
494 
496  vector< vector<Clusterable*> > *clusters_out,
497  vector< vector<int32> > *assignments_out) {
500  BaseFloat total_obj_change = 0;
501 
502  while (nclusters_ > min_clust_ && !queue_.empty()) {
503  CompBotClustElem qelem = queue_.top();
504  queue_.pop();
505  if (CanMerge(qelem.compartment, qelem.point1, qelem.point2, qelem.dist))
506  total_obj_change += MergeClusters(qelem.compartment, qelem.point1,
507  qelem.point2);
508  }
509  for (int32 comp = 0; comp < ncompartments_; comp++)
510  Renumber(comp);
511  if (clusters_out != NULL) clusters_out->swap(clusters_);
512  if (assignments_out != NULL) assignments_out->swap(assignments_);
513  return total_obj_change;
514 }
515 
517  // first free up memory by getting rid of queue. this is a special trick
518  // to force STL to free memory.
519  {
520  QueueType tmp;
521  std::swap(tmp, queue_);
522  }
523 
524  // First find the number of surviving clusters in the compartment.
525  int32 clusts_in_compartment = 0;
526  for (int32 i = 0; i < npoints_[comp]; i++) {
527  if (clusters_[comp][i] != NULL)
528  clusts_in_compartment++;
529  }
530  KALDI_ASSERT(clusts_in_compartment <= nclusters_);
531 
532  // mapping from intermediate to final clusters.
533  vector<uint_smaller> mapping(npoints_[comp], static_cast<uint_smaller> (-1));
534  vector<Clusterable*> new_clusters(clusts_in_compartment);
535 
536  // Now copy the surviving clusters in a fresh array.
537  clusts_in_compartment = 0;
538  for (int32 i = 0; i < npoints_[comp]; i++) {
539  if (clusters_[comp][i] != NULL) {
540  new_clusters[clusts_in_compartment] = clusters_[comp][i];
541  mapping[i] = clusts_in_compartment;
542  clusts_in_compartment++;
543  }
544  }
545 
546  // Next, process the assignments.
547  std::vector<int32> new_assignments(npoints_[comp]);
548  for (int32 i = 0; i < npoints_[comp]; i++) {
549  int32 ii = i;
550  while (assignments_[comp][ii] != ii)
551  ii = assignments_[comp][ii]; // follow the chain.
552  // cannot assign to nonexistent cluster.
553  KALDI_ASSERT(clusters_[comp][ii] != NULL);
554  KALDI_ASSERT(mapping[ii] != static_cast<uint_smaller>(-1));
555  new_assignments[i] = mapping[ii];
556  }
557  clusters_[comp].swap(new_clusters);
558  assignments_[comp].swap(new_assignments);
559 }
560 
562  clusters_.resize(ncompartments_);
563  assignments_.resize(ncompartments_);
564  for (int32 comp = 0; comp < ncompartments_; comp++) {
565  clusters_[comp].resize(npoints_[comp]);
566  assignments_[comp].resize(npoints_[comp]);
567  for (int32 i = 0; i < npoints_[comp]; i++) { // initialize as 1-1 mapping.
568  clusters_[comp][i] = points_[comp][i]->Copy();
569  assignments_[comp][i] = i;
570  }
571  }
572 }
573 
575  dist_vec_.resize(ncompartments_);
576  for (int32 comp = 0; comp < ncompartments_; comp++) {
577  dist_vec_[comp].resize((npoints_[comp] * (npoints_[comp] - 1)) / 2);
578  for (int32 i = 0; i < npoints_[comp]; i++)
579  for (int32 j = 0; j < i; j++)
580  SetDistance(comp, i, j);
581  }
582 }
583 
585  BaseFloat dist) {
586  KALDI_ASSERT(comp < ncompartments_ && i < npoints_[comp] && j < i);
587  if (clusters_[comp][i] == NULL || clusters_[comp][j] == NULL)
588  return false;
589  BaseFloat cached_dist = dist_vec_[comp][(i * (i - 1)) / 2 + j];
590  return (std::fabs(cached_dist - dist) <= 1.0e-05 * std::fabs(dist));
591 }
592 
594  int32 j) {
595  KALDI_ASSERT(comp < ncompartments_ && i < npoints_[comp] && j < i);
596  clusters_[comp][i]->Add(*(clusters_[comp][j]));
597  delete clusters_[comp][j];
598  clusters_[comp][j] = NULL;
599  // note that we may have to follow the chain within "assignment_" to get
600  // final assignments.
601  assignments_[comp][j] = i;
602  // objective function change.
603  BaseFloat ans = -dist_vec_[comp][(i * (i - 1)) / 2 + j];
604  nclusters_--;
605  // Now update "distances".
606  for (int32 k = 0; k < npoints_[comp]; k++) {
607  if (k != i && clusters_[comp][k] != NULL) {
608  if (k < i)
609  SetDistance(comp, i, k); // SetDistance requires k < i.
610  else
611  SetDistance(comp, k, i);
612  }
613  }
614  // Control memory use by getting rid of orphaned queue entries every time
615  // it's at least twice the maximum possible size.
616  if (queue_.size() >= static_cast<size_t> (nclusters_ * nclusters_)) {
618  }
619  return ans;
620 }
621 
623  // empty queue [since there is no clear()]
624  {
625  QueueType tmp;
626  std::swap(tmp, queue_);
627  }
628  for (int32 comp = 0; comp < ncompartments_; comp++) {
629  for (int32 i = 0; i < npoints_[comp]; i++) {
630  if (clusters_[comp][i] == NULL) continue;
631  for (int32 j = 0; j < i; j++) {
632  if (clusters_[comp][j] == NULL) continue;
633  SetDistance(comp, i, j);
634  }
635  }
636  }
637 }
638 
640  int32 i, int32 j) {
641  KALDI_ASSERT(comp < ncompartments_ && i < npoints_[comp] && j < i);
642  KALDI_ASSERT(clusters_[comp][i] != NULL && clusters_[comp][j] != NULL);
643  BaseFloat dist = clusters_[comp][i]->Distance(*(clusters_[comp][j]));
644  dist_vec_[comp][(i * (i - 1)) / 2 + j] = dist;
645  if (dist < max_merge_thresh_) {
646  queue_.push(CompBotClustElem(dist, comp, static_cast<uint_smaller>(i),
647  static_cast<uint_smaller>(j)));
648  }
649 }
650 
651 
652 
654  const std::vector< std::vector<Clusterable*> > &points, BaseFloat thresh,
655  int32 min_clust, std::vector< std::vector<Clusterable*> > *clusters_out,
656  std::vector< std::vector<int32> > *assignments_out) {
657  KALDI_ASSERT(thresh >= 0.0 && min_clust >= 0);
658  int32 npoints = 0, num_non_empty_compartments = 0;
659  for (vector< vector<Clusterable*> >::const_iterator itr = points.begin(),
660  end = points.end(); itr != end; ++itr) {
662  npoints += itr->size();
663  if (itr->size() > 0) num_non_empty_compartments++;
664  }
665  KALDI_ASSERT(min_clust >= num_non_empty_compartments); // Code does not merge compartments.
666  // make sure fits in uint_smaller and does not hit the -1 which is reserved.
667  KALDI_ASSERT(sizeof(uint_smaller)==sizeof(uint32) ||
668  npoints < static_cast<int32>(static_cast<uint_smaller>(-1)));
669 
670  CompartmentalizedBottomUpClusterer bc(points, thresh, min_clust);
671  BaseFloat ans = bc.Cluster(clusters_out, assignments_out);
672  if (clusters_out) {
673  for (vector< vector<Clusterable*> >::iterator itr = clusters_out->begin(),
674  end = clusters_out->end(); itr != end; ++itr) {
676  }
677  }
678  return ans;
679 }
680 
681 
682 // ============================================================================
683 // Clustering through refinement routines
684 // ============================================================================
685 
687 
688  public:
689  // size used in point_info structure (we store a lot of these so don't want
690  // to just make it int32). Also used as a time-id (cannot have more moves of
691  // points, than can fit in this time). Must be big enough to store num-clust.
692  typedef int32 LocalInt;
693  typedef uint_smaller ClustIndexInt;
694 
695  RefineClusterer(const std::vector<Clusterable*> &points,
696  std::vector<Clusterable*> *clusters,
697  std::vector<int32> *assignments,
699  : points_(points), clusters_(clusters), assignments_(assignments),
700  cfg_(cfg) {
701  KALDI_ASSERT(cfg_.top_n >= 2);
702  num_points_ = points_.size();
703  num_clust_ = static_cast<int32> (clusters->size());
704 
705  // so can fit clust-id in LocalInt
706  if (cfg_.top_n > (int32) num_clust_) cfg_.top_n
707  = static_cast<int32> (num_clust_);
708  KALDI_ASSERT(cfg_.top_n == static_cast<int32>(static_cast<ClustIndexInt>(cfg_.top_n)));
709  t_ = 0;
710  my_clust_index_.resize(num_points_);
711  // will set all PointInfo's to 0 too (they will be up-to-date).
712  clust_time_.resize(num_clust_, 0);
713  clust_objf_.resize(num_clust_);
714  for (int32 i = 0; i < num_clust_; i++)
715  clust_objf_[i] = (*clusters_)[i]->Objf();
716  info_.resize(num_points_ * cfg_.top_n);
717  ans_ = 0;
718  InitPoints();
719  }
720 
722  if (cfg_.top_n <= 1) return 0.0; // nothing to do.
723  Iterate();
724  return ans_;
725  }
726  // at some point check cfg_.top_n > 1 after maxing to num_clust_.
727  private:
728  void InitPoint(int32 point) {
729  // Find closest clusters to this point.
730  // distances are really negated objf changes, ignoring terms that don't vary with the "other" cluster.
731 
732  std::vector<std::pair<BaseFloat, LocalInt> > distances;
733  distances.reserve(num_clust_-1);
734  int32 my_clust = (*assignments_)[point];
735  Clusterable *point_cl = points_[point];
736 
737  for (int32 clust = 0;clust < num_clust_;clust++) {
738  if (clust != my_clust) {
739  Clusterable *tmp = (*clusters_)[clust]->Copy();
740  tmp->Add(*point_cl);
741  BaseFloat other_clust_objf = clust_objf_[clust];
742  BaseFloat other_clust_plus_me_objf = (*clusters_)[clust]->ObjfPlus(* (points_[point]));
743 
744  BaseFloat distance = other_clust_objf-other_clust_plus_me_objf; // negated delta-objf, with only "varying" terms.
745  distances.push_back(std::make_pair(distance, (LocalInt)clust));
746  delete tmp;
747  }
748  }
749  if ((cfg_.top_n-1-1) >= 0) {
750  std::nth_element(distances.begin(), distances.begin()+(cfg_.top_n-1-1), distances.end());
751  }
752  // top_n-1 is the # of elements we want to retain. -1 because we need the iterator
753  // that points to the end of that range (i.e. not potentially off the end of the array).
754 
755  for (int32 index = 0;index < cfg_.top_n-1;index++) {
756  point_info &info = GetInfo(point, index);
757  int32 clust = distances[index].second;
758  info.clust = clust;
759  BaseFloat distance = distances[index].first;
760  BaseFloat other_clust_objf = clust_objf_[clust];
761  BaseFloat other_clust_plus_me_objf = -(distance - other_clust_objf);
762  info.objf = other_clust_plus_me_objf;
763  info.time = 0;
764  }
765  // now put the last element in, which is my current cluster.
766  point_info &info = GetInfo(point, cfg_.top_n-1);
767  info.clust = my_clust;
768  info.time = 0;
769  info.objf = (*clusters_)[my_clust]->ObjfMinus(*(points_[point]));
770  my_clust_index_[point] = cfg_.top_n-1;
771  }
772  void InitPoints() {
773  // finds, for each point, the closest cfg_.top_n clusters (including its own cluster).
774  // this may be the most time-consuming step of the algorithm.
775  for (int32 p = 0;p < num_points_;p++) InitPoint(p);
776  }
777  void Iterate() {
778  int32 iter, num_iters = cfg_.num_iters;
779  for (iter = 0;iter < num_iters;iter++) {
780  int32 cur_t = t_;
781  for (int32 point = 0;point < num_points_;point++) {
782  if (t_+1 == 0) {
783  KALDI_WARN << "Stopping iterating at int32 moves";
784  return; // once we use up all time points, must return-- this
785  // should rarely happen as int32 is large.
786  }
787  ProcessPoint(point);
788  }
789  if (t_ == cur_t) break; // nothing changed so we converged.
790  }
791  }
792  void MovePoint(int32 point, int32 new_index) {
793  // move point to a different cluster.
794  t_++;
795  int32 old_index = my_clust_index_[point]; // index into info
796  // array corresponding to current cluster.
797  KALDI_ASSERT(new_index < cfg_.top_n && new_index != old_index);
798  point_info &old_info = GetInfo(point, old_index),
799  &new_info = GetInfo(point, new_index);
800  my_clust_index_[point] = new_index; // update to new index.
801 
802  int32 old_clust = old_info.clust, new_clust = new_info.clust;
803  KALDI_ASSERT( (*assignments_)[point] == old_clust);
804  (*assignments_)[point] = new_clust;
805  (*clusters_)[old_clust]->Sub( *(points_[point]) );
806  (*clusters_)[new_clust]->Add( *(points_[point]) );
807  UpdateClust(old_clust);
808  UpdateClust(new_clust);
809  }
810  void UpdateClust(int32 clust) {
811  KALDI_ASSERT(clust < num_clust_);
812  clust_objf_[clust] = (*clusters_)[clust]->Objf();
813  clust_time_[clust] = t_;
814  }
815  void ProcessPoint(int32 point) {
816  // note: calling code uses the fact
817  // that it only ever increases t_ by one.
818  KALDI_ASSERT(point < num_points_);
819  // (1) Make sure own-cluster like is updated.
820  int32 self_index = my_clust_index_[point]; // index <cfg_.top_n of own cluster.
821  point_info &self_info = GetInfo(point, self_index);
822  int32 self_clust = self_info.clust; // cluster index of own cluster.
823  KALDI_ASSERT(self_index < cfg_.top_n);
824  UpdateInfo(point, self_index);
825 
826  float own_clust_objf = clust_objf_[self_clust];
827  float own_clust_minus_me_objf = self_info.objf; // objf of own cluster minus self.
828  // Now check the other "close" clusters and see if we want to move there.
829  for (int32 index = 0;index < cfg_.top_n;index++) {
830  if (index != self_index) {
831  UpdateInfo(point, index);
832  point_info &other_info = GetInfo(point, index);
833  BaseFloat other_clust_objf = clust_objf_[other_info.clust];
834  BaseFloat other_clust_plus_me_objf = other_info.objf;
835  BaseFloat impr = other_clust_plus_me_objf + own_clust_minus_me_objf
836  - other_clust_objf - own_clust_objf;
837  if (impr > 0) { // better to switch...
838  ans_ += impr;
839  MovePoint(point, index);
840  return; // the stuff we precomputed at the top is invalidated now, and it's
841  // easiest just to wait till next time we visit this point.
842  }
843  }
844  }
845  }
846 
847  void UpdateInfo(int32 point, int32 idx) {
848  point_info &pinfo = GetInfo(point, idx);
849  if (pinfo.time < clust_time_[pinfo.clust]) { // it's not up-to-date...
850  Clusterable *tmp_cl = (*clusters_)[pinfo.clust]->Copy();
851  if (idx == my_clust_index_[point]) {
852  tmp_cl->Sub( *(points_[point]) );
853  } else{
854  tmp_cl->Add( *(points_[point]) );
855  }
856  pinfo.time = t_;
857  pinfo.objf = tmp_cl->Objf();
858  delete tmp_cl;
859  }
860  }
861 
862  typedef struct {
863  LocalInt clust;
864  LocalInt time;
865  BaseFloat objf; // Objf of this cluster plus this point (or minus, if own cluster).
866  } point_info;
867 
868  point_info &GetInfo(int32 point, int32 idx) {
869  KALDI_ASSERT(point < num_points_ && idx < cfg_.top_n);
870  int32 i = point*cfg_.top_n + idx;
871  KALDI_PARANOID_ASSERT(i < static_cast<int32>(info_.size()));
872  return info_[i];
873  }
874 
875  const std::vector<Clusterable*> &points_;
876  std::vector<Clusterable*> *clusters_;
877  std::vector<int32> *assignments_;
878 
879  std::vector<point_info> info_; // size is [num_points_ * cfg_.top_n].
880  std::vector<ClustIndexInt> my_clust_index_; // says for each point, which index 0...cfg_.top_n-1 currently
881  // corresponds to its own cluster.
882 
883  std::vector<LocalInt> clust_time_; // Modification time of cluster.
884  std::vector<BaseFloat> clust_objf_; // [clust], objf for cluster.
885 
886  BaseFloat ans_; // objf improvement.
887 
891  RefineClustersOptions cfg_; // note, we change top_n in config; don't make this member a reference member.
892 };
893 
894 
895 BaseFloat RefineClusters(const std::vector<Clusterable*> &points,
896  std::vector<Clusterable*> *clusters,
897  std::vector<int32> *assignments,
898  RefineClustersOptions cfg) {
899 #ifndef KALDI_PARANOID // don't do this check in "paranoid" mode as we want to expose bugs.
900  if (cfg.num_iters <= 0) { return 0.0; } // nothing to do.
901 #endif
902  KALDI_ASSERT(clusters != NULL && assignments != NULL);
903  KALDI_ASSERT(!ContainsNullPointers(points) && !ContainsNullPointers(*clusters));
904  RefineClusterer rc(points, clusters, assignments, cfg);
905  BaseFloat ans = rc.Refine();
906  KALDI_ASSERT(!ContainsNullPointers(*clusters));
907  return ans;
908 }
909 
910 // ============================================================================
911 // K-means like clustering routines
912 // ============================================================================
913 
917 
918 BaseFloat ClusterKMeansOnce(const std::vector<Clusterable*> &points,
919  int32 num_clust,
920  std::vector<Clusterable*> *clusters_out,
921  std::vector<int32> *assignments_out,
922  ClusterKMeansOptions &cfg) {
923  std::vector<int32> my_assignments;
924  int32 num_points = points.size();
925  KALDI_ASSERT(clusters_out != NULL);
926  KALDI_ASSERT(num_points != 0);
927  KALDI_ASSERT(num_clust <= num_points);
928 
929  KALDI_ASSERT(clusters_out->empty()); // or we wouldn't know what to do with pointers in there.
930  clusters_out->resize(num_clust, (Clusterable*)NULL);
931  assignments_out->resize(num_points);
932 
933  { // This block assigns points to clusters.
934  // This is done pseudo-randomly using Rand() so that
935  // if we call ClusterKMeans multiple times we get different answers (so we can choose
936  // the best if we want).
937  int32 skip; // randomly choose a "skip" that's coprime to num_points.
938  if (num_points == 1) {
939  skip = 1;
940  } else {
941  skip = 1 + (Rand() % (num_points-1)); // a number between 1 and num_points-1.
942  while (Gcd(skip, num_points) != 1) { // while skip is not coprime to num_points...
943  if (skip == num_points-1) skip = 0;
944  skip++; // skip is now still betweeen 1 and num_points-1. will cycle through
945  // all of 1...num_points-1.
946  }
947  }
948  int32 i, j, count = 0;
949  for (i = 0, j = 0; count != num_points;i = (i+skip)%num_points, j = (j+1)%num_clust, count++) {
950  // i cycles pseudo-randomly through all points; j skips ahead by 1 each time
951  // modulo num_points.
952  // assign point i to cluster j.
953  if ((*clusters_out)[j] == NULL) (*clusters_out)[j] = points[i]->Copy();
954  else (*clusters_out)[j]->Add(*(points[i]));
955  (*assignments_out)[i] = j;
956  }
957  }
958 
959 
960  BaseFloat normalizer = SumClusterableNormalizer(*clusters_out);
961  BaseFloat ans;
962  { // work out initial value of "ans" (objective function improvement).
963  Clusterable *all_stats = SumClusterable(*clusters_out);
964  ans = SumClusterableObjf(*clusters_out) - all_stats->Objf(); // improvement just from the random
965  // initialization.
966  if (ans < -0.01 && ans < -0.01 * fabs(all_stats->Objf())) { // something bad happend.
967  KALDI_WARN << "ClusterKMeans: objective function after random assignment to clusters is worse than in single cluster: "<< (all_stats->Objf()) << " changed by " << ans << ". Perhaps your stats class has the wrong properties?";
968  }
969  delete all_stats;
970  }
971  for (int32 iter = 0;iter < cfg.num_iters;iter++) {
972  // Keep refining clusters by reassigning points.
973  BaseFloat objf_before;
974  if (cfg.verbose) objf_before =SumClusterableObjf(*clusters_out);
975  BaseFloat impr = RefineClusters(points, clusters_out, assignments_out, cfg.refine_cfg);
976  BaseFloat objf_after;
977  if (cfg.verbose) objf_after = SumClusterableObjf(*clusters_out);
978  ans += impr;
979  if (cfg.verbose)
980  KALDI_LOG << "ClusterKMeans: on iteration "<<(iter)<<", objf before = "<<(objf_before)<<", impr = "<<(impr)<<", objf after = "<<(objf_after)<<", normalized by "<<(normalizer)<<" = "<<(objf_after/normalizer);
981  if (impr == 0) break;
982  }
983  return ans;
984 }
985 
986 BaseFloat ClusterKMeans(const std::vector<Clusterable*> &points,
987  int32 num_clust,
988  std::vector<Clusterable*> *clusters_out,
989  std::vector<int32> *assignments_out,
990  ClusterKMeansOptions cfg) {
991  if (points.size() == 0) {
992  if (clusters_out) KALDI_ASSERT(clusters_out->empty()); // or we wouldn't know whether to free the pointers.
993  if (assignments_out) assignments_out->clear();
994  return 0.0;
995  }
996  KALDI_ASSERT(cfg.num_tries>=1 && cfg.num_iters>=1);
997  if (clusters_out) KALDI_ASSERT(clusters_out->empty()); // or we wouldn't know whether to deallocate.
998  if (cfg.num_tries == 1) {
999  std::vector<int32> assignments;
1000  return ClusterKMeansOnce(points, num_clust, clusters_out, (assignments_out != NULL?assignments_out:&assignments), cfg);
1001  } else { // multiple tries.
1002  if (clusters_out) {
1003  KALDI_ASSERT(clusters_out->empty()); // we don't know the ownership of any pointers in there, otherwise.
1004  }
1005  BaseFloat best_ans = 0.0;
1006  for (int32 i = 0;i < cfg.num_tries;i++) {
1007  std::vector<Clusterable*> clusters_tmp;
1008  std::vector<int32> assignments_tmp;
1009  BaseFloat ans = ClusterKMeansOnce(points, num_clust, &clusters_tmp, &assignments_tmp, cfg);
1010  KALDI_ASSERT(!ContainsNullPointers(clusters_tmp));
1011  if (i == 0 || ans > best_ans) {
1012  best_ans = ans;
1013  if (clusters_out) {
1014  if (clusters_out->size()) DeletePointers(clusters_out);
1015  *clusters_out = clusters_tmp;
1016  clusters_tmp.clear(); // suppress deletion of pointers.
1017  }
1018  if (assignments_out) *assignments_out = assignments_tmp;
1019  }
1020  // delete anything remaining in clusters_tmp (we cleared it if we used
1021  // the pointers.
1022  DeletePointers(&clusters_tmp);
1023  }
1024  return best_ans;
1025  }
1026 }
1027 
1028 // ============================================================================
1029 // Routines for clustering using a top-down tree
1030 // ============================================================================
1031 
1033  public:
1034  TreeClusterer(const std::vector<Clusterable*> &points,
1035  int32 max_clust,
1036  TreeClusterOptions cfg):
1037  points_(points), max_clust_(max_clust), ans_(0.0), cfg_(cfg)
1038  {
1039  KALDI_ASSERT(cfg_.branch_factor > 1);
1040  Init();
1041  }
1042  BaseFloat Cluster(std::vector<Clusterable*> *clusters_out,
1043  std::vector<int32> *assignments_out,
1044  std::vector<int32> *clust_assignments_out,
1045  int32 *num_leaves_out) {
1046  while (static_cast<int32>(leaf_nodes_.size()) < max_clust_ && !queue_.empty()) {
1047  std::pair<BaseFloat, Node*> pr = queue_.top();
1048  queue_.pop();
1049  ans_ += pr.first;
1050  DoSplit(pr.second);
1051  }
1052  CreateOutput(clusters_out, assignments_out, clust_assignments_out,
1053  num_leaves_out);
1054  return ans_;
1055  }
1056 
1058  for (int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size());leaf++) {
1059  delete leaf_nodes_[leaf]->node_total;
1060  DeletePointers(&(leaf_nodes_[leaf]->leaf.clusters));
1061  delete leaf_nodes_[leaf];
1062  }
1063  for (int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
1064  delete nonleaf_nodes_[nonleaf]->node_total;
1065  delete nonleaf_nodes_[nonleaf];
1066  }
1067  }
1068 
1069 
1070  private:
1071  struct Node {
1072  bool is_leaf;
1073  int32 index; // index into leaf_nodes or nonleaf_nodes as applicable.
1075  Clusterable *node_total; // sum of all data with this node.
1076  struct {
1077  std::vector<Clusterable*> points;
1078  std::vector<int32> point_indices;
1080  std::vector<Clusterable*> clusters; // [branch_factor]... if we do split.
1081  std::vector<int32> assignments; // assignments of points to clusters.
1082  } leaf;
1083  std::vector<Node*> children; // vector of size branch_factor. if non-leaf.
1084  // pointers not owned here but in vectors leaf_nodes_, nonleaf_nodes_.
1085  };
1086 
1087 
1088  void CreateOutput(std::vector<Clusterable*> *clusters_out,
1089  std::vector<int32> *assignments_out,
1090  std::vector<int32> *clust_assignments_out,
1091  int32 *num_leaves_out) {
1092  if (num_leaves_out) *num_leaves_out = leaf_nodes_.size();
1093  if (assignments_out)
1094  CreateAssignmentsOutput(assignments_out);
1095  if (clust_assignments_out)
1096  CreateClustAssignmentsOutput(clust_assignments_out);
1097  if (clusters_out)
1098  CreateClustersOutput(clusters_out);
1099  }
1100 
1101  // This creates the output index corresponding to an index "index" into the array nonleaf_nodes_.
1102  // reverse numbering so root node is last.
1104  return leaf_nodes_.size() + nonleaf_nodes_.size() - 1 - index;
1105  }
1106  void CreateAssignmentsOutput(std::vector<int32> *assignments_out) {
1107  assignments_out->clear();
1108  assignments_out->resize(points_.size(), (int32)(-1)); // fill with -1.
1109  for (int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size()); leaf++) {
1110  std::vector<int32> &indices = leaf_nodes_[leaf]->leaf.point_indices;
1111  for (int32 i = 0; i < static_cast<int32>(indices.size()); i++) {
1112  KALDI_ASSERT(static_cast<size_t>(indices[i]) < points_.size());
1113  KALDI_ASSERT((*assignments_out)[indices[i]] == (int32)(-1)); // check we're not assigning twice.
1114  (*assignments_out)[indices[i]] = leaf;
1115  }
1116  }
1117 #ifdef KALDI_PARANOID
1118  for (size_t i = 0;i<assignments_out->size();i++) KALDI_ASSERT((*assignments_out)[i] != (int32)(-1));
1119 #endif
1120  }
1121  void CreateClustAssignmentsOutput(std::vector<int32> *clust_assignments_out) {
1122  clust_assignments_out->resize(leaf_nodes_.size() + nonleaf_nodes_.size());
1123  for (int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size()); leaf++) {
1124  int32 parent_index;
1125  if (leaf_nodes_[leaf]->parent == NULL) { // tree with only one node.
1126  KALDI_ASSERT(leaf_nodes_.size() == 1&&nonleaf_nodes_.size() == 0 && leaf == 0);
1127  parent_index = 0;
1128  } else {
1129  if (leaf_nodes_[leaf]->parent->is_leaf) parent_index = leaf_nodes_[leaf]->parent->index;
1130  else parent_index = NonleafOutputIndex(leaf_nodes_[leaf]->parent->index);
1131  }
1132  (*clust_assignments_out)[leaf] = parent_index;
1133  }
1134  for (int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
1135  int32 index = NonleafOutputIndex(nonleaf);
1136  int32 parent_index;
1137  if (nonleaf_nodes_[nonleaf]->parent == NULL) parent_index = index; // top node. make it own parent.
1138  else {
1139  KALDI_ASSERT(! nonleaf_nodes_[nonleaf]->parent->is_leaf); // parent is nonleaf since child is nonleaf.
1140  parent_index = NonleafOutputIndex(nonleaf_nodes_[nonleaf]->parent->index);
1141  }
1142  (*clust_assignments_out)[index] = parent_index;
1143  }
1144  }
1145  void CreateClustersOutput(std::vector<Clusterable*> *clusters_out) {
1146  clusters_out->resize(leaf_nodes_.size() + nonleaf_nodes_.size());
1147  for (int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size()); leaf++) {
1148  (*clusters_out)[leaf] = leaf_nodes_[leaf]->node_total;
1149  leaf_nodes_[leaf]->node_total = NULL; // suppress delete.
1150  }
1151  for (int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
1152  int32 index = NonleafOutputIndex(nonleaf);
1153  (*clusters_out)[index] = nonleaf_nodes_[nonleaf]->node_total;
1154  nonleaf_nodes_[nonleaf]->node_total = NULL; // suppress delete.
1155  }
1156  }
1157  void DoSplit(Node *node) {
1158  KALDI_ASSERT(node->is_leaf && node->leaf.best_split > cfg_.thresh*0.999); // 0.999 is to avoid potential floating-point weirdness under compiler optimizations.
1159  KALDI_ASSERT(node->children.size() == 0);
1160  node->children.resize(cfg_.branch_factor);
1161  for (int32 i = 0;i < cfg_.branch_factor;i++) {
1162  Node *child = new Node;
1163  node->children[i] = child;
1164  child->is_leaf = true;
1165  child->parent = node;
1166  child->node_total = node->leaf.clusters[i];
1167  if (i == 0) {
1168  child->index = node->index; // assign node's own index in leaf_nodes_ to 1st child.
1169  leaf_nodes_[child->index] = child;
1170  } else {
1171  child->index = leaf_nodes_.size(); // generate new indices for other children.
1172  leaf_nodes_.push_back(child);
1173  }
1174  }
1175 
1176  KALDI_ASSERT(node->leaf.assignments.size() == node->leaf.points.size());
1177  KALDI_ASSERT(node->leaf.point_indices.size() == node->leaf.points.size());
1178  for (int32 i = 0; i < static_cast<int32>(node->leaf.points.size()); i++) {
1179  int32 child_index = node->leaf.assignments[i];
1180  KALDI_ASSERT(child_index < static_cast<int32>(cfg_.branch_factor));
1181  node->children[child_index]->leaf.points.push_back(node->leaf.points[i]);
1182  node->children[child_index]->leaf.point_indices.push_back(node->leaf.point_indices[i]);
1183  }
1184  node->leaf.points.clear();
1185  node->leaf.point_indices.clear();
1186  node->leaf.clusters.clear(); // already assigned pointers to children.
1187  node->leaf.assignments.clear();
1188  node->is_leaf = false;
1189  node->index = nonleaf_nodes_.size(); // new index at end of nonleaf_nodes_.
1190  nonleaf_nodes_.push_back(node);
1191  for (int32 i = 0;i < static_cast<int32>(cfg_.branch_factor);i++)
1192  FindBestSplit(node->children[i]);
1193  }
1194  void FindBestSplit(Node *node) {
1195  // takes a leaf node that has just been set up, and does ClusterKMeans with k = cfg_branch_factor.
1196  KALDI_ASSERT(node->is_leaf);
1197  if (node->leaf.points.size() == 0) {
1198  KALDI_WARN << "Warning: tree clustering: leaf with no data";
1199  node->leaf.best_split = 0; return;
1200  }
1201  if (node->leaf.points.size()<=1) { node->leaf.best_split = 0; return; }
1202  else {
1203  // use kmeans.
1204  BaseFloat impr = ClusterKMeans(node->leaf.points,
1205  cfg_.branch_factor,
1206  &node->leaf.clusters,
1207  &node->leaf.assignments,
1208  cfg_.kmeans_cfg);
1209  node->leaf.best_split = impr;
1210  if (impr > cfg_.thresh)
1211  queue_.push(std::make_pair(impr, node));
1212  }
1213  }
1214  void Init() { // Initializes top node.
1215  Node *top_node = new Node;
1216  top_node->index = leaf_nodes_.size(); // == 0 currently.
1217  top_node->parent = NULL; // no parent since root of tree.
1218  top_node->is_leaf = true;
1219  leaf_nodes_.push_back(top_node);
1220  top_node->leaf.points = points_;
1221  top_node->node_total = SumClusterable(points_);
1222  top_node->leaf.point_indices.resize(points_.size());
1223  for (size_t i = 0;i<points_.size();i++) top_node->leaf.point_indices[i] = i;
1224  FindBestSplit(top_node); // this should always be called when new node is created.
1225  }
1226 
1227  std::vector<Node*> leaf_nodes_;
1228  std::vector<Node*> nonleaf_nodes_;
1229 
1230  const std::vector<Clusterable*> &points_;
1232  BaseFloat ans_; // objf improvement.
1233 
1234  std::priority_queue<std::pair<BaseFloat, Node*> > queue_; // contains leaves.
1235 
1237 };
1238 
1239 
1240 BaseFloat TreeCluster(const std::vector<Clusterable*> &points,
1241  int32 max_clust, // this is a max only.
1242  std::vector<Clusterable*> *clusters_out,
1243  std::vector<int32> *assignments_out,
1244  std::vector<int32> *clust_assignments_out,
1245  int32 *num_leaves_out,
1246  TreeClusterOptions cfg) {
1247  if (points.size() == 0) {
1248  if (clusters_out) clusters_out->clear();
1249  if (assignments_out) assignments_out->clear();
1250  if (clust_assignments_out) clust_assignments_out->clear();
1251  if (num_leaves_out) *num_leaves_out = 0;
1252  return 0.0;
1253  }
1254  TreeClusterer tc(points, max_clust, cfg);
1255  BaseFloat ans = tc.Cluster(clusters_out, assignments_out, clust_assignments_out, num_leaves_out);
1256  if (clusters_out) KALDI_ASSERT(!ContainsNullPointers(*clusters_out));
1257  return ans;
1258 }
1259 
1260 
1261 BaseFloat ClusterTopDown(const std::vector<Clusterable*> &points,
1262  int32 max_clust, // max # of clusters.
1263  std::vector<Clusterable*> *clusters_out,
1264  std::vector<int32> *assignments_out,
1265  TreeClusterOptions cfg) {
1266  int32 num_leaves = 0;
1267  BaseFloat ans = TreeCluster(points, max_clust, clusters_out, assignments_out, NULL, &num_leaves, cfg);
1268  if (clusters_out != NULL) {
1269  for (size_t j = num_leaves;j<clusters_out->size();j++) delete (*clusters_out)[j];
1270  clusters_out->resize(num_leaves); // number of leaf-level clusters in tree.
1271  }
1272  return ans;
1273 }
1274 
1275 
1276 void RefineClustersOptions::Write(std::ostream &os, bool binary) const {
1277  WriteToken(os, binary, "<RefineClustersOptions>");
1278  WriteBasicType(os, binary, num_iters);
1279  WriteBasicType(os, binary, top_n);
1280  WriteToken(os, binary, "</RefineClustersOptions>");
1281 }
1282 
1283 void RefineClustersOptions::Read(std::istream &is, bool binary) {
1284  ExpectToken(is, binary, "<RefineClustersOptions>");
1285  ReadBasicType(is, binary, &num_iters);
1286  ReadBasicType(is, binary, &top_n);
1287  ExpectToken(is, binary, "</RefineClustersOptions>");
1288 }
1289 
1290 
1291 } // end namespace kaldi.
virtual void Sub(const Clusterable &other)=0
Subtract other stats.
std::vector< ClustIndexInt > my_clust_index_
bool operator>(const CompBotClustElem &a, const CompBotClustElem &b)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void CreateOutput(std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out)
std::vector< int32 > * assignments_
RefineClusterer(const std::vector< Clusterable *> &points, std::vector< Clusterable *> *clusters, std::vector< int32 > *assignments, RefineClustersOptions cfg)
void ProcessPoint(int32 point)
virtual void Add(const Clusterable &other)=0
Add other stats.
void Write(std::ostream &os, bool binary) const
std::vector< point_info > info_
void DeletePointers(std::vector< A *> *v)
Deletes any non-NULL pointers in the vector v, and sets the corresponding entries of v to NULL...
Definition: stl-utils.h:184
const std::vector< Clusterable * > & points_
void ReconstructQueue()
Reconstructs the priority queue from the distances.
std::vector< BaseFloat > clust_objf_
void CreateAssignmentsOutput(std::vector< int32 > *assignments_out)
std::vector< int32 > point_indices
std::priority_queue< std::pair< BaseFloat, Node * > > queue_
int32 NonleafOutputIndex(int32 index)
BaseFloat RefineClusters(const std::vector< Clusterable *> &points, std::vector< Clusterable *> *clusters, std::vector< int32 > *assignments, RefineClustersOptions cfg)
RefineClusters is mainly used internally by other clustering algorithms.
std::vector< Node * > leaf_nodes_
RefineClustersOptions cfg_
void Read(std::istream &is, bool binary)
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
virtual void SetZero()=0
Set stats to empty.
virtual BaseFloat Objf() const =0
Return the objective function associated with the stats [assuming ML estimation]. ...
std::pair< BaseFloat, std::pair< uint_smaller, uint_smaller > > QueueElement
bool CanMerge(int32 i, int32 j, BaseFloat dist)
CanMerge returns true if i and j are existing clusters, and the distance (negated objf-change) "dist"...
BaseFloat SumClusterableNormalizer(const std::vector< Clusterable *> &vec)
Returns the total normalizer (usually count) of the cluster (pointers may be NULL).
BaseFloat ClusterKMeans(const std::vector< Clusterable *> &points, int32 num_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, ClusterKMeansOptions cfg)
ClusterKMeans is a K-means-like clustering algorithm.
std::priority_queue< CompBotClustElem, std::vector< CompBotClustElem >, std::greater< CompBotClustElem > > QueueType
I Gcd(I m, I n)
Definition: kaldi-math.h:297
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
kaldi::int32 int32
std::vector< Clusterable * > * clusters_
vector< vector< int32 > > assignments_
BaseFloat ClusterBottomUpCompartmentalized(const std::vector< std::vector< Clusterable *> > &points, BaseFloat thresh, int32 min_clust, std::vector< std::vector< Clusterable *> > *clusters_out, std::vector< std::vector< int32 > > *assignments_out)
This is a bottom-up clustering where the points are pre-clustered in a set of compartments, such that only points in the same compartment are clustered together.
bool ContainsNullPointers(const std::vector< A *> &v)
Returns true if the vector of pointers contains NULL pointers.
Definition: stl-utils.h:197
BaseFloat ClusterKMeansOnce(const std::vector< Clusterable *> &points, int32 num_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, ClusterKMeansOptions &cfg)
ClusterKMeansOnce is called internally by ClusterKMeans; it is equivalent to calling ClusterKMeans wi...
std::vector< Clusterable * > tmp_clusters_
std::vector< Clusterable * > clusters
const std::vector< Clusterable * > & points_
virtual Clusterable * Copy() const =0
Return a copy of this object.
const vector< vector< Clusterable * > > & points_
const size_t count
void ReconstructQueue()
Reconstructs the priority queue from the distances.
BaseFloat & Distance(int32 i, int32 j)
int16 int_smaller
CompBotClustElem(BaseFloat d, int32 comp, int32 i, int32 j)
std::vector< LocalInt > clust_time_
void EnsureClusterableVectorNotNull(std::vector< Clusterable *> *stats)
Fills in any (NULL) holes in "stats" vector, with empty stats, because certain algorithms require non...
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
Definition: io-funcs.cc:191
vector< vector< Clusterable * > > clusters_
BaseFloat ClusterBottomUp(const std::vector< Clusterable *> &points, BaseFloat max_merge_thresh, int32 min_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out)
A bottom-up clustering algorithm.
void CreateClustAssignmentsOutput(std::vector< int32 > *clust_assignments_out)
std::vector< Clusterable * > points
std::vector< int32 > assignments
void MovePoint(int32 point, int32 new_index)
void UpdateInfo(int32 point, int32 idx)
void AddToClusters(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, std::vector< Clusterable *> *clusters)
Given stats and a vector "assignments" of the same size (that maps to cluster indices), sums the stats up into "clusters." It will add to any stats already present in "clusters" (although typically "clusters" will be empty when called), and it will extend with NULL pointers for any unseen indices.
CompartmentalizedBottomUpClusterer(const vector< vector< Clusterable *> > &points, BaseFloat max_merge_thresh, int32 min_clust)
bool CanMerge(int32 compartment, int32 i, int32 j, BaseFloat dist)
CanMerge returns true if i and j are existing clusters, and the distance (negated objf-change) "dist"...
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_PARANOID_ASSERT(cond)
Definition: kaldi-error.h:206
void SetInitialDistances()
Sets up distances and queue.
void AddToClustersOptimized(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, const Clusterable &total, std::vector< Clusterable *> *clusters)
AddToClustersOptimized does the same as AddToClusters (it sums up the stats within each cluster...
std::vector< int32 > * assignments_
BaseFloat ClusterTopDown(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, TreeClusterOptions cfg)
A clustering algorithm that internally uses TreeCluster, but does not give you the information about ...
#define KALDI_WARN
Definition: kaldi-error.h:150
uint_smaller ClustIndexInt
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
std::vector< BaseFloat > dist_vec_
void CreateClustersOutput(std::vector< Clusterable *> *clusters_out)
void UpdateClust(int32 clust)
int Rand(struct RandomState *state)
Definition: kaldi-math.cc:45
void MergeClusters(int32 i, int32 j)
Merge j into i and delete j.
virtual BaseFloat Normalizer() const =0
Return the normalizer (typically, count) associated with the stats.
const std::vector< Clusterable * > & points_
std::priority_queue< QueueElement, std::vector< QueueElement >, std::greater< QueueElement > > QueueType
std::vector< int32 > tmp_assignments_
BaseFloat MergeClusters(int32 compartment, int32 i, int32 j)
Merge j into i and delete j. Returns obj function change.
TreeClusterOptions cfg_
void SetDistance(int32 i, int32 j)
#define KALDI_ISNAN
Definition: kaldi-math.h:72
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void DoSplit(Node *node)
point_info & GetInfo(int32 point, int32 idx)
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
Definition: kaldi-math.h:276
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
void FindBestSplit(Node *node)
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34
void SetInitialDistances()
Sets up distances and queue.
void InitPoint(int32 point)
void SetDistance(int32 compartment, int32 i, int32 j)
std::vector< Node * > children
BaseFloat SumClusterableObjf(const std::vector< Clusterable *> &vec)
Returns the total objective function after adding up all the statistics in the vector (pointers may b...
std::vector< Clusterable * > * clusters_
TreeClusterer(const std::vector< Clusterable *> &points, int32 max_clust, TreeClusterOptions cfg)
BaseFloat TreeCluster(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out, TreeClusterOptions cfg)
TreeCluster is a top-down clustering algorithm, using a binary tree (not necessarily balanced)...
vector< vector< BaseFloat > > dist_vec_
std::vector< Node * > nonleaf_nodes_
#define KALDI_LOG
Definition: kaldi-error.h:153
BottomUpClusterer(const std::vector< Clusterable *> &points, BaseFloat max_merge_thresh, int32 min_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out)
struct kaldi::TreeClusterer::Node::@7 leaf
Clusterable * SumClusterable(const std::vector< Clusterable *> &vec)
Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present.
RefineClustersOptions refine_cfg
BaseFloat Cluster(std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out)
BaseFloat Cluster(vector< vector< Clusterable *> > *clusters_out, vector< vector< int32 > > *assignments_out)
uint16 uint_smaller