AgglomerativeClusterer Class Reference

The AgglomerativeClusterer class contains the necessary mechanisms for the actual clustering algorithm. More...

#include <agglomerative-clustering.h>

Collaboration diagram for AgglomerativeClusterer:

Public Member Functions

 AgglomerativeClusterer (const Matrix< BaseFloat > &costs, BaseFloat threshold, int32 min_clusters, int32 first_pass_max_points, BaseFloat max_cluster_fraction, std::vector< int32 > *assignments_out)
 
void Cluster ()
 
void ClusterSinglePass ()
 
void ClusterTwoPass ()
 

Private Types

typedef std::pair< BaseFloat, uint32 > QueueElement
 
typedef std::priority_queue< QueueElement, std::vector< QueueElement >, std::greater< QueueElement > > QueueType
 

Private Member Functions

uint32 EncodePair (int32 i, int32 j)
 
std::pair< int32, int32DecodePair (uint32 key)
 
void InitializeClusters (int32 first, int32 last)
 
void ComputeClusters (int32 min_clusters)
 
void AddClustersToSecondPass ()
 
void AssignClusters ()
 
void MergeClusters (int32 i, int32 j)
 

Private Attributes

const Matrix< BaseFloat > & costs_
 
BaseFloat threshold_
 
int32 min_clusters_
 
int32 first_pass_max_points_
 
std::vector< int32 > * assignments_
 
int32 num_points_
 
int32 max_cluster_size_
 
int32 count_
 
int32 second_pass_count_
 
QueueType queue_
 
QueueType second_pass_queue_
 
std::unordered_map< uint32, BaseFloatcluster_cost_map_
 
std::unordered_map< int32, AhcCluster * > clusters_map_
 
std::set< int32active_clusters_
 
std::unordered_map< uint32, BaseFloatsecond_pass_cluster_cost_map_
 
std::unordered_map< int32, AhcCluster * > second_pass_clusters_map_
 
std::set< int32second_pass_active_clusters_
 

Detailed Description

The AgglomerativeClusterer class contains the necessary mechanisms for the actual clustering algorithm.

Definition at line 55 of file agglomerative-clustering.h.

Member Typedef Documentation

◆ QueueElement

typedef std::pair<BaseFloat, uint32> QueueElement
private

Definition at line 125 of file agglomerative-clustering.h.

◆ QueueType

typedef std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > QueueType
private

Definition at line 127 of file agglomerative-clustering.h.

Constructor & Destructor Documentation

◆ AgglomerativeClusterer()

AgglomerativeClusterer ( const Matrix< BaseFloat > &  costs,
BaseFloat  threshold,
int32  min_clusters,
int32  first_pass_max_points,
BaseFloat  max_cluster_fraction,
std::vector< int32 > *  assignments_out 
)
inline

Definition at line 57 of file agglomerative-clustering.h.

References rnnlm::i, rnnlm::j, and MatrixBase< Real >::NumRows().

64  : costs_(costs), threshold_(threshold), min_clusters_(min_clusters),
65  first_pass_max_points_(first_pass_max_points),
66  assignments_(assignments_out) {
67  num_points_ = costs.NumRows();
68 
69  // The max_cluster_size_ is a hard limit on the number points in a cluster.
70  // This is useful for handling degenerate cases where some outlier points
71  // form their own clusters and force everything else to be clustered
72  // together, e.g. when min-clusters is provided instead of a threshold.
73  max_cluster_size_ = ceil(num_points_ * max_cluster_fraction);
74 
75  // The count_, which is used for identifying clusters, is initialized to
76  // num_points_ because cluster IDs 1..num_points_ are reserved for input
77  // points, which are the initial set of clusters.
79 
80  // The second_pass_count_, which is used for identifying the initial set of
81  // second pass clusters and initializing count_ before the second pass, is
82  // initialized to 0 and incremented whenever a new cluster is added to the
83  // initial set of second pass clusters.
85  }
const Matrix< BaseFloat > & costs_
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64

Member Function Documentation

◆ AddClustersToSecondPass()

void AddClustersToSecondPass ( )
private

Definition at line 155 of file agglomerative-clustering.cc.

References AgglomerativeClusterer::active_clusters_, AgglomerativeClusterer::cluster_cost_map_, AgglomerativeClusterer::clusters_map_, AgglomerativeClusterer::costs_, count, AgglomerativeClusterer::EncodePair(), AgglomerativeClusterer::second_pass_active_clusters_, AgglomerativeClusterer::second_pass_cluster_cost_map_, AgglomerativeClusterer::second_pass_clusters_map_, AgglomerativeClusterer::second_pass_count_, AgglomerativeClusterer::second_pass_queue_, AhcCluster::size, AgglomerativeClusterer::threshold_, and AhcCluster::utt_ids.

Referenced by AgglomerativeClusterer::ClusterTwoPass().

155  {
156  // This method collects the results of first pass clustering for one subset,
157  // i.e. adds the set of active clusters to the set of second pass active
158  // clusters and computes the costs for the newly formed cluster pairs.
159  std::set<int32>::iterator it1, it2;
161  for (it1 = active_clusters_.begin(); it1 != active_clusters_.end(); ++it1) {
162  AhcCluster *clust1 = clusters_map_[*it1];
163  second_pass_clusters_map_[++count] = clust1;
164 
165  // Compute new cluster pair costs
166  for (it2 = second_pass_active_clusters_.begin();
167  it2 != second_pass_active_clusters_.end(); ++it2) {
168  AhcCluster *clust2 = second_pass_clusters_map_[*it2];
169  uint32 new_key = EncodePair(count, *it2);
170 
171  BaseFloat new_cost = 0.0;
172  std::vector<int32>::iterator utt_it1, utt_it2;
173  for (utt_it1 = clust1->utt_ids.begin();
174  utt_it1 != clust1->utt_ids.end(); ++utt_it1) {
175  for (utt_it2 = clust2->utt_ids.begin();
176  utt_it2 != clust2->utt_ids.end(); ++utt_it2) {
177  new_cost += costs_(*utt_it1, *utt_it2);
178  }
179  }
180 
181  second_pass_cluster_cost_map_[new_key] = new_cost;
182  BaseFloat norm = clust1->size * clust2->size;
183  if (new_cost / norm <= threshold_)
184  second_pass_queue_.push(std::make_pair(new_cost / norm, new_key));
185  }
186 
187  // Copy cluster pair costs that were already computed in the first pass
188  int32 count2 = second_pass_count_;
189  for (it2 = active_clusters_.begin(); it2 != it1; ++it2) {
190  uint32 key = EncodePair(*it1, *it2);
191  BaseFloat cost = cluster_cost_map_[key];
192  BaseFloat norm = clust1->size * (clusters_map_[*it2])->size;
193  uint32 new_key = EncodePair(count, ++count2);
194  second_pass_cluster_cost_map_[new_key] = cost;
195  if (cost / norm <= threshold_)
196  second_pass_queue_.push(std::make_pair(cost / norm, new_key));
197  }
198  }
199  // We update second_pass_count_ and second_pass_active_clusters_ here since
200  // above loop assumes they do not change while the loop is running.
201  while (second_pass_count_ < count)
203 }
std::unordered_map< int32, AhcCluster * > clusters_map_
kaldi::int32 int32
const Matrix< BaseFloat > & costs_
std::unordered_map< uint32, BaseFloat > second_pass_cluster_cost_map_
std::unordered_map< int32, AhcCluster * > second_pass_clusters_map_
const size_t count
float BaseFloat
Definition: kaldi-types.h:29
std::unordered_map< uint32, BaseFloat > cluster_cost_map_

◆ AssignClusters()

void AssignClusters ( )
private

Definition at line 205 of file agglomerative-clustering.cc.

References AgglomerativeClusterer::active_clusters_, AgglomerativeClusterer::assignments_, AgglomerativeClusterer::clusters_map_, AgglomerativeClusterer::num_points_, and AhcCluster::utt_ids.

Referenced by AgglomerativeClusterer::ClusterSinglePass(), and AgglomerativeClusterer::ClusterTwoPass().

205  {
206  assignments_->resize(num_points_);
207  int32 label_id = 0;
208  std::set<int32>::iterator it;
209  // Iterate through the clusters and assign all utterances within the cluster
210  // an ID label unique to the cluster. This is the final output and frees up
211  // the cluster memory accordingly.
212  for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
213  ++label_id;
214  AhcCluster *cluster = clusters_map_[*it];
215  std::vector<int32>::iterator utt_it;
216  for (utt_it = cluster->utt_ids.begin();
217  utt_it != cluster->utt_ids.end(); ++utt_it)
218  (*assignments_)[*utt_it] = label_id;
219  delete cluster;
220  }
221 }
std::unordered_map< int32, AhcCluster * > clusters_map_
kaldi::int32 int32

◆ Cluster()

◆ ClusterSinglePass()

◆ ClusterTwoPass()

void ClusterTwoPass ( )

Definition at line 40 of file agglomerative-clustering.cc.

References AgglomerativeClusterer::active_clusters_, AgglomerativeClusterer::AddClustersToSecondPass(), AgglomerativeClusterer::AssignClusters(), AgglomerativeClusterer::cluster_cost_map_, AgglomerativeClusterer::clusters_map_, AgglomerativeClusterer::ComputeClusters(), AgglomerativeClusterer::count_, AgglomerativeClusterer::first_pass_max_points_, AgglomerativeClusterer::InitializeClusters(), AgglomerativeClusterer::min_clusters_, rnnlm::n, AgglomerativeClusterer::num_points_, AgglomerativeClusterer::queue_, AgglomerativeClusterer::second_pass_active_clusters_, AgglomerativeClusterer::second_pass_cluster_cost_map_, AgglomerativeClusterer::second_pass_clusters_map_, AgglomerativeClusterer::second_pass_count_, and AgglomerativeClusterer::second_pass_queue_.

Referenced by AgglomerativeClusterer::Cluster().

40  {
41  // This is the first pass loop. We divide the input into equal size subsets
42  // making sure each subset has at most first_pass_max_points_ points. Then, we
43  // cluster the points in each subset separately until a stopping criterion is
44  // reached. We set the minimum number of clusters to 10 * min_clusters_ for
45  // each subset to avoid early merging of most clusters that would otherwise be
46  // kept separate in single pass clustering.
47  BaseFloat num_points = static_cast<BaseFloat>(num_points_);
48  int32 num_subsets = ceil(num_points / first_pass_max_points_);
49  int32 subset_size = ceil(num_points / num_subsets);
50  for (int32 n = 0; n < num_points_; n += subset_size) {
51  InitializeClusters(n, std::min(n + subset_size, num_points_));
54  }
55 
56  // We swap the contents of the first and second pass data structures so that
57  // we can use the same method to do second pass clustering.
63 
64  // This is the second pass. It moves through the queue merging clusters
65  // determined in the first pass until a stopping criterion is reached.
67 
69 }
void InitializeClusters(int32 first, int32 last)
std::unordered_map< int32, AhcCluster * > clusters_map_
kaldi::int32 int32
std::unordered_map< uint32, BaseFloat > second_pass_cluster_cost_map_
std::unordered_map< int32, AhcCluster * > second_pass_clusters_map_
float BaseFloat
Definition: kaldi-types.h:29
struct rnnlm::@11::@12 n
void ComputeClusters(int32 min_clusters)
std::unordered_map< uint32, BaseFloat > cluster_cost_map_

◆ ComputeClusters()

void ComputeClusters ( int32  min_clusters)
private

Definition at line 109 of file agglomerative-clustering.cc.

References AgglomerativeClusterer::active_clusters_, AgglomerativeClusterer::clusters_map_, AgglomerativeClusterer::DecodePair(), rnnlm::i, rnnlm::j, AgglomerativeClusterer::max_cluster_size_, AgglomerativeClusterer::MergeClusters(), and AgglomerativeClusterer::queue_.

Referenced by AgglomerativeClusterer::ClusterSinglePass(), and AgglomerativeClusterer::ClusterTwoPass().

109  {
110  while (active_clusters_.size() > min_clusters && !queue_.empty()) {
111  std::pair<BaseFloat, uint32> pr = queue_.top();
112  int32 i, j;
113  std::tie(i, j) = DecodePair(pr.second);
114  queue_.pop();
115  // check to make sure clusters have not already been merged
116  if ((active_clusters_.find(i) != active_clusters_.end()) &&
117  (active_clusters_.find(j) != active_clusters_.end())) {
118  if (clusters_map_[i]->size + clusters_map_[j]->size <= max_cluster_size_)
119  MergeClusters(i, j);
120  }
121  }
122 }
std::unordered_map< int32, AhcCluster * > clusters_map_
kaldi::int32 int32
std::pair< int32, int32 > DecodePair(uint32 key)

◆ DecodePair()

std::pair< int32, int32 > DecodePair ( uint32  key)
private

Definition at line 78 of file agglomerative-clustering.cc.

Referenced by AgglomerativeClusterer::ComputeClusters().

78  {
79  return std::make_pair(static_cast<int32>(key >> 16),
80  static_cast<int32>(key & 0x0000FFFFu));
81 }

◆ EncodePair()

uint32 EncodePair ( int32  i,
int32  j 
)
private

Definition at line 71 of file agglomerative-clustering.cc.

References rnnlm::j.

Referenced by AgglomerativeClusterer::AddClustersToSecondPass(), AgglomerativeClusterer::InitializeClusters(), and AgglomerativeClusterer::MergeClusters().

71  {
72  if (i < j)
73  return (static_cast<uint32>(i) << 16) + static_cast<uint32>(j);
74  else
75  return (static_cast<uint32>(j) << 16) + static_cast<uint32>(i);
76 }

◆ InitializeClusters()

void InitializeClusters ( int32  first,
int32  last 
)
private

Definition at line 83 of file agglomerative-clustering.cc.

References AgglomerativeClusterer::active_clusters_, AgglomerativeClusterer::cluster_cost_map_, AgglomerativeClusterer::clusters_map_, AgglomerativeClusterer::costs_, AgglomerativeClusterer::EncodePair(), rnnlm::i, rnnlm::j, KALDI_ASSERT, AgglomerativeClusterer::queue_, and AgglomerativeClusterer::threshold_.

Referenced by AgglomerativeClusterer::ClusterSinglePass(), and AgglomerativeClusterer::ClusterTwoPass().

83  {
84  KALDI_ASSERT(last > first);
85  clusters_map_.clear();
86  active_clusters_.clear();
87  cluster_cost_map_.clear();
88  queue_ = QueueType(); // priority_queue does not have a clear method
89 
90  for (int32 i = first; i < last; i++) {
91  // create an initial cluster of size 1 for each point
92  std::vector<int32> ids;
93  ids.push_back(i);
94  AhcCluster *c = new AhcCluster(i + 1, -1, -1, ids);
95  clusters_map_[i + 1] = c;
96  active_clusters_.insert(i + 1);
97 
98  // propagate the queue with all pairs from the cost matrix
99  for (int32 j = i + 1; j < last; j++) {
100  BaseFloat cost = costs_(i, j);
101  uint32 key = EncodePair(i + 1, j + 1);
102  cluster_cost_map_[key] = cost;
103  if (cost <= threshold_)
104  queue_.push(std::make_pair(cost, key));
105  }
106  }
107 }
std::unordered_map< int32, AhcCluster * > clusters_map_
kaldi::int32 int32
const Matrix< BaseFloat > & costs_
float BaseFloat
Definition: kaldi-types.h:29
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
std::priority_queue< QueueElement, std::vector< QueueElement >, std::greater< QueueElement > > QueueType
std::unordered_map< uint32, BaseFloat > cluster_cost_map_

◆ MergeClusters()

void MergeClusters ( int32  i,
int32  j 
)
private

Definition at line 124 of file agglomerative-clustering.cc.

References AgglomerativeClusterer::active_clusters_, AgglomerativeClusterer::cluster_cost_map_, AgglomerativeClusterer::clusters_map_, AgglomerativeClusterer::count_, AgglomerativeClusterer::EncodePair(), rnnlm::i, AhcCluster::id, rnnlm::j, AhcCluster::parent1, AhcCluster::parent2, AgglomerativeClusterer::queue_, AhcCluster::size, AgglomerativeClusterer::threshold_, and AhcCluster::utt_ids.

Referenced by AgglomerativeClusterer::ComputeClusters().

124  {
125  AhcCluster *clust1 = clusters_map_[i];
126  AhcCluster *clust2 = clusters_map_[j];
127  // For memory efficiency, the first cluster is updated to contain the new
128  // merged cluster information, and the second cluster is later deleted.
129  clust1->id = ++count_;
130  clust1->parent1 = i;
131  clust1->parent2 = j;
132  clust1->size += clust2->size;
133  clust1->utt_ids.insert(clust1->utt_ids.end(), clust2->utt_ids.begin(),
134  clust2->utt_ids.end());
135  // Remove the merged clusters from the list of active clusters.
136  active_clusters_.erase(i);
137  active_clusters_.erase(j);
138  // Update the queue with all the new costs involving the new cluster
139  std::set<int32>::iterator it;
140  for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
141  // The new cost is the sum of the costs of the new cluster's parents
142  BaseFloat new_cost = cluster_cost_map_[EncodePair(*it, i)] +
144  uint32 new_key = EncodePair(*it, count_);
145  cluster_cost_map_[new_key] = new_cost;
146  BaseFloat norm = clust1->size * (clusters_map_[*it])->size;
147  if (new_cost / norm <= threshold_)
148  queue_.push(std::make_pair(new_cost / norm, new_key));
149  }
150  active_clusters_.insert(count_);
151  clusters_map_[count_] = clust1;
152  delete clust2;
153 }
std::unordered_map< int32, AhcCluster * > clusters_map_
float BaseFloat
Definition: kaldi-types.h:29
std::unordered_map< uint32, BaseFloat > cluster_cost_map_

Member Data Documentation

◆ active_clusters_

◆ assignments_

std::vector<int32>* assignments_
private

Definition at line 116 of file agglomerative-clustering.h.

Referenced by AgglomerativeClusterer::AssignClusters().

◆ cluster_cost_map_

◆ clusters_map_

◆ costs_

◆ count_

◆ first_pass_max_points_

int32 first_pass_max_points_
private

◆ max_cluster_size_

int32 max_cluster_size_
private

Definition at line 119 of file agglomerative-clustering.h.

Referenced by AgglomerativeClusterer::ComputeClusters().

◆ min_clusters_

◆ num_points_

◆ queue_

◆ second_pass_active_clusters_

std::set<int32> second_pass_active_clusters_
private

◆ second_pass_cluster_cost_map_

std::unordered_map<uint32, BaseFloat> second_pass_cluster_cost_map_
private

◆ second_pass_clusters_map_

std::unordered_map<int32, AhcCluster*> second_pass_clusters_map_
private

◆ second_pass_count_

◆ second_pass_queue_

◆ threshold_


The documentation for this class was generated from the following files: