agglomerative-clustering.cc
Go to the documentation of this file.
1 // ivector/agglomerative-clustering.cc
2 
3 // Copyright 2017-2018 Matthew Maciejewski
4 // 2018 David Snyder
5 // 2019 Dogan Can
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #include <algorithm>
24 
25 namespace kaldi {
26 
30  else
32 }
33 
38 }
39 
41  // This is the first pass loop. We divide the input into equal size subsets
42  // making sure each subset has at most first_pass_max_points_ points. Then, we
43  // cluster the points in each subset separately until a stopping criterion is
44  // reached. We set the minimum number of clusters to 10 * min_clusters_ for
45  // each subset to avoid early merging of most clusters that would otherwise be
46  // kept separate in single pass clustering.
47  BaseFloat num_points = static_cast<BaseFloat>(num_points_);
48  int32 num_subsets = ceil(num_points / first_pass_max_points_);
49  int32 subset_size = ceil(num_points / num_subsets);
50  for (int32 n = 0; n < num_points_; n += subset_size) {
51  InitializeClusters(n, std::min(n + subset_size, num_points_));
54  }
55 
56  // We swap the contents of the first and second pass data structures so that
57  // we can use the same method to do second pass clustering.
63 
64  // This is the second pass. It moves through the queue merging clusters
65  // determined in the first pass until a stopping criterion is reached.
67 
69 }
70 
72  if (i < j)
73  return (static_cast<uint32>(i) << 16) + static_cast<uint32>(j);
74  else
75  return (static_cast<uint32>(j) << 16) + static_cast<uint32>(i);
76 }
77 
78 std::pair<int32, int32> AgglomerativeClusterer::DecodePair(uint32 key) {
79  return std::make_pair(static_cast<int32>(key >> 16),
80  static_cast<int32>(key & 0x0000FFFFu));
81 }
82 
84  KALDI_ASSERT(last > first);
85  clusters_map_.clear();
86  active_clusters_.clear();
87  cluster_cost_map_.clear();
88  queue_ = QueueType(); // priority_queue does not have a clear method
89 
90  for (int32 i = first; i < last; i++) {
91  // create an initial cluster of size 1 for each point
92  std::vector<int32> ids;
93  ids.push_back(i);
94  AhcCluster *c = new AhcCluster(i + 1, -1, -1, ids);
95  clusters_map_[i + 1] = c;
96  active_clusters_.insert(i + 1);
97 
98  // propagate the queue with all pairs from the cost matrix
99  for (int32 j = i + 1; j < last; j++) {
100  BaseFloat cost = costs_(i, j);
101  uint32 key = EncodePair(i + 1, j + 1);
102  cluster_cost_map_[key] = cost;
103  if (cost <= threshold_)
104  queue_.push(std::make_pair(cost, key));
105  }
106  }
107 }
108 
110  while (active_clusters_.size() > min_clusters && !queue_.empty()) {
111  std::pair<BaseFloat, uint32> pr = queue_.top();
112  int32 i, j;
113  std::tie(i, j) = DecodePair(pr.second);
114  queue_.pop();
115  // check to make sure clusters have not already been merged
116  if ((active_clusters_.find(i) != active_clusters_.end()) &&
117  (active_clusters_.find(j) != active_clusters_.end())) {
118  if (clusters_map_[i]->size + clusters_map_[j]->size <= max_cluster_size_)
119  MergeClusters(i, j);
120  }
121  }
122 }
123 
125  AhcCluster *clust1 = clusters_map_[i];
126  AhcCluster *clust2 = clusters_map_[j];
127  // For memory efficiency, the first cluster is updated to contain the new
128  // merged cluster information, and the second cluster is later deleted.
129  clust1->id = ++count_;
130  clust1->parent1 = i;
131  clust1->parent2 = j;
132  clust1->size += clust2->size;
133  clust1->utt_ids.insert(clust1->utt_ids.end(), clust2->utt_ids.begin(),
134  clust2->utt_ids.end());
135  // Remove the merged clusters from the list of active clusters.
136  active_clusters_.erase(i);
137  active_clusters_.erase(j);
138  // Update the queue with all the new costs involving the new cluster
139  std::set<int32>::iterator it;
140  for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
141  // The new cost is the sum of the costs of the new cluster's parents
142  BaseFloat new_cost = cluster_cost_map_[EncodePair(*it, i)] +
143  cluster_cost_map_[EncodePair(*it, j)];
144  uint32 new_key = EncodePair(*it, count_);
145  cluster_cost_map_[new_key] = new_cost;
146  BaseFloat norm = clust1->size * (clusters_map_[*it])->size;
147  if (new_cost / norm <= threshold_)
148  queue_.push(std::make_pair(new_cost / norm, new_key));
149  }
150  active_clusters_.insert(count_);
151  clusters_map_[count_] = clust1;
152  delete clust2;
153 }
154 
156  // This method collects the results of first pass clustering for one subset,
157  // i.e. adds the set of active clusters to the set of second pass active
158  // clusters and computes the costs for the newly formed cluster pairs.
159  std::set<int32>::iterator it1, it2;
161  for (it1 = active_clusters_.begin(); it1 != active_clusters_.end(); ++it1) {
162  AhcCluster *clust1 = clusters_map_[*it1];
163  second_pass_clusters_map_[++count] = clust1;
164 
165  // Compute new cluster pair costs
166  for (it2 = second_pass_active_clusters_.begin();
167  it2 != second_pass_active_clusters_.end(); ++it2) {
168  AhcCluster *clust2 = second_pass_clusters_map_[*it2];
169  uint32 new_key = EncodePair(count, *it2);
170 
171  BaseFloat new_cost = 0.0;
172  std::vector<int32>::iterator utt_it1, utt_it2;
173  for (utt_it1 = clust1->utt_ids.begin();
174  utt_it1 != clust1->utt_ids.end(); ++utt_it1) {
175  for (utt_it2 = clust2->utt_ids.begin();
176  utt_it2 != clust2->utt_ids.end(); ++utt_it2) {
177  new_cost += costs_(*utt_it1, *utt_it2);
178  }
179  }
180 
181  second_pass_cluster_cost_map_[new_key] = new_cost;
182  BaseFloat norm = clust1->size * clust2->size;
183  if (new_cost / norm <= threshold_)
184  second_pass_queue_.push(std::make_pair(new_cost / norm, new_key));
185  }
186 
187  // Copy cluster pair costs that were already computed in the first pass
188  int32 count2 = second_pass_count_;
189  for (it2 = active_clusters_.begin(); it2 != it1; ++it2) {
190  uint32 key = EncodePair(*it1, *it2);
191  BaseFloat cost = cluster_cost_map_[key];
192  BaseFloat norm = clust1->size * (clusters_map_[*it2])->size;
193  uint32 new_key = EncodePair(count, ++count2);
194  second_pass_cluster_cost_map_[new_key] = cost;
195  if (cost / norm <= threshold_)
196  second_pass_queue_.push(std::make_pair(cost / norm, new_key));
197  }
198  }
199  // We update second_pass_count_ and second_pass_active_clusters_ here since
200  // above loop assumes they do not change while the loop is running.
201  while (second_pass_count_ < count)
203 }
204 
206  assignments_->resize(num_points_);
207  int32 label_id = 0;
208  std::set<int32>::iterator it;
209  // Iterate through the clusters and assign all utterances within the cluster
210  // an ID label unique to the cluster. This is the final output and frees up
211  // the cluster memory accordingly.
212  for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
213  ++label_id;
214  AhcCluster *cluster = clusters_map_[*it];
215  std::vector<int32>::iterator utt_it;
216  for (utt_it = cluster->utt_ids.begin();
217  utt_it != cluster->utt_ids.end(); ++utt_it)
218  (*assignments_)[*utt_it] = label_id;
219  delete cluster;
220  }
221 }
222 
224  const Matrix<BaseFloat> &costs,
225  BaseFloat threshold,
226  int32 min_clusters,
227  int32 first_pass_max_points,
228  BaseFloat max_cluster_fraction,
229  std::vector<int32> *assignments_out) {
230  KALDI_ASSERT(min_clusters >= 0);
231  KALDI_ASSERT(max_cluster_fraction >= 1.0 / min_clusters);
232  AgglomerativeClusterer ac(costs, threshold, min_clusters,
233  first_pass_max_points, max_cluster_fraction,
234  assignments_out);
235  ac.Cluster();
236 }
237 
238 } // end namespace kaldi.
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void InitializeClusters(int32 first, int32 last)
std::unordered_map< int32, AhcCluster * > clusters_map_
kaldi::int32 int32
const Matrix< BaseFloat > & costs_
std::unordered_map< uint32, BaseFloat > second_pass_cluster_cost_map_
std::unordered_map< int32, AhcCluster * > second_pass_clusters_map_
const size_t count
float BaseFloat
Definition: kaldi-types.h:29
struct rnnlm::@11::@12 n
std::pair< int32, int32 > DecodePair(uint32 key)
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void AgglomerativeCluster(const Matrix< BaseFloat > &costs, BaseFloat threshold, int32 min_clusters, int32 first_pass_max_points, BaseFloat max_cluster_fraction, std::vector< int32 > *assignments_out)
This is the function that is called to perform the agglomerative clustering.
AhcCluster is the cluster object for the agglomerative clustering.
std::priority_queue< QueueElement, std::vector< QueueElement >, std::greater< QueueElement > > QueueType
The AgglomerativeClusterer class contains the necessary mechanisms for the actual clustering algorith...
void ComputeClusters(int32 min_clusters)
std::unordered_map< uint32, BaseFloat > cluster_cost_map_
std::vector< int32 > utt_ids