agglomerative-clustering.h
Go to the documentation of this file.
1 // ivector/agglomerative-clustering.h
2 
3 // Copyright 2017-2018 Matthew Maciejewski
4 // 2018 David Snyder
5 // 2019 Dogan Can
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #ifndef KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_
23 #define KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_
24 
25 #include <vector>
26 #include <queue>
27 #include <set>
28 #include <unordered_map>
29 #include <functional>
30 #include "base/kaldi-common.h"
31 #include "matrix/matrix-lib.h"
32 #include "util/stl-utils.h"
33 
34 namespace kaldi {
35 
41 struct AhcCluster {
43  parent1,
44  parent2,
45  size;
46  std::vector<int32> utt_ids;
47  AhcCluster(int32 id, int32 p1, int32 p2, std::vector<int32> utts)
48  : id(id), parent1(p1), parent2(p2), utt_ids(utts) {
49  size = utts.size();
50  }
51 };
52 
56  public:
58  const Matrix<BaseFloat> &costs,
59  BaseFloat threshold,
60  int32 min_clusters,
61  int32 first_pass_max_points,
62  BaseFloat max_cluster_fraction,
63  std::vector<int32> *assignments_out)
64  : costs_(costs), threshold_(threshold), min_clusters_(min_clusters),
65  first_pass_max_points_(first_pass_max_points),
66  assignments_(assignments_out) {
67  num_points_ = costs.NumRows();
68 
69  // The max_cluster_size_ is a hard limit on the number points in a cluster.
70  // This is useful for handling degenerate cases where some outlier points
71  // form their own clusters and force everything else to be clustered
72  // together, e.g. when min-clusters is provided instead of a threshold.
73  max_cluster_size_ = ceil(num_points_ * max_cluster_fraction);
74 
75  // The count_, which is used for identifying clusters, is initialized to
76  // num_points_ because cluster IDs 1..num_points_ are reserved for input
77  // points, which are the initial set of clusters.
78  count_ = num_points_;
79 
80  // The second_pass_count_, which is used for identifying the initial set of
81  // second pass clusters and initializing count_ before the second pass, is
82  // initialized to 0 and incremented whenever a new cluster is added to the
83  // initial set of second pass clusters.
84  second_pass_count_ = 0;
85  }
86 
87  // Clusters points. Chooses single pass or two pass algorithm.
88  void Cluster();
89 
90  // Clusters points using single pass algorithm.
91  void ClusterSinglePass();
92 
93  // Clusters points using two pass algorithm.
94  void ClusterTwoPass();
95 
96  private:
97  // Encodes cluster pair into a 32bit unsigned integer.
98  uint32 EncodePair(int32 i, int32 j);
99  // Decodes cluster pair from a 32bit unsigned integer.
100  std::pair<int32, int32> DecodePair(uint32 key);
101  // Initializes the clustering queue with singleton clusters
102  void InitializeClusters(int32 first, int32 last);
103  // Does hierarchical agglomerative clustering
104  void ComputeClusters(int32 min_clusters);
105  // Adds clusters created in first pass to second pass clusters
106  void AddClustersToSecondPass();
107  // Assigns points to clusters
108  void AssignClusters();
109  // Merges clusters with IDs i and j and updates cost map and queue
110  void MergeClusters(int32 i, int32 j);
111 
112  const Matrix<BaseFloat> &costs_; // cost matrix
113  BaseFloat threshold_; // stopping criterion threshold
114  int32 min_clusters_; // minimum number of clusters
115  int32 first_pass_max_points_; // maximum number of points in each subset
116  std::vector<int32> *assignments_; // assignments out
117 
118  int32 num_points_; // total number of points to cluster
119  int32 max_cluster_size_; // maximum number of points in a cluster
120  int32 count_; // count of first pass clusters, used for identifying clusters
121  int32 second_pass_count_; // count of second pass clusters
122 
123  // Priority queue using greater (lowest costs are highest priority).
124  // Elements contain pairs of cluster IDs and their cost.
125  typedef std::pair<BaseFloat, uint32> QueueElement;
126  typedef std::priority_queue<QueueElement, std::vector<QueueElement>,
127  std::greater<QueueElement> > QueueType;
128  QueueType queue_, second_pass_queue_;
129 
130  // Map from cluster IDs to cost between them
131  std::unordered_map<uint32, BaseFloat> cluster_cost_map_;
132  // Map from cluster ID to cluster object address
133  std::unordered_map<int32, AhcCluster*> clusters_map_;
134  // Set of unmerged cluster IDs
135  std::set<int32> active_clusters_;
136 
137  // Map from second pass cluster IDs to cost between them
138  std::unordered_map<uint32, BaseFloat> second_pass_cluster_cost_map_;
139  // Map from second pass cluster ID to cluster object address
140  std::unordered_map<int32, AhcCluster*> second_pass_clusters_map_;
141  // Set of unmerged second pass cluster IDs
143 };
144 
181  const Matrix<BaseFloat> &costs,
182  BaseFloat threshold,
183  int32 min_clusters,
184  int32 first_pass_max_points,
185  BaseFloat max_cluster_fraction,
186  std::vector<int32> *assignments_out);
187 
188 } // end namespace kaldi.
189 
190 #endif // KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
std::unordered_map< int32, AhcCluster * > clusters_map_
kaldi::int32 int32
const Matrix< BaseFloat > & costs_
std::unordered_map< uint32, BaseFloat > second_pass_cluster_cost_map_
AhcCluster(int32 id, int32 p1, int32 p2, std::vector< int32 > utts)
std::unordered_map< int32, AhcCluster * > second_pass_clusters_map_
AgglomerativeClusterer(const Matrix< BaseFloat > &costs, BaseFloat threshold, int32 min_clusters, int32 first_pass_max_points, BaseFloat max_cluster_fraction, std::vector< int32 > *assignments_out)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void AgglomerativeCluster(const Matrix< BaseFloat > &costs, BaseFloat threshold, int32 min_clusters, int32 first_pass_max_points, BaseFloat max_cluster_fraction, std::vector< int32 > *assignments_out)
This is the function that is called to perform the agglomerative clustering.
AhcCluster is the cluster object for the agglomerative clustering.
std::pair< BaseFloat, uint32 > QueueElement
std::priority_queue< QueueElement, std::vector< QueueElement >, std::greater< QueueElement > > QueueType
The AgglomerativeClusterer class contains the necessary mechanisms for the actual clustering algorith...
std::unordered_map< uint32, BaseFloat > cluster_cost_map_
std::vector< int32 > utt_ids