doc/agglomerative-clustering_8cc_source.html

 // ivector/agglomerative-clustering.cc

 // Copyright  2017-2018  Matthew Maciejewski
 //                 2018  David Snyder
 //                 2019  Dogan Can

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.

 #include <algorithm>
 #include "ivector/agglomerative-clustering.h"

 namespace kaldi {

 void AgglomerativeClusterer::Cluster() {
   if (num_points_ > first_pass_max_points_)
     ClusterTwoPass();
   else
     ClusterSinglePass();
 }

 void AgglomerativeClusterer::ClusterSinglePass() {
   InitializeClusters(0, num_points_);
   ComputeClusters(min_clusters_);
   AssignClusters();
 }

 void AgglomerativeClusterer::ClusterTwoPass() {
   // This is the first pass loop. We divide the input into equal size subsets
   // making sure each subset has at most first_pass_max_points_ points. Then, we
   // cluster the points in each subset separately until a stopping criterion is
   // reached. We set the minimum number of clusters to 10 * min_clusters_ for
   // each subset to avoid early merging of most clusters that would otherwise be
   // kept separate in single pass clustering.
   BaseFloat num_points = static_cast<BaseFloat>(num_points_);
   int32 num_subsets = ceil(num_points / first_pass_max_points_);
   int32 subset_size = ceil(num_points / num_subsets);
   for (int32 n = 0; n < num_points_; n += subset_size) {
     InitializeClusters(n, std::min(n + subset_size, num_points_));
     ComputeClusters(min_clusters_ * 10);
     AddClustersToSecondPass();
   }

   // We swap the contents of the first and second pass data structures so that
   // we can use the same method to do second pass clustering.
   clusters_map_.swap(second_pass_clusters_map_);
   active_clusters_.swap(second_pass_active_clusters_);
   cluster_cost_map_.swap(second_pass_cluster_cost_map_);
   queue_.swap(second_pass_queue_);
   count_ = second_pass_count_;

   // This is the second pass. It moves through the queue merging clusters
   // determined in the first pass until a stopping criterion is reached.
   ComputeClusters(min_clusters_);

   AssignClusters();
 }

 uint32 AgglomerativeClusterer::EncodePair(int32 i, int32 j) {
   if (i < j)
     return (static_cast<uint32>(i) << 16) + static_cast<uint32>(j);
   else
     return (static_cast<uint32>(j) << 16) + static_cast<uint32>(i);
 }

 std::pair<int32, int32> AgglomerativeClusterer::DecodePair(uint32 key) {
   return std::make_pair(static_cast<int32>(key >> 16),
                         static_cast<int32>(key & 0x0000FFFFu));
 }

 void AgglomerativeClusterer::InitializeClusters(int32 first, int32 last) {
   KALDI_ASSERT(last > first);
   clusters_map_.clear();
   active_clusters_.clear();
   cluster_cost_map_.clear();
   queue_ = QueueType();  // priority_queue does not have a clear method

   for (int32 i = first; i < last; i++) {
     // create an initial cluster of size 1 for each point
     std::vector<int32> ids;
     ids.push_back(i);
     AhcCluster *c = new AhcCluster(i + 1, -1, -1, ids);
     clusters_map_[i + 1] = c;
     active_clusters_.insert(i + 1);

     // propagate the queue with all pairs from the cost matrix
     for (int32 j = i + 1; j < last; j++) {
       BaseFloat cost = costs_(i, j);
       uint32 key = EncodePair(i + 1, j + 1);
       cluster_cost_map_[key] = cost;
       if (cost <= threshold_)
         queue_.push(std::make_pair(cost, key));
     }
   }
 }

 void AgglomerativeClusterer::ComputeClusters(int32 min_clusters) {
   while (active_clusters_.size() > min_clusters && !queue_.empty()) {
     std::pair<BaseFloat, uint32> pr = queue_.top();
     int32 i, j;
     std::tie(i, j) = DecodePair(pr.second);
     queue_.pop();
     // check to make sure clusters have not already been merged
     if ((active_clusters_.find(i) != active_clusters_.end()) &&
         (active_clusters_.find(j) != active_clusters_.end())) {
       if (clusters_map_[i]->size + clusters_map_[j]->size <= max_cluster_size_)
         MergeClusters(i, j);
     }
   }
 }

 void AgglomerativeClusterer::MergeClusters(int32 i, int32 j) {
   AhcCluster *clust1 = clusters_map_[i];
   AhcCluster *clust2 = clusters_map_[j];
   // For memory efficiency, the first cluster is updated to contain the new
   // merged cluster information, and the second cluster is later deleted.
   clust1->id = ++count_;
   clust1->parent1 = i;
   clust1->parent2 = j;
   clust1->size += clust2->size;
   clust1->utt_ids.insert(clust1->utt_ids.end(), clust2->utt_ids.begin(),
                          clust2->utt_ids.end());
   // Remove the merged clusters from the list of active clusters.
   active_clusters_.erase(i);
   active_clusters_.erase(j);
   // Update the queue with all the new costs involving the new cluster
   std::set<int32>::iterator it;
   for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
     // The new cost is the sum of the costs of the new cluster's parents
     BaseFloat new_cost = cluster_cost_map_[EncodePair(*it, i)] +
                          cluster_cost_map_[EncodePair(*it, j)];
     uint32 new_key = EncodePair(*it, count_);
     cluster_cost_map_[new_key] = new_cost;
     BaseFloat norm = clust1->size * (clusters_map_[*it])->size;
     if (new_cost / norm <= threshold_)
       queue_.push(std::make_pair(new_cost / norm, new_key));
   }
   active_clusters_.insert(count_);
   clusters_map_[count_] = clust1;
   delete clust2;
 }

 void AgglomerativeClusterer::AddClustersToSecondPass() {
   // This method collects the results of first pass clustering for one subset,
   // i.e. adds the set of active clusters to the set of second pass active
   // clusters and computes the costs for the newly formed cluster pairs.
   std::set<int32>::iterator it1, it2;
   int32 count = second_pass_count_;
   for (it1 = active_clusters_.begin(); it1 != active_clusters_.end(); ++it1) {
     AhcCluster *clust1 = clusters_map_[*it1];
     second_pass_clusters_map_[++count] = clust1;

     // Compute new cluster pair costs
     for (it2 = second_pass_active_clusters_.begin();
          it2 != second_pass_active_clusters_.end(); ++it2) {
       AhcCluster *clust2 = second_pass_clusters_map_[*it2];
       uint32 new_key = EncodePair(count, *it2);

       BaseFloat new_cost = 0.0;
       std::vector<int32>::iterator utt_it1, utt_it2;
       for (utt_it1 = clust1->utt_ids.begin();
            utt_it1 != clust1->utt_ids.end(); ++utt_it1) {
          for (utt_it2 = clust2->utt_ids.begin();
               utt_it2 != clust2->utt_ids.end(); ++utt_it2) {
            new_cost += costs_(*utt_it1, *utt_it2);
          }
       }

       second_pass_cluster_cost_map_[new_key] = new_cost;
       BaseFloat norm = clust1->size * clust2->size;
       if (new_cost / norm <= threshold_)
         second_pass_queue_.push(std::make_pair(new_cost / norm, new_key));
     }

     // Copy cluster pair costs that were already computed in the first pass
     int32 count2 = second_pass_count_;
     for (it2 = active_clusters_.begin(); it2 != it1; ++it2) {
       uint32 key = EncodePair(*it1, *it2);
       BaseFloat cost = cluster_cost_map_[key];
       BaseFloat norm = clust1->size * (clusters_map_[*it2])->size;
       uint32 new_key = EncodePair(count, ++count2);
       second_pass_cluster_cost_map_[new_key] = cost;
       if (cost / norm <= threshold_)
         second_pass_queue_.push(std::make_pair(cost / norm, new_key));
     }
   }
   // We update second_pass_count_ and second_pass_active_clusters_ here since
   // above loop assumes they do not change while the loop is running.
   while (second_pass_count_ < count)
     second_pass_active_clusters_.insert(++second_pass_count_);
 }

 void AgglomerativeClusterer::AssignClusters() {
   assignments_->resize(num_points_);
   int32 label_id = 0;
   std::set<int32>::iterator it;
   // Iterate through the clusters and assign all utterances within the cluster
   // an ID label unique to the cluster. This is the final output and frees up
   // the cluster memory accordingly.
   for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
     ++label_id;
     AhcCluster *cluster = clusters_map_[*it];
     std::vector<int32>::iterator utt_it;
     for (utt_it = cluster->utt_ids.begin();
          utt_it != cluster->utt_ids.end(); ++utt_it)
       (*assignments_)[*utt_it] = label_id;
     delete cluster;
   }
 }

 void AgglomerativeCluster(
     const Matrix<BaseFloat> &costs,
     BaseFloat threshold,
     int32 min_clusters,
     int32 first_pass_max_points,
     BaseFloat max_cluster_fraction,
     std::vector<int32> *assignments_out) {
   KALDI_ASSERT(min_clusters >= 0);
   KALDI_ASSERT(max_cluster_fraction >= 1.0 / min_clusters);
   AgglomerativeClusterer ac(costs, threshold, min_clusters,
                             first_pass_max_points, max_cluster_fraction,
                             assignments_out);
   ac.Cluster();
 }

 }  // end namespace kaldi.
kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::AgglomerativeClusterer::InitializeClusters
void InitializeClusters(int32 first, int32 last)
Definition: agglomerative-clustering.cc:83

kaldi::AgglomerativeClusterer::max_cluster_size_
int32 max_cluster_size_
Definition: agglomerative-clustering.h:119

kaldi::AgglomerativeClusterer::clusters_map_
std::unordered_map< int32, AhcCluster * > clusters_map_
Definition: agglomerative-clustering.h:133

rnnlm::j
int j
Definition: mikolov-rnnlm-lib.cc:66

kaldi::AgglomerativeClusterer::ClusterSinglePass
void ClusterSinglePass()
Definition: agglomerative-clustering.cc:34

kaldi::AhcCluster::parent2
int32 parent2
Definition: agglomerative-clustering.h:42

kaldi::AgglomerativeClusterer::second_pass_active_clusters_
std::set< int32 > second_pass_active_clusters_
Definition: agglomerative-clustering.h:142

kaldi::AgglomerativeClusterer::AddClustersToSecondPass
void AddClustersToSecondPass()
Definition: agglomerative-clustering.cc:155

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

kaldi::AgglomerativeClusterer::costs_
const Matrix< BaseFloat > & costs_
Definition: agglomerative-clustering.h:112

kaldi::Matrix< BaseFloat >

kaldi::AgglomerativeClusterer::Cluster
void Cluster()
Definition: agglomerative-clustering.cc:27

kaldi::AgglomerativeClusterer::assignments_
std::vector< int32 > * assignments_
Definition: agglomerative-clustering.h:116

kaldi::AgglomerativeClusterer::second_pass_cluster_cost_map_
std::unordered_map< uint32, BaseFloat > second_pass_cluster_cost_map_
Definition: agglomerative-clustering.h:138

kaldi::AgglomerativeClusterer::second_pass_clusters_map_
std::unordered_map< int32, AhcCluster * > second_pass_clusters_map_
Definition: agglomerative-clustering.h:140

kaldi::AgglomerativeClusterer::second_pass_count_
int32 second_pass_count_
Definition: agglomerative-clustering.h:121

count
const size_t count
Definition: arpa-file-parser-test.cc:66

kaldi::BaseFloat
float BaseFloat
Definition: kaldi-types.h:29

float

kaldi::AgglomerativeClusterer::AssignClusters
void AssignClusters()
Definition: agglomerative-clustering.cc:205

kaldi::AhcCluster::size
int32 size
Definition: agglomerative-clustering.h:42

rnnlm::n
struct rnnlm::@11::@12 n

kaldi::AgglomerativeClusterer::MergeClusters
void MergeClusters(int32 i, int32 j)
Definition: agglomerative-clustering.cc:124

kaldi::AgglomerativeClusterer::ClusterTwoPass
void ClusterTwoPass()
Definition: agglomerative-clustering.cc:40

kaldi::AgglomerativeClusterer::queue_
QueueType queue_
Definition: agglomerative-clustering.h:128

kaldi::AgglomerativeClusterer::DecodePair
std::pair< int32, int32 > DecodePair(uint32 key)
Definition: agglomerative-clustering.cc:78

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

kaldi::AgglomerativeClusterer::second_pass_queue_
QueueType second_pass_queue_
Definition: agglomerative-clustering.h:128

kaldi::AgglomerativeClusterer::count_
int32 count_
Definition: agglomerative-clustering.h:120

kaldi::AgglomerativeClusterer::num_points_
int32 num_points_
Definition: agglomerative-clustering.h:118

kaldi::AhcCluster::parent1
int32 parent1
Definition: agglomerative-clustering.h:42

kaldi::AgglomerativeCluster
void AgglomerativeCluster(const Matrix< BaseFloat > &costs, BaseFloat threshold, int32 min_clusters, int32 first_pass_max_points, BaseFloat max_cluster_fraction, std::vector< int32 > *assignments_out)
This is the function that is called to perform the agglomerative clustering.
Definition: agglomerative-clustering.cc:223

kaldi::AhcCluster
AhcCluster is the cluster object for the agglomerative clustering.
Definition: agglomerative-clustering.h:41

agglomerative-clustering.h

kaldi::AgglomerativeClusterer::EncodePair
uint32 EncodePair(int32 i, int32 j)
Definition: agglomerative-clustering.cc:71

kaldi::AgglomerativeClusterer::threshold_
BaseFloat threshold_
Definition: agglomerative-clustering.h:113

kaldi::AgglomerativeClusterer::QueueType
std::priority_queue< QueueElement, std::vector< QueueElement >, std::greater< QueueElement > > QueueType
Definition: agglomerative-clustering.h:127

kaldi::AgglomerativeClusterer::active_clusters_
std::set< int32 > active_clusters_
Definition: agglomerative-clustering.h:135

kaldi::AgglomerativeClusterer::first_pass_max_points_
int32 first_pass_max_points_
Definition: agglomerative-clustering.h:115

kaldi::AgglomerativeClusterer
The AgglomerativeClusterer class contains the necessary mechanisms for the actual clustering algorith...
Definition: agglomerative-clustering.h:55

kaldi::AgglomerativeClusterer::ComputeClusters
void ComputeClusters(int32 min_clusters)
Definition: agglomerative-clustering.cc:109

kaldi::AgglomerativeClusterer::cluster_cost_map_
std::unordered_map< uint32, BaseFloat > cluster_cost_map_
Definition: agglomerative-clustering.h:131

kaldi::AgglomerativeClusterer::min_clusters_
int32 min_clusters_
Definition: agglomerative-clustering.h:114

kaldi::AhcCluster::id
int32 id
Definition: agglomerative-clustering.h:42

kaldi::AhcCluster::utt_ids
std::vector< int32 > utt_ids
Definition: agglomerative-clustering.h:46