BottomUpClusterer Class Reference
Collaboration diagram for BottomUpClusterer:

Public Member Functions

 BottomUpClusterer (const std::vector< Clusterable *> &points, BaseFloat max_merge_thresh, int32 min_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out)
 
BaseFloat Cluster ()
 
 ~BottomUpClusterer ()
 

Private Types

typedef std::pair< BaseFloat, std::pair< uint_smaller, uint_smaller > > QueueElement
 
typedef std::priority_queue< QueueElement, std::vector< QueueElement >, std::greater< QueueElement > > QueueType
 

Private Member Functions

void Renumber ()
 
void InitializeAssignments ()
 
void SetInitialDistances ()
 Sets up distances and queue. More...
 
bool CanMerge (int32 i, int32 j, BaseFloat dist)
 CanMerge returns true if i and j are existing clusters, and the distance (negated objf-change) "dist" is accurate (i.e. More...
 
void MergeClusters (int32 i, int32 j)
 Merge j into i and delete j. More...
 
void ReconstructQueue ()
 Reconstructs the priority queue from the distances. More...
 
void SetDistance (int32 i, int32 j)
 
BaseFloatDistance (int32 i, int32 j)
 

Private Attributes

BaseFloat ans_
 
const std::vector< Clusterable * > & points_
 
BaseFloat max_merge_thresh_
 
int32 min_clust_
 
std::vector< Clusterable * > * clusters_
 
std::vector< int32 > * assignments_
 
std::vector< Clusterable * > tmp_clusters_
 
std::vector< int32tmp_assignments_
 
std::vector< BaseFloatdist_vec_
 
int32 nclusters_
 
int32 npoints_
 
QueueType queue_
 

Detailed Description

Definition at line 193 of file cluster-utils.cc.

Member Typedef Documentation

◆ QueueElement

typedef std::pair<BaseFloat, std::pair<uint_smaller, uint_smaller> > QueueElement
private

Definition at line 242 of file cluster-utils.cc.

◆ QueueType

typedef std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > QueueType
private

Definition at line 245 of file cluster-utils.cc.

Constructor & Destructor Documentation

◆ BottomUpClusterer()

BottomUpClusterer ( const std::vector< Clusterable *> &  points,
BaseFloat  max_merge_thresh,
int32  min_clust,
std::vector< Clusterable *> *  clusters_out,
std::vector< int32 > *  assignments_out 
)
inline

Definition at line 195 of file cluster-utils.cc.

References BottomUpClusterer::Cluster(), BottomUpClusterer::dist_vec_, BottomUpClusterer::nclusters_, and BottomUpClusterer::npoints_.

200  : ans_(0.0), points_(points), max_merge_thresh_(max_merge_thresh),
201  min_clust_(min_clust), clusters_(clusters_out != NULL? clusters_out
202  : &tmp_clusters_), assignments_(assignments_out != NULL ?
203  assignments_out : &tmp_assignments_) {
204  nclusters_ = npoints_ = points.size();
205  dist_vec_.resize((npoints_ * (npoints_ - 1)) / 2);
206  }
std::vector< int32 > * assignments_
std::vector< Clusterable * > * clusters_
std::vector< Clusterable * > tmp_clusters_
const std::vector< Clusterable * > & points_
std::vector< BaseFloat > dist_vec_
std::vector< int32 > tmp_assignments_

◆ ~BottomUpClusterer()

Member Function Documentation

◆ CanMerge()

bool CanMerge ( int32  i,
int32  j,
BaseFloat  dist 
)
private

CanMerge returns true if i and j are existing clusters, and the distance (negated objf-change) "dist" is accurate (i.e.

not outdated).

Definition at line 337 of file cluster-utils.cc.

References BottomUpClusterer::clusters_, BottomUpClusterer::dist_vec_, rnnlm::j, KALDI_ASSERT, and BottomUpClusterer::npoints_.

Referenced by BottomUpClusterer::Cluster(), CompartmentalizedBottomUpClusterer::Cluster(), BottomUpClusterer::~BottomUpClusterer(), and CompartmentalizedBottomUpClusterer::~CompartmentalizedBottomUpClusterer().

337  {
338  KALDI_ASSERT(i != j && i < npoints_ && j < npoints_);
339  if ((*clusters_)[i] == NULL || (*clusters_)[j] == NULL)
340  return false;
341  BaseFloat cached_dist = dist_vec_[(i * (i - 1)) / 2 + j];
342  return (std::fabs(cached_dist - dist) <= 1.0e-05 * std::fabs(dist));
343 }
std::vector< Clusterable * > * clusters_
float BaseFloat
Definition: kaldi-types.h:29
std::vector< BaseFloat > dist_vec_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ Cluster()

BaseFloat Cluster ( )

Definition at line 249 of file cluster-utils.cc.

References BottomUpClusterer::ans_, BottomUpClusterer::CanMerge(), rnnlm::i, BottomUpClusterer::InitializeAssignments(), rnnlm::j, KALDI_VLOG, BottomUpClusterer::MergeClusters(), BottomUpClusterer::min_clust_, BottomUpClusterer::nclusters_, BottomUpClusterer::queue_, BottomUpClusterer::Renumber(), and BottomUpClusterer::SetInitialDistances().

Referenced by BottomUpClusterer::BottomUpClusterer(), kaldi::ClusterBottomUp(), and CompartmentalizedBottomUpClusterer::CompartmentalizedBottomUpClusterer().

249  {
250  KALDI_VLOG(2) << "Initializing cluster assignments.";
252  KALDI_VLOG(2) << "Setting initial distances.";
254 
255  KALDI_VLOG(2) << "Clustering...";
256  while (nclusters_ > min_clust_ && !queue_.empty()) {
257  std::pair<BaseFloat, std::pair<uint_smaller, uint_smaller> > pr = queue_.top();
258  BaseFloat dist = pr.first;
259  int32 i = (int32) pr.second.first, j = (int32) pr.second.second;
260  queue_.pop();
261  if (CanMerge(i, j, dist)) MergeClusters(i, j);
262  }
263  KALDI_VLOG(2) << "Renumbering clusters to contiguous numbers.";
264  Renumber();
265  return ans_;
266 }
bool CanMerge(int32 i, int32 j, BaseFloat dist)
CanMerge returns true if i and j are existing clusters, and the distance (negated objf-change) "dist"...
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
void SetInitialDistances()
Sets up distances and queue.
void MergeClusters(int32 i, int32 j)
Merge j into i and delete j.
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156

◆ Distance()

BaseFloat& Distance ( int32  i,
int32  j 
)
inlineprivate

Definition at line 224 of file cluster-utils.cc.

References BottomUpClusterer::dist_vec_, rnnlm::j, KALDI_ASSERT, and BottomUpClusterer::npoints_.

224  {
225  KALDI_ASSERT(i < npoints_ && j < i);
226  return dist_vec_[(i * (i - 1)) / 2 + j];
227  }
std::vector< BaseFloat > dist_vec_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ InitializeAssignments()

void InitializeAssignments ( )
private

Definition at line 316 of file cluster-utils.cc.

References BottomUpClusterer::assignments_, BottomUpClusterer::clusters_, rnnlm::i, BottomUpClusterer::npoints_, and BottomUpClusterer::points_.

Referenced by BottomUpClusterer::Cluster(), CompartmentalizedBottomUpClusterer::Cluster(), BottomUpClusterer::~BottomUpClusterer(), and CompartmentalizedBottomUpClusterer::~CompartmentalizedBottomUpClusterer().

316  {
317  clusters_->resize(npoints_);
318  assignments_->resize(npoints_);
319  for (int32 i = 0; i < npoints_; i++) { // initialize as 1-1 mapping.
320  (*clusters_)[i] = points_[i]->Copy();
321  (*assignments_)[i] = i;
322  }
323 }
std::vector< int32 > * assignments_
kaldi::int32 int32
std::vector< Clusterable * > * clusters_
const std::vector< Clusterable * > & points_

◆ MergeClusters()

void MergeClusters ( int32  i,
int32  j 
)
private

Merge j into i and delete j.

Definition at line 345 of file cluster-utils.cc.

References BottomUpClusterer::ans_, BottomUpClusterer::clusters_, BottomUpClusterer::dist_vec_, rnnlm::i, rnnlm::j, KALDI_ASSERT, BottomUpClusterer::nclusters_, BottomUpClusterer::npoints_, and BottomUpClusterer::SetDistance().

Referenced by BottomUpClusterer::Cluster(), CompartmentalizedBottomUpClusterer::Cluster(), BottomUpClusterer::~BottomUpClusterer(), and CompartmentalizedBottomUpClusterer::~CompartmentalizedBottomUpClusterer().

345  {
346  KALDI_ASSERT(i != j && i < npoints_ && j < npoints_);
347  (*clusters_)[i]->Add(*((*clusters_)[j]));
348  delete (*clusters_)[j];
349  (*clusters_)[j] = NULL;
350  // note that we may have to follow the chain within "assignment_" to get
351  // final assignments.
352  (*assignments_)[j] = i;
353  // subtract negated objective function change, i.e. add objective function
354  // change.
355  ans_ -= dist_vec_[(i * (i - 1)) / 2 + j];
356  nclusters_--;
357  // Now update "distances".
358  for (int32 k = 0; k < npoints_; k++) {
359  if (k != i && (*clusters_)[k] != NULL) {
360  if (k < i)
361  SetDistance(i, k); // SetDistance requires k < i.
362  else
363  SetDistance(k, i);
364  }
365  }
366 }
kaldi::int32 int32
std::vector< Clusterable * > * clusters_
std::vector< BaseFloat > dist_vec_
void SetDistance(int32 i, int32 j)
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ ReconstructQueue()

void ReconstructQueue ( )
private

Reconstructs the priority queue from the distances.

Definition at line 368 of file cluster-utils.cc.

References BottomUpClusterer::clusters_, BottomUpClusterer::dist_vec_, rnnlm::i, rnnlm::j, BottomUpClusterer::max_merge_thresh_, BottomUpClusterer::npoints_, BottomUpClusterer::queue_, and kaldi::swap().

Referenced by CompartmentalizedBottomUpClusterer::MergeClusters(), BottomUpClusterer::SetDistance(), BottomUpClusterer::~BottomUpClusterer(), and CompartmentalizedBottomUpClusterer::~CompartmentalizedBottomUpClusterer().

368  {
369  // empty queue [since there is no clear()]
370  {
371  QueueType tmp;
372  std::swap(tmp, queue_);
373  }
374  for (int32 i = 0; i < npoints_; i++) {
375  if ((*clusters_)[i] != NULL) {
376  for (int32 j = 0; j < i; j++) {
377  if ((*clusters_)[j] != NULL) {
378  BaseFloat dist = dist_vec_[(i * (i - 1)) / 2 + j];
379  if (dist <= max_merge_thresh_) {
380  queue_.push(std::make_pair(dist, std::make_pair(
381  static_cast<uint_smaller>(i), static_cast<uint_smaller>(j))));
382  }
383  }
384  }
385  }
386  }
387 }
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
kaldi::int32 int32
std::vector< Clusterable * > * clusters_
float BaseFloat
Definition: kaldi-types.h:29
std::vector< BaseFloat > dist_vec_
std::priority_queue< QueueElement, std::vector< QueueElement >, std::greater< QueueElement > > QueueType

◆ Renumber()

void Renumber ( )
private

Definition at line 268 of file cluster-utils.cc.

References BottomUpClusterer::assignments_, BottomUpClusterer::clusters_, BottomUpClusterer::dist_vec_, rnnlm::i, KALDI_ASSERT, KALDI_VLOG, BottomUpClusterer::nclusters_, and BottomUpClusterer::npoints_.

Referenced by BottomUpClusterer::Cluster(), CompartmentalizedBottomUpClusterer::Cluster(), BottomUpClusterer::~BottomUpClusterer(), and CompartmentalizedBottomUpClusterer::~CompartmentalizedBottomUpClusterer().

268  {
269  KALDI_VLOG(2) << "Freeing up distance vector.";
270  {
271  vector<BaseFloat> tmp;
272  tmp.swap(dist_vec_);
273  }
274 
275 // Commented the following out since it was causing the process to take up too
276 // much memory with larger models. While the swap() method of STL types swaps
277 // the data pointers, std::swap() creates a temporary copy. -Arnab
278 // KALDI_VLOG(2) << "Freeing up the queue";
279 // // first free up memory by getting rid of queue. this is a special trick
280 // // to force STL to free memory.
281 // {
282 // QueueType tmp;
283 // std::swap(tmp, queue_);
284 // }
285 
286  // called after clustering, renumbers to make clusters contiguously
287  // numbered. also processes assignments_ to remove chains of references.
288  KALDI_VLOG(2) << "Creating new copy of non-NULL clusters.";
289  std::vector<uint_smaller> mapping(npoints_, static_cast<uint_smaller> (-1)); // mapping from intermediate to final clusters.
290  std::vector<Clusterable*> new_clusters(nclusters_);
291  int32 clust = 0;
292  for (int32 i = 0; i < npoints_; i++) {
293  if ((*clusters_)[i] != NULL) {
294  KALDI_ASSERT(clust < nclusters_);
295  new_clusters[clust] = (*clusters_)[i];
296  mapping[i] = clust;
297  clust++;
298  }
299  }
300  KALDI_ASSERT(clust == nclusters_);
301 
302  KALDI_VLOG(2) << "Creating new copy of assignments.";
303  std::vector<int32> new_assignments(npoints_);
304  for (int32 i = 0; i < npoints_; i++) { // now reprocess assignments_.
305  int32 ii = i;
306  while ((*assignments_)[ii] != ii)
307  ii = (*assignments_)[ii]; // follow the chain.
308  KALDI_ASSERT((*clusters_)[ii] != NULL); // cannot have assignment to nonexistent cluster.
309  KALDI_ASSERT(mapping[ii] != static_cast<uint_smaller>(-1));
310  new_assignments[i] = mapping[ii];
311  }
312  clusters_->swap(new_clusters);
313  assignments_->swap(new_assignments);
314 }
std::vector< int32 > * assignments_
kaldi::int32 int32
std::vector< Clusterable * > * clusters_
std::vector< BaseFloat > dist_vec_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156

◆ SetDistance()

void SetDistance ( int32  i,
int32  j 
)
private

Definition at line 389 of file cluster-utils.cc.

References BottomUpClusterer::clusters_, BottomUpClusterer::dist_vec_, rnnlm::i, rnnlm::j, KALDI_ASSERT, BottomUpClusterer::max_merge_thresh_, BottomUpClusterer::npoints_, BottomUpClusterer::queue_, and BottomUpClusterer::ReconstructQueue().

Referenced by BottomUpClusterer::MergeClusters(), CompartmentalizedBottomUpClusterer::MergeClusters(), CompartmentalizedBottomUpClusterer::ReconstructQueue(), CompartmentalizedBottomUpClusterer::SetInitialDistances(), BottomUpClusterer::~BottomUpClusterer(), and CompartmentalizedBottomUpClusterer::~CompartmentalizedBottomUpClusterer().

389  {
390  KALDI_ASSERT(i < npoints_ && j < i && (*clusters_)[i] != NULL
391  && (*clusters_)[j] != NULL);
392  BaseFloat dist = (*clusters_)[i]->Distance(*((*clusters_)[j]));
393  dist_vec_[(i * (i - 1)) / 2 + j] = dist; // set the distance in the array.
394  if (dist < max_merge_thresh_) {
395  queue_.push(std::make_pair(dist, std::make_pair(static_cast<uint_smaller>(i),
396  static_cast<uint_smaller>(j))));
397  }
398  // every time it's at least twice the maximum possible size.
399  if (queue_.size() >= static_cast<size_t> (npoints_ * npoints_)) {
400  // Control memory use by getting rid of orphaned queue entries
402  }
403 }
std::vector< Clusterable * > * clusters_
void ReconstructQueue()
Reconstructs the priority queue from the distances.
float BaseFloat
Definition: kaldi-types.h:29
std::vector< BaseFloat > dist_vec_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ SetInitialDistances()

void SetInitialDistances ( )
private

Sets up distances and queue.

Definition at line 325 of file cluster-utils.cc.

References BottomUpClusterer::clusters_, BottomUpClusterer::dist_vec_, rnnlm::i, rnnlm::j, BottomUpClusterer::max_merge_thresh_, BottomUpClusterer::npoints_, and BottomUpClusterer::queue_.

Referenced by BottomUpClusterer::Cluster(), CompartmentalizedBottomUpClusterer::Cluster(), BottomUpClusterer::~BottomUpClusterer(), and CompartmentalizedBottomUpClusterer::~CompartmentalizedBottomUpClusterer().

325  {
326  for (int32 i = 0; i < npoints_; i++) {
327  for (int32 j = 0; j < i; j++) {
328  BaseFloat dist = (*clusters_)[i]->Distance(*((*clusters_)[j]));
329  dist_vec_[(i * (i - 1)) / 2 + j] = dist;
330  if (dist <= max_merge_thresh_)
331  queue_.push(std::make_pair(dist, std::make_pair(static_cast<uint_smaller>(i),
332  static_cast<uint_smaller>(j))));
333  }
334  }
335 }
kaldi::int32 int32
std::vector< Clusterable * > * clusters_
float BaseFloat
Definition: kaldi-types.h:29
std::vector< BaseFloat > dist_vec_

Member Data Documentation

◆ ans_

◆ assignments_

◆ clusters_

◆ dist_vec_

◆ max_merge_thresh_

◆ min_clust_

int32 min_clust_
private

◆ nclusters_

◆ npoints_

◆ points_

◆ queue_

◆ tmp_assignments_

std::vector<int32> tmp_assignments_
private

Definition at line 237 of file cluster-utils.cc.

◆ tmp_clusters_

std::vector<Clusterable*> tmp_clusters_
private

Definition at line 236 of file cluster-utils.cc.

Referenced by BottomUpClusterer::~BottomUpClusterer().


The documentation for this class was generated from the following file: