TreeClusterer Class Reference
Collaboration diagram for TreeClusterer:

Classes

struct  Node
 

Public Member Functions

 TreeClusterer (const std::vector< Clusterable *> &points, int32 max_clust, TreeClusterOptions cfg)
 
BaseFloat Cluster (std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out)
 
 ~TreeClusterer ()
 

Private Member Functions

void CreateOutput (std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out)
 
int32 NonleafOutputIndex (int32 index)
 
void CreateAssignmentsOutput (std::vector< int32 > *assignments_out)
 
void CreateClustAssignmentsOutput (std::vector< int32 > *clust_assignments_out)
 
void CreateClustersOutput (std::vector< Clusterable *> *clusters_out)
 
void DoSplit (Node *node)
 
void FindBestSplit (Node *node)
 
void Init ()
 

Private Attributes

std::vector< Node * > leaf_nodes_
 
std::vector< Node * > nonleaf_nodes_
 
const std::vector< Clusterable * > & points_
 
int32 max_clust_
 
BaseFloat ans_
 
std::priority_queue< std::pair< BaseFloat, Node * > > queue_
 
TreeClusterOptions cfg_
 

Detailed Description

Definition at line 1032 of file cluster-utils.cc.

Constructor & Destructor Documentation

◆ TreeClusterer()

TreeClusterer ( const std::vector< Clusterable *> &  points,
int32  max_clust,
TreeClusterOptions  cfg 
)
inline

Definition at line 1034 of file cluster-utils.cc.

References KALDI_ASSERT.

1036  :
1037  points_(points), max_clust_(max_clust), ans_(0.0), cfg_(cfg)
1038  {
1040  Init();
1041  }
const std::vector< Clusterable * > & points_
TreeClusterOptions cfg_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ ~TreeClusterer()

~TreeClusterer ( )
inline

Definition at line 1057 of file cluster-utils.cc.

References kaldi::DeletePointers().

1057  {
1058  for (int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size());leaf++) {
1059  delete leaf_nodes_[leaf]->node_total;
1060  DeletePointers(&(leaf_nodes_[leaf]->leaf.clusters));
1061  delete leaf_nodes_[leaf];
1062  }
1063  for (int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
1064  delete nonleaf_nodes_[nonleaf]->node_total;
1065  delete nonleaf_nodes_[nonleaf];
1066  }
1067  }
void DeletePointers(std::vector< A *> *v)
Deletes any non-NULL pointers in the vector v, and sets the corresponding entries of v to NULL...
Definition: stl-utils.h:184
std::vector< Node * > leaf_nodes_
kaldi::int32 int32
std::vector< Node * > nonleaf_nodes_

Member Function Documentation

◆ Cluster()

BaseFloat Cluster ( std::vector< Clusterable *> *  clusters_out,
std::vector< int32 > *  assignments_out,
std::vector< int32 > *  clust_assignments_out,
int32 num_leaves_out 
)
inline

Definition at line 1042 of file cluster-utils.cc.

References BottomUpClusterer::ans_, and BottomUpClusterer::queue_.

Referenced by kaldi::TreeCluster().

1045  {
1046  while (static_cast<int32>(leaf_nodes_.size()) < max_clust_ && !queue_.empty()) {
1047  std::pair<BaseFloat, Node*> pr = queue_.top();
1048  queue_.pop();
1049  ans_ += pr.first;
1050  DoSplit(pr.second);
1051  }
1052  CreateOutput(clusters_out, assignments_out, clust_assignments_out,
1053  num_leaves_out);
1054  return ans_;
1055  }
void CreateOutput(std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out)
std::priority_queue< std::pair< BaseFloat, Node * > > queue_
std::vector< Node * > leaf_nodes_
void DoSplit(Node *node)

◆ CreateAssignmentsOutput()

void CreateAssignmentsOutput ( std::vector< int32 > *  assignments_out)
inlineprivate

Definition at line 1106 of file cluster-utils.cc.

References rnnlm::i, KALDI_ASSERT, and BottomUpClusterer::points_.

1106  {
1107  assignments_out->clear();
1108  assignments_out->resize(points_.size(), (int32)(-1)); // fill with -1.
1109  for (int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size()); leaf++) {
1110  std::vector<int32> &indices = leaf_nodes_[leaf]->leaf.point_indices;
1111  for (int32 i = 0; i < static_cast<int32>(indices.size()); i++) {
1112  KALDI_ASSERT(static_cast<size_t>(indices[i]) < points_.size());
1113  KALDI_ASSERT((*assignments_out)[indices[i]] == (int32)(-1)); // check we're not assigning twice.
1114  (*assignments_out)[indices[i]] = leaf;
1115  }
1116  }
1117 #ifdef KALDI_PARANOID
1118  for (size_t i = 0;i<assignments_out->size();i++) KALDI_ASSERT((*assignments_out)[i] != (int32)(-1));
1119 #endif
1120  }
const std::vector< Clusterable * > & points_
std::vector< Node * > leaf_nodes_
kaldi::int32 int32
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ CreateClustAssignmentsOutput()

void CreateClustAssignmentsOutput ( std::vector< int32 > *  clust_assignments_out)
inlineprivate

Definition at line 1121 of file cluster-utils.cc.

References KALDI_ASSERT.

1121  {
1122  clust_assignments_out->resize(leaf_nodes_.size() + nonleaf_nodes_.size());
1123  for (int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size()); leaf++) {
1124  int32 parent_index;
1125  if (leaf_nodes_[leaf]->parent == NULL) { // tree with only one node.
1126  KALDI_ASSERT(leaf_nodes_.size() == 1&&nonleaf_nodes_.size() == 0 && leaf == 0);
1127  parent_index = 0;
1128  } else {
1129  if (leaf_nodes_[leaf]->parent->is_leaf) parent_index = leaf_nodes_[leaf]->parent->index;
1130  else parent_index = NonleafOutputIndex(leaf_nodes_[leaf]->parent->index);
1131  }
1132  (*clust_assignments_out)[leaf] = parent_index;
1133  }
1134  for (int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
1135  int32 index = NonleafOutputIndex(nonleaf);
1136  int32 parent_index;
1137  if (nonleaf_nodes_[nonleaf]->parent == NULL) parent_index = index; // top node. make it own parent.
1138  else {
1139  KALDI_ASSERT(! nonleaf_nodes_[nonleaf]->parent->is_leaf); // parent is nonleaf since child is nonleaf.
1140  parent_index = NonleafOutputIndex(nonleaf_nodes_[nonleaf]->parent->index);
1141  }
1142  (*clust_assignments_out)[index] = parent_index;
1143  }
1144  }
int32 NonleafOutputIndex(int32 index)
std::vector< Node * > leaf_nodes_
kaldi::int32 int32
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
std::vector< Node * > nonleaf_nodes_

◆ CreateClustersOutput()

void CreateClustersOutput ( std::vector< Clusterable *> *  clusters_out)
inlineprivate

Definition at line 1145 of file cluster-utils.cc.

1145  {
1146  clusters_out->resize(leaf_nodes_.size() + nonleaf_nodes_.size());
1147  for (int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size()); leaf++) {
1148  (*clusters_out)[leaf] = leaf_nodes_[leaf]->node_total;
1149  leaf_nodes_[leaf]->node_total = NULL; // suppress delete.
1150  }
1151  for (int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
1152  int32 index = NonleafOutputIndex(nonleaf);
1153  (*clusters_out)[index] = nonleaf_nodes_[nonleaf]->node_total;
1154  nonleaf_nodes_[nonleaf]->node_total = NULL; // suppress delete.
1155  }
1156  }
int32 NonleafOutputIndex(int32 index)
std::vector< Node * > leaf_nodes_
kaldi::int32 int32
std::vector< Node * > nonleaf_nodes_

◆ CreateOutput()

void CreateOutput ( std::vector< Clusterable *> *  clusters_out,
std::vector< int32 > *  assignments_out,
std::vector< int32 > *  clust_assignments_out,
int32 num_leaves_out 
)
inlineprivate

Definition at line 1088 of file cluster-utils.cc.

1091  {
1092  if (num_leaves_out) *num_leaves_out = leaf_nodes_.size();
1093  if (assignments_out)
1094  CreateAssignmentsOutput(assignments_out);
1095  if (clust_assignments_out)
1096  CreateClustAssignmentsOutput(clust_assignments_out);
1097  if (clusters_out)
1098  CreateClustersOutput(clusters_out);
1099  }
void CreateAssignmentsOutput(std::vector< int32 > *assignments_out)
std::vector< Node * > leaf_nodes_
void CreateClustAssignmentsOutput(std::vector< int32 > *clust_assignments_out)
void CreateClustersOutput(std::vector< Clusterable *> *clusters_out)

◆ DoSplit()

void DoSplit ( Node node)
inlineprivate

Definition at line 1157 of file cluster-utils.cc.

References TreeClusterer::Node::assignments, TreeClusterer::Node::best_split, TreeClusterer::Node::children, TreeClusterer::Node::clusters, rnnlm::i, TreeClusterer::Node::index, TreeClusterer::Node::is_leaf, KALDI_ASSERT, TreeClusterer::Node::leaf, TreeClusterer::Node::node_total, TreeClusterer::Node::parent, TreeClusterer::Node::point_indices, and TreeClusterer::Node::points.

1157  {
1158  KALDI_ASSERT(node->is_leaf && node->leaf.best_split > cfg_.thresh*0.999); // 0.999 is to avoid potential floating-point weirdness under compiler optimizations.
1159  KALDI_ASSERT(node->children.size() == 0);
1160  node->children.resize(cfg_.branch_factor);
1161  for (int32 i = 0;i < cfg_.branch_factor;i++) {
1162  Node *child = new Node;
1163  node->children[i] = child;
1164  child->is_leaf = true;
1165  child->parent = node;
1166  child->node_total = node->leaf.clusters[i];
1167  if (i == 0) {
1168  child->index = node->index; // assign node's own index in leaf_nodes_ to 1st child.
1169  leaf_nodes_[child->index] = child;
1170  } else {
1171  child->index = leaf_nodes_.size(); // generate new indices for other children.
1172  leaf_nodes_.push_back(child);
1173  }
1174  }
1175 
1176  KALDI_ASSERT(node->leaf.assignments.size() == node->leaf.points.size());
1177  KALDI_ASSERT(node->leaf.point_indices.size() == node->leaf.points.size());
1178  for (int32 i = 0; i < static_cast<int32>(node->leaf.points.size()); i++) {
1179  int32 child_index = node->leaf.assignments[i];
1180  KALDI_ASSERT(child_index < static_cast<int32>(cfg_.branch_factor));
1181  node->children[child_index]->leaf.points.push_back(node->leaf.points[i]);
1182  node->children[child_index]->leaf.point_indices.push_back(node->leaf.point_indices[i]);
1183  }
1184  node->leaf.points.clear();
1185  node->leaf.point_indices.clear();
1186  node->leaf.clusters.clear(); // already assigned pointers to children.
1187  node->leaf.assignments.clear();
1188  node->is_leaf = false;
1189  node->index = nonleaf_nodes_.size(); // new index at end of nonleaf_nodes_.
1190  nonleaf_nodes_.push_back(node);
1191  for (int32 i = 0;i < static_cast<int32>(cfg_.branch_factor);i++)
1192  FindBestSplit(node->children[i]);
1193  }
std::vector< Node * > leaf_nodes_
kaldi::int32 int32
TreeClusterOptions cfg_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void FindBestSplit(Node *node)
std::vector< Node * > nonleaf_nodes_

◆ FindBestSplit()

void FindBestSplit ( Node node)
inlineprivate

Definition at line 1194 of file cluster-utils.cc.

References TreeClusterer::Node::assignments, TreeClusterer::Node::best_split, kaldi::ClusterKMeans(), TreeClusterer::Node::clusters, TreeClusterer::Node::is_leaf, KALDI_ASSERT, KALDI_WARN, TreeClusterer::Node::leaf, TreeClusterer::Node::points, and BottomUpClusterer::queue_.

1194  {
1195  // takes a leaf node that has just been set up, and does ClusterKMeans with k = cfg_branch_factor.
1196  KALDI_ASSERT(node->is_leaf);
1197  if (node->leaf.points.size() == 0) {
1198  KALDI_WARN << "Warning: tree clustering: leaf with no data";
1199  node->leaf.best_split = 0; return;
1200  }
1201  if (node->leaf.points.size()<=1) { node->leaf.best_split = 0; return; }
1202  else {
1203  // use kmeans.
1204  BaseFloat impr = ClusterKMeans(node->leaf.points,
1206  &node->leaf.clusters,
1207  &node->leaf.assignments,
1208  cfg_.kmeans_cfg);
1209  node->leaf.best_split = impr;
1210  if (impr > cfg_.thresh)
1211  queue_.push(std::make_pair(impr, node));
1212  }
1213  }
std::priority_queue< std::pair< BaseFloat, Node * > > queue_
BaseFloat ClusterKMeans(const std::vector< Clusterable *> &points, int32 num_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, ClusterKMeansOptions cfg)
ClusterKMeans is a K-means-like clustering algorithm.
float BaseFloat
Definition: kaldi-types.h:29
#define KALDI_WARN
Definition: kaldi-error.h:150
TreeClusterOptions cfg_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
ClusterKMeansOptions kmeans_cfg

◆ Init()

void Init ( )
inlineprivate

Definition at line 1214 of file cluster-utils.cc.

References rnnlm::i, TreeClusterer::Node::index, TreeClusterer::Node::is_leaf, TreeClusterer::Node::leaf, TreeClusterer::Node::node_total, TreeClusterer::Node::parent, TreeClusterer::Node::point_indices, TreeClusterer::Node::points, BottomUpClusterer::points_, and kaldi::SumClusterable().

1214  { // Initializes top node.
1215  Node *top_node = new Node;
1216  top_node->index = leaf_nodes_.size(); // == 0 currently.
1217  top_node->parent = NULL; // no parent since root of tree.
1218  top_node->is_leaf = true;
1219  leaf_nodes_.push_back(top_node);
1220  top_node->leaf.points = points_;
1221  top_node->node_total = SumClusterable(points_);
1222  top_node->leaf.point_indices.resize(points_.size());
1223  for (size_t i = 0;i<points_.size();i++) top_node->leaf.point_indices[i] = i;
1224  FindBestSplit(top_node); // this should always be called when new node is created.
1225  }
const std::vector< Clusterable * > & points_
std::vector< Node * > leaf_nodes_
void FindBestSplit(Node *node)
Clusterable * SumClusterable(const std::vector< Clusterable *> &vec)
Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present.

◆ NonleafOutputIndex()

int32 NonleafOutputIndex ( int32  index)
inlineprivate

Definition at line 1103 of file cluster-utils.cc.

1103  {
1104  return leaf_nodes_.size() + nonleaf_nodes_.size() - 1 - index;
1105  }
std::vector< Node * > leaf_nodes_
std::vector< Node * > nonleaf_nodes_

Member Data Documentation

◆ ans_

BaseFloat ans_
private

Definition at line 1232 of file cluster-utils.cc.

◆ cfg_

TreeClusterOptions cfg_
private

Definition at line 1236 of file cluster-utils.cc.

◆ leaf_nodes_

std::vector<Node*> leaf_nodes_
private

Definition at line 1227 of file cluster-utils.cc.

◆ max_clust_

int32 max_clust_
private

Definition at line 1231 of file cluster-utils.cc.

◆ nonleaf_nodes_

std::vector<Node*> nonleaf_nodes_
private

Definition at line 1228 of file cluster-utils.cc.

◆ points_

const std::vector<Clusterable*>& points_
private

Definition at line 1230 of file cluster-utils.cc.

◆ queue_

std::priority_queue<std::pair<BaseFloat, Node*> > queue_
private

Definition at line 1234 of file cluster-utils.cc.


The documentation for this class was generated from the following file: