41 for (
size_t i = 0;
i < vec.size();
i++) {
56 for (
size_t i = 0;
i < vec.size();
i++) {
71 for (
size_t i = 0;
i < vec.size();
i++) {
84 std::vector<Clusterable*>::iterator itr = stats->begin(), end = stats->end();
85 if (itr == end)
return;
87 for (; itr != end; ++itr) {
89 nonNullExample = *itr;
93 if (nonNullExample == NULL) {
100 for (; itr != end; ++itr) {
102 *itr = nonNullExampleCopy->
Copy();
105 delete nonNullExampleCopy;
109 const std::vector<int32> &assignments,
110 std::vector<Clusterable*> *clusters) {
112 int32 size = stats.size();
113 if (size == 0)
return;
115 int32 max_assignment = *std::max_element(assignments.begin(),
117 if (static_cast<int32> (clusters->size()) <= max_assignment)
118 clusters->resize(max_assignment + 1, NULL);
120 if (stats[
i] != NULL) {
121 if ((*clusters)[assignments[
i]] == NULL)
122 (*clusters)[assignments[
i]] = stats[
i]->Copy();
124 (*clusters)[assignments[
i]]->Add(*(stats[
i]));
131 const std::vector<int32> &assignments,
133 std::vector<Clusterable*> *clusters) {
134 #ifdef KALDI_PARANOID 144 int32 size = stats.size();
145 if (size == 0)
return;
147 int32 num_clust = 1 + *std::max_element(assignments.begin(),
149 if (static_cast<int32> (clusters->size()) < num_clust)
150 clusters->resize(num_clust, NULL);
151 std::vector<int32> num_stats_for_cluster(num_clust, 0);
152 int32 num_total_stats = 0;
154 if (stats[
i] != NULL) {
156 num_stats_for_cluster[assignments[
i]]++;
159 if (num_total_stats == 0)
return;
162 int32 subtract_index = -1;
163 for (
int32 c = 0; c < num_clust; c++) {
164 if (num_stats_for_cluster[c] > num_total_stats - num_stats_for_cluster[c]) {
166 if ((*clusters)[c] == NULL)
167 (*clusters)[c] = total.
Copy();
169 (*clusters)[c]->
Add(total);
175 if (stats[
i] != NULL) {
176 int32 assignment = assignments[
i];
177 if (assignment != (
int32) subtract_index) {
178 if ((*clusters)[assignment] == NULL)
179 (*clusters)[assignment] = stats[
i]->Copy();
181 (*clusters)[assignment]->Add(*(stats[
i]));
183 if (subtract_index != -1 && assignment != subtract_index)
184 (*clusters)[subtract_index]->Sub(*(stats[
i]));
198 std::vector<Clusterable*> *clusters_out,
199 std::vector<int32> *assignments_out)
242 typedef std::pair<BaseFloat, std::pair<uint_smaller, uint_smaller> >
QueueElement;
244 typedef std::priority_queue<QueueElement, std::vector<QueueElement>,
250 KALDI_VLOG(2) <<
"Initializing cluster assignments.";
252 KALDI_VLOG(2) <<
"Setting initial distances.";
257 std::pair<BaseFloat, std::pair<uint_smaller, uint_smaller> > pr =
queue_.top();
263 KALDI_VLOG(2) <<
"Renumbering clusters to contiguous numbers.";
269 KALDI_VLOG(2) <<
"Freeing up distance vector.";
271 vector<BaseFloat> tmp;
288 KALDI_VLOG(2) <<
"Creating new copy of non-NULL clusters.";
289 std::vector<uint_smaller> mapping(
npoints_, static_cast<uint_smaller> (-1));
290 std::vector<Clusterable*> new_clusters(
nclusters_);
295 new_clusters[clust] = (*clusters_)[
i];
302 KALDI_VLOG(2) <<
"Creating new copy of assignments.";
303 std::vector<int32> new_assignments(npoints_);
307 ii = (*assignments_)[ii];
309 KALDI_ASSERT(mapping[ii] != static_cast<uint_smaller>(-1));
310 new_assignments[
i] = mapping[ii];
321 (*assignments_)[
i] =
i;
331 queue_.push(std::make_pair(dist, std::make_pair(static_cast<uint_smaller>(i),
332 static_cast<uint_smaller>(j))));
342 return (std::fabs(cached_dist - dist) <= 1.0e-05 * std::fabs(dist));
348 delete (*clusters_)[
j];
349 (*clusters_)[
j] = NULL;
352 (*assignments_)[
j] =
i;
380 queue_.push(std::make_pair(dist, std::make_pair(
381 static_cast<uint_smaller>(i), static_cast<uint_smaller>(
j))));
395 queue_.push(std::make_pair(dist, std::make_pair(static_cast<uint_smaller>(i),
396 static_cast<uint_smaller>(j))));
410 std::vector<Clusterable*> *clusters_out,
411 std::vector<int32> *assignments_out) {
412 KALDI_ASSERT(max_merge_thresh >= 0.0 && min_clust >= 0);
414 int32 npoints = points.size();
417 npoints < static_cast<int32>(static_cast<uint_smaller>(-1)));
419 KALDI_VLOG(2) <<
"Initializing clustering object.";
420 BottomUpClusterer bc(points, max_merge_thresh, min_clust, clusters_out, assignments_out);
435 : dist(d), compartment(comp), point1(i), point2(j) {}
445 const vector< vector<Clusterable*> > &points,
BaseFloat max_merge_thresh,
449 ncompartments_ = points.size();
452 for (
int32 comp = 0; comp < ncompartments_; comp++) {
453 npoints_[comp] = points[comp].size();
458 vector< vector<int32> > *assignments_out);
460 for (vector< vector<Clusterable*> >::iterator itr =
clusters_.begin(),
461 end =
clusters_.end(); itr != end; ++itr)
481 const vector< vector<Clusterable*> > &
points_;
490 typedef std::priority_queue< CompBotClustElem, std::vector<CompBotClustElem>,
496 vector< vector<Clusterable*> > *clusters_out,
497 vector< vector<int32> > *assignments_out) {
509 for (
int32 comp = 0; comp < ncompartments_; comp++)
511 if (clusters_out != NULL) clusters_out->swap(
clusters_);
512 if (assignments_out != NULL) assignments_out->swap(
assignments_);
513 return total_obj_change;
525 int32 clusts_in_compartment = 0;
528 clusts_in_compartment++;
533 vector<uint_smaller> mapping(npoints_[comp], static_cast<uint_smaller> (-1));
534 vector<Clusterable*> new_clusters(clusts_in_compartment);
537 clusts_in_compartment = 0;
538 for (
int32 i = 0;
i < npoints_[comp];
i++) {
540 new_clusters[clusts_in_compartment] =
clusters_[comp][
i];
541 mapping[
i] = clusts_in_compartment;
542 clusts_in_compartment++;
547 std::vector<int32> new_assignments(npoints_[comp]);
548 for (
int32 i = 0;
i < npoints_[comp];
i++) {
554 KALDI_ASSERT(mapping[ii] != static_cast<uint_smaller>(-1));
555 new_assignments[
i] = mapping[ii];
564 for (
int32 comp = 0; comp < ncompartments_; comp++) {
576 for (
int32 comp = 0; comp < ncompartments_; comp++) {
590 return (std::fabs(cached_dist - dist) <= 1.0e-05 * std::fabs(dist));
607 if (k != i &&
clusters_[comp][k] != NULL) {
628 for (
int32 comp = 0; comp < ncompartments_; comp++) {
644 dist_vec_[comp][(i * (i - 1)) / 2 +
j] = dist;
647 static_cast<uint_smaller>(j)));
654 const std::vector< std::vector<Clusterable*> > &points,
BaseFloat thresh,
655 int32 min_clust, std::vector< std::vector<Clusterable*> > *clusters_out,
656 std::vector< std::vector<int32> > *assignments_out) {
658 int32 npoints = 0, num_non_empty_compartments = 0;
659 for (vector< vector<Clusterable*> >::const_iterator itr = points.begin(),
660 end = points.end(); itr != end; ++itr) {
662 npoints += itr->size();
663 if (itr->size() > 0) num_non_empty_compartments++;
668 npoints < static_cast<int32>(static_cast<uint_smaller>(-1)));
673 for (vector< vector<Clusterable*> >::iterator itr = clusters_out->begin(),
674 end = clusters_out->end(); itr != end; ++itr) {
696 std::vector<Clusterable*> *clusters,
697 std::vector<int32> *assignments,
703 num_clust_ =
static_cast<int32> (clusters->size());
706 if (cfg_.top_n > (
int32) num_clust_) cfg_.top_n
707 = static_cast<int32> (num_clust_);
708 KALDI_ASSERT(cfg_.top_n == static_cast<int32>(static_cast<ClustIndexInt>(cfg_.top_n)));
710 my_clust_index_.resize(num_points_);
712 clust_time_.resize(num_clust_, 0);
713 clust_objf_.resize(num_clust_);
714 for (
int32 i = 0;
i < num_clust_;
i++)
716 info_.resize(num_points_ * cfg_.top_n);
722 if (cfg_.top_n <= 1)
return 0.0;
732 std::vector<std::pair<BaseFloat, LocalInt> > distances;
733 distances.reserve(num_clust_-1);
734 int32 my_clust = (*assignments_)[point];
737 for (
int32 clust = 0;clust < num_clust_;clust++) {
738 if (clust != my_clust) {
741 BaseFloat other_clust_objf = clust_objf_[clust];
742 BaseFloat other_clust_plus_me_objf = (*clusters_)[clust]->ObjfPlus(* (
points_[point]));
744 BaseFloat distance = other_clust_objf-other_clust_plus_me_objf;
745 distances.push_back(std::make_pair(distance, (LocalInt)clust));
749 if ((cfg_.top_n-1-1) >= 0) {
750 std::nth_element(distances.begin(), distances.begin()+(cfg_.top_n-1-1), distances.end());
755 for (
int32 index = 0;index < cfg_.top_n-1;index++) {
757 int32 clust = distances[index].second;
759 BaseFloat distance = distances[index].first;
760 BaseFloat other_clust_objf = clust_objf_[clust];
761 BaseFloat other_clust_plus_me_objf = -(distance - other_clust_objf);
762 info.
objf = other_clust_plus_me_objf;
766 point_info &info = GetInfo(point, cfg_.top_n-1);
767 info.
clust = my_clust;
769 info.
objf = (*clusters_)[my_clust]->ObjfMinus(*(
points_[point]));
770 my_clust_index_[point] = cfg_.top_n-1;
775 for (
int32 p = 0;p < num_points_;p++) InitPoint(p);
778 int32 iter, num_iters = cfg_.num_iters;
779 for (iter = 0;iter < num_iters;iter++) {
781 for (
int32 point = 0;point < num_points_;point++) {
783 KALDI_WARN <<
"Stopping iterating at int32 moves";
789 if (t_ == cur_t)
break;
795 int32 old_index = my_clust_index_[point];
797 KALDI_ASSERT(new_index < cfg_.top_n && new_index != old_index);
798 point_info &old_info = GetInfo(point, old_index),
799 &new_info = GetInfo(point, new_index);
800 my_clust_index_[point] = new_index;
802 int32 old_clust = old_info.
clust, new_clust = new_info.clust;
804 (*assignments_)[point] = new_clust;
805 (*clusters_)[old_clust]->Sub( *(
points_[point]) );
806 (*clusters_)[new_clust]->Add( *(
points_[point]) );
807 UpdateClust(old_clust);
808 UpdateClust(new_clust);
812 clust_objf_[clust] = (*clusters_)[clust]->Objf();
813 clust_time_[clust] = t_;
820 int32 self_index = my_clust_index_[point];
821 point_info &self_info = GetInfo(point, self_index);
824 UpdateInfo(point, self_index);
826 float own_clust_objf = clust_objf_[self_clust];
827 float own_clust_minus_me_objf = self_info.
objf;
829 for (
int32 index = 0;index < cfg_.top_n;index++) {
830 if (index != self_index) {
831 UpdateInfo(point, index);
832 point_info &other_info = GetInfo(point, index);
835 BaseFloat impr = other_clust_plus_me_objf + own_clust_minus_me_objf
836 - other_clust_objf - own_clust_objf;
839 MovePoint(point, index);
849 if (pinfo.
time < clust_time_[pinfo.
clust]) {
851 if (idx == my_clust_index_[point]) {
870 int32 i = point*cfg_.top_n + idx;
896 std::vector<Clusterable*> *clusters,
897 std::vector<int32> *assignments,
899 #ifndef KALDI_PARANOID // don't do this check in "paranoid" mode as we want to expose bugs. 920 std::vector<Clusterable*> *clusters_out,
921 std::vector<int32> *assignments_out,
923 std::vector<int32> my_assignments;
924 int32 num_points = points.size();
930 clusters_out->resize(num_clust, (
Clusterable*)NULL);
931 assignments_out->resize(num_points);
938 if (num_points == 1) {
941 skip = 1 + (
Rand() % (num_points-1));
942 while (
Gcd(skip, num_points) != 1) {
943 if (skip == num_points-1) skip = 0;
949 for (i = 0, j = 0; count != num_points;i = (i+skip)%num_points, j = (j+1)%num_clust, count++) {
953 if ((*clusters_out)[
j] == NULL) (*clusters_out)[
j] = points[
i]->Copy();
954 else (*clusters_out)[
j]->Add(*(points[i]));
955 (*assignments_out)[
i] =
j;
966 if (ans < -0.01 && ans < -0.01 * fabs(all_stats->
Objf())) {
967 KALDI_WARN <<
"ClusterKMeans: objective function after random assignment to clusters is worse than in single cluster: "<< (all_stats->
Objf()) <<
" changed by " << ans <<
". Perhaps your stats class has the wrong properties?";
980 KALDI_LOG <<
"ClusterKMeans: on iteration "<<(iter)<<
", objf before = "<<(objf_before)<<
", impr = "<<(impr)<<
", objf after = "<<(objf_after)<<
", normalized by "<<(normalizer)<<
" = "<<(objf_after/normalizer);
981 if (impr == 0)
break;
988 std::vector<Clusterable*> *clusters_out,
989 std::vector<int32> *assignments_out,
991 if (points.size() == 0) {
993 if (assignments_out) assignments_out->clear();
999 std::vector<int32> assignments;
1000 return ClusterKMeansOnce(points, num_clust, clusters_out, (assignments_out != NULL?assignments_out:&assignments), cfg);
1007 std::vector<Clusterable*> clusters_tmp;
1008 std::vector<int32> assignments_tmp;
1011 if (
i == 0 || ans > best_ans) {
1015 *clusters_out = clusters_tmp;
1016 clusters_tmp.clear();
1018 if (assignments_out) *assignments_out = assignments_tmp;
1037 points_(points), max_clust_(max_clust),
ans_(0.0), cfg_(cfg)
1043 std::vector<int32> *assignments_out,
1044 std::vector<int32> *clust_assignments_out,
1045 int32 *num_leaves_out) {
1046 while (static_cast<int32>(leaf_nodes_.size()) < max_clust_ && !
queue_.empty()) {
1047 std::pair<BaseFloat, Node*> pr =
queue_.top();
1052 CreateOutput(clusters_out, assignments_out, clust_assignments_out,
1058 for (
int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size());leaf++) {
1059 delete leaf_nodes_[leaf]->node_total;
1061 delete leaf_nodes_[leaf];
1063 for (
int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
1064 delete nonleaf_nodes_[nonleaf]->node_total;
1065 delete nonleaf_nodes_[nonleaf];
1089 std::vector<int32> *assignments_out,
1090 std::vector<int32> *clust_assignments_out,
1091 int32 *num_leaves_out) {
1092 if (num_leaves_out) *num_leaves_out = leaf_nodes_.size();
1093 if (assignments_out)
1094 CreateAssignmentsOutput(assignments_out);
1095 if (clust_assignments_out)
1096 CreateClustAssignmentsOutput(clust_assignments_out);
1098 CreateClustersOutput(clusters_out);
1104 return leaf_nodes_.size() + nonleaf_nodes_.size() - 1 - index;
1107 assignments_out->clear();
1109 for (
int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size()); leaf++) {
1110 std::vector<int32> &indices = leaf_nodes_[leaf]->leaf.point_indices;
1111 for (
int32 i = 0; i < static_cast<int32>(indices.size());
i++) {
1114 (*assignments_out)[indices[
i]] = leaf;
1117 #ifdef KALDI_PARANOID 1118 for (
size_t i = 0;
i<assignments_out->size();
i++)
KALDI_ASSERT((*assignments_out)[
i] != (
int32)(-1));
1122 clust_assignments_out->resize(leaf_nodes_.size() + nonleaf_nodes_.size());
1123 for (
int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size()); leaf++) {
1125 if (leaf_nodes_[leaf]->parent == NULL) {
1126 KALDI_ASSERT(leaf_nodes_.size() == 1&&nonleaf_nodes_.size() == 0 && leaf == 0);
1129 if (leaf_nodes_[leaf]->parent->is_leaf) parent_index = leaf_nodes_[leaf]->parent->index;
1130 else parent_index = NonleafOutputIndex(leaf_nodes_[leaf]->parent->index);
1132 (*clust_assignments_out)[leaf] = parent_index;
1134 for (
int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
1135 int32 index = NonleafOutputIndex(nonleaf);
1137 if (nonleaf_nodes_[nonleaf]->parent == NULL) parent_index = index;
1139 KALDI_ASSERT(! nonleaf_nodes_[nonleaf]->parent->is_leaf);
1140 parent_index = NonleafOutputIndex(nonleaf_nodes_[nonleaf]->parent->index);
1142 (*clust_assignments_out)[index] = parent_index;
1146 clusters_out->resize(leaf_nodes_.size() + nonleaf_nodes_.size());
1147 for (
int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size()); leaf++) {
1148 (*clusters_out)[leaf] = leaf_nodes_[leaf]->node_total;
1149 leaf_nodes_[leaf]->node_total = NULL;
1151 for (
int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
1152 int32 index = NonleafOutputIndex(nonleaf);
1153 (*clusters_out)[index] = nonleaf_nodes_[nonleaf]->node_total;
1154 nonleaf_nodes_[nonleaf]->node_total = NULL;
1160 node->
children.resize(cfg_.branch_factor);
1161 for (
int32 i = 0;
i < cfg_.branch_factor;
i++) {
1169 leaf_nodes_[child->
index] = child;
1171 child->
index = leaf_nodes_.size();
1172 leaf_nodes_.push_back(child);
1180 KALDI_ASSERT(child_index < static_cast<int32>(cfg_.branch_factor));
1189 node->
index = nonleaf_nodes_.size();
1190 nonleaf_nodes_.push_back(node);
1191 for (
int32 i = 0;i < static_cast<int32>(cfg_.branch_factor);
i++)
1198 KALDI_WARN <<
"Warning: tree clustering: leaf with no data";
1210 if (impr > cfg_.thresh)
1211 queue_.push(std::make_pair(impr, node));
1216 top_node->
index = leaf_nodes_.size();
1219 leaf_nodes_.push_back(top_node);
1224 FindBestSplit(top_node);
1234 std::priority_queue<std::pair<BaseFloat, Node*> >
queue_;
1242 std::vector<Clusterable*> *clusters_out,
1243 std::vector<int32> *assignments_out,
1244 std::vector<int32> *clust_assignments_out,
1245 int32 *num_leaves_out,
1247 if (points.size() == 0) {
1248 if (clusters_out) clusters_out->clear();
1249 if (assignments_out) assignments_out->clear();
1250 if (clust_assignments_out) clust_assignments_out->clear();
1251 if (num_leaves_out) *num_leaves_out = 0;
1255 BaseFloat ans = tc.
Cluster(clusters_out, assignments_out, clust_assignments_out, num_leaves_out);
1263 std::vector<Clusterable*> *clusters_out,
1264 std::vector<int32> *assignments_out,
1266 int32 num_leaves = 0;
1267 BaseFloat ans =
TreeCluster(points, max_clust, clusters_out, assignments_out, NULL, &num_leaves, cfg);
1268 if (clusters_out != NULL) {
1269 for (
size_t j = num_leaves;
j<clusters_out->size();
j++)
delete (*clusters_out)[
j];
1270 clusters_out->resize(num_leaves);
1277 WriteToken(os, binary,
"<RefineClustersOptions>");
1280 WriteToken(os, binary,
"</RefineClustersOptions>");
1284 ExpectToken(is, binary,
"<RefineClustersOptions>");
1287 ExpectToken(is, binary,
"</RefineClustersOptions>");
virtual void Sub(const Clusterable &other)=0
Subtract other stats.
std::vector< ClustIndexInt > my_clust_index_
BaseFloat max_merge_thresh_
bool operator>(const CompBotClustElem &a, const CompBotClustElem &b)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void CreateOutput(std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out)
std::vector< int32 > * assignments_
RefineClusterer(const std::vector< Clusterable *> &points, std::vector< Clusterable *> *clusters, std::vector< int32 > *assignments, RefineClustersOptions cfg)
void ProcessPoint(int32 point)
virtual void Add(const Clusterable &other)=0
Add other stats.
void Write(std::ostream &os, bool binary) const
std::vector< point_info > info_
void DeletePointers(std::vector< A *> *v)
Deletes any non-NULL pointers in the vector v, and sets the corresponding entries of v to NULL...
const std::vector< Clusterable * > & points_
void ReconstructQueue()
Reconstructs the priority queue from the distances.
std::vector< BaseFloat > clust_objf_
void CreateAssignmentsOutput(std::vector< int32 > *assignments_out)
std::vector< int32 > point_indices
std::priority_queue< std::pair< BaseFloat, Node * > > queue_
int32 NonleafOutputIndex(int32 index)
BaseFloat RefineClusters(const std::vector< Clusterable *> &points, std::vector< Clusterable *> *clusters, std::vector< int32 > *assignments, RefineClustersOptions cfg)
RefineClusters is mainly used internally by other clustering algorithms.
std::vector< Node * > leaf_nodes_
RefineClustersOptions cfg_
void Read(std::istream &is, bool binary)
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
virtual void SetZero()=0
Set stats to empty.
virtual BaseFloat Objf() const =0
Return the objective function associated with the stats [assuming ML estimation]. ...
std::pair< BaseFloat, std::pair< uint_smaller, uint_smaller > > QueueElement
bool CanMerge(int32 i, int32 j, BaseFloat dist)
CanMerge returns true if i and j are existing clusters, and the distance (negated objf-change) "dist"...
BaseFloat SumClusterableNormalizer(const std::vector< Clusterable *> &vec)
Returns the total normalizer (usually count) of the cluster (pointers may be NULL).
BaseFloat ClusterKMeans(const std::vector< Clusterable *> &points, int32 num_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, ClusterKMeansOptions cfg)
ClusterKMeans is a K-means-like clustering algorithm.
std::priority_queue< CompBotClustElem, std::vector< CompBotClustElem >, std::greater< CompBotClustElem > > QueueType
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
std::vector< Clusterable * > * clusters_
vector< vector< int32 > > assignments_
BaseFloat ClusterBottomUpCompartmentalized(const std::vector< std::vector< Clusterable *> > &points, BaseFloat thresh, int32 min_clust, std::vector< std::vector< Clusterable *> > *clusters_out, std::vector< std::vector< int32 > > *assignments_out)
This is a bottom-up clustering where the points are pre-clustered in a set of compartments, such that only points in the same compartment are clustered together.
bool ContainsNullPointers(const std::vector< A *> &v)
Returns true if the vector of pointers contains NULL pointers.
BaseFloat ClusterKMeansOnce(const std::vector< Clusterable *> &points, int32 num_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, ClusterKMeansOptions &cfg)
ClusterKMeansOnce is called internally by ClusterKMeans; it is equivalent to calling ClusterKMeans wi...
void InitializeAssignments()
std::vector< Clusterable * > tmp_clusters_
std::vector< Clusterable * > clusters
const std::vector< Clusterable * > & points_
virtual Clusterable * Copy() const =0
Return a copy of this object.
const vector< vector< Clusterable * > > & points_
void ReconstructQueue()
Reconstructs the priority queue from the distances.
BaseFloat & Distance(int32 i, int32 j)
CompBotClustElem(BaseFloat d, int32 comp, int32 i, int32 j)
std::vector< LocalInt > clust_time_
void EnsureClusterableVectorNotNull(std::vector< Clusterable *> *stats)
Fills in any (NULL) holes in "stats" vector, with empty stats, because certain algorithms require non...
void ExpectToken(std::istream &is, bool binary, const char *token)
ExpectToken tries to read in the given token, and throws an exception on failure. ...
vector< vector< Clusterable * > > clusters_
BaseFloat ClusterBottomUp(const std::vector< Clusterable *> &points, BaseFloat max_merge_thresh, int32 min_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out)
A bottom-up clustering algorithm.
void CreateClustAssignmentsOutput(std::vector< int32 > *clust_assignments_out)
std::vector< Clusterable * > points
std::vector< int32 > assignments
void MovePoint(int32 point, int32 new_index)
void UpdateInfo(int32 point, int32 idx)
void AddToClusters(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, std::vector< Clusterable *> *clusters)
Given stats and a vector "assignments" of the same size (that maps to cluster indices), sums the stats up into "clusters." It will add to any stats already present in "clusters" (although typically "clusters" will be empty when called), and it will extend with NULL pointers for any unseen indices.
CompartmentalizedBottomUpClusterer(const vector< vector< Clusterable *> > &points, BaseFloat max_merge_thresh, int32 min_clust)
bool CanMerge(int32 compartment, int32 i, int32 j, BaseFloat dist)
CanMerge returns true if i and j are existing clusters, and the distance (negated objf-change) "dist"...
#define KALDI_PARANOID_ASSERT(cond)
void SetInitialDistances()
Sets up distances and queue.
void AddToClustersOptimized(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, const Clusterable &total, std::vector< Clusterable *> *clusters)
AddToClustersOptimized does the same as AddToClusters (it sums up the stats within each cluster...
std::vector< int32 > * assignments_
BaseFloat ClusterTopDown(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, TreeClusterOptions cfg)
A clustering algorithm that internally uses TreeCluster, but does not give you the information about ...
uint_smaller ClustIndexInt
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
std::vector< BaseFloat > dist_vec_
void CreateClustersOutput(std::vector< Clusterable *> *clusters_out)
void UpdateClust(int32 clust)
int Rand(struct RandomState *state)
void MergeClusters(int32 i, int32 j)
Merge j into i and delete j.
virtual BaseFloat Normalizer() const =0
Return the normalizer (typically, count) associated with the stats.
const std::vector< Clusterable * > & points_
std::priority_queue< QueueElement, std::vector< QueueElement >, std::greater< QueueElement > > QueueType
std::vector< int32 > tmp_assignments_
BaseFloat MergeClusters(int32 compartment, int32 i, int32 j)
Merge j into i and delete j. Returns obj function change.
void SetDistance(int32 i, int32 j)
#define KALDI_ASSERT(cond)
void InitializeAssignments()
point_info & GetInfo(int32 point, int32 idx)
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
void FindBestSplit(Node *node)
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
void SetInitialDistances()
Sets up distances and queue.
void InitPoint(int32 point)
void SetDistance(int32 compartment, int32 i, int32 j)
std::vector< Node * > children
BaseFloat SumClusterableObjf(const std::vector< Clusterable *> &vec)
Returns the total objective function after adding up all the statistics in the vector (pointers may b...
std::vector< Clusterable * > * clusters_
TreeClusterer(const std::vector< Clusterable *> &points, int32 max_clust, TreeClusterOptions cfg)
BaseFloat TreeCluster(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out, TreeClusterOptions cfg)
TreeCluster is a top-down clustering algorithm, using a binary tree (not necessarily balanced)...
vector< vector< BaseFloat > > dist_vec_
BaseFloat max_merge_thresh_
std::vector< Node * > nonleaf_nodes_
~CompartmentalizedBottomUpClusterer()
BottomUpClusterer(const std::vector< Clusterable *> &points, BaseFloat max_merge_thresh, int32 min_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out)
struct kaldi::TreeClusterer::Node::@7 leaf
Clusterable * SumClusterable(const std::vector< Clusterable *> &vec)
Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present.
RefineClustersOptions refine_cfg
BaseFloat Cluster(std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out)
BaseFloat Cluster(vector< vector< Clusterable *> > *clusters_out, vector< vector< int32 > > *assignments_out)
void Renumber(int32 compartment)