cluster-utils.h
Go to the documentation of this file.
1 // tree/cluster-utils.h
2 
3 // Copyright 2012 Arnab Ghoshal
4 // Copyright 2009-2011 Microsoft Corporation; Saarland University
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #ifndef KALDI_TREE_CLUSTER_UTILS_H_
22 #define KALDI_TREE_CLUSTER_UTILS_H_
23 
24 #include <vector>
25 #include "matrix/matrix-lib.h"
26 #include "itf/clusterable-itf.h"
27 
28 namespace kaldi {
29 
32 
35 BaseFloat SumClusterableObjf(const std::vector<Clusterable*> &vec);
36 
38 BaseFloat SumClusterableNormalizer(const std::vector<Clusterable*> &vec);
39 
41 Clusterable *SumClusterable(const std::vector<Clusterable*> &vec);
42 
47 void EnsureClusterableVectorNotNull(std::vector<Clusterable*> *stats);
48 
49 
58 void AddToClusters(const std::vector<Clusterable*> &stats,
59  const std::vector<int32> &assignments,
60  std::vector<Clusterable*> *clusters);
61 
62 
70 void AddToClustersOptimized(const std::vector<Clusterable*> &stats,
71  const std::vector<int32> &assignments,
72  const Clusterable &total,
73  std::vector<Clusterable*> *clusters);
74 
76 
79 
80 // Note, in the algorithms below, it is assumed that the input "points" (which
81 // is std::vector<Clusterable*>) is all non-NULL.
82 
109 BaseFloat ClusterBottomUp(const std::vector<Clusterable*> &points,
110  BaseFloat thresh,
111  int32 min_clust,
112  std::vector<Clusterable*> *clusters_out,
113  std::vector<int32> *assignments_out);
114 
124  const std::vector< std::vector<Clusterable*> > &points, BaseFloat thresh,
125  int32 min_clust, std::vector< std::vector<Clusterable*> > *clusters_out,
126  std::vector< std::vector<int32> > *assignments_out);
127 
128 
130  int32 num_iters; // must be >= 0. If zero, does nothing.
131  int32 top_n; // must be >= 2.
132  RefineClustersOptions() : num_iters(100), top_n(5) {}
133  RefineClustersOptions(int32 num_iters_in, int32 top_n_in)
134  : num_iters(num_iters_in), top_n(top_n_in) {}
135  // include Write and Read functions because this object gets written/read as
136  // part of the QuestionsForKeyOptions class.
137  void Write(std::ostream &os, bool binary) const;
138  void Read(std::istream &is, bool binary);
139 };
140 
156 BaseFloat RefineClusters(const std::vector<Clusterable*> &points,
157  std::vector<Clusterable*> *clusters /*non-NULL*/,
158  std::vector<int32> *assignments /*non-NULL*/,
160 
164  int32 num_tries; // if >1, try whole procedure >once and pick best.
165  bool verbose;
167  : refine_cfg(), num_iters(20), num_tries(2), verbose(true) {}
168 };
169 
203 BaseFloat ClusterKMeans(const std::vector<Clusterable*> &points,
204  int32 num_clust, // exact number of clusters
205  std::vector<Clusterable*> *clusters_out, // may be NULL
206  std::vector<int32> *assignments_out, // may be NULL
208 
212  BaseFloat thresh; // Objf change: if >0, may be used to control number of leaves.
214  : kmeans_cfg(), branch_factor(2), thresh(0) {
215  kmeans_cfg.verbose = false;
216  }
217 };
218 
252 BaseFloat TreeCluster(const std::vector<Clusterable*> &points,
253  int32 max_clust, // max number of leaf-level clusters.
254  std::vector<Clusterable*> *clusters_out,
255  std::vector<int32> *assignments_out,
256  std::vector<int32> *clust_assignments_out,
257  int32 *num_leaves_out,
259 
260 
281 BaseFloat ClusterTopDown(const std::vector<Clusterable*> &points,
282  int32 max_clust, // max number of clusters.
283  std::vector<Clusterable*> *clusters_out,
284  std::vector<int32> *assignments_out,
286 
288 
289 } // end namespace kaldi.
290 
291 #endif // KALDI_TREE_CLUSTER_UTILS_H_
RefineClustersOptions(int32 num_iters_in, int32 top_n_in)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void Write(std::ostream &os, bool binary) const
BaseFloat RefineClusters(const std::vector< Clusterable *> &points, std::vector< Clusterable *> *clusters, std::vector< int32 > *assignments, RefineClustersOptions cfg)
RefineClusters is mainly used internally by other clustering algorithms.
void Read(std::istream &is, bool binary)
BaseFloat SumClusterableNormalizer(const std::vector< Clusterable *> &vec)
Returns the total normalizer (usually count) of the cluster (pointers may be NULL).
BaseFloat ClusterKMeans(const std::vector< Clusterable *> &points, int32 num_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, ClusterKMeansOptions cfg)
ClusterKMeans is a K-means-like clustering algorithm.
kaldi::int32 int32
BaseFloat ClusterBottomUpCompartmentalized(const std::vector< std::vector< Clusterable *> > &points, BaseFloat thresh, int32 min_clust, std::vector< std::vector< Clusterable *> > *clusters_out, std::vector< std::vector< int32 > > *assignments_out)
This is a bottom-up clustering where the points are pre-clustered in a set of compartments, such that only points in the same compartment are clustered together.
float BaseFloat
Definition: kaldi-types.h:29
void EnsureClusterableVectorNotNull(std::vector< Clusterable *> *stats)
Fills in any (NULL) holes in "stats" vector, with empty stats, because certain algorithms require non...
BaseFloat ClusterBottomUp(const std::vector< Clusterable *> &points, BaseFloat max_merge_thresh, int32 min_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out)
A bottom-up clustering algorithm.
void AddToClusters(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, std::vector< Clusterable *> *clusters)
Given stats and a vector "assignments" of the same size (that maps to cluster indices), sums the stats up into "clusters." It will add to any stats already present in "clusters" (although typically "clusters" will be empty when called), and it will extend with NULL pointers for any unseen indices.
void AddToClustersOptimized(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, const Clusterable &total, std::vector< Clusterable *> *clusters)
AddToClustersOptimized does the same as AddToClusters (it sums up the stats within each cluster...
BaseFloat ClusterTopDown(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, TreeClusterOptions cfg)
A clustering algorithm that internally uses TreeCluster, but does not give you the information about ...
BaseFloat SumClusterableObjf(const std::vector< Clusterable *> &vec)
Returns the total objective function after adding up all the statistics in the vector (pointers may b...
BaseFloat TreeCluster(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out, TreeClusterOptions cfg)
TreeCluster is a top-down clustering algorithm, using a binary tree (not necessarily balanced)...
ClusterKMeansOptions kmeans_cfg
Clusterable * SumClusterable(const std::vector< Clusterable *> &vec)
Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present.
RefineClustersOptions refine_cfg