27 size_t dim = 1 +
Rand() % 10;
28 size_t nGauss = 1 +
Rand() % 10;
29 std::vector< GaussClusterable * > v(nGauss);
30 for (
size_t i = 0;
i < nGauss;
i++) {
33 for (
size_t i = 0;
i < nGauss;
i++) {
34 size_t nPoints = 1 +
Rand() % 30;
35 for (
size_t j = 0;
j < nPoints;
j++) {
38 for (
size_t k = 0;k < dim;k++) vec(k) =
RandGauss();
39 v[
i]->AddStats(vec, post);
42 for (
size_t i = 0;
i+1 < nGauss;
i++) {
43 BaseFloat like_before = (v[
i]->Objf() + v[
i+1]->Objf()) / (v[
i]->Normalizer() + v[
i+1]->Normalizer());
47 KALDI_LOG <<
"Like_before = " << like_before <<
", after = "<<like_after <<
" over "<<tmp->
Normalizer()<<
" frames.";
52 for (
size_t i = 0;
i < nGauss;
i++)
57 size_t dim = 2 +
Rand() % 10;
58 size_t num_vectors = 1 +
Rand() % 10;
59 std::vector<VectorClusterable*> v(num_vectors);
60 for (
size_t i = 0;
i < num_vectors;
i++) {
68 *tmp2 = static_cast<VectorClusterable*>(v[
i]->
Copy());
80 for (
size_t i = 0;
i+1 < num_vectors;
i++) {
81 BaseFloat like_before = (v[
i]->Objf() + v[
i+1]->Objf()) / (v[
i]->Normalizer() + v[
i+1]->Normalizer());
85 KALDI_LOG <<
"Like_before = " << like_before <<
", after = "<<like_after <<
" over "<<tmp->
Normalizer()<<
" frames.";
90 for (
size_t i = 0;
i < num_vectors;
i++)
99 AssertEqual( a.ObjfPlus(b), -0.5 * (1.0-2.5)*(1.0-2.5));
101 a.Write(std::cerr,
false);
102 std::cerr <<
"\nBinary Output:\n";
103 a.Write(std::cerr,
true);
114 AssertEqual(a.ObjfMinus(b), -0.5 * (1.0-2.5)*(1.0-2.5));
130 std::vector<Clusterable*> vec;
139 std::vector<Clusterable*> vec;
149 std::vector<Clusterable*> vec(4);
150 vec[1] = a.Copy(); vec[3] = a.Copy();
152 KALDI_ASSERT(vec[0] != NULL && vec[2] != NULL && vec[0]->Objf() == 0 && vec[2]->Objf() == 0 && vec[0] != vec[2] && vec[0] != vec[1]);
158 std::vector<Clusterable*> stats(3);
159 stats[0] = a.Copy(); stats[1] = b.Copy(); stats[2] = c.
Copy();
160 std::vector<int32> assignments(3);
161 assignments[0] = 1; assignments[1] = 1; assignments[2] = 4;
162 std::vector<Clusterable*> clusters;
163 std::vector<Clusterable*> clusters2;
170 KALDI_ASSERT(clusters[0] == NULL && clusters[1] != NULL && clusters[4] != NULL);
171 for (
size_t i = 0;
i < 5;
i++) {
172 if (clusters[
i] != NULL) {
184 for (
size_t p = 0;p < 100;p++) {
185 size_t n_stats =
Rand() % 5;
186 n_stats = n_stats * n_stats;
187 std::vector<Clusterable*> stats(n_stats);
188 for (
size_t i = 0;
i < n_stats;
i++) {
189 if (
Rand() % 5 < 4) {
191 if (
Rand() % 2 == 0) ptr->
Add(*ptr);
193 }
else stats[
i] = NULL;
195 size_t n_clust = 1 +
Rand() % 4;
196 std::vector<int32> assignments(n_stats);
197 for (
size_t i = 0;
i < assignments.size();
i++)
198 assignments[
i] =
Rand() % n_clust;
199 std::vector<Clusterable*> clusts1;
200 std::vector<Clusterable*> clusts2;
216 for (
size_t i = 0;
i < clusts1.size();
i++) {
217 if (clusts1[
i] != NULL || clusts2[
i] != NULL) {
219 AssertEqual(clusts1[
i]->Normalizer(), clusts2[
i]->Normalizer());
233 for (
size_t i = 0;
i < 10;
i++) {
234 size_t n_clust =
Rand() % 10;
235 std::vector<Clusterable*> points;
236 for (
size_t j = 0;
j < n_clust;
j++) {
237 size_t n_points = 1 +
Rand() % 5;
243 size_t min_clust =
Rand() % 10;
244 std::vector<Clusterable*> clusters;
245 std::vector<int32> assignments;
247 for (
size_t i = 0;
i < points.size();
i++) {
248 size_t j =
Rand() % points.size();
253 float ans =
ClusterBottomUp(points, max_merge_thresh, min_clust, &clusters, &assignments);
256 KALDI_LOG <<
"Objf change from bottom-up clustering is "<<ans<<
'\n';
261 for (
size_t i = 0;
i < points.size();
i++) {
264 for (
size_t i = 0;
i < clusters.size();
i++) {
269 KALDI_ASSERT(clusters.size() == std::max(n_clust, std::min(points.size(), min_clust)));
271 for (
size_t i = 0;
i < points.size();
i++) {
272 size_t j =
Rand() % points.size();
275 if (fabs(xi-xj) < 0.011) {
276 if (clusters.size() == n_clust)
KALDI_ASSERT(assignments[i] == assignments[j]);
287 for (
size_t n = 0;
n < 4;
n++) {
290 size_t n_clust =
Rand() % 10;
291 std::vector<Clusterable*> points;
292 for (
size_t j = 0;
j < n_clust;
j++) {
293 size_t n_points = 1 +
Rand() % 5;
297 std::vector<Clusterable*> clusters(n_clust);
298 std::vector<int32> assignments(points.size());
301 for (
size_t i = 0;
i < points.size();
i++) {
302 assignments[
i] =
Rand() % n_clust;
303 clusters[assignments[
i]]->Add(*(points[
i]));
310 (std::abs(points_objf)+std::abs(clust_objf_before))*0.001);
318 KALDI_LOG <<
"TestRefineClusters: objfs are: "<<points_objf<<
" "<<clust_objf_before<<
" "<<clust_objf_after<<
", impr = "<<impr<<
'\n';
322 AssertEqual(clust_objf_after - clust_objf_before, impr);
330 size_t n_points_tot = 0, n_wrong_tot = 0;
331 for (
size_t n = 0;
n < 3;
n++) {
334 size_t n_clust =
Rand() % 10;
335 std::vector<Clusterable*> points;
336 std::vector<int32> assignments_ref;
337 for (
size_t j = 0;
j < n_clust;
j++) {
338 size_t n_points = 1 +
Rand() % 5;
340 for (
size_t k = 0;k < n_points;k++) {
342 assignments_ref.push_back(
j);
345 std::vector<Clusterable*> clusters;
346 std::vector<int32> assignments;
355 KALDI_LOG <<
"TestClusterKmeans: objf after clustering is: "<<clust_objf<<
", impr is: "<<ans<<
'\n';
357 if (clusters.size() != n_clust) {
358 KALDI_LOG <<
"Warning: unexpected number of clusters "<<clusters.size()<<
" vs. "<<n_clust<<
"";
362 if (clust_objf < -1.0 * points.size()) {
363 KALDI_LOG <<
"Warning: ClusterKMeans did not work quite as well as expected";
367 for (
size_t i = 0;
i < points.size();
i++) {
368 size_t j =
Rand() % points.size();
369 if (assignments_ref[
i] == assignments_ref[j]) {
370 if (assignments[
i] != assignments[j]) num_wrong++;
372 if (assignments[
i] == assignments[j]) num_wrong++;
374 KALDI_LOG <<
"num_wrong = "<<num_wrong<<
'\n';
376 n_points_tot += points.size();
377 n_wrong_tot += num_wrong;
382 if (n_wrong_tot*4 > n_points_tot) {
383 KALDI_LOG <<
"Got too many wrong in k-means test [may not be fatal, but check it out.";
389 size_t n_points_tot = 0, n_wrong_tot = 0;
390 for (
size_t n = 0;
n < 3;
n++) {
391 std::vector<int32> assignments_ref;
395 size_t n_clust =
Rand() % 10;
396 std::vector<Clusterable*> points;
397 for (
size_t j = 0;
j < n_clust;
j++) {
398 size_t n_points = 1 +
Rand() % 5;
402 for (
size_t k = 0; k < n_points; k++) {
406 point.
AddVec(1.0, clust_center);
409 assignments_ref.push_back(
j);
412 std::vector<Clusterable*> clusters;
413 std::vector<int32> assignments;
423 KALDI_LOG <<
"TestClusterKmeans: objf after clustering is: "<<clust_objf<<
", impr is: "<<ans<<
'\n';
425 if (clusters.size() != n_clust) {
426 KALDI_LOG <<
"Warning: unexpected number of clusters "<<clusters.size()<<
" vs. "<<n_clust<<
"";
430 if (clust_objf < -1.0 * points.size()) {
431 KALDI_LOG <<
"Warning: ClusterKMeans did not work quite as well as expected";
436 for (
size_t i = 0;
i < points.size();
i++) {
437 size_t j =
Rand() % points.size();
438 if (assignments_ref[
i] == assignments_ref[j]) {
439 if (assignments[
i] != assignments[j]) num_wrong++;
441 if (assignments[
i] == assignments[j]) num_wrong++;
444 n_points_tot += points.size();
445 n_wrong_tot += num_wrong;
447 KALDI_LOG <<
"num_wrong = " << num_wrong <<
", num-points-tot = " 453 if (n_wrong_tot*4 > n_points_tot) {
454 KALDI_LOG <<
"Got too many wrong in k-means test [may not be fatal, but check it out.";
462 size_t n_points_tot = 0, n_wrong_tot = 0;
463 for (
size_t n = 0;
n < 10;
n++) {
466 std::vector<Clusterable*> points;
472 std::vector<Clusterable*> clusters_ext;
473 std::vector<int32> assignments;
474 std::vector<int32> clust_assignments;
477 int32 num_leaves = 0;
478 BaseFloat ans =
TreeCluster(points, n_clust, &clusters_ext, &assignments, &clust_assignments, &num_leaves, tcfg);
480 if (
n < 3)
TreeCluster(points, n_clust, NULL, NULL, NULL, NULL, tcfg);
483 KALDI_ASSERT(clusters_ext.size() >=
static_cast<size_t>(n_clust));
484 std::vector<Clusterable*> clusters(clusters_ext);
485 clusters.resize(n_clust);
488 KALDI_LOG <<
"TreeCluster: objf after clustering is: "<<clust_objf<<
", impr is: "<<ans<<
'\n';
491 KALDI_LOG <<
"Num nodes is "<<clusters_ext.size() <<
", leaves "<<num_leaves;
492 for (
int32 i = 0;i<static_cast<int32>(clusters_ext.size());
i++) {
495 KALDI_ASSERT(clust_assignments[i]>i || (i+1 == static_cast<int32>(clusters_ext.size()) && clust_assignments[i] == i));
496 if (i == static_cast<int32>(clusters_ext.size())-1)
502 if (n_wrong_tot*4 > n_points_tot) {
503 KALDI_LOG <<
"Got too many wrong in k-means test [may not be fatal, but check it out.";
510 size_t n_points_tot = 0, n_wrong_tot = 0;
511 for (
size_t n = 0;
n < 10;
n++) {
513 size_t n_clust =
Rand() % 10;
514 std::vector<Clusterable*> points;
515 for (
size_t j = 0;
j < n_clust;
j++) {
516 size_t n_points = 1 +
Rand() % 5;
520 std::vector<Clusterable*> clusters;
521 std::vector<int32> assignments;
532 KALDI_LOG <<
"ClusterTopDown: objf after clustering is: "<<clust_objf<<
", impr is: "<<ans<<
'\n';
535 KALDI_LOG <<
"Num nodes is "<<clusters.size()<<
'\n';
536 for (
size_t i = 0;
i < clusters.size();
i++) {
545 if (n_wrong_tot*4 > n_points_tot) {
546 KALDI_LOG <<
"Got too many wrong in k-means test [may not be fatal, but check it out.";
556 using namespace kaldi;
558 for (
size_t i = 0;
i < 2;
i++) {
static void TestClusterUtils()
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
static void TestClusterTopDown()
virtual void Add(const Clusterable &other)=0
Add other stats.
void DeletePointers(std::vector< A *> *v)
Deletes any non-NULL pointers in the vector v, and sets the corresponding entries of v to NULL...
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
BaseFloat RefineClusters(const std::vector< Clusterable *> &points, std::vector< Clusterable *> *clusters, std::vector< int32 > *assignments, RefineClustersOptions cfg)
RefineClusters is mainly used internally by other clustering algorithms.
static void TestEnsureClusterableVectorNotNull()
virtual BaseFloat Objf() const =0
Return the objective function associated with the stats [assuming ML estimation]. ...
BaseFloat SumClusterableNormalizer(const std::vector< Clusterable *> &vec)
Returns the total normalizer (usually count) of the cluster (pointers may be NULL).
BaseFloat ClusterKMeans(const std::vector< Clusterable *> &points, int32 num_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, ClusterKMeansOptions cfg)
ClusterKMeans is a K-means-like clustering algorithm.
static void TestRefineClusters()
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
float RandGauss(struct RandomState *state=NULL)
VectorClusterable wraps vectors in a form accessible to generic clustering algorithms.
virtual void Add(const Clusterable &other_in)
Add other stats.
virtual Clusterable * Copy() const =0
Return a copy of this object.
virtual void Add(const Clusterable &other_in)
Add other stats.
static void TestObjfMinus()
void EnsureClusterableVectorNotNull(std::vector< Clusterable *> *stats)
Fills in any (NULL) holes in "stats" vector, with empty stats, because certain algorithms require non...
static void TestDistance()
BaseFloat ClusterBottomUp(const std::vector< Clusterable *> &points, BaseFloat max_merge_thresh, int32 min_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out)
A bottom-up clustering algorithm.
void AddToClusters(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, std::vector< Clusterable *> *clusters)
Given stats and a vector "assignments" of the same size (that maps to cluster indices), sums the stats up into "clusters." It will add to any stats already present in "clusters" (although typically "clusters" will be empty when called), and it will extend with NULL pointers for any unseen indices.
static void TestAddToClustersOptimized()
virtual Clusterable * Copy() const
Return a copy of this object.
void AddToClustersOptimized(const std::vector< Clusterable *> &stats, const std::vector< int32 > &assignments, const Clusterable &total, std::vector< Clusterable *> *clusters)
AddToClustersOptimized does the same as AddToClusters (it sums up the stats within each cluster...
BaseFloat ClusterTopDown(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, TreeClusterOptions cfg)
A clustering algorithm that internally uses TreeCluster, but does not give you the information about ...
void Scale(Real alpha)
Multiplies all elements by this constant.
static void TestObjfPlus()
static void TestSumObjfAndSumNormalizer()
int Rand(struct RandomState *state)
void SetRandn()
Set vector to random normally-distributed noise.
virtual BaseFloat Normalizer() const =0
Return the normalizer (typically, count) associated with the stats.
static void TestTreeCluster()
A class representing a vector.
static void TestClusterBottomUp()
#define KALDI_ASSERT(cond)
static void TestClusterUtilsVector()
virtual BaseFloat Objf() const
Return the objective function associated with the stats [assuming ML estimation]. ...
static void TestAddToClusters()
static void TestClusterKMeansVector()
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
virtual BaseFloat Objf() const
Return the objective function associated with the stats [assuming ML estimation]. ...
static void TestClusterKMeans()
BaseFloat SumClusterableObjf(const std::vector< Clusterable *> &vec)
Returns the total objective function after adding up all the statistics in the vector (pointers may b...
BaseFloat TreeCluster(const std::vector< Clusterable *> &points, int32 max_clust, std::vector< Clusterable *> *clusters_out, std::vector< int32 > *assignments_out, std::vector< int32 > *clust_assignments_out, int32 *num_leaves_out, TreeClusterOptions cfg)
TreeCluster is a top-down clustering algorithm, using a binary tree (not necessarily balanced)...
GaussClusterable wraps Gaussian statistics in a form accessible to generic clustering algorithms...
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Clusterable * SumClusterable(const std::vector< Clusterable *> &vec)
Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present.
void Copy(const CuMatrixBase< Real > &src, const CuArray< int32 > ©_from_indices, CuMatrixBase< Real > *tgt)
Copies elements from src into tgt as given by copy_from_indices.
ScalarClusterable clusters scalars with x^2 loss.