X-Git-Url: https://www.fleuret.org/cgi-bin/gitweb/gitweb.cgi?p=clueless-kmeans.git;a=blobdiff_plain;f=clusterer.h;h=6fa538273c22c039831ed295ce5079cb9b72c323;hp=88c168a488511fc1f2d5e41f916d5b04641f7004;hb=HEAD;hpb=2455f83ba251602d5e04640067094f09f03aaa3d diff --git a/clusterer.h b/clusterer.h index 88c168a..6fa5382 100644 --- a/clusterer.h +++ b/clusterer.h @@ -1,17 +1,17 @@ /* - * clueless-kmean is a variant of k-mean which enforces balanced + * clueless-kmeans is a variant of k-means which enforces balanced * distribution of classes in every cluster * * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ * Written by Francois Fleuret * - * This file is part of clueless-kmean. + * This file is part of clueless-kmeans. * - * clueless-kmean is free software: you can redistribute it and/or + * clueless-kmeans is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 3 as published by the Free Software Foundation. * - * clueless-kmean is distributed in the hope that it will be useful, + * clueless-kmeans is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. @@ -29,35 +29,65 @@ class Clusterer { public: + + enum { + // Standard k-mean + STANDARD_ASSOCIATION, + // Same, implemented as a LP problem for sanity check + STANDARD_LP_ASSOCIATION, + // Criterion forcing to have the same distribution of classes in + // all clusters + UNINFORMATIVE_LP_ASSOCIATION, + // Criterion forcing to have the same number of samples of each + // class in all clusters + UNINFORMATIVE_LP_ASSOCIATION_ABSOLUTE + }; + const static int max_nb_iterations = 10; const static scalar_t min_iteration_improvement = 0.999; + const static scalar_t min_cluster_variance = 0.01f; int _nb_clusters; int _dim; + scalar_t **_cluster_means, **_cluster_var; + scalar_t distance_to_centroid(scalar_t *x, int k); + void initialize_clusters(int nb_points, scalar_t **points); + // Standard hard k-means association + scalar_t baseline_cluster_association(int nb_points, scalar_t **points, int nb_classes, int *labels, scalar_t **gamma); + // Standard k-means association implemented as an LP optimization + scalar_t baseline_lp_cluster_association(int nb_points, scalar_t **points, int nb_classes, int *labels, scalar_t **gamma); + // Association under the constraint that each cluster gets the same + // class proportions as the overall training set + scalar_t uninformative_lp_cluster_association(int nb_points, scalar_t **points, int nb_classes, int *labels, - scalar_t **gamma); + scalar_t **gamma, + int absolute_proportion); - void baseline_update_clusters(int nb_points, scalar_t **points, scalar_t **gamma); + void update_clusters(int nb_points, scalar_t **points, scalar_t **gamma); public: Clusterer(); ~Clusterer(); - void train(int nb_clusters, int dim, + + void train(int mode, + int nb_clusters, int dim, int nb_points, scalar_t **points, int nb_classes, int *labels, + // This last array returns for each sample to what + // cluster it was associated. It can be null. int *cluster_associations); int cluster(scalar_t *point);