clusterer.h

   1 /*
   2  *  clueless-kmeans is a variant of k-means which enforces balanced
   3  *  distribution of classes in every cluster
   4  *
   5  *  Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/
   6  *  Written by Francois Fleuret <francois.fleuret@idiap.ch>
   7  *
   8  *  This file is part of clueless-kmeans.
   9  *
  10  *  clueless-kmeans is free software: you can redistribute it and/or
  11  *  modify it under the terms of the GNU General Public License
  12  *  version 3 as published by the Free Software Foundation.
  13  *
  14  *  clueless-kmeans is distributed in the hope that it will be useful,
  15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  *  General Public License for more details.
  18  *
  19  *  You should have received a copy of the GNU General Public License
  20  *  along with selector.  If not, see <http://www.gnu.org/licenses/>.
  21  *
  22  */
  23
  24 #ifndef CLUSTERER_H
  25 #define CLUSTERER_H
  26
  27 #include "misc.h"
  28 #include "arrays.h"
  29
  30 class Clusterer {
  31 public:
  32
  33   enum {
  34     // Standard k-mean
  35     STANDARD_ASSOCIATION,
  36     // Same, implemented as a LP problem for sanity check
  37     STANDARD_LP_ASSOCIATION,
  38     // Criterion forcing to have the same distribution of classes in
  39     // all clusters
  40     UNINFORMATIVE_LP_ASSOCIATION,
  41     // Criterion forcing to have the same number of samples of each
  42     // class in all clusters
  43     UNINFORMATIVE_LP_ASSOCIATION_ABSOLUTE
  44   };
  45
  46   const static int max_nb_iterations = 10;
  47   const static scalar_t min_iteration_improvement = 0.999;
  48   const static scalar_t min_cluster_variance = 0.01f;
  49
  50   int _nb_clusters;
  51   int _dim;
  52
  53   scalar_t **_cluster_means, **_cluster_var;
  54
  55   scalar_t distance_to_centroid(scalar_t *x, int k);
  56
  57   void initialize_clusters(int nb_points, scalar_t **points);
  58
  59   // Standard hard k-means association
  60
  61   scalar_t baseline_cluster_association(int nb_points, scalar_t **points,
  62                                         int nb_classes, int *labels,
  63                                         scalar_t **gamma);
  64
  65   // Standard k-means association implemented as an LP optimization
  66
  67   scalar_t baseline_lp_cluster_association(int nb_points, scalar_t **points,
  68                                            int nb_classes, int *labels,
  69                                            scalar_t **gamma);
  70
  71   // Association under the constraint that each cluster gets the same
  72   // class proportions as the overall training set
  73
  74   scalar_t uninformative_lp_cluster_association(int nb_points, scalar_t **points,
  75                                                 int nb_classes, int *labels,
  76                                                 scalar_t **gamma,
  77                                                 int absolute_proportion);
  78
  79   void update_clusters(int nb_points, scalar_t **points, scalar_t **gamma);
  80
  81 public:
  82   Clusterer();
  83   ~Clusterer();
  84
  85   void train(int mode,
  86              int nb_clusters, int dim,
  87              int nb_points, scalar_t **points,
  88              int nb_classes, int *labels,
  89              // This last array returns for each sample to what
  90              // cluster it was associated. It can be null.
  91              int *cluster_associations);
  92
  93   int cluster(scalar_t *point);
  94 };
  95
  96 #endif