kmean -> kmeans.

[clueless-kmeans.git] / clueless-kmeans.cc
diff --git a/clueless-kmeans.cc b/clueless-kmeans.cc

new file mode 100644 (file)

index 0000000..557f0d8
--- /dev/null
+++ b/clueless-kmeans.cc
@@ -0,0 +1,130 @@
+/*
+ *  clueless-kmean is a variant of k-mean which enforces balanced
+ *  distribution of classes in every cluster
+ *
+ *  Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/
+ *  Written by Francois Fleuret <francois.fleuret@idiap.ch>
+ *
+ *  This file is part of clueless-kmean.
+ *
+ *  clueless-kmean is free software: you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  version 3 as published by the Free Software Foundation.
+ *
+ *  clueless-kmean is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with selector.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <iostream>
+#include <fstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <float.h>
+#include <glpk.h>
+
+using namespace std;
+
+#include "misc.h"
+#include "arrays.h"
+#include "sample_set.h"
+#include "clusterer.h"
+
+void generate_toy_problem(SampleSet *sample_set) {
+  int dim = 2;
+  int nb_points = 1000;
+
+  sample_set->resize(dim, nb_points);
+  sample_set->nb_classes = 2;
+
+  for(int n = 0; n < nb_points; n++) {
+    sample_set->labels[n] = int(drand48() * 2);
+    if(sample_set->labels[n] == 0) {
+      sample_set->points[n][0] = (2 * drand48()  - 1) * 0.8;
+      sample_set->points[n][1] = - 0.6 + (2 * drand48()  - 1) * 0.4;
+    } else {
+      sample_set->points[n][0] = (2 * drand48()  - 1) * 0.4;
+      sample_set->points[n][1] =   0.6 + (2 * drand48()  - 1) * 0.4;
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  SampleSet sample_set;
+  Clusterer clusterer;
+  int nb_clusters = 3;
+
+  generate_toy_problem(&sample_set);
+
+  {
+    ofstream out("points.dat");
+    for(int n = 0; n < sample_set.nb_points; n++) {
+      out << sample_set.labels[n];
+      for(int d = 0; d < sample_set.dim; d++) {
+        out << " " << sample_set.points[n][d];
+      }
+      out << endl;
+    }
+  }
+
+  int *associated_clusters = new int[sample_set.nb_points];
+
+  glp_term_out(0);
+
+  int mode;
+
+  if(argc == 2) {
+    if(strcmp(argv[1], "standard") == 0) {
+      mode = Clusterer::STANDARD_LP_ASSOCIATION;
+    } else if(strcmp(argv[1], "clueless") == 0) {
+      mode = Clusterer::UNINFORMATIVE_LP_ASSOCIATION;
+    } else {
+      cerr << "Unknown association mode " << argv[1] << endl;
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    cerr << "Usage: " << argv[0] << " standard|clueless" << endl;
+    exit(EXIT_FAILURE);
+  }
+
+  clusterer.train(mode,
+                  nb_clusters,
+                  sample_set.dim,
+                  sample_set.nb_points, sample_set.points,
+                  sample_set.nb_classes, sample_set.labels,
+                  associated_clusters);
+
+  {
+    ofstream out("associated_clusters.dat");
+    for(int n = 0; n < sample_set.nb_points; n++) {
+      out << associated_clusters[n];
+      for(int d = 0; d < sample_set.dim; d++) {
+        out << " " << sample_set.points[n][d];
+      }
+      out << endl;
+    }
+  }
+
+  {
+    ofstream out("clusters.dat");
+    for(int k = 0 ; k < clusterer._nb_clusters; k++) {
+      out << k;
+      for(int d = 0; d < sample_set.dim; d++) {
+        out << " " << clusterer._cluster_means[k][d];
+      }
+      for(int d = 0; d < sample_set.dim; d++) {
+        out << " " << 2 * sqrt(clusterer._cluster_var[k][d]);
+      }
+      out << endl;
+    }
+  }
+
+  delete[] associated_clusters;
+
+  glp_free_env(); // I do not want valgrind to complain
+}