X-Git-Url: https://www.fleuret.org/cgi-bin/gitweb/gitweb.cgi?p=clueless-kmeans.git;a=blobdiff_plain;f=clueless-kmeans.cc;fp=clueless-kmeans.cc;h=557f0d8d2a161fd789aac9d2d2d28c57fe20629d;hp=0000000000000000000000000000000000000000;hb=8f8e8f2fb669aa421c245eada82095fb3fdcadc9;hpb=056eef1e23b3f6e5218b7bc3800f6412a3f97bfc diff --git a/clueless-kmeans.cc b/clueless-kmeans.cc new file mode 100644 index 0000000..557f0d8 --- /dev/null +++ b/clueless-kmeans.cc @@ -0,0 +1,130 @@ +/* + * clueless-kmean is a variant of k-mean which enforces balanced + * distribution of classes in every cluster + * + * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of clueless-kmean. + * + * clueless-kmean is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * clueless-kmean is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with selector. If not, see . + * + */ + +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include "misc.h" +#include "arrays.h" +#include "sample_set.h" +#include "clusterer.h" + +void generate_toy_problem(SampleSet *sample_set) { + int dim = 2; + int nb_points = 1000; + + sample_set->resize(dim, nb_points); + sample_set->nb_classes = 2; + + for(int n = 0; n < nb_points; n++) { + sample_set->labels[n] = int(drand48() * 2); + if(sample_set->labels[n] == 0) { + sample_set->points[n][0] = (2 * drand48() - 1) * 0.8; + sample_set->points[n][1] = - 0.6 + (2 * drand48() - 1) * 0.4; + } else { + sample_set->points[n][0] = (2 * drand48() - 1) * 0.4; + sample_set->points[n][1] = 0.6 + (2 * drand48() - 1) * 0.4; + } + } +} + +int main(int argc, char **argv) { + SampleSet sample_set; + Clusterer clusterer; + int nb_clusters = 3; + + generate_toy_problem(&sample_set); + + { + ofstream out("points.dat"); + for(int n = 0; n < sample_set.nb_points; n++) { + out << sample_set.labels[n]; + for(int d = 0; d < sample_set.dim; d++) { + out << " " << sample_set.points[n][d]; + } + out << endl; + } + } + + int *associated_clusters = new int[sample_set.nb_points]; + + glp_term_out(0); + + int mode; + + if(argc == 2) { + if(strcmp(argv[1], "standard") == 0) { + mode = Clusterer::STANDARD_LP_ASSOCIATION; + } else if(strcmp(argv[1], "clueless") == 0) { + mode = Clusterer::UNINFORMATIVE_LP_ASSOCIATION; + } else { + cerr << "Unknown association mode " << argv[1] << endl; + exit(EXIT_FAILURE); + } + } else { + cerr << "Usage: " << argv[0] << " standard|clueless" << endl; + exit(EXIT_FAILURE); + } + + clusterer.train(mode, + nb_clusters, + sample_set.dim, + sample_set.nb_points, sample_set.points, + sample_set.nb_classes, sample_set.labels, + associated_clusters); + + { + ofstream out("associated_clusters.dat"); + for(int n = 0; n < sample_set.nb_points; n++) { + out << associated_clusters[n]; + for(int d = 0; d < sample_set.dim; d++) { + out << " " << sample_set.points[n][d]; + } + out << endl; + } + } + + { + ofstream out("clusters.dat"); + for(int k = 0 ; k < clusterer._nb_clusters; k++) { + out << k; + for(int d = 0; d < sample_set.dim; d++) { + out << " " << clusterer._cluster_means[k][d]; + } + for(int d = 0; d < sample_set.dim; d++) { + out << " " << 2 * sqrt(clusterer._cluster_var[k][d]); + } + out << endl; + } + } + + delete[] associated_clusters; + + glp_free_env(); // I do not want valgrind to complain +}