3 * data-tool is a command line tool to do simple statistical
4 * processing on numerical data.
6 * Copyright (c) 2009 Francois Fleuret
7 * Written by Francois Fleuret <francois@fleuret.org>
9 * This file is part of data-tool.
11 * data-tool is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 3 as
13 * published by the Free Software Foundation.
15 * data-tool is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with data-tool. If not, see <http://www.gnu.org/licenses/>.
37 int compare_couple(const void *a, const void *b) {
38 if(((Couple *) a)->value < ((Couple *) b)->value) return -1;
39 else if(((Couple *) a)->value > ((Couple *) b)->value) return 1;
43 double *inflate_array(double *x, int current_size, int new_size) {
44 double *xx = new double[new_size];
45 for(int n = 0; n < current_size; n++) xx[n] = x[n];
50 char *next_word(char *buffer, char *r, int buffer_size) {
57 while((*r != '"') && (*r != '\0') &&
58 (s<buffer+buffer_size-1))
62 while((*r != '\r') && (*r != '\n') && (*r != '\0') &&
63 (*r != '\t') && (*r != ' ') && (*r != ',') &&
64 (s<buffer+buffer_size-1))
68 while((*r == ' ') || (*r == '\t') || (*r == ',')) r++;
69 if((*r == '\0') || (*r=='\r') || (*r=='\n')) r = NULL;
75 void check_opt(int argc, char **argv, int n_opt, int n, const char *help) {
76 if(n_opt + n >= argc) {
77 cerr << "ERROR: Missing argument for " << argv[n_opt] << ". Expecting " << help << "." << endl;
82 void print_help_and_exit(int e) {
83 cout << "Simple data processing tool. Written by Francois Fleuret." << endl
85 << "This application takes data from the standard input and prints" << endl
86 << "the result on the standard output. It expects either a list of" << endl
87 << "float values (to produce histograms, cumulative distribution functions" << endl
88 << "or the mean, variance, etc.) or a list of couples of values of the form" << endl
89 << "x y on each line (where the sign of x tells the class and y the parameter" << endl
90 << "value) to compute the ROC curve or the ROC curve surface.\n" << endl
91 << "The options are:" << endl
94 << " --roc-surface" << endl
95 << " --normalize" << endl
99 << " --auto-extrema" << endl
100 << " --xbounds <float: xmin> <float: xmax>" << endl
101 << " --ybounds <float: ymin> <float: ymax>" << endl
102 << " --nb-bins <int: number of bins>" << endl;
106 void check_single_processing(bool unknown_processing) {
107 if(!unknown_processing) {
108 cerr << "ERROR: You can't do two different processings." << endl;
113 int main(int argc, char **argv) {
114 double xmin = 0, xmax = 1, ymin = 0, ymax = 1;
116 const int buffer_size = 1024;
118 char line[buffer_size], token[buffer_size];
119 bool auto_extrema = false;
120 bool normalize = false;
124 enum { UNKNOWN, ROC, ROC_SURFACE, HISTO, CUMUL, MISC } processing = UNKNOWN;
126 // Parsing the command line arguments ////////////////////////////////
130 if(argc == 1 || strcmp(argv[i], "--help") == 0) print_help_and_exit(0);
132 else if(strcmp(argv[i], "--roc") == 0) {
133 check_single_processing(processing == UNKNOWN);
138 else if(strcmp(argv[i], "--roc-surface") == 0) {
139 check_single_processing(processing == UNKNOWN);
140 processing = ROC_SURFACE;
144 else if(strcmp(argv[i], "--cumul") == 0) {
145 check_single_processing(processing == UNKNOWN);
150 else if(strcmp(argv[i], "--normalize") == 0) {
155 else if(strcmp(argv[i], "--histo") == 0) {
156 check_single_processing(processing == UNKNOWN);
161 else if(strcmp(argv[i], "--misc") == 0) {
162 check_single_processing(processing == UNKNOWN);
167 else if(strcmp(argv[i], "--auto-extrema") == 0) {
172 else if(strcmp(argv[i], "--xbounds") == 0) {
173 check_opt(argc, argv, i, 2, "<float: xmin> <float: xmax>");
174 xmin = atof(argv[i+1]);
175 xmax = atof(argv[i+2]);
177 cerr << "ERROR: Incorrect bounds." << endl;
183 else if(strcmp(argv[i], "--ybounds") == 0) {
184 check_opt(argc, argv, i, 2, "<float: ymin> <float: ymax>");
185 ymin = atof(argv[i+1]);
186 ymax = atof(argv[i+2]);
188 cerr << "ERROR: Incorrect bounds." << endl;
194 else if(strcmp(argv[i], "--nb-bins") == 0) {
195 check_opt(argc, argv, i, 1, "<int: number of bins>");
196 nb_bins = atoi(argv[i+1]);
198 cerr << "ERROR: Incorrect number of bins." << endl;
205 cerr << "ERROR: Unknown option " << argv[i] << endl;
206 print_help_and_exit(1);
210 // Processing the data ///////////////////////////////////////////////
217 int nb_samples = 0, nb_samples_max = 50000;
218 double *x = new double[nb_samples_max];
221 if(nb_samples == nb_samples_max) {
222 x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
223 nb_samples_max = 2 * nb_samples_max;
226 cin.getline(line, buffer_size);
230 s = next_word(token, s, buffer_size);
231 x[nb_samples] = atof(token);
236 Couple tmp[nb_samples];
237 for(int n = 0; n < nb_samples; n++) {
242 qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
244 for(int n = 0; n < nb_samples; n++)
245 cout << tmp[n].value << " " << double(n)/double(nb_samples) << endl;
257 int nb_samples = 0, nb_samples_max = 1000;
258 double *x = new double[nb_samples_max], *y = new double[nb_samples_max];
261 if(nb_samples == nb_samples_max) {
262 x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
263 y = inflate_array(y, nb_samples_max, 2 * nb_samples_max);
264 nb_samples_max = 2 * nb_samples_max;
267 cin.getline(line, buffer_size);
271 s = next_word(token, s, buffer_size);
272 x[nb_samples] = atof(token);
273 s = next_word(token, s, buffer_size);
274 y[nb_samples] = atof(token);
279 Couple tmp[nb_samples];
280 int nb_rn = 0, nb_rp = 0, nb_fp = 0, nb_fn = 0;
283 for(int n = 0; binary && n < nb_samples; n++) binary &= (x[n] == 0 || x[n] == 1);
285 cerr << "WARNING: your classes are binary, I process them accordingly." << endl;
286 for(int n = 0; n < nb_samples; n++) x[n] = 2 * x[n] - 1;
289 for(int n = 0; n < nb_samples; n++) {
292 if(x[n] >= 0) nb_rp++;
293 else { nb_rn++; nb_fp++; }
296 if(nb_rp == 0) cerr << "WARNING: No true positive." << endl;
297 if(nb_rn == 0) cerr << "WARNING: No true negative." << endl;
299 qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
301 if(processing == ROC) {
302 for(int n = 0; n < nb_samples - 1; n++) {
303 if(x[tmp[n].index] >= 0) nb_fn++;
305 if(tmp[n].value < tmp[n+1].value) {
306 cout << double(nb_fp)/double(nb_rn) << " "
307 << 1 - double(nb_fn) / double(nb_rp) << " "
308 << (tmp[n].value + tmp[n+1].value)/2 << " "
314 double cx = double(nb_fp)/double(nb_rn), cy = 1 - double(nb_fn) / double(nb_rp);
315 for(int n = 0; n < nb_samples - 1; n++) {
316 if(x[tmp[n].index] >= 0) nb_fn++;
318 if(tmp[n].value < tmp[n+1].value) {
319 double ncx = double(nb_fp)/double(nb_rn), ncy = 1 - double(nb_fn) / double(nb_rp);
320 surface += (cx - ncx) * cy;
324 cout << surface << endl;
327 delete[] x; delete[] y;
336 int nb_samples = 0, nb_samples_max = 1000;
337 double *x = new double[nb_samples_max];
340 if(nb_samples == nb_samples_max) {
341 x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
342 nb_samples_max = 2 * nb_samples_max;
345 cin.getline(line, buffer_size);
349 s = next_word(token, s, buffer_size);
350 x[nb_samples] = atof(token);
352 if(nb_samples == 0 || x[nb_samples] > xmax) xmax = x[nb_samples];
353 if(nb_samples == 0 || x[nb_samples] < xmin) xmin = x[nb_samples];
360 for(int n = 0; n < nb_bins; n++) nb[n] = 0;
363 for(int s = 0; s < nb_samples; s++) {
364 int n = int((x[s] - xmin)/(xmax - xmin) * nb_bins);
365 if(n >= 0 && n < nb_bins) nb[n]++;
367 cerr << "WARNING: value " << x[s] << " is out of histogram." << endl;
373 for(int n = 0; n < nb_bins; n++)
374 cout << xmin + ((xmax - xmin) * n) / double(nb_bins) << " "
375 << (nb[n] / double(nb_total))/((xmax - xmin) / double(nb_bins)) << endl;
377 for(int n = 0; n < nb_bins; n++)
378 cout << xmin + ((xmax - xmin) * n) / double(nb_bins) << " "
379 << nb[n] / double(nb_total) << endl;
388 int nb_samples = 0, nb_samples_max = 1000;
389 double *x = new double[nb_samples_max];
391 double min = 0, max = 0;
392 double sum = 0, sumsq = 0;
395 if(nb_samples == nb_samples_max) {
396 x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
397 nb_samples_max = 2 * nb_samples_max;
400 cin.getline(line, buffer_size);
403 s = next_word(token, s, buffer_size);
404 x[nb_samples] = atof(token);
406 double x = atof(token);
407 if(nb == 0 || x > max) max = x;
408 if(nb == 0 || x < min) min = x;
415 Couple tmp[nb_samples];
416 for(int n = 0; n < nb_samples; n++) {
421 qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
425 double mu = sum / double(nb);
426 double sigma = (sumsq - sum * mu) / double(nb - 1);
427 double stdd = sqrt(sigma);
429 cout << "MIN " << min
432 << " SIGMA " << sigma
435 << " MEDIAN " << tmp[nb_samples/2].value
436 << " QUANTILE0.1 " << tmp[int(nb_samples * 0.1)].value
437 << " QUANTILE0.9 " << tmp[int(nb_samples * 0.9)].value
445 cerr << "ERROR: You must choose a processing type." << endl;