neural.cc

   1 /*
   2  *  mlp-mnist is an implementation of a multi-layer neural network.
   3  *
   4  *  Copyright (c) 2006 École Polytechnique Fédérale de Lausanne,
   5  *  http://www.epfl.ch
   6  *
   7  *  Written by Francois Fleuret <francois@fleuret.org>
   8  *
   9  *  This file is part of mlp-mnist.
  10  *
  11  *  mlp-mnist is free software: you can redistribute it and/or modify
  12  *  it under the terms of the GNU General Public License version 3 as
  13  *  published by the Free Software Foundation.
  14  *
  15  *  mlp-mnist is distributed in the hope that it will be useful, but
  16  *  WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  *  General Public License for more details.
  19  *
  20  *  You should have received a copy of the GNU General Public License
  21  *  along with mlp-mnist.  If not, see <http://www.gnu.org/licenses/>.
  22  *
  23  */
  24
  25 #include <string.h>
  26
  27 #include "neural.h"
  28
  29 const scalar_t MultiLayerPerceptron::output_amplitude = 0.95;
  30
  31 // I won't comment the natural elegance of C++ in that kind of
  32 // situation. IT SUCKS.
  33
  34 MultiLayerPerceptron::MultiLayerPerceptron(const MultiLayerPerceptron &mlp) {
  35   _nb_layers = mlp._nb_layers;
  36   _layer_sizes = new int[_nb_layers];
  37   _frozen_layers = new bool[_nb_layers-1];
  38   memcpy((void *) _frozen_layers, (void *) mlp._frozen_layers, _nb_layers * sizeof(bool));
  39   _activations_index = new int[_nb_layers];
  40   _weights_index = new int[_nb_layers];
  41
  42   _nb_activations = 0;
  43   _nb_weights = 0;
  44
  45   for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = mlp._layer_sizes[n];
  46
  47   for(int n = 0; n < _nb_layers; n++) {
  48     _activations_index[n] = _nb_activations;
  49     _nb_activations += _layer_sizes[n];
  50     if(n < _nb_layers - 1) {
  51       _frozen_layers[n] = false;
  52       _weights_index[n] = _nb_weights;
  53       _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
  54     }
  55   }
  56
  57   _activations = new scalar_t[_nb_activations];
  58   _pre_sigma_activations = new scalar_t[_nb_activations];
  59
  60   _weights = new scalar_t[_nb_weights];
  61   memcpy((void *) _weights, (void *) mlp._weights, _nb_weights * sizeof(scalar_t));
  62 }
  63
  64 MultiLayerPerceptron::MultiLayerPerceptron(int nb_layers, int *layer_sizes) {
  65   _nb_layers = nb_layers;
  66   _layer_sizes = new int[_nb_layers];
  67   _frozen_layers = new bool[_nb_layers-1];
  68   _activations_index = new int[_nb_layers];
  69   _weights_index = new int[_nb_layers];
  70
  71   _nb_activations = 0;
  72   _nb_weights = 0;
  73
  74   for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = layer_sizes[n];
  75
  76   for(int n = 0; n < _nb_layers; n++) {
  77     _activations_index[n] = _nb_activations;
  78     _nb_activations += _layer_sizes[n];
  79     if(n < _nb_layers - 1) {
  80       _frozen_layers[n] = false;
  81       _weights_index[n] = _nb_weights;
  82       _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
  83     }
  84   }
  85
  86   _activations = new scalar_t[_nb_activations];
  87   _pre_sigma_activations = new scalar_t[_nb_activations];
  88
  89   _weights = new scalar_t[_nb_weights];
  90 }
  91
  92 MultiLayerPerceptron::MultiLayerPerceptron(istream &is) {
  93   is >> _nb_layers;
  94
  95   _layer_sizes = new int[_nb_layers];
  96   _frozen_layers = new bool[_nb_layers - 1];
  97   _activations_index = new int[_nb_layers];
  98   _weights_index = new int[_nb_layers];
  99
 100   _nb_activations = 0;
 101   _nb_weights = 0;
 102
 103   for(int n = 0; n < _nb_layers; n++) is >> _layer_sizes[n];
 104
 105   for(int n = 0; n < _nb_layers; n++) {
 106     _activations_index[n] = _nb_activations;
 107     _nb_activations += _layer_sizes[n];
 108     if(n < _nb_layers - 1) {
 109       _frozen_layers[n] = false;
 110       _weights_index[n] = _nb_weights;
 111       _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
 112     }
 113   }
 114
 115   _activations = new scalar_t[_nb_activations];
 116   _pre_sigma_activations = new scalar_t[_nb_activations];
 117
 118   _weights = new scalar_t[_nb_weights];
 119
 120   for(int l = 0; l < _nb_layers - 1; l++) {
 121     int ll;
 122     is >> ll;
 123     if(l != ll) {
 124       cerr << "Inconsistent layer number during loading!\n";
 125       cerr.flush();
 126       exit(1);
 127     }
 128     for(int j = 0; j < _layer_sizes[l]; j++)
 129       for(int i = 0; i < _layer_sizes[l+1]; i++)
 130         is >> _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j];
 131   }
 132 }
 133
 134 MultiLayerPerceptron::~MultiLayerPerceptron() {
 135   delete[] _weights;
 136   delete[] _activations;
 137   delete[] _pre_sigma_activations;
 138
 139   delete[] _layer_sizes;
 140   delete[] _frozen_layers;
 141   delete[] _weights_index;
 142   delete[] _activations_index;
 143 }
 144
 145 void MultiLayerPerceptron::save(ostream &os) {
 146   os << _nb_layers << "\n";
 147   for(int n = 0; n < _nb_layers; n++) os << _layer_sizes[n] << (n < _nb_layers - 1 ? " " : "\n");
 148   for(int l = 0; l < _nb_layers - 1; l++) {
 149     os << l << "\n";
 150     for(int j = 0; j < _layer_sizes[l]; j++) {
 151       for(int i = 0; i < _layer_sizes[l+1]; i++)
 152         os << _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j] << (i < _layer_sizes[l+1] - 1 ? " " : "\n");
 153     }
 154   }
 155 }
 156
 157 void MultiLayerPerceptron::save_data() {
 158   for(int i = 0; i < _layer_sizes[1]; i++) {
 159     char buffer[256];
 160     sprintf(buffer, "/tmp/hidden_%03d.dat", i);
 161     ofstream stream(buffer);
 162     for(int j = 0; j < _layer_sizes[0]; j++) {
 163       if(j%28 == 0) stream << "\n";
 164       stream << j%28 << " " << j/28 << " " << _weights[i * (_layer_sizes[0] + 1) + j] << "\n";
 165     }
 166   }
 167 }
 168
 169 void MultiLayerPerceptron::init_random_weights(scalar_t stdd) {
 170   for(int w = 0; w < _nb_weights; w++) _weights[w] = normal_sample() * stdd;
 171 }
 172
 173 void MultiLayerPerceptron::compute_gradient_1s(ImageSet *is, int p, scalar_t *gradient_1s) {
 174   scalar_t dactivations[_nb_activations];
 175
 176   compute_activations_1s(is, p);
 177
 178   int nb_unfrozen = 0;
 179   for(int l = 0; l < _nb_layers - 1; l++) if(!_frozen_layers[l]) nb_unfrozen++;
 180
 181   for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
 182     scalar_t correct;
 183     if(is->label(p) == i) correct =   output_amplitude;
 184     else                  correct = - output_amplitude;
 185     dactivations[_activations_index[_nb_layers - 1] + i] = 2 * (_activations[_activations_index[_nb_layers - 1] + i] - correct);
 186   }
 187
 188   for(int l = _nb_layers - 2; (l >= 0) && (nb_unfrozen > 0); l--) {
 189     int ai0 = _activations_index[l], ai1 = _activations_index[l+1],
 190       wi0 = _weights_index[l], ls0p1 = _layer_sizes[l] + 1;
 191
 192     int j;
 193     for(j = 0; j < _layer_sizes[l]; j++) {
 194       scalar_t s = 0.0;
 195       for(int i = 0; i < _layer_sizes[l+1]; i++) {
 196         scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
 197         s += alpha * _weights[wi0 + i * ls0p1 + j];
 198         gradient_1s[wi0 + i * ls0p1 + j] = alpha * _activations[ai0 + j];
 199       }
 200       dactivations[ai0 + j] = s;
 201     }
 202
 203     for(int i = 0; i < _layer_sizes[l+1]; i++) {
 204       scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
 205       gradient_1s[wi0 + i * ls0p1 + j] = alpha;
 206     }
 207     if(!_frozen_layers[l]) nb_unfrozen--;
 208   }
 209 }
 210
 211 void MultiLayerPerceptron::compute_gradient(ImageSet *is, scalar_t *gradient) {
 212   scalar_t gradient_1s[_nb_weights];
 213   for(int w = 0; w < _nb_weights; w++) gradient[w] = 0.0;
 214   for(int p = 0; p < is->nb_pics(); p++) {
 215     compute_gradient_1s(is, p, gradient_1s);
 216     for(int w = 0; w < _nb_weights; w++) gradient[w] += gradient_1s[w];
 217   }
 218 }
 219
 220 void MultiLayerPerceptron::compute_numerical_gradient(ImageSet *is, scalar_t *gradient) {
 221   const scalar_t eps = 1e-3;
 222   scalar_t error_plus, error_minus, ref;
 223   for(int w = 0; w < _nb_weights; w++) {
 224     ref = _weights[w];
 225     _weights[w] = ref + eps;
 226     error_plus = error(is);
 227     _weights[w] = ref - eps;
 228     error_minus = error(is);
 229     _weights[w] = ref;
 230     gradient[w] = (error_plus - error_minus) / (2 * eps);
 231   }
 232 }
 233
 234 void MultiLayerPerceptron::print_gradient(ostream &os, scalar_t *gradient) {
 235   for(int w = 0; w < _nb_weights; w++) os << gradient[w] << "\n";
 236 }
 237
 238 void MultiLayerPerceptron::move_on_line(scalar_t *origin, scalar_t *gradient, scalar_t lambda) {
 239   for(int l = 0; l < _nb_layers-1; l++) if(!_frozen_layers[l]) {
 240     for(int i = 0; i < (_layer_sizes[l] + 1) * _layer_sizes[l+1]; i++)
 241       _weights[_weights_index[l] + i] =
 242         origin[_weights_index[l] + i] + lambda * gradient[_weights_index[l] + i];
 243   }
 244 }
 245
 246 void MultiLayerPerceptron::one_step_basic_gradient(ImageSet *is, scalar_t dt) {
 247   scalar_t gradient_1s[_nb_weights];
 248   for(int p = 0; p < is->nb_pics(); p++) {
 249     if(p%1000 == 0) {
 250       int n = (p*50)/is->nb_pics();
 251       int j;
 252       for(j = 0; j < n; j++) cout << "X";
 253       for(; j < 50; j++) cout << ((j%5 == 0) ? "+" : "-");
 254       cout << "\r";
 255       cout.flush();
 256     }
 257     compute_gradient_1s(is, p, gradient_1s);
 258     move_on_line(_weights, gradient_1s, -dt);
 259   }
 260   cout << "                                                            \r"; cout.flush();
 261 }
 262
 263 void MultiLayerPerceptron::one_step_global_gradient(ImageSet *is, scalar_t *xi, scalar_t *g, scalar_t *h) {
 264   scalar_t origin[_nb_weights];
 265   for(int w = 0; w < _nb_weights; w++) origin[w] = _weights[w];
 266
 267   scalar_t l = 1e-8;
 268   scalar_t e, pe;
 269
 270   e = error(is);
 271
 272   do {
 273     pe = e;
 274     l *= 2;
 275     move_on_line(origin, xi, l);
 276     e = error(is);
 277   } while(e < pe);
 278
 279   scalar_t dl = - l / 4;
 280
 281   while(abs(dl) > 1e-6) {
 282     move_on_line(origin, xi, l);
 283     e = error(is);
 284     do {
 285       pe = e;
 286       l  += dl;
 287       move_on_line(origin, xi, l);
 288       e = error(is);
 289     } while(e < pe);
 290     dl = - dl / 4;
 291   }
 292
 293   compute_gradient(is, xi);
 294
 295   scalar_t gg = 0, gxi = 0, xixi = 0;
 296
 297   // Polak-Ribiere
 298
 299   for(int w = 0; w < _nb_weights; w++) {
 300     gg += sq(g[w]);
 301     gxi += g[w] * xi[w];
 302     xixi += sq(xi[w]);
 303   }
 304
 305   scalar_t gamma = (xixi + gxi)/gg;
 306
 307   // Set gamma to 0 for standard gradient descente
 308   //   gamma = 0.0;
 309
 310   for(int w = 0; w < _nb_weights; w++) {
 311     g[w] = -xi[w];
 312     h[w] = g[w] + gamma * h[w];
 313     xi[w] = h[w];
 314   }
 315 }
 316
 317 void MultiLayerPerceptron::train(ImageSet *training_set, ImageSet *validation_set) {
 318   scalar_t prev_validation_error = 1.0, validation_error = 1.0, training_error;
 319   int l = 0, nb_increases = 0;
 320   do {
 321 // #warning horrible debugging
 322 //     {
 323 //       char buffer[1024];
 324 //       sprintf(buffer, "tmp_%04d.mlp", l);
 325 //       ofstream stream(buffer);
 326 //       save(stream);
 327 //       stream.flush();
 328 //     }
 329     prev_validation_error = validation_error;
 330     //     one_step_global_gradient(training_set, xi, g, h);
 331     one_step_basic_gradient(training_set, 1e-2);
 332     training_error = error(training_set);
 333     validation_error = error(validation_set);
 334     cout << l
 335          << " TRAINING " << training_error << " (" << classification_error(training_set)*100 << "%)"
 336          << " VALIDATION " << validation_error << " (" << classification_error(validation_set)*100 << "%)";
 337     if(l > 0 && validation_error >= prev_validation_error) {
 338       nb_increases++;
 339       cout << " [" << nb_increases << "]";
 340     }
 341     cout << "\n";
 342     cout.flush();
 343     l++;
 344   } while(nb_increases < 10);
 345
 346 }
 347
 348 void MultiLayerPerceptron::compute_activations_1s(ImageSet *is, int p) {
 349   ASSERT(_layer_sizes[0] == is->width() * is->height(), "The first layer has to have as many units as there are pixels!");
 350
 351   scalar_t *a = _activations;
 352   scalar_t *w = _weights;
 353
 354   for(int y = 0; y < is->height(); y++) for(int x = 0; x < is->width(); x++)
 355     *(a++) = 2.0 * (scalar_t(is->pixel(p, x, y)) / 255.0) - 1.0;
 356
 357   scalar_t *pa = _pre_sigma_activations + _activations_index[1];
 358   scalar_t *b = _activations, *b2 = 0;
 359
 360   for(int l = 0; l < _nb_layers - 1; l++) {
 361     for(int i = 0; i < _layer_sizes[l+1]; i++) {
 362       scalar_t s = 0;
 363       b2 = b;
 364       for(int j = 0; j < _layer_sizes[l]; j++) s += *(w++) * *(b2++);
 365       s += *(w++);
 366       *(pa++) = s;
 367       *(a++) = sigma(s);
 368     }
 369     b = b2;
 370   }
 371 }
 372
 373 void MultiLayerPerceptron::test(ImageSet *is, scalar_t *responses) {
 374   for(int p = 0; p < is->nb_pics(); p++) {
 375     compute_activations_1s(is, p);
 376     for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++)
 377       responses[p * _layer_sizes[_nb_layers - 1] + i] = _activations[_activations_index[_nb_layers - 1] + i];
 378   }
 379 }
 380
 381 scalar_t MultiLayerPerceptron::error(ImageSet *is) {
 382   scalar_t error = 0;
 383
 384   for(int p = 0; p < is->nb_pics(); p++) {
 385     compute_activations_1s(is, p);
 386     for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
 387       scalar_t correct;
 388       if(is->label(p) == i) correct =   output_amplitude;
 389       else                  correct = - output_amplitude;
 390       error += sq(_activations[_activations_index[_nb_layers - 1] + i] - correct);
 391     }
 392   }
 393
 394   return error;
 395 }
 396
 397 scalar_t MultiLayerPerceptron::classification_error(ImageSet *is) {
 398   int nb_error = 0;
 399
 400   for(int p = 0; p < is->nb_pics(); p++) {
 401     compute_activations_1s(is, p);
 402     scalar_t max = -1;
 403     int i_max = -1;
 404     for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
 405       if(i_max < 0 || _activations[_activations_index[_nb_layers - 1] + i] > max) {
 406         i_max = i;
 407         max = _activations[_activations_index[_nb_layers - 1] + i];
 408       }
 409     }
 410     if(is->label(p) != i_max) nb_error++;
 411   }
 412
 413   return scalar_t(nb_error)/scalar_t(is->nb_pics());
 414 }