X-Git-Url: https://www.fleuret.org/cgi-bin/gitweb/gitweb.cgi?p=mlp.git;a=blobdiff_plain;f=neural.cc;fp=neural.cc;h=275ad7e15debcb4d715e897629212c8352330c68;hp=0000000000000000000000000000000000000000;hb=713c683d77fc94a4257c4031b0c51ef4669a3d4a;hpb=751279426fb49172dfe95d85dd277e06a970577e diff --git a/neural.cc b/neural.cc new file mode 100644 index 0000000..275ad7e --- /dev/null +++ b/neural.cc @@ -0,0 +1,412 @@ +/* + * mlp-mnist is an implementation of a multi-layer neural network. + * + * Copyright (c) 2008 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of mlp-mnist. + * + * mlp-mnist is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 3 as + * published by the Free Software Foundation. + * + * mlp-mnist is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with mlp-mnist. If not, see . + * + */ + +#include + +#include "neural.h" + +const scalar_t MultiLayerPerceptron::output_amplitude = 0.95; + +// I won't comment the natural elegance of C++ in that kind of +// situation. IT SUCKS. + +MultiLayerPerceptron::MultiLayerPerceptron(const MultiLayerPerceptron &mlp) { + _nb_layers = mlp._nb_layers; + _layer_sizes = new int[_nb_layers]; + _frozen_layers = new bool[_nb_layers-1]; + memcpy((void *) _frozen_layers, (void *) mlp._frozen_layers, _nb_layers * sizeof(bool)); + _activations_index = new int[_nb_layers]; + _weights_index = new int[_nb_layers]; + + _nb_activations = 0; + _nb_weights = 0; + + for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = mlp._layer_sizes[n]; + + for(int n = 0; n < _nb_layers; n++) { + _activations_index[n] = _nb_activations; + _nb_activations += _layer_sizes[n]; + if(n < _nb_layers - 1) { + _frozen_layers[n] = false; + _weights_index[n] = _nb_weights; + _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1]; + } + } + + _activations = new scalar_t[_nb_activations]; + _pre_sigma_activations = new scalar_t[_nb_activations]; + + _weights = new scalar_t[_nb_weights]; + memcpy((void *) _weights, (void *) mlp._weights, _nb_weights * sizeof(scalar_t)); +} + +MultiLayerPerceptron::MultiLayerPerceptron(int nb_layers, int *layer_sizes) { + _nb_layers = nb_layers; + _layer_sizes = new int[_nb_layers]; + _frozen_layers = new bool[_nb_layers-1]; + _activations_index = new int[_nb_layers]; + _weights_index = new int[_nb_layers]; + + _nb_activations = 0; + _nb_weights = 0; + + for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = layer_sizes[n]; + + for(int n = 0; n < _nb_layers; n++) { + _activations_index[n] = _nb_activations; + _nb_activations += _layer_sizes[n]; + if(n < _nb_layers - 1) { + _frozen_layers[n] = false; + _weights_index[n] = _nb_weights; + _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1]; + } + } + + _activations = new scalar_t[_nb_activations]; + _pre_sigma_activations = new scalar_t[_nb_activations]; + + _weights = new scalar_t[_nb_weights]; +} + +MultiLayerPerceptron::MultiLayerPerceptron(istream &is) { + is >> _nb_layers; + + _layer_sizes = new int[_nb_layers]; + _frozen_layers = new bool[_nb_layers - 1]; + _activations_index = new int[_nb_layers]; + _weights_index = new int[_nb_layers]; + + _nb_activations = 0; + _nb_weights = 0; + + for(int n = 0; n < _nb_layers; n++) is >> _layer_sizes[n]; + + for(int n = 0; n < _nb_layers; n++) { + _activations_index[n] = _nb_activations; + _nb_activations += _layer_sizes[n]; + if(n < _nb_layers - 1) { + _frozen_layers[n] = false; + _weights_index[n] = _nb_weights; + _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1]; + } + } + + _activations = new scalar_t[_nb_activations]; + _pre_sigma_activations = new scalar_t[_nb_activations]; + + _weights = new scalar_t[_nb_weights]; + + for(int l = 0; l < _nb_layers - 1; l++) { + int ll; + is >> ll; + if(l != ll) { + cerr << "Inconsistent layer number during loading!\n"; + cerr.flush(); + exit(1); + } + for(int j = 0; j < _layer_sizes[l]; j++) + for(int i = 0; i < _layer_sizes[l+1]; i++) + is >> _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j]; + } +} + +MultiLayerPerceptron::~MultiLayerPerceptron() { + delete[] _weights; + delete[] _activations; + delete[] _pre_sigma_activations; + + delete[] _layer_sizes; + delete[] _frozen_layers; + delete[] _weights_index; + delete[] _activations_index; +} + +void MultiLayerPerceptron::save(ostream &os) { + os << _nb_layers << "\n"; + for(int n = 0; n < _nb_layers; n++) os << _layer_sizes[n] << (n < _nb_layers - 1 ? " " : "\n"); + for(int l = 0; l < _nb_layers - 1; l++) { + os << l << "\n"; + for(int j = 0; j < _layer_sizes[l]; j++) { + for(int i = 0; i < _layer_sizes[l+1]; i++) + os << _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j] << (i < _layer_sizes[l+1] - 1 ? " " : "\n"); + } + } +} + +void MultiLayerPerceptron::save_data() { + for(int i = 0; i < _layer_sizes[1]; i++) { + char buffer[256]; + sprintf(buffer, "/tmp/hidden_%03d.dat", i); + ofstream stream(buffer); + for(int j = 0; j < _layer_sizes[0]; j++) { + if(j%28 == 0) stream << "\n"; + stream << j%28 << " " << j/28 << " " << _weights[i * (_layer_sizes[0] + 1) + j] << "\n"; + } + } +} + +void MultiLayerPerceptron::init_random_weights(scalar_t stdd) { + for(int w = 0; w < _nb_weights; w++) _weights[w] = normal_sample() * stdd; +} + +void MultiLayerPerceptron::compute_gradient_1s(ImageSet *is, int p, scalar_t *gradient_1s) { + scalar_t dactivations[_nb_activations]; + + compute_activations_1s(is, p); + + int nb_unfrozen = 0; + for(int l = 0; l < _nb_layers - 1; l++) if(!_frozen_layers[l]) nb_unfrozen++; + + for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) { + scalar_t correct; + if(is->label(p) == i) correct = output_amplitude; + else correct = - output_amplitude; + dactivations[_activations_index[_nb_layers - 1] + i] = 2 * (_activations[_activations_index[_nb_layers - 1] + i] - correct); + } + + for(int l = _nb_layers - 2; (l >= 0) && (nb_unfrozen > 0); l--) { + int ai0 = _activations_index[l], ai1 = _activations_index[l+1], + wi0 = _weights_index[l], ls0p1 = _layer_sizes[l] + 1; + + int j; + for(j = 0; j < _layer_sizes[l]; j++) { + scalar_t s = 0.0; + for(int i = 0; i < _layer_sizes[l+1]; i++) { + scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]); + s += alpha * _weights[wi0 + i * ls0p1 + j]; + gradient_1s[wi0 + i * ls0p1 + j] = alpha * _activations[ai0 + j]; + } + dactivations[ai0 + j] = s; + } + + for(int i = 0; i < _layer_sizes[l+1]; i++) { + scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]); + gradient_1s[wi0 + i * ls0p1 + j] = alpha; + } + if(!_frozen_layers[l]) nb_unfrozen--; + } +} + +void MultiLayerPerceptron::compute_gradient(ImageSet *is, scalar_t *gradient) { + scalar_t gradient_1s[_nb_weights]; + for(int w = 0; w < _nb_weights; w++) gradient[w] = 0.0; + for(int p = 0; p < is->nb_pics(); p++) { + compute_gradient_1s(is, p, gradient_1s); + for(int w = 0; w < _nb_weights; w++) gradient[w] += gradient_1s[w]; + } +} + +void MultiLayerPerceptron::compute_numerical_gradient(ImageSet *is, scalar_t *gradient) { + const scalar_t eps = 1e-3; + scalar_t error_plus, error_minus, ref; + for(int w = 0; w < _nb_weights; w++) { + ref = _weights[w]; + _weights[w] = ref + eps; + error_plus = error(is); + _weights[w] = ref - eps; + error_minus = error(is); + _weights[w] = ref; + gradient[w] = (error_plus - error_minus) / (2 * eps); + } +} + +void MultiLayerPerceptron::print_gradient(ostream &os, scalar_t *gradient) { + for(int w = 0; w < _nb_weights; w++) os << gradient[w] << "\n"; +} + +void MultiLayerPerceptron::move_on_line(scalar_t *origin, scalar_t *gradient, scalar_t lambda) { + for(int l = 0; l < _nb_layers-1; l++) if(!_frozen_layers[l]) { + for(int i = 0; i < (_layer_sizes[l] + 1) * _layer_sizes[l+1]; i++) + _weights[_weights_index[l] + i] = + origin[_weights_index[l] + i] + lambda * gradient[_weights_index[l] + i]; + } +} + +void MultiLayerPerceptron::one_step_basic_gradient(ImageSet *is, scalar_t dt) { + scalar_t gradient_1s[_nb_weights]; + for(int p = 0; p < is->nb_pics(); p++) { + if(p%1000 == 0) { + int n = (p*50)/is->nb_pics(); + int j; + for(j = 0; j < n; j++) cout << "X"; + for(; j < 50; j++) cout << ((j%5 == 0) ? "+" : "-"); + cout << "\r"; + cout.flush(); + } + compute_gradient_1s(is, p, gradient_1s); + move_on_line(_weights, gradient_1s, -dt); + } + cout << " \r"; cout.flush(); +} + +void MultiLayerPerceptron::one_step_global_gradient(ImageSet *is, scalar_t *xi, scalar_t *g, scalar_t *h) { + scalar_t origin[_nb_weights]; + for(int w = 0; w < _nb_weights; w++) origin[w] = _weights[w]; + + scalar_t l = 1e-8; + scalar_t e, pe; + + e = error(is); + + do { + pe = e; + l *= 2; + move_on_line(origin, xi, l); + e = error(is); + } while(e < pe); + + scalar_t dl = - l / 4; + + while(abs(dl) > 1e-6) { + move_on_line(origin, xi, l); + e = error(is); + do { + pe = e; + l += dl; + move_on_line(origin, xi, l); + e = error(is); + } while(e < pe); + dl = - dl / 4; + } + + compute_gradient(is, xi); + + scalar_t gg = 0, gxi = 0, xixi = 0; + + // Polak-Ribiere + + for(int w = 0; w < _nb_weights; w++) { + gg += sq(g[w]); + gxi += g[w] * xi[w]; + xixi += sq(xi[w]); + } + + scalar_t gamma = (xixi + gxi)/gg; + + // Set gamma to 0 for standard gradient descente + // gamma = 0.0; + + for(int w = 0; w < _nb_weights; w++) { + g[w] = -xi[w]; + h[w] = g[w] + gamma * h[w]; + xi[w] = h[w]; + } +} + +void MultiLayerPerceptron::train(ImageSet *training_set, ImageSet *validation_set) { + scalar_t prev_validation_error = 1.0, validation_error = 1.0, training_error; + int l = 0, nb_increases = 0; + do { +// #warning horrible debugging +// { +// char buffer[1024]; +// sprintf(buffer, "tmp_%04d.mlp", l); +// ofstream stream(buffer); +// save(stream); +// stream.flush(); +// } + prev_validation_error = validation_error; + // one_step_global_gradient(training_set, xi, g, h); + one_step_basic_gradient(training_set, 1e-2); + training_error = error(training_set); + validation_error = error(validation_set); + cout << l + << " TRAINING " << training_error << " (" << classification_error(training_set)*100 << "%)" + << " VALIDATION " << validation_error << " (" << classification_error(validation_set)*100 << "%)"; + if(l > 0 && validation_error >= prev_validation_error) { + nb_increases++; + cout << " [" << nb_increases << "]"; + } + cout << "\n"; + cout.flush(); + l++; + } while(nb_increases < 10); + +} + +void MultiLayerPerceptron::compute_activations_1s(ImageSet *is, int p) { + ASSERT(_layer_sizes[0] == is->width() * is->height(), "The first layer has to have as many units as there are pixels!"); + + scalar_t *a = _activations; + scalar_t *w = _weights; + + for(int y = 0; y < is->height(); y++) for(int x = 0; x < is->width(); x++) + *(a++) = 2.0 * (scalar_t(is->pixel(p, x, y)) / 255.0) - 1.0; + + scalar_t *pa = _pre_sigma_activations + _activations_index[1]; + scalar_t *b = _activations, *b2 = 0; + + for(int l = 0; l < _nb_layers - 1; l++) { + for(int i = 0; i < _layer_sizes[l+1]; i++) { + scalar_t s = 0; + b2 = b; + for(int j = 0; j < _layer_sizes[l]; j++) s += *(w++) * *(b2++); + s += *(w++); + *(pa++) = s; + *(a++) = sigma(s); + } + b = b2; + } +} + +void MultiLayerPerceptron::test(ImageSet *is, scalar_t *responses) { + for(int p = 0; p < is->nb_pics(); p++) { + compute_activations_1s(is, p); + for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) + responses[p * _layer_sizes[_nb_layers - 1] + i] = _activations[_activations_index[_nb_layers - 1] + i]; + } +} + +scalar_t MultiLayerPerceptron::error(ImageSet *is) { + scalar_t error = 0; + + for(int p = 0; p < is->nb_pics(); p++) { + compute_activations_1s(is, p); + for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) { + scalar_t correct; + if(is->label(p) == i) correct = output_amplitude; + else correct = - output_amplitude; + error += sq(_activations[_activations_index[_nb_layers - 1] + i] - correct); + } + } + + return error; +} + +scalar_t MultiLayerPerceptron::classification_error(ImageSet *is) { + int nb_error = 0; + + for(int p = 0; p < is->nb_pics(); p++) { + compute_activations_1s(is, p); + scalar_t max = -1; + int i_max = -1; + for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) { + if(i_max < 0 || _activations[_activations_index[_nb_layers - 1] + i] > max) { + i_max = i; + max = _activations[_activations_index[_nb_layers - 1] + i]; + } + } + if(is->label(p) != i_max) nb_error++; + } + + return scalar_t(nb_error)/scalar_t(is->nb_pics()); +}