8fffe7a020f9b1118b78c5b0784a12e0a35e8883
[pytorch.git] / bit_mlp.py
1 #!/usr/bin/env python
2
3 # Any copyright is dedicated to the Public Domain.
4 # https://creativecommons.org/publicdomain/zero/1.0/
5
6 # Written by Francois Fleuret <francois@fleuret.org>
7
8 import os, sys
9 import torch, torchvision
10 from torch import nn
11
12 lr, nb_epochs, batch_size = 2e-3, 50, 100
13
14 data_dir = os.environ.get("PYTORCH_DATA_DIR") or "./data/"
15
16 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
18 ######################################################################
19
20 train_set = torchvision.datasets.MNIST(root=data_dir, train=True, download=True)
21 train_input = train_set.data.view(-1, 1, 28, 28).float()
22 train_targets = train_set.targets
23
24 test_set = torchvision.datasets.MNIST(root=data_dir, train=False, download=True)
25 test_input = test_set.data.view(-1, 1, 28, 28).float()
26 test_targets = test_set.targets
27
28 train_input, train_targets = train_input.to(device), train_targets.to(device)
29 test_input, test_targets = test_input.to(device), test_targets.to(device)
30
31 mu, std = train_input.mean(), train_input.std()
32
33 train_input.sub_(mu).div_(std)
34 test_input.sub_(mu).div_(std)
35
36 ######################################################################
37
38
39 class QLinear(nn.Module):
40     def __init__(self, dim_in, dim_out):
41         super().__init__()
42         self.w = nn.Parameter(torch.randn(dim_out, dim_in))
43         self.b = nn.Parameter(torch.randn(dim_out) * 1e-1)
44
45     def quantize(self, z):
46         epsilon = 1e-3
47         zr = z / (z.abs().mean() + epsilon)
48         zq = -(zr <= -0.5).long() + (zr >= 0.5).long()
49         if self.training:
50             return zq + z - z.detach()
51         else:
52             return zq.float()
53
54     def forward(self, x):
55         return x @ self.quantize(self.w).t() + self.quantize(self.b)
56
57
58 ######################################################################
59
60 for nb_hidden in [16, 32, 64, 128, 256, 512, 1024]:
61     for linear_layer in [nn.Linear, QLinear]:
62         # The model
63
64         model = nn.Sequential(
65             nn.Flatten(),
66             linear_layer(784, nb_hidden),
67             nn.ReLU(),
68             linear_layer(nb_hidden, 10),
69         ).to(device)
70
71         nb_parameters = sum(p.numel() for p in model.parameters())
72
73         print(f"nb_parameters {nb_parameters}")
74
75         optimizer = torch.optim.Adam(model.parameters(), lr=lr)
76
77         #
78
79         for k in range(nb_epochs):
80             # Train
81
82             model.train()
83
84             acc_train_loss = 0.0
85
86             for input, targets in zip(
87                 train_input.split(batch_size), train_targets.split(batch_size)
88             ):
89                 output = model(input)
90                 loss = torch.nn.functional.cross_entropy(output, targets)
91                 acc_train_loss += loss.item() * input.size(0)
92
93                 optimizer.zero_grad()
94                 loss.backward()
95                 optimizer.step()
96
97             # Test
98
99             model.eval()
100
101             nb_test_errors = 0
102             for input, targets in zip(
103                 test_input.split(batch_size), test_targets.split(batch_size)
104             ):
105                 wta = model(input).argmax(1)
106                 nb_test_errors += (wta != targets).long().sum()
107             test_error = nb_test_errors / test_input.size(0)
108
109             if (k + 1) % 10 == 0:
110                 print(
111                     f"loss {k+1} {acc_train_loss/train_input.size(0)} {test_error*100:.02f}%"
112                 )
113                 sys.stdout.flush()
114
115         ######################################################################
116
117         print(
118             f"final_loss {nb_hidden} {linear_layer} {acc_train_loss/train_input.size(0)} {test_error*100} %"
119         )
120         sys.stdout.flush()