Update.
[mygptrnn.git] / mygpt.py
1 #!/usr/bin/env python
2
3 # Any copyright is dedicated to the Public Domain.
4 # https://creativecommons.org/publicdomain/zero/1.0/
5
6 # Written by Francois Fleuret <francois@fleuret.org>
7
8 # This is an implementation from scratch of a "GPT", that is a model
9 # composed of several causal self-attention blocks. It is equipped
10 # with a caching mechanism for keys and values to avoid a O(N^3) cost
11 # for auto-regression.
12
13 # This implementation is equipped with RNN layers to replace the MHA
14
15 import math, warnings
16
17 import torch, einops
18
19 from torch import nn
20 from torch.nn import functional as F
21
22 import ffutils
23
24 # import memload
25
26 ######################################################################
27
28 # A BracketedSequence is a BxTx... tensor with a first and a nb time
29 # steps to compute.
30
31 # Modules able to process it expect that they will have to process a
32 # first bracket starting at t=0, followed by a succession of brackets
33 # that move forward in time, do not overlap, and cover the axis T with
34 # no holes.
35 #
36 # Although it is more general, for a classical prompt-conditioned
37 # auto-regressive process it will be a first bracket starting at 0 and
38 # of arbitrary length for the "prompt", followed by brackets of length
39 # 1 for the successive tokens.
40 #
41 # Modules able to process brackets may implement a cache that is
42 # resetted when init_cache is True
43
44
45 class BracketedSequence:
46     def __init__(self, x, first=None, nb=None, init_cache=None):
47         self.x = x
48         assert (first is None and nb is None and init_cache is None) or (
49             first is not None and nb is not None and init_cache is not None
50         )
51
52         self.first = 0 if first is None else first
53         self.nb = x.size(1) if nb is None else nb
54         self.init_cache = True if init_cache is None else init_cache
55
56     def slice(self):
57         return self.x[:, self.first : self.first + self.nb]
58
59     def complete(self):
60         return self.first == 0 and self.nb == self.x.size(1)
61
62
63 ######################################################################
64
65
66 class CacheWrapper(nn.Module):
67     def __init__(self, *f):
68         super().__init__()
69         self.f = f[0] if len(f) == 1 else nn.Sequential(*f)
70
71     def forward(self, bs):
72         if bs.init_cache:
73             y = self.f(bs.slice())
74             self.cache_y = y.new(*((y.size(0), bs.x.size(1)) + y.size()[2:]))
75             self.cache_y[:, bs.first : bs.first + bs.nb] = y
76         else:
77             assert tuple(bs.x.size()[:2]) == tuple(self.cache_y.size()[:2])
78             assert bs.first + bs.nb <= self.cache_y.size(1)
79             self.cache_y[:, bs.first : bs.first + bs.nb] = self.f(bs.slice())
80
81         return BracketedSequence(self.cache_y, bs.first, bs.nb, bs.init_cache)
82
83
84 ##############################
85
86
87 class WithResidual(nn.Module):
88     def __init__(self, *f):
89         super().__init__()
90         self.f = f[0] if len(f) == 1 else nn.Sequential(*f)
91
92     def forward(self, bs):
93         return BracketedSequence(bs.x + self.f(bs).x, bs.first, bs.nb, bs.init_cache)
94
95
96 ##############################
97
98
99 class AddPositionalEncoding(nn.Module):
100     def __init__(self, len_max):
101         super().__init__()
102         self.len_max = len_max
103
104     # [Vaswani et al 2018] PE_{t,2i} = sin(t/(L^{2i/D})), PE_{t,2i+1} = cos(t/(L^{2i/D}))
105
106     def forward(self, bs):
107         if bs.init_cache:
108             t = torch.arange(bs.x.size(1), dtype=bs.x.dtype, device=bs.x.device)[
109                 :, None
110             ]
111             j = torch.arange(bs.x.size(2), dtype=bs.x.dtype, device=bs.x.device)[
112                 None, :
113             ]
114             k = j % 2
115             self.pe = torch.sin(
116                 t / (self.len_max ** ((j - k) / bs.x.size(2))) + math.pi / 2 * k
117             )
118             self.cache_y = bs.x.new(bs.x.size())
119
120         self.cache_y[:, bs.first : bs.first + bs.nb] = (
121             bs.slice() + self.pe[bs.first : bs.first + bs.nb]
122         )
123
124         return BracketedSequence(self.cache_y, bs.first, bs.nb, bs.init_cache)
125
126
127 import pscan
128
129 # X is /.../xTxD   A is /.../xT   Y_init is /.../xD
130
131
132 def pscan_dim(A, X, Y_init, dim=-2):
133     s = X.size()
134     a, T, b = s[:dim].numel(), s[dim], s[dim + 1 :].numel()
135
136     A = A.reshape(a, T, *s[dim + 1 : -1])
137     X = X.reshape(a, T, *s[dim + 1 : -1], -1)
138
139     if Y_init is None:
140         Y_init = X.new_zeros(a, *s[dim + 1 : -1], X.size(-1))
141     else:
142         Y_init = Y_init.reshape(a, *s[dim + 1 : -1], -1)
143
144     Y = pscan.pscan(A, X, Y_init).reshape(s)
145
146     return Y
147
148
149 def pscan_rgrad(grad_Y, A, X, Y_init, dim=-2, eps=1e-2):
150     with torch.no_grad():
151         s_A, s_X = 0, 0
152         for t in range(X.size(dim) - 1, 0, -1):
153             delta = (grad_Y[t] - s_A) / A[t].grad
154             s_A += A[t].grad * delta
155             A[t].grad = delta
156             delta = (grad_Y[t] - s_X) / X[t].grad
157             s_X += X[t].grad * delta
158             X[t].grad = delta
159
160
161 def pscan_shape(A, X, Y_init):
162     s = X.size()
163     A = A.reshape(-1, s[-2])
164     X = X.reshape(-1, s[-2], s[-1])
165
166     if Y_init is None:
167         Y_init = X.new_zeros(X.size(0), s[-1])
168     else:
169         Y_init = Y_init.reshape(-1, s[-1])
170
171     Y = pscan.pscan(A, X, Y_init).reshape(s)
172
173     return Y
174
175
176 def nsum_shape(X, Y_init):
177     s = X.size()
178     X = X.reshape(-1, s[-2], s[-1])  # ntd
179
180     Y = 0 if Y_init is None else Y_init.reshape(-1, s[-1])
181     result = []
182
183     for k in range(X.size(1)):
184         Y = Y + X[:, k]
185         Y = Y / Y.norm(dim=-1, keepdim=True).clamp(min=1)
186         result.append(Y)
187
188     return torch.cat(result, dim=1).reshape(s)
189
190
191 ##############################
192
193
194 class DumbRec(nn.Module):
195     def __init__(
196         self,
197         dim_model,
198         dim_qk,
199         dim_v,
200         nb_heads,
201         nb_lines,
202         attention_dropout=0.0,
203         len_max=1e5,
204         logger=print,
205         **kwargs,
206     ):
207         super().__init__()
208
209         def randw(*d):
210             return nn.Parameter(torch.randn(*d) / math.sqrt(d[-1]))
211
212         self.nb_lines = nb_lines
213         self.attention_dropout = attention_dropout
214
215         self.k_star = randw(nb_lines, dim_qk)
216
217         self.w_qw = randw(nb_heads, dim_qk, dim_model)
218         self.w_qr = randw(nb_heads, dim_qk, dim_model)
219         # self.w_k = randw(nb_heads, dim_qk, dim_model)
220         self.w_v = randw(nb_heads, dim_v, dim_model)
221         self.w_o = randw(dim_v * nb_heads, dim_model)
222
223     def reset_inner_loss(self):
224         self.acc_attention = 0
225         self.acc_nb = 0
226
227     def get_inner_loss(self):
228         warnings.warn("l2 regularization", RuntimeWarning)
229         return (self.acc_attention / self.acc_nb).pow(2).sum()
230         # return torch.tensor([0], device=self.w_qw.device)
231
232     def forward(self, bs):
233         x_q, t0, t1 = bs.x, bs.first, bs.first + bs.nb
234
235         if bs.init_cache:
236             self.rec_v = x_q.new_zeros(
237                 x_q.size(0), self.nb_lines, x_q.size(1), self.w_v.size(1)
238             )
239             # self.rec_k = x_q.new_zeros(
240             # x_q.size(0), self.nb_lines, x_q.size(1), self.w_k.size(1)
241             # )
242             self.cache_y = x_q.new_zeros(x_q.size(0), x_q.size(1), self.w_o.size(1))
243
244         ######################################################################
245         # Prepare the keys
246
247         k_star = self.k_star[:, None, :].expand(-1, t1 - t0, -1)
248
249         warnings.warn("rotating key barrel", RuntimeWarning)
250         k_star = self.k_star[:, None, :].expand(-1, x_q.size(1), -1)
251         t_barrel = torch.arange(t0, t1, device=k_star.device)
252         t_barrel = t_barrel[None, :].expand(k_star.size(0), t1 - t0)
253         l_barrel = (
254             torch.arange(k_star.size(0), device=k_star.device)[:, None] + t_barrel
255         ) % k_star.size(0)
256         k_star = k_star[l_barrel, t_barrel]
257
258         ######################################################################
259         # Compute the recurrent state
260
261         qw = torch.einsum("ntc,hdc->nhtd", x_q[:, t0:t1], self.w_qw)
262
263         v = torch.einsum("ntc,hdc->nhtd", x_q[:, t0:t1], self.w_v)
264         # k = torch.einsum("ntc,hdc->nhtd", x_q[:, t0:t1], self.w_k)
265
266         aw = torch.einsum(
267             "nhtd,ltd->nhlt",
268             qw,
269             k_star,
270         ) / math.sqrt(self.w_qw.size(1))
271
272         aw = aw.softmax(dim=2)  # nhlt
273
274         if self.train:
275             self.acc_attention += aw.sum(dim=(0, 1, 3))
276             self.acc_nb += aw.size(0) * aw.size(1) * aw.size(3)
277
278         aw = F.dropout(aw, self.attention_dropout, self.training)
279
280         A = 1 - aw.sum(dim=1)  # nlt
281
282         V = torch.einsum("nhlt,nhtd->nltd", aw, v).contiguous()
283         # K = torch.einsum("nhlt,nhtd->nltd", aw, k).contiguous()
284
285         if t0 == 0:
286             V0 = None
287             # K0 = None
288         else:
289             V0 = self.rec_v[:, :, t0 - 1]
290             # K0 = self.rec_k[:, :, t0 - 1]
291
292         self.rec_v[:, :, t0:t1] = pscan_shape(A, V, V0)
293         # self.rec_k[:, :, t0:t1] = pscan_shape(A, K, K0)
294
295         ######################################################################
296         # compute the readout
297
298         qr = torch.einsum("ntc,hdc->nhtd", x_q[:, t0:t1], self.w_qr)
299
300         ar = torch.einsum(
301             "nhtd,ld->nhlt",
302             qr,
303             # self.rec_k[:, :, t0:t1],
304             self.k_star,
305         ) / math.sqrt(self.w_qr.size(1))
306
307         ar = ar.softmax(dim=2)  # nhlt
308
309         ar = F.dropout(ar, self.attention_dropout, self.training)
310
311         y = torch.einsum(
312             "nhlt,nltd->nthd",
313             ar,
314             self.rec_v[:, :, t0:t1],
315         ).flatten(2)
316
317         self.cache_y[:, t0:t1] = y @ self.w_o
318
319         return BracketedSequence(self.cache_y, t0, t1 - t0, bs.init_cache)
320
321
322 ##############################
323
324
325 class KVRec(nn.Module):
326     def __init__(
327         self,
328         dim_model,
329         dim_qk,
330         dim_v,
331         nb_heads,
332         nb_lines,
333         attention_dropout=0.0,
334         len_max=1e5,
335         logger=print,
336         **kwargs,
337     ):
338         super().__init__()
339
340         def randw(*d):
341             return nn.Parameter(torch.randn(*d) / math.sqrt(d[-1]))
342
343         self.nb_lines = nb_lines
344         self.attention_dropout = attention_dropout
345
346         self.k_star = randw(nb_lines, dim_qk)
347
348         self.w_qw = randw(nb_heads, dim_qk, dim_model)
349         self.w_qr = randw(nb_heads, dim_qk, dim_model)
350         self.w_k = randw(nb_heads, dim_qk, dim_model)
351         self.w_v = randw(nb_heads, dim_v, dim_model)
352         self.w_o = randw(dim_v * nb_heads, dim_model)
353
354     def reset_inner_loss(self):
355         self.acc_attention = 0
356         self.acc_nb = 0
357
358     def get_inner_loss(self):
359         warnings.warn("l2 regularization", RuntimeWarning)
360         return (self.acc_attention / self.acc_nb).pow(2).sum()
361         # return torch.tensor([0], device=self.w_qw.device)
362         # warnings.warn("side regularization", RuntimeWarning)
363         # return (
364         # (0.5 / self.nb_lines - self.acc_attention / self.acc_nb).clamp(min=0).sum()
365         # )
366         # return torch.tensor([0], device=self.w_qw.device)
367
368     def forward(self, bs):
369         x_q, t0, t1 = bs.x, bs.first, bs.first + bs.nb
370
371         if bs.init_cache:
372             self.rec_v = x_q.new_zeros(
373                 x_q.size(0), self.nb_lines, x_q.size(1), self.w_v.size(1)
374             )
375             self.rec_k = x_q.new_zeros(
376                 x_q.size(0), self.nb_lines, x_q.size(1), self.w_k.size(1)
377             )
378             self.cache_y = x_q.new_zeros(x_q.size(0), x_q.size(1), self.w_o.size(1))
379
380         ######################################################################
381         # Prepare the keys
382
383         k_star = self.k_star[:, None, :].expand(-1, t1 - t0, -1)
384
385         warnings.warn("rotating key barrel", RuntimeWarning)
386         k_star = self.k_star[:, None, :].expand(-1, x_q.size(1), -1)
387         t_barrel = torch.arange(t0, t1, device=k_star.device)
388         t_barrel = t_barrel[None, :].expand(k_star.size(0), t1 - t0)
389         l_barrel = (
390             torch.arange(k_star.size(0), device=k_star.device)[:, None] + t_barrel
391         ) % k_star.size(0)
392         k_star = k_star[l_barrel, t_barrel]
393
394         ######################################################################
395         # Compute the recurrent state
396
397         qw = torch.einsum("ntc,hdc->nhtd", x_q[:, t0:t1], self.w_qw)
398
399         v = torch.einsum("ntc,hdc->nhtd", x_q[:, t0:t1], self.w_v)
400         k = torch.einsum("ntc,hdc->nhtd", x_q[:, t0:t1], self.w_k)
401
402         aw = torch.einsum(
403             "nhtd,ltd->nhlt",
404             qw,
405             k_star,
406         ) / math.sqrt(self.w_qw.size(1))
407
408         aw = aw.softmax(dim=2)  # nhlt
409
410         if self.train:
411             # We want all the memory lines to be used similarly
412             self.acc_attention += aw.sum(dim=(0, 1, 3))  # Sum accross NxHx_xT
413             self.acc_nb += aw.size(0) * aw.size(1) * aw.size(3)
414
415         aw = F.dropout(aw, self.attention_dropout, self.training)
416
417         A = 1 - aw.sum(dim=1)  # nlt
418
419         V = torch.einsum("nhlt,nhtd->nltd", aw, v).contiguous()
420         K = torch.einsum("nhlt,nhtd->nltd", aw, k).contiguous()
421
422         if t0 == 0:
423             V0 = None
424             K0 = None
425         else:
426             V0 = self.rec_v[:, :, t0 - 1]
427             K0 = self.rec_k[:, :, t0 - 1]
428
429         self.rec_v[:, :, t0:t1] = pscan_shape(A, V, V0)
430         self.rec_k[:, :, t0:t1] = pscan_shape(A, K, K0)
431
432         ######################################################################
433         # compute the readout
434
435         qr = torch.einsum("ntc,hdc->nhtd", x_q[:, t0:t1], self.w_qr)
436
437         ar = torch.einsum(
438             "nhtd,nltd->nhlt",
439             qr,
440             self.rec_k[:, :, t0:t1],
441         ) / math.sqrt(self.w_qr.size(1))
442
443         ar = ar.softmax(dim=2)  # nhlt
444
445         ar = F.dropout(ar, self.attention_dropout, self.training)
446
447         y = torch.einsum(
448             "nhlt,nltd->nthd",
449             ar,
450             self.rec_v[:, :, t0:t1],
451         ).flatten(2)
452
453         self.cache_y[:, t0:t1] = y @ self.w_o
454
455         return BracketedSequence(self.cache_y, t0, t1 - t0, bs.init_cache)
456
457
458 ##############################
459
460
461 # Returns a tensor with an additional index at rank win_dim, that move
462 # along the same dimension as dim, on a domain {0...win_size-1}, and
463 # dim is restricted on a domain reduced by win_size-1 values.
464
465
466 def moving_window(x, dim, win_dim, win_size):
467     size, stride = x.size(), x.stride()
468     size = size[:dim] + (size[dim] - win_size + 1,) + size[dim + 1 :]
469     size = size[:win_dim] + (win_size,) + size[win_dim:]
470     stride = stride[:win_dim] + (stride[dim],) + stride[win_dim:]
471
472     return x.as_strided(size=size, stride=stride)
473
474
475 ##############################
476
477
478 class Caterpillar(nn.Module):
479     def __init__(
480         self,
481         dim_model,
482         dim_qk,
483         dim_v,
484         nb_heads,
485         caterpillar_length,
486         caterpillar_height,
487         attention_dropout=0.0,
488         len_max=1e5,
489         logger=print,
490         **kwargs,
491     ):
492         super().__init__()
493
494         warnings.warn("Caterpillar", RuntimeWarning)
495
496         def randw(*d, amplitude=None):
497             if amplitude is None:
498                 amplitude = 1 / math.sqrt(d[-1])
499             return nn.Parameter(amplitude * torch.randn(*d))
500
501         self.caterpillar_length = caterpillar_length
502         self.caterpillar_height = caterpillar_height
503         self.attention_dropout = attention_dropout
504
505         ######################################################################
506         # sup_args
507
508         x = kwargs.get("gate_dropout")
509         if x is None:
510             self.proba_gate_dropout = 0.0
511         else:
512             self.proba_gate_dropout = float(x)
513
514         logger(f"self.proba_gate_dropout {self.proba_gate_dropout}")
515
516         x = kwargs.get("default_bg")
517         if x is None:
518             default_bg = -math.log(caterpillar_height - 1)
519         else:
520             default_bg = float(x)
521
522         logger(f"default_bg {default_bg}")
523
524         ######################################################################
525
526         self.w_G = randw(nb_heads, caterpillar_height, dim_model)
527         self.b_G = nn.Parameter(torch.full((nb_heads, caterpillar_height), default_bg))
528
529         self.w_K = randw(nb_heads, dim_qk, dim_model)
530         self.w_V = randw(nb_heads, dim_v, dim_model)
531         self.w_Q = randw(nb_heads, dim_qk, dim_model)
532         self.w_O = randw(dim_v * nb_heads, dim_model)
533
534         self.init_K_rec = randw(
535             caterpillar_height,
536             caterpillar_length,
537             dim_qk,
538         )
539         self.init_V_rec = randw(
540             caterpillar_height,
541             caterpillar_length,
542             dim_v,
543         )
544
545     def reset_inner_loss(self):
546         self.acc_attention = 0
547         self.acc_nb = 0
548
549     def get_inner_loss(self):
550         # warnings.warn("l2 regularization", RuntimeWarning)
551         # return (self.acc_attention / self.acc_nb).pow(2).sum()
552         return torch.tensor([0], device=self.w_Q.device)
553
554     def forward(self, bs):
555         # Dimensions to make the source a bit clearer, that's needed
556
557         X, t0, t1 = bs.slice(), bs.first, bs.first + bs.nb
558
559         N = bs.x.size(0)
560         T = bs.x.size(1)
561         H = self.w_V.size(0)
562         DV = self.w_V.size(1)
563         DK = self.w_K.size(1)
564         DM = self.w_O.size(1)
565         R = self.caterpillar_height
566         L = self.caterpillar_length
567
568         assert (
569             t0 >= L and (t1 - t0) % L == 0
570         ), f"bs.first should be greater than caterpillar_length, and bs.nb should be a multiple of caterpillar_length"
571
572         # We cache values to deal efficiently with auto-regression
573
574         if bs.init_cache:
575             self.rec_V = X.new_zeros(N, R, T, DV)
576             self.rec_K = X.new_zeros(N, R, T, DK)
577             # We start the recurrent sequences with optimizable
578             # initial values. No idea if it helps.
579             self.rec_V[:, :, t0 - L : t0, :] = self.init_V_rec[None, :, :, :]
580             self.rec_K[:, :, t0 - L : t0, :] = self.init_K_rec[None, :, :, :]
581
582             self.cache_Y = X.new_zeros(N, T, DM)
583
584         V = torch.einsum("ntc,hdc->nhtd", X, self.w_V)
585         K = torch.einsum("ntc,hdc->nhtd", X, self.w_K)
586
587         ######################################################################
588         # Compute the recurrent state
589
590         # This is the Gating sequence that modulates the storing of
591         # the new key and value in the R pairs of the current
592         # stack. There are R independent gating values, which means
593         # that the current K/V may be stored in multiple pairs of the
594         # recurrent state, or not at all.
595
596         G = (
597             torch.einsum("ntc,hrc->nhrt", X, self.w_G) + self.b_G[None, :, :, None]
598         ).sigmoid()
599
600         # Clip the gating to avoid values greater than 1 when several
601         # heads hit the same row
602
603         G = G / G.sum(1, keepdim=True).clamp(min=1)
604
605         ######################################################################
606
607         def recurrence(G, V, K):
608             # We prepare the arguments for the parallel scan
609
610             A = 1 - G.sum(1)
611
612             gated_V = torch.einsum("nhrt,nhtd->nrtd", G, V)
613             gated_K = torch.einsum("nhrt,nhtd->nrtd", G, K)
614
615             # We start from cached values, which matters in inference
616
617             init_rec_V = self.rec_V[:, :, t0 - L : t0]
618             init_rec_K = self.rec_K[:, :, t0 - L : t0]
619
620             # Here there is a trick: Since the stack at position t is
621             # computed by updating that at position t-L, the parallel
622             # scan operates with a period of L. To do so we split the
623             # sequence indexing in two axes, the second of size L, and
624             # run the parallel scan using the first as the sequence index.
625
626             A = A.unflatten(2, (-1, L))
627             gated_V = gated_V.unflatten(2, (-1, L))
628             gated_K = gated_K.unflatten(2, (-1, L))
629
630             next_V = pscan_dim(A, gated_V, init_rec_V, dim=2)
631             next_K = pscan_dim(A, gated_K, init_rec_K, dim=2)
632
633             next_V = next_V.flatten(2, 3)
634             next_K = next_K.flatten(2, 3)
635
636             return next_V, next_K
637
638         #################################################################
639
640         next_V, next_K = recurrence(G, V, K)
641
642         if self.training and self.proba_gate_dropout > 0.0:
643             # G is NxHxRxT where r is the caterpillar's row.
644
645             warnings.warn("gate dropout", RuntimeWarning)
646
647             # kill = (
648             # torch.rand(G.size(), device=G.device) <= self.proba_gate_dropout
649             # ).float()
650
651             kill = (
652                 torch.rand(N, H, R, t1 - t0, device=G.device).sort(dim=3).indices == 0
653             ).cumsum(dim=3)
654             kill = kill * (
655                 torch.rand(N, H, R, 1, device=G.device) <= self.proba_gate_dropout
656             )
657
658             mask = 1 - kill
659
660             masked_next_V, masked_next_K = recurrence(G * mask, V, K)
661
662             next_V = next_V.detach() + (masked_next_V - masked_next_V.detach()) / (
663                 1 - self.proba_gate_dropout
664             )
665             next_K = next_K.detach() + (masked_next_K - masked_next_K.detach()) / (
666                 1 - self.proba_gate_dropout
667             )
668
669         self.rec_V[:, :, t0:t1] = next_V
670         self.rec_K[:, :, t0:t1] = next_K
671
672         ######################################################################
673         # compute the readout
674
675         Q = torch.einsum("ntc,hdc->nhtd", X, self.w_Q)
676
677         # We build tensors NxHxTxFxL where N is the sample index, H
678         # the head, T the time, F the row in the caterpillar, and L
679         # the column in the caterpillar
680
681         windowed_V = moving_window(
682             self.rec_V[:, :, t0 - L + 1 : t1], dim=2, win_dim=3, win_size=L
683         )
684
685         windowed_K = moving_window(
686             self.rec_K[:, :, t0 - L + 1 : t1], dim=2, win_dim=3, win_size=L
687         )
688
689         # We have an attention score for each of the RxL values
690
691         ar = torch.einsum(
692             "nhtd,nftld->nhtfl",
693             Q,
694             windowed_K,
695         ) / math.sqrt(DK)
696
697         # softmax can operate only on one dimension, hence the
698         # flattening
699
700         ar = ar.flatten(3).softmax(dim=3).view(ar.size())
701
702         ar = F.dropout(ar, self.attention_dropout, self.training)
703
704         # Compute the output for each head, flatten to concatenate
705
706         Y = torch.einsum(
707             "nhtfl,nftld->nthd",
708             ar,
709             windowed_V,
710         ).flatten(2)
711
712         # Compute the final output
713
714         self.cache_Y[:, t0:t1] = Y @ self.w_O
715
716         return BracketedSequence(self.cache_Y, t0, t1 - t0, bs.init_cache)
717
718
719 ##############################
720
721
722 class QKVAttention(nn.Module):
723     def __init__(
724         self,
725         dim_model,
726         dim_qk,
727         dim_v,
728         nb_heads=1,
729         causal=False,
730         attention_dropout=0.0,
731         logger=print,
732         **kwargs,
733     ):
734         super().__init__()
735
736         def randw(*d):
737             return nn.Parameter(torch.randn(*d) / math.sqrt(d[-1]))
738
739         self.causal = causal
740         self.attention_dropout = attention_dropout
741         self.record_attention = False
742
743         self.w_q = randw(nb_heads, dim_qk, dim_model)
744         self.w_k = randw(nb_heads, dim_qk, dim_model)
745         self.w_v = randw(nb_heads, dim_v, dim_model)
746         self.w_o = randw(dim_v * nb_heads, dim_model)
747
748     def forward(self, bs):
749         x_q = bs.x
750
751         assert (
752             self.causal or bs.complete()
753         ), "Partial evaluation is only possible for causal models"
754
755         if bs.init_cache:
756             self.cache_k = x_q.new_zeros(
757                 x_q.size(0), self.w_k.size(0), x_q.size(1), self.w_k.size(1)
758             )
759             self.cache_v = x_q.new_zeros(
760                 x_q.size(0), self.w_v.size(0), x_q.size(1), self.w_v.size(1)
761             )
762             self.cache_y = x_q.new_zeros(x_q.size(0), x_q.size(1), self.w_o.size(1))
763
764         q = torch.einsum("ntc,hdc->nhtd", x_q[:, bs.first : bs.first + bs.nb], self.w_q)
765
766         self.cache_k[:, :, bs.first : bs.first + bs.nb] = torch.einsum(
767             "ntc,hdc->nhtd", x_q[:, bs.first : bs.first + bs.nb], self.w_k
768         )
769         self.cache_v[:, :, bs.first : bs.first + bs.nb] = torch.einsum(
770             "ntc,hdc->nhtd", x_q[:, bs.first : bs.first + bs.nb], self.w_v
771         )
772
773         a = torch.einsum(
774             "nhtd,nhsd->nhts", q, self.cache_k[:, :, : bs.first + bs.nb]
775         ) / math.sqrt(self.w_q.size(1))
776
777         if self.causal:
778             if bs.init_cache:
779                 self.cache_attzero = (
780                     torch.arange(x_q.size(1), device=q.device)[None, None, :, None]
781                     < torch.arange(x_q.size(1), device=q.device)[None, None, None, :]
782                 )
783             a = a.masked_fill(
784                 self.cache_attzero[
785                     :, :, bs.first : bs.first + bs.nb, : bs.first + bs.nb
786                 ],
787                 float("-inf"),
788             )
789
790         a = a.softmax(dim=3)
791
792         if self.record_attention:
793             self.a = a
794
795         a = F.dropout(a, self.attention_dropout, self.training)
796
797         y = torch.einsum(
798             "nhts,nhsd->nthd", a, self.cache_v[:, :, : bs.first + bs.nb]
799         ).flatten(2)
800
801         self.cache_y[:, bs.first : bs.first + bs.nb] = y @ self.w_o
802
803         return BracketedSequence(self.cache_y, bs.first, bs.nb, bs.init_cache)
804
805
806 ##############################
807
808
809 class MyGPT(nn.Module):
810     def __init__(
811         self,
812         vocabulary_size,
813         dim_model,
814         dim_keys,
815         dim_hidden,
816         nb_heads,
817         nb_blocks,
818         nb_lines=None,
819         caterpillar_height=None,
820         causal=False,
821         dropout=0.0,
822         len_max=1e5,
823         attention_layer="kvrec",
824         logger=print,
825         **kwargs,
826     ):
827         super().__init__()
828
829         assert attention_layer in {
830             "mha",
831             "dumbrec",
832             "kvrec",
833             "caterpillar",
834         }, f"Unknown attention operator {attention_layer}."
835
836         if attention_layer == "caterpillar":
837             assert nb_lines % caterpillar_height == 0
838             self.caterpillar_length = nb_lines // caterpillar_height
839             self.caterpillar_height = caterpillar_height
840         else:
841             self.caterpillar_length = -1
842             self.caterpillar_height = -1
843
844         assert dim_model % nb_heads == 0
845
846         self.embedding = nn.Sequential(
847             CacheWrapper(nn.Embedding(vocabulary_size, dim_model), nn.Dropout(dropout)),
848             AddPositionalEncoding(len_max),
849         )
850
851         trunk_blocks = []
852
853         def attlayer():
854             if attention_layer == "mha":
855                 return QKVAttention(
856                     dim_model=dim_model,
857                     dim_qk=dim_keys,
858                     dim_v=dim_model // nb_heads,
859                     nb_heads=nb_heads,
860                     causal=causal,
861                     attention_dropout=dropout,
862                     logger=logger,
863                     **kwargs,
864                 )
865             elif attention_layer == "dumbrec":
866                 return DumbRec(
867                     dim_model=dim_model,
868                     dim_qk=dim_keys,
869                     dim_v=dim_model // nb_heads,
870                     nb_heads=nb_heads,
871                     nb_lines=nb_lines,
872                     attention_dropout=dropout,
873                     logger=logger,
874                     **kwargs,
875                 )
876             elif attention_layer == "kvrec":
877                 return KVRec(
878                     dim_model=dim_model,
879                     dim_qk=dim_keys,
880                     dim_v=dim_model // nb_heads,
881                     nb_heads=nb_heads,
882                     nb_lines=nb_lines,
883                     attention_dropout=dropout,
884                     logger=logger,
885                     **kwargs,
886                 )
887             elif attention_layer == "caterpillar":
888                 return Caterpillar(
889                     dim_model=dim_model,
890                     dim_qk=dim_keys,
891                     dim_v=dim_model // nb_heads,
892                     nb_heads=nb_heads,
893                     caterpillar_length=self.caterpillar_length,
894                     caterpillar_height=self.caterpillar_height,
895                     attention_dropout=dropout,
896                     logger=logger,
897                     **kwargs,
898                 )
899             else:
900                 raise ValueError(f"Unknown attention type {attention_layer}.")
901
902         for b in range(nb_blocks):
903             trunk_blocks += [
904                 WithResidual(
905                     CacheWrapper(nn.LayerNorm((dim_model,))),
906                     attlayer(),
907                 ),
908                 WithResidual(
909                     CacheWrapper(
910                         nn.LayerNorm((dim_model,)),
911                         nn.Linear(in_features=dim_model, out_features=dim_hidden),
912                         nn.ReLU(),
913                         nn.Linear(in_features=dim_hidden, out_features=dim_model),
914                         nn.Dropout(dropout),
915                     ),
916                 ),
917             ]
918
919         self.trunk = nn.Sequential(*trunk_blocks)
920
921         self.readout = CacheWrapper(
922             nn.Linear(in_features=dim_model, out_features=vocabulary_size)
923         )
924
925         with torch.no_grad():
926             for m in self.modules():
927                 if isinstance(m, nn.Embedding):
928                     m.weight.normal_(mean=0, std=2e-2)
929                 elif isinstance(m, nn.LayerNorm):
930                     m.bias.zero_()
931                     m.weight.fill_(1.0)
932
933         self.reset_inner_loss()
934
935     def forward(self, bs):
936         bs = BracketedSequence(F.pad(bs.x, (1, -1)), bs.first, bs.nb, bs.init_cache)
937
938         # To make the code simpler in the Caterpillar layer, we pad
939         # here. It's unclear if/how much it hurts computationaly by
940         # increasing the sequence length for the other layers
941
942         if self.caterpillar_length > 0:
943             original_nb = bs.nb
944             if bs.nb % self.caterpillar_length > 0:
945                 bs.nb += self.caterpillar_length - bs.nb % self.caterpillar_length
946
947             bs = BracketedSequence(
948                 F.pad(bs.x, (self.caterpillar_length, self.caterpillar_length)),
949                 bs.first + self.caterpillar_length,
950                 bs.nb,
951                 bs.init_cache,
952             )
953
954         bs = self.embedding(bs)
955         bs = self.trunk(bs)
956         bs = self.readout(bs)
957
958         if self.caterpillar_length > 0:
959             bs = BracketedSequence(
960                 F.pad(bs.x, (0, 0, -self.caterpillar_length, -self.caterpillar_length)),
961                 bs.first - self.caterpillar_length,
962                 original_nb,
963                 bs.init_cache,
964             )
965
966         return bs
967
968     # ar_mask is a tensor with 0s and 1s, of same shape as input, with
969     # 1s where tokens should be generated. The others are kept
970     # unchanged.
971
972     def masked_inplace_autoregression(
973         self,
974         input_src,
975         ar_mask_src,
976         forbidden_tokens=None,
977         deterministic_synthesis=False,
978     ):
979         input = input_src.to(self.readout.f.weight.device)
980         ar_mask = ar_mask_src.to(self.readout.f.weight.device)
981         to_generate = (ar_mask.sum(0) > 0).nonzero()
982         if to_generate.min() > 0:
983             self(
984                 BracketedSequence(input, 0, to_generate.min(), True)
985             )  # Needed to initialize the model's cache
986         for s in range(to_generate.min(), to_generate.max() + 1):
987             output = self(BracketedSequence(input, s, 1, s == 0)).x
988             logits = output[:, s]
989             if forbidden_tokens is not None:
990                 logits = logits.masked_fill(forbidden_tokens, float("-inf"))
991             if deterministic_synthesis:
992                 t_next = logits.argmax(1)
993             else:
994                 dist = torch.distributions.categorical.Categorical(logits=logits)
995                 t_next = dist.sample()
996             input[:, s] = ar_mask[:, s] * t_next + (1 - ar_mask[:, s]) * input[:, s]
997
998         input_src.copy_(input)
999
1000     def reset_inner_loss(self):
1001         for m in self.modules():
1002             if m is not self and hasattr(m, "reset_inner_loss"):
1003                 m.reset_inner_loss()
1004
1005     def get_inner_loss(self):
1006         l = torch.tensor([0.0], device=self.readout.f.weight.device)
1007         for m in self.modules():
1008             if m is not self and hasattr(m, "get_inner_loss"):
1009                 l += m.get_inner_loss()
1010         return l
1011
1012     def record_attention(self, v=True):
1013         for m in self.modules():
1014             if isinstance(m, QKVAttention):
1015                 m.record_attention = v
1016
1017     def retrieve_attention(self):
1018         a = []
1019         for m in self.modules():
1020             if isinstance(m, QKVAttention):
1021                 a.append(m.a)
1022         return a
1023
1024
1025 ######################################################################
1026
1027 if __name__ == "__main__":
1028     print("Basic check.")
1029
1030     m = Caterpillar(
1031         dim_model=4,
1032         dim_qk=3,
1033         dim_v=7,
1034         nb_heads=1,
1035         caterpillar_length=7,
1036         caterpillar_height=3,
1037         attention_dropout=0.0,
1038     )
1039
1040     m.reset_inner_loss()
1041     x = torch.randn(1, 21 + 2 * 7, 4)
1042     y1 = m(BracketedSequence(x, first=7, nb=21, init_cache=True)).x[:, 7:28]
1043     y2 = m(BracketedSequence(x, first=7, nb=21, init_cache=True)).x[:, 7:28]
1044     y3a = m(BracketedSequence(x, first=7, nb=14, init_cache=True)).x[:, 7:21]
1045     y3b = m(BracketedSequence(x, first=21, nb=7, init_cache=False)).x[:, 21:28]
1046     print((y1 - y2).abs().max())
1047     print((y1 - torch.cat([y3a, y3b], dim=1)).abs().max())
1048     exit(0)
1049
1050     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
1051
1052     vocabulary_size = 128
1053     x = torch.randint(vocabulary_size, (6, 1024))
1054
1055     model = MyGPT(
1056         vocabulary_size=vocabulary_size,
1057         dim_model=512,
1058         dim_keys=64,
1059         dim_hidden=2048,
1060         nb_heads=8,
1061         nb_lines=128,
1062         nb_blocks=12,
1063         dropout=0.1,
1064         causal=True,
1065     )
1066
1067     x = x.to(device)
1068     model.to(device)
1069
1070     import time, sys
1071
1072     # import torchvision.models as models
1073     # from torch.profiler import profile, record_function, ProfilerActivity
1074
1075     # with profile(activities=[ProfilerActivity.CPU,  ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
1076     # with record_function("model_inference"):
1077
1078     model.eval()
1079     for i in range(3):
1080         start_time = time.perf_counter()
1081         for k in range(10):
1082             model(BracketedSequence(x))
1083         duration = time.perf_counter() - start_time
1084         print(duration)
1085         sys.stdout.flush()
1086
1087     # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
1088     # print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
1089
1090     # print("##############################################################")
1091     # y2 = torch.randn_like(y1)
1092     # for s in range(x.size(1)):
1093     # z = model(BracketedSequence(x, s, 1))
1094     # y2[:, s : s + 1] = z.slice()
1095
1096     # print(f"error={((y1 - y2).norm() / (y1.norm() + y2.norm())).item()}")
1097
1098 ######################################################################