From 3e4af6d54fb3d7bd6794035cb79e30ecdcadeb6f Mon Sep 17 00:00:00 2001
From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= <francois@fleuret.org>
Date: Wed, 10 Jan 2024 08:46:29 +0100
Subject: [PATCH] Update.

---
 fridge   | 41 +++++++++++++++++++++++++++++++++++++++++
 mygpt.py | 39 ++-------------------------------------
 2 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/fridge b/fridge
index bb6f46e..dcaac19 100644
--- a/fridge
+++ b/fridge
@@ -125,3 +125,44 @@ def insert_flash_back(rec_V, V, rec_K, K, t0, t1, CL, proba):
         # That was a bad idea
         # G = F.dropout(G, self.attention_dropout, self.training)
 
+
+######################################################################
+
+2024 Jan 10 08:46:13 (from mygpt.py)
+
+        #################################################################
+        # Flashbacks. This version sucks, about to replace it 
+        if self.training and self.proba_flashback > 0.0:
+            warnings.warn("flash back", RuntimeWarning)
+            # This piece of code makes the assumption that there is
+            # nothing informative before t0, otherwise we'd have to
+            # implement a cache for V and K too. This should not be
+            # too much of a problem since this is used only during
+            # train, where full sequence are available
+
+            n = torch.arange(N, device=X.device)[:, None, None, None]
+            t = torch.arange(t0, t1, device=X.device)[None, None, :, None]
+            dv = torch.arange(DV, device=X.device)[None, None, None, :]
+            dk = torch.arange(DK, device=X.device)[None, None, None, :]
+
+            u = (
+                torch.rand(N, CH, t1 - t0, 1, device=X.device).mul(t).long() // CL
+            ) * CL
+
+            src_time = t - u - t0
+            src_head = torch.randint(H, (N, CH, t1 - t0, 1), device=X.device)
+
+            mask = (
+                torch.rand(N, CH, t1 - t0, DV, device=X.device) <= self.proba_flashback
+            ).long()
+
+            self.rec_V[:, :, t0:t1] = (
+                mask * V[n, src_head, src_time, dv]
+                + (1 - mask) * self.rec_V[:, :, t0:t1]
+            )
+
+            self.rec_K[:, :, t0:t1] = (
+                mask * K[n, src_head, src_time, dk]
+                + (1 - mask) * self.rec_K[:, :, t0:t1]
+            )
+
diff --git a/mygpt.py b/mygpt.py
index ed4b2a7..d8fd227 100755
--- a/mygpt.py
+++ b/mygpt.py
@@ -483,7 +483,6 @@ class Caterpillar(nn.Module):
         self.caterpillar_height = caterpillar_height
         self.attention_dropout = attention_dropout
 
-        self.proba_flashback = 0.0
         self.proba_gate_dropout = 0.0
 
         self.w_G = randw(nb_heads, caterpillar_height, dim_model)
@@ -572,6 +571,8 @@ class Caterpillar(nn.Module):
         init_rec_V = self.rec_V[:, :, t0 - CL : t0]
         init_rec_K = self.rec_K[:, :, t0 - CL : t0]
 
+        ######################################################################
+
         if self.training and self.proba_gate_dropout > 0.0:
             warnings.warn("gate dropout", RuntimeWarning)
             epsilon = 0.5
@@ -595,42 +596,6 @@ class Caterpillar(nn.Module):
         self.rec_V[:, :, t0:t1] = next_V.flatten(2, 3)
         self.rec_K[:, :, t0:t1] = next_K.flatten(2, 3)
 
-        #################################################################
-
-        if self.training and self.proba_flashback > 0.0:
-            warnings.warn("flash back", RuntimeWarning)
-            # This piece of code makes the assumption that there is
-            # nothing informative before t0, otherwise we'd have to
-            # implement a cache for V and K too. This should not be
-            # too much of a problem since this is used only during
-            # train, where full sequence are available
-
-            n = torch.arange(N, device=X.device)[:, None, None, None]
-            t = torch.arange(t0, t1, device=X.device)[None, None, :, None]
-            dv = torch.arange(DV, device=X.device)[None, None, None, :]
-            dk = torch.arange(DK, device=X.device)[None, None, None, :]
-
-            u = (
-                torch.rand(N, CH, t1 - t0, 1, device=X.device).mul(t).long() // CL
-            ) * CL
-
-            src_time = t - u - t0
-            src_head = torch.randint(H, (N, CH, t1 - t0, 1), device=X.device)
-
-            mask = (
-                torch.rand(N, CH, t1 - t0, DV, device=X.device) <= self.proba_flashback
-            ).long()
-
-            self.rec_V[:, :, t0:t1] = (
-                mask * V[n, src_head, src_time, dv]
-                + (1 - mask) * self.rec_V[:, :, t0:t1]
-            )
-
-            self.rec_K[:, :, t0:t1] = (
-                mask * K[n, src_head, src_time, dk]
-                + (1 - mask) * self.rec_K[:, :, t0:t1]
-            )
-
         ######################################################################
         # compute the readout
 
-- 
2.39.5