######################################################################

2024 Jan 07 21:37:48 (from mygpt.py)


# This is one order of magnitude more complicated than I expected, not
# elegant, slow, hopefully not buggy


def flash_back_time_src(N, H, t0, t1, CL, CH, proba, device):
    # starting flash backs
    fb_start = (torch.rand(N, CH, t1 - t0, device=device) <= proba).long()
    fb_start[:, :, -CL:] = 0
    fb_start[:, :, :CL] = 0

    # Remove series longer than CL
    fb_body = fb_start.clone()
    fb_body[:, :, CL + 1 :] -= fb_start[:, :, : -(CL + 1)]
    fb_body = fb_body.cumsum(dim=2)
    fb_start = fb_start * (fb_body == 1)

    # Set a origin source time (starting time of the chunck to copy
    # here) We set it as the current time minus a multiple of CL to be
    # consistent with the "rolling" caterpillar
    t = torch.arange(fb_start.size(2), device=fb_start.device)[None, None, :]
    src_time = fb_start * (
        t
        - CL
        * (
            1
            + (
                torch.rand(fb_start.size(), device=fb_start.device) * (t // CL - 1)
            ).long()
        )
    )
    src_time[:, :, CL:] -= src_time.clone()[:, :, :-CL]
    src_time = src_time.cumsum(dim=2)

    src_head = fb_start * torch.randint(H, fb_start.size(), device=fb_start.device)
    src_head[:, :, CL:] -= src_head.clone()[:, :, :-CL]
    src_head = src_head.cumsum(dim=2)

    # combine
    src_delta = fb_start.clone()
    src_delta[:, :, CL:] -= fb_start[:, :, :-CL]
    src_delta = src_delta.cumsum(dim=2)
    src_delta[:, :, CL:] -= CL * fb_start[:, :, :-CL]
    src_time += src_delta.cumsum(dim=2) - 1

    return src_time, src_head


def insert_flash_back(rec_V, V, rec_K, K, t0, t1, CL, proba):
    N, H, CH = V.size(0), V.size(1), rec_V.size(1)

    fbt, fbh = flash_back_time_src(N, H, t0, t1, CL, CH, proba, rec_V.device)

    fbt_V = fbt[:, :, :, None]
    fbh_V = fbh[:, :, :, None]
    t = fbt_V.clamp(min=0)
    n = torch.arange(V.size(0), device=V.device)[:, None, None, None]
    d = torch.arange(V.size(3), device=V.device)[None, None, None, :]
    q = V[:, :, t0:t1][n, fbh_V, t, d]
    rec_V[:, :, t0:t1] = q * (fbt_V >= 0) + rec_V[:, :, t0:t1] * (fbt_V < 0)

    fbt_K = fbt[:, :, :, None]
    fbh_K = fbh[:, :, :, None]
    t = fbt_K.clamp(min=0)
    n = torch.arange(K.size(0), device=K.device)[:, None, None, None]
    d = torch.arange(K.size(3), device=K.device)[None, None, None, :]
    q = K[:, :, t0:t1][n, fbh_K, t, d]
    rec_K[:, :, t0:t1] = q * (fbt_K >= 0) + rec_K[:, :, t0:t1] * (fbt_K < 0)


######################################################################

######################################################################

2024 Jan 07 21:38:11 (from mygpt.py)

            # insert_flash_back(self.rec_V,V,self.rec_K,K,t0,t1,CL,proba=self.proba_flashback / CL,)


######################################################################

2024 Jan 09 14:24:42 (from mygpt.py)

            # This piece of code makes the assumption that there is
            # nothing informative before t0, otherwise we'd have to
            # implement a cache for V and K too. This should not be
            # too much of a problem since this is used only during
            # train, where full sequence are available

            # n = torch.arange(N, device=X.device)[:, None, None, None]
            # t = torch.arange(t0, t1, device=X.device)[None, None, :, None]
            # dv = torch.arange(DV, device=X.device)[None, None, None, :]
            # dk = torch.arange(DK, device=X.device)[None, None, None, :]

            # u = (
                # torch.rand(N, CH, t1 - t0, 1, device=X.device).mul(t).long() // CL
            # ) * CL

            # src_time = t - u - t0
            # src_head = torch.randint(H, (N, CH, t1 - t0, 1), device=X.device)

            # mask = (
                # torch.rand(N, CH, t1 - t0, DV, device=X.device) <= self.proba_flashback
            # ).long()

            # self.rec_V[:, :, t0:t1] = (
                # mask * V[n, src_head, src_time, dv]
                # + (1 - mask) * self.rec_V[:, :, t0:t1]
            # )

            # self.rec_K[:, :, t0:t1] = (
                # mask * K[n, src_head, src_time, dk]
                # + (1 - mask) * self.rec_K[:, :, t0:t1]
            # )

######################################################################

2024 Jan 10 08:10:39 (from mygpt.py)

        # That was a bad idea
        # G = F.dropout(G, self.attention_dropout, self.training)


######################################################################

2024 Jan 10 08:46:13 (from mygpt.py)

        #################################################################
        # Flashbacks. This version sucks, about to replace it 
        if self.training and self.proba_flashback > 0.0:
            warnings.warn("flash back", RuntimeWarning)
            # This piece of code makes the assumption that there is
            # nothing informative before t0, otherwise we'd have to
            # implement a cache for V and K too. This should not be
            # too much of a problem since this is used only during
            # train, where full sequence are available

            n = torch.arange(N, device=X.device)[:, None, None, None]
            t = torch.arange(t0, t1, device=X.device)[None, None, :, None]
            dv = torch.arange(DV, device=X.device)[None, None, None, :]
            dk = torch.arange(DK, device=X.device)[None, None, None, :]

            u = (
                torch.rand(N, CH, t1 - t0, 1, device=X.device).mul(t).long() // CL
            ) * CL

            src_time = t - u - t0
            src_head = torch.randint(H, (N, CH, t1 - t0, 1), device=X.device)

            mask = (
                torch.rand(N, CH, t1 - t0, DV, device=X.device) <= self.proba_flashback
            ).long()

            self.rec_V[:, :, t0:t1] = (
                mask * V[n, src_head, src_time, dv]
                + (1 - mask) * self.rec_V[:, :, t0:t1]
            )

            self.rec_K[:, :, t0:t1] = (
                mask * K[n, src_head, src_time, dk]
                + (1 - mask) * self.rec_K[:, :, t0:t1]
            )


######################################################################

2024 Jan 13 13:38:31 (from mygpt.py)

        g= F.sigmoid(self.b_G)
        a=1-g

        print(f"\n\nSANITY {a**T}\n")
        exit(0)


######################################################################

2024 Jan 14 13:39:37 (from mygpt.py)

            epsilon = 0.5

            dropout_head = (
                (torch.rand(N, H, 1, t1 - t0, device=G.device).sort(dim=3).indices == 0)
                .expand_as(G)
                .float()
            )

            dropout_tail = dropout_head.cumsum(dim=3) - dropout_head

            dropout_active = (
                torch.rand(N, 1, 1, 1, device=G.device) < self.proba_gate_dropout
            ).long()

            dropout_head *= dropout_active
            dropout_tail *= dropout_active

            G = (
                G
                + dropout_head * (1 - epsilon - G.detach())
                - dropout_tail * G.detach()
            )

######################################################################

2024 Jan 18 07:39:29 (from mygpt.py)

class Calibrator:
    def __init__(self, w=None, b=None):
        self.w = w
        self.b = b
        self.s, self.s_sq, self.n = 0, 0, 0
        self.mean, self.std = 0, 0

    def update(self, X):
        X = X.detach()
        self.s += X.sum(dim=0)
        self.s_sq += X.pow(2).sum(dim=0)
        self.n += X.size(0)

    def moments(self):
        mean = self.s / self.n
        std = (self.s_sq / self.n - mean * mean).sqrt()
        return mean, std

    def normalize(self):
        mean, std = self.moments()
        if self.b is not None:
            self.b.sub_(mean)
        if self.w is not None:
            self.w.div_(std)
        result = mean - self.mean, std - self.std
        self.mean, self.std = mean, std
        self.s, self.s_sq, self.n = 0, 0, 0
        return result


######################################################################

2024 Jan 18 07:39:34 (from mygpt.py)

        # self.calibrator_G = Calibrator()
        # self.calibrator_rec_V = Calibrator()
        # self.calibrator_rec_K = Calibrator()


######################################################################

2024 Jan 18 07:39:37 (from mygpt.py)

        # self.calibrator_G.update(G.reshape(-1, G.size(-1)))


######################################################################

2024 Jan 18 07:39:42 (from mygpt.py)

        # self.calibrator_rec_V.update(
        # next_V.permute(0, 1, 3, 2).reshape(-1, next_V.size(2))
        # )
        # self.calibrator_rec_K.update(
        # next_K.permute(0, 1, 3, 2).reshape(-1, next_K.size(2))
        # )


######################################################################

2024 Jan 18 07:47:12 (from mygpt.py)

        ######################################################################
        # Roll the gating indexes

        # warnings.warn("rotating barrel", RuntimeWarning)

        # r_barrel = torch.arange(R, device=G.device)[None, None, :, None]
        # t_barrel = torch.arange(t1 - t0, device=G.device)[None, None, None, :]
        # r_barrel = (r_barrel + (t_barrel + t0) // L) % R
        # G = G.gather(dim=2, index=r_barrel.expand_as(G))


######################################################################

2024 Jan 18 07:47:25 (from mygpt.py)

        # warnings.warn("harmonic recurrence", RuntimeWarning)
        # har = torch.arange(t0, t1, device = G.device).float() + 1
        # A = har / (har + 1)
        # G = G / har


######################################################################

2024 Jan 18 08:46:18 (from mygpt.py)

        # warnings.warn("softmax gating", RuntimeWarning)

        # G = (
        # torch.einsum("ntc,hrc->nhrt", X, self.w_G) + self.b_G[None, :, :, None]
        # ).softmax(dim=2)

######################################################################

2024 Jan 21 16:55:24 (from main.py)

        with open("test.dat", "a") as f:
            for m filter(lambda m: isinstance(m,mygpt.Catenn.Linear),model.modules()):
                for p in m.parameters() ]


        for m in model.modules():
            if isinstance(m, mygpt.Caterpillar):
                

######################################################################

2024 Feb 13 22:53:52 (from mygpt.py)

        ######################################################################
        # Prepare the keys

        k_star = self.k_star[:, None, :].expand(-1, t1 - t0, -1)

        warnings.warn("rotating key barrel", RuntimeWarning)
        k_star = self.k_star[:, None, :].expand(-1, x_q.size(1), -1)
        t_barrel = torch.arange(t0, t1, device=k_star.device)
        t_barrel = t_barrel[None, :].expand(k_star.size(0), t1 - t0)
        l_barrel = (
            torch.arange(k_star.size(0), device=k_star.device)[:, None] + t_barrel
        ) % k_star.size(0)
        k_star = k_star[l_barrel, t_barrel]


######################################################################

2024 Feb 15 23:10:50 (from main.py)


def add_memex_v4(batches, memex_proba, marker_token):
    for input in batches:
        if torch.rand(1).item() < memex_proba:
            t = (
                torch.arange(2 * input.size(1), device=input.device)[None, :]
                .expand(input.size(0), -1)
                .clone()
            )

            u = torch.rand(t.size(), device=t.device)
            u[:, : input.size(1)] = 1.0
            memex_v3_proba_fragment = 1 / 20
            u = (u < memex_v3_proba_fragment).long()
            v = u * torch.randint(input.size(1), u.size())
            u[:, input.size(1) + 1 :] = v[:, input.size(1) + 1 :] - u[
                :, : input.size(1) - 1
            ] * input.size(1)
            u = u.cumsum().clamp(min=0)

            u0 = torch.randint(input.size(1), (input.size(0), 1), device=input.device)
            caterpillar_length = args.nb_lines // args.caterpillar_height
            u1 = (
                u0
                + torch.randint(
                    caterpillar_length, (input.size(0), 1), device=input.device
                )
                + 1
            )

            m0 = (t < u0).long()
            m1 = (t >= u1).long() * (t < u1 + input.size(1)).long()

            t = t * m0 + ((-1) * (1 - m0) * (1 - m1)) + (t - u1) * m1
            m = (t < 0).long()
            n = torch.arange(input.size(0), device=input.device)[:, None].expand(
                -1, t.size(1)
            )

            new_input = input[n, t.clamp(min=0)]
            new_input = (1 - m) * new_input + m * (marker_token)

            yield new_input

        yield input


######################################################################

2024 Feb 16 17:07:48 (from main.py)

                # ||gn + lambda * gm|| = max(||gn||,||gm||)
                # ||gn||^2 + lambda<gn,gm> + lambda^2||gm||^2 = max(||gn||^2,||gm||^2)
                # A = ||gm||^2 B = <gn,gm> C = ||gn||^2 - max(||gn||^2, ||gm||^2)

######################################################################

2024 Feb 16 17:07:51 (from main.py)

                # A,B,C = gmgm, gngm, gngn - max(gngn,gmgm)
                # Delta = B*B - 4*A*C
                # if(delta >= 0):
                    # l = ( -B - sqrt(Delta))/(2*A)
                # ||gn||+l*rho*||gm|| = max(||gn||,rho*||gm||)