Update.

author François Fleuret <francois@fleuret.org>

Mon, 25 Mar 2024 08:59:19 +0000 (09:59 +0100)

committer François Fleuret <francois@fleuret.org>

Mon, 25 Mar 2024 08:59:19 +0000 (09:59 +0100)
author François Fleuret <francois@fleuret.org>
Mon, 25 Mar 2024 08:59:19 +0000 (09:59 +0100)
committer François Fleuret <francois@fleuret.org>
Mon, 25 Mar 2024 08:59:19 +0000 (09:59 +0100)
diff --git a/mygpt.py b/mygpt.py

index 0cf70e0..77c29ce 100755 (executable)
--- a/mygpt.py
+++ b/mygpt.py
@@ -275,7 +275,12 @@ class MyGPT(nn.Module):
      # unchanged.
  
      def masked_inplace_autoregression(
      # unchanged.
  
      def masked_inplace_autoregression(
-        self, input, ar_mask, forbidden_tokens=None, deterministic_synthesis=False
+        self,
+        input,
+        ar_mask,
+        deterministic_synthesis=False,
+        forbidden_tokens=None,
+        forced_biases=None,
      ):
          to_generate = (ar_mask.sum(0) > 0).nonzero()
          if to_generate.min() > 0:
      ):
          to_generate = (ar_mask.sum(0) > 0).nonzero()
          if to_generate.min() > 0:
@@ -287,6 +292,8 @@ class MyGPT(nn.Module):
              logits = output[:, s]
              if forbidden_tokens is not None:
                  logits = logits.masked_fill(forbidden_tokens, float("-inf"))
              logits = output[:, s]
              if forbidden_tokens is not None:
                  logits = logits.masked_fill(forbidden_tokens, float("-inf"))
+            if forced_biases is not None:
+                logits = logits + forced_biases[None, :]
              if deterministic_synthesis:
                  t_next = logits.argmax(1)
              else:
              if deterministic_synthesis:
                  t_next = logits.argmax(1)
              else:
diff --git a/tasks.py b/tasks.py

index 5153836..6b6b8f2 100755 (executable)
--- a/tasks.py
+++ b/tasks.py
@@ -27,6 +27,7 @@ def masked_inplace_autoregression(
      ar_mask,
      deterministic_synthesis,
      forbidden_tokens=None,
      ar_mask,
      deterministic_synthesis,
      forbidden_tokens=None,
+    logit_biases=None,
      progress_bar_desc="autoregression",
      device=torch.device("cpu"),
  ):
      progress_bar_desc="autoregression",
      device=torch.device("cpu"),
  ):
@@ -48,7 +49,11 @@ def masked_inplace_autoregression(
  
          for input, ar_mask in batches:
              model.masked_inplace_autoregression(
  
          for input, ar_mask in batches:
              model.masked_inplace_autoregression(
-                input, ar_mask, forbidden_tokens, deterministic_synthesis
+                input,
+                ar_mask,
+                deterministic_synthesis,
+                forbidden_tokens,
+                logit_biases,
              )
  
          model.train(t)
              )
  
          model.train(t)
@@ -1917,9 +1922,12 @@ class Escape(Task):
          t = torch.arange(result.size(1), device=result.device)[None, :]
  
          state_len = self.height * self.width
          t = torch.arange(result.size(1), device=result.device)[None, :]
  
          state_len = self.height * self.width
+        index_action = state_len
+        index_reward = state_len + 1
+        index_lookahead_reward = state_len + 2
          it_len = state_len + 3  # state / action / reward / lookahead_reward
  
          it_len = state_len + 3  # state / action / reward / lookahead_reward
  
-        def ar(result, ar_mask):
+        def ar(result, ar_mask, logit_biases=None):
              ar_mask = ar_mask.expand_as(result)
              result *= 1 - ar_mask
              masked_inplace_autoregression(
              ar_mask = ar_mask.expand_as(result)
              result *= 1 - ar_mask
              masked_inplace_autoregression(
@@ -1927,47 +1935,36 @@ class Escape(Task):
                  self.batch_size,
                  result,
                  ar_mask,
                  self.batch_size,
                  result,
                  ar_mask,
-                deterministic_synthesis,
+                deterministic_synthesis=deterministic_synthesis,
+                logit_biases=logit_biases,
                  device=self.device,
                  progress_bar_desc=None,
              )
  
          # Generate iteration after iteration
  
                  device=self.device,
                  progress_bar_desc=None,
              )
  
          # Generate iteration after iteration
  
+        optimistic_bias = result.new_zeros(self.nb_codes, device=result.device)
+        optimistic_bias[(-1) + escape.first_lookahead_rewards_code + 1] = math.log(1e-1)
+        optimistic_bias[(1) + escape.first_lookahead_rewards_code + 1] = math.log(1e1)
+
          for u in tqdm.tqdm(
              range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking"
          ):
          for u in tqdm.tqdm(
              range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking"
          ):
-            # Put the lookahead reward to either 0 or -1 for the
-            # current iteration, with a proba that depends with the
-            # sequence index, so that we have diverse examples, sample
-            # the next state
-            s = -(
-                torch.rand(result.size(0), device=result.device)
-                <= torch.linspace(0, 1, result.size(0), device=result.device)
-            ).long()
-            result[:, u - 1] = s + 1 + escape.first_lookahead_rewards_code
+            # Generate the lookahead_reward pessimistically
+            ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long()
+            ar(result, ar_mask, logit_biases=-optimistic_bias)
+
+            # Generate the state
              ar_mask = (t >= u).long() * (t < u + state_len).long()
              ar(result, ar_mask)
  
              ar_mask = (t >= u).long() * (t < u + state_len).long()
              ar(result, ar_mask)
  
-            # Put the lookahead reward to +1 for the current
-            # iteration, sample the action and reward
-            s = 1
-            result[:, u - 1] = s + 1 + escape.first_lookahead_rewards_code
-            ar_mask = (t >= u + state_len).long() * (t < u + state_len + 2).long()
-            ar(result, ar_mask)
+            # Generate the lookahead_reward optimistically
+            ar_mask = (t < u).long() * (t % it_len == index_lookahead_reward).long()
+            ar(result, ar_mask, logit_biases=optimistic_bias)
  
  
-            # Fix the previous lookahead rewards in a consistant state
-            for v in range(0, u, it_len):
-                # Extract the rewards
-                r = result[:, range(v + state_len + 1 + it_len, u + it_len - 1, it_len)]
-                r = r - escape.first_rewards_code - 1
-                r = r.clamp(min=-1, max=1)  # the reward is predicted hence can be weird
-                a = r.min(dim=1).values
-                b = r.max(dim=1).values
-                s = (a < 0).long() * a + (a >= 0).long() * b
-                result[:, v + state_len + 2] = (
-                    s + 1 + escape.first_lookahead_rewards_code
-                )
+            # Generate the action and reward
+            ar_mask = (t >= u + index_action).long() * (t <= u + index_reward).long()
+            ar(result, ar_mask)
  
          # Saving the generated sequences
  
  
          # Saving the generated sequences
author	François Fleuret <francois@fleuret.org>
	Mon, 25 Mar 2024 08:59:19 +0000 (09:59 +0100)
committer	François Fleuret <francois@fleuret.org>
	Mon, 25 Mar 2024 08:59:19 +0000 (09:59 +0100)
mygpt.py		patch \| blob \| history
tasks.py		patch \| blob \| history