From e39282eef52a7f5ab6654b999009127569b1b599 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= <francois@fleuret.org>
Date: Sun, 24 Mar 2024 18:15:08 +0100
Subject: [PATCH] Update.

---
 tasks.py | 54 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/tasks.py b/tasks.py
index 38c85ed..829eb24 100755
--- a/tasks.py
+++ b/tasks.py
@@ -1913,11 +1913,14 @@ class Escape(Task):
         self, n_epoch, model, result_dir, logger, deterministic_synthesis, nmax=1000
     ):
         result = self.test_input[:100].clone()
-        t = torch.arange(result.size(1), device=result.device)
+        t = torch.arange(result.size(1), device=result.device)[None, :]
+
         state_len = self.height * self.width
-        iteration_len = state_len + 3
+        it_len = state_len + 3  # state / action / reward / lookahead_reward
 
-        def ar():
+        def ar(result, ar_mask):
+            ar_mask = ar_mask.expand_as(result)
+            result *= 1 - ar_mask
             masked_inplace_autoregression(
                 model,
                 self.batch_size,
@@ -1925,26 +1928,39 @@ class Escape(Task):
                 ar_mask,
                 deterministic_synthesis,
                 device=self.device,
+                progress_bar_desc=None,
             )
 
-        for u in range(
-            iteration_len, result.size(1) - iteration_len + 1, iteration_len
+        # Generate iteration after iteration
+
+        for u in tqdm.tqdm(
+            range(it_len, result.size(1) - it_len + 1, it_len), desc="thinking"
         ):
-            # Put a lookahead reward to -1, sample the next state
-            result[:, u - 1] = (-1) + 1 + escape.first_lookahead_rewards_code
+            # Put the lookahead reward to -1 for the current iteration,
+            # sample the next state
+            s = -1
+            result[:, u - 1] = s + 1 + escape.first_lookahead_rewards_code
             ar_mask = (t >= u).long() * (t < u + state_len).long()
-            ar_mask = ar_mask[None, :]
-            ar_mask = ar_mask.expand_as(result)
-            result *= 1 - ar_mask
-            ar()
-
-            # Put a lookahead reward to +1, sample the action and reward
-            result[:, u - 1] = (1) + 1 + escape.first_lookahead_rewards_code
-            ar_mask = (t >= state_len).long() * (t < state_len + 2).long()
-            ar_mask = ar_mask[None, :]
-            ar_mask = ar_mask.expand_as(result)
-            result *= 1 - ar_mask
-            ar()
+            ar(result, ar_mask)
+
+            # Put the lookahead reward to +1 for the current
+            # iteration, sample the action and reward
+            s = 1
+            result[:, u - 1] = s + 1 + escape.first_lookahead_rewards_code
+            ar_mask = (t >= u + state_len).long() * (t < u + state_len + 2).long()
+            ar(result, ar_mask)
+
+            # Fix the previous lookahead rewards in a consistant state
+            for v in range(0, u, it_len):
+                # Extract the rewards
+                r = result[:, range(v + state_len + 1 + it_len, u + it_len - 1, it_len)]
+                r = r - escape.first_lookahead_rewards_code - 1
+                a = r.min(dim=1).values
+                b = r.max(dim=1).values
+                s = (a < 0).long() * a + (a >= 0).long() * b
+                result[:, v + state_len + 2] = (
+                    s + 1 + escape.first_lookahead_rewards_code
+                )
 
         # Saving the generated sequences
 
-- 
2.20.1