From 81dc2aca01f2ab64ecc03cbae473a346a4832347 Mon Sep 17 00:00:00 2001
From: Johnny Fernandes <up202402612@up.pt>
Date: Thu, 23 Apr 2026 19:22:39 +0100
Subject: [PATCH] Sheep training flock of 10

---
 controllers/sheep/sheep.py |   9 +++
 training/herding_env.py    |  10 ++-
 training/train.py          | 155 +++++++++++++++++++++++--------------
 3 files changed, 113 insertions(+), 61 deletions(-)

diff --git a/controllers/sheep/sheep.py b/controllers/sheep/sheep.py
index 3cb3326..a0c115f 100644
--- a/controllers/sheep/sheep.py
+++ b/controllers/sheep/sheep.py
@@ -204,6 +204,15 @@ while robot.step(timestep) != -1:
             fx += math.cos(wander_angle) * 0.5
             fy += math.sin(wander_angle) * 0.5
 
+    # Hard-stop clamp: within 0.5 m of a wall, zero any force component that
+    # would push further into it.  Prevents the flee force from pinning a sheep
+    # against the boundary when the dog approaches from outside.
+    HS = 0.5
+    if x < X_MIN + HS and fx < 0: fx = 0.0
+    if x > X_MAX - HS and fx > 0: fx = 0.0
+    if y < Y_MIN + HS and fy < 0: fy = 0.0
+    if y > Y_MAX - HS and fy > 0: fy = 0.0
+
     heading = math.atan2(fy, fx)
     mag     = math.hypot(fx, fy)
     speed   = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))
diff --git a/training/herding_env.py b/training/herding_env.py
index 90d3aa9..ce56cc3 100644
--- a/training/herding_env.py
+++ b/training/herding_env.py
@@ -30,7 +30,7 @@ class HerdingEnv(gym.Env):
     # -----------------------------------------------------------------------
     # World constants — must match Webots world file
     # -----------------------------------------------------------------------
-    MAX_SHEEP  = 5
+    MAX_SHEEP  = 10
     FIELD      = 15.0                         # half-size; positions ∈ [-FIELD, FIELD]
     PEN_X      = (10.0, 13.0)
     PEN_Y      = (-15.0, -8.0)
@@ -344,6 +344,14 @@ class HerdingEnv(gym.Env):
         if pos[1] < -F + m: fy += ((-F + m - pos[1]) / m) * 6.0
         if pos[1] >  F - m: fy -= ((pos[1] - (F - m)) / m) * 6.0
 
+        # Hard-stop clamp: mirrors sheep.py — zero any force driving further
+        # into the wall within 0.5 m so the flee force cannot pin the sheep.
+        HS = 0.5
+        if pos[0] < -F + HS and fx < 0: fx = 0.0
+        if pos[0] >  F - HS and fx > 0: fx = 0.0
+        if pos[1] < -F + HS and fy < 0: fy = 0.0
+        if pos[1] >  F - HS and fy > 0: fy = 0.0
+
         # Wander — suppressed while fleeing
         if not fleeing:
             if self.np_random.random() < 0.02:
diff --git a/training/train.py b/training/train.py
index 44abce1..c5b05ab 100644
--- a/training/train.py
+++ b/training/train.py
@@ -3,13 +3,17 @@ PPO training script for the herding task.
 
 Usage examples
 --------------
-# Start fresh with curriculum (1 → 5 sheep):
-    python train.py --curriculum
+# Proper 5-sheep curriculum, 1 M steps per stage:
+    python train.py --curriculum --steps-per-stage 1000000 --total-steps 5000000
 
-# Resume from checkpoint, skip directly to 3 sheep:
-    python train.py --resume runs/ppo_herding/ckpt_200000_steps.zip --n-sheep 3
+# Success-rate curriculum (advances when 70 % success over 100 episodes):
+    python train.py --curriculum --threshold 0.70
 
-# Quick smoke-test (no curriculum, single env):
+# Resume from checkpoint at stage 3:
+    python train.py --resume runs/ppo_herding/ckpt_3000000_steps.zip --n-sheep 3 \
+                    --curriculum --steps-per-stage 1000000 --total-steps 5000000
+
+# Quick smoke-test:
     python train.py --n-envs 1 --total-steps 50000
 """
 
@@ -35,39 +39,64 @@ from herding_env import HerdingEnv
 
 class CurriculumCallback(BaseCallback):
     """
-    Advances the curriculum (number of active sheep) when the rolling mean
-    episode success rate exceeds a threshold.
+    Advances n_sheep on both training and eval envs.
 
-    Success = episode terminated (all sheep penned) rather than truncated.
+    Two modes (mutually exclusive):
+      steps_per_stage — advance every N environment steps regardless of
+                        success rate (recommended for reliability).
+      threshold       — advance when rolling success rate exceeds this value
+                        (requires the policy to actually reach the threshold).
     """
 
-    THRESHOLD   = 0.75   # success rate to graduate
-    WINDOW      = 100    # episodes to average over
-    MIN_EPISODES = 50    # don't graduate before seeing this many episodes
-
-    def __init__(self, start_sheep: int, max_sheep: int, verbose: int = 1):
+    def __init__(self, start_sheep: int, max_sheep: int,
+                 eval_env=None,
+                 steps_per_stage: int = None,
+                 threshold: float = 0.75,
+                 window: int = 100,
+                 min_episodes: int = 50,
+                 verbose: int = 1):
         super().__init__(verbose)
-        self.max_sheep  = max_sheep
-        self._successes = []
-        self._cur_sheep = start_sheep
+        self.max_sheep       = max_sheep
+        self.eval_env        = eval_env
+        self.steps_per_stage = steps_per_stage
+        self.threshold       = threshold
+        self.window          = window
+        self.min_episodes    = min_episodes
+        self._cur_sheep      = start_sheep
+        self._successes      = []
+        self._stage_start    = 0
+
+    def _advance(self):
+        self._cur_sheep += 1
+        self.training_env.env_method("set_n_sheep", self._cur_sheep)
+        if self.eval_env is not None:
+            self.eval_env.env_method("set_n_sheep", self._cur_sheep)
+        self._stage_start = self.num_timesteps
+        self._successes.clear()
+        if self.verbose:
+            print(f"\n[Curriculum] → {self._cur_sheep} sheep "
+                  f"at step {self.num_timesteps:,}\n")
 
     def _on_step(self) -> bool:
-        for info, done in zip(self.locals["infos"], self.locals["dones"]):
-            if done:
-                truncated = info.get("TimeLimit.truncated", False)
-                self._successes.append(0 if truncated else 1)
-                if len(self._successes) > self.WINDOW:
-                    self._successes.pop(0)
+        if self._cur_sheep >= self.max_sheep:
+            return True
 
-        if (self._cur_sheep < self.max_sheep
-                and len(self._successes) >= self.MIN_EPISODES
-                and np.mean(self._successes) >= self.THRESHOLD):
-            self._cur_sheep += 1
-            self.training_env.env_method("set_n_sheep", self._cur_sheep)
-            self._successes.clear()
-            if self.verbose:
-                print(f"\n[Curriculum] Advanced to {self._cur_sheep} sheep "
-                      f"at step {self.num_timesteps}\n")
+        if self.steps_per_stage is not None:
+            # Time-based: advance every steps_per_stage env steps
+            if self.num_timesteps - self._stage_start >= self.steps_per_stage:
+                self._advance()
+        else:
+            # Success-rate based
+            for info, done in zip(self.locals["infos"], self.locals["dones"]):
+                if done:
+                    truncated = info.get("TimeLimit.truncated", False)
+                    self._successes.append(0 if truncated else 1)
+                    if len(self._successes) > self.window:
+                        self._successes.pop(0)
+
+            if (len(self._successes) >= self.min_episodes
+                    and np.mean(self._successes) >= self.threshold):
+                self._advance()
 
         return True
 
@@ -90,36 +119,35 @@ def make_env(n_sheep: int, seed: int, max_steps: int):
 
 def parse_args():
     p = argparse.ArgumentParser()
-    p.add_argument("--n-sheep",     type=int,   default=1,
-                   help="Starting number of sheep (or fixed count if no curriculum)")
-    p.add_argument("--max-sheep",   type=int,   default=5,
-                   help="Maximum sheep for curriculum (ignored without --curriculum)")
-    p.add_argument("--n-envs",      type=int,   default=8,
-                   help="Number of parallel environments")
-    p.add_argument("--total-steps", type=int,   default=5_000_000,
-                   help="Total environment steps to train for")
-    p.add_argument("--max-steps",   type=int,   default=2000,
-                   help="Episode step limit inside each env")
-    p.add_argument("--curriculum",  action="store_true",
-                   help="Enable automatic curriculum advancement")
-    p.add_argument("--resume",      type=str,   default=None,
-                   help="Path to a .zip checkpoint to resume training from")
-    p.add_argument("--run-dir",     type=str,   default="runs/ppo_herding",
-                   help="Output directory for checkpoints and logs")
-    p.add_argument("--save-freq",   type=int,   default=100_000,
-                   help="Checkpoint every N steps (per-env, not total)")
-    p.add_argument("--eval-freq",   type=int,   default=50_000,
-                   help="Evaluate every N steps")
-    p.add_argument("--eval-eps",    type=int,   default=20,
-                   help="Episodes per evaluation run")
+    p.add_argument("--n-sheep",          type=int,   default=1,
+                   help="Starting sheep count")
+    p.add_argument("--max-sheep",        type=int,   default=5,
+                   help="Final sheep count for curriculum")
+    p.add_argument("--n-envs",           type=int,   default=8,
+                   help="Parallel training environments")
+    p.add_argument("--total-steps",      type=int,   default=5_000_000)
+    p.add_argument("--max-steps",        type=int,   default=2000,
+                   help="Episode step limit")
+    p.add_argument("--curriculum",       action="store_true",
+                   help="Enable curriculum advancement")
+    p.add_argument("--steps-per-stage",  type=int,   default=None,
+                   help="Advance curriculum every N steps (overrides --threshold)")
+    p.add_argument("--threshold",        type=float, default=0.75,
+                   help="Success-rate threshold to advance (used without --steps-per-stage)")
+    p.add_argument("--resume",           type=str,   default=None,
+                   help="Checkpoint .zip to resume from")
+    p.add_argument("--run-dir",          type=str,   default="runs/ppo_herding")
+    p.add_argument("--save-freq",        type=int,   default=100_000)
+    p.add_argument("--eval-freq",        type=int,   default=50_000)
+    p.add_argument("--eval-eps",         type=int,   default=20)
     return p.parse_args()
 
 
 def main():
     args = parse_args()
     os.makedirs(args.run_dir, exist_ok=True)
-    ckpt_dir = os.path.join(args.run_dir, "checkpoints")
-    best_dir = os.path.join(args.run_dir, "best_model")
+    ckpt_dir  = os.path.join(args.run_dir, "checkpoints")
+    best_dir  = os.path.join(args.run_dir, "best_model")
     norm_path = os.path.join(args.run_dir, "vecnorm.pkl")
     os.makedirs(ckpt_dir, exist_ok=True)
 
@@ -130,13 +158,13 @@ def main():
     ])
     if args.resume and os.path.exists(norm_path):
         train_env = VecNormalize.load(norm_path, train_env)
-        train_env.training = True
+        train_env.training    = True
         train_env.norm_reward = True
     else:
         train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True,
                                  clip_obs=10.0)
 
-    # Eval env (no reward normalisation, deterministic)
+    # Eval env — starts at same difficulty, advances with curriculum callback
     eval_env = SubprocVecEnv([
         make_env(args.n_sheep, seed=1000 + i, max_steps=args.max_steps)
         for i in range(2)
@@ -161,9 +189,17 @@ def main():
         verbose=1,
     )
     callbacks = [checkpoint_cb, eval_cb]
+
     if args.curriculum:
-        callbacks.append(CurriculumCallback(start_sheep=args.n_sheep,
-                                            max_sheep=args.max_sheep))
+        cur_cb = CurriculumCallback(
+            start_sheep=args.n_sheep,
+            max_sheep=args.max_sheep,
+            eval_env=eval_env,
+            steps_per_stage=args.steps_per_stage,
+            threshold=args.threshold,
+        )
+        callbacks.append(cur_cb)
+
     callback_list = CallbackList(callbacks)
 
     # Model
@@ -201,7 +237,6 @@ def main():
         tb_log_name="ppo",
     )
 
-    # Save final artefacts
     model.save(os.path.join(args.run_dir, "final_model"))
     train_env.save(norm_path)
     print(f"\nTraining complete. Artefacts saved to {args.run_dir}/")