Checkpoint 2

2026-05-07 22:00:10 +01:00
parent 90aa3bbcb4
commit 1bb9415414
37 changed files with 3068 additions and 2912 deletions
@@ -4,18 +4,22 @@

 # Python
 __pycache__/
-
-# Training
-training/**/events.out.tfevents.*
-training/**/checkpoints/
-training/runs/**
-!training/runs/.gitkeep
-
-# Controller runtime artefacts
-controllers/shepherd_dog_rl/debug*.csv
-controllers/shepherd_dog_rl/debug_out*/
-controllers/shepherd_dog_rl/final_model*.zip
-controllers/shepherd_dog_rl/vecnorm*.pkl
+*.pyc
+.venv/

 # Optional env parity debug
 dog_debug.csv
+
+# Webots controller scratch
+controllers/shepherd_dog/dog_behavior_log.csv
+
+# Training artefacts
+training/runs/*
+!training/runs/.gitkeep
+*.zip
+*.pkl
+
+# TensorBoard
+events.out.tfevents.*
+worlds/field_test.wbt
+herding_runtime.cfg
@@ -1,45 +1,36 @@
-"""
-Sheep flocking controller (Webots, Reynolds boids variant).
+"""Sheep flocking controller (Webots).

 Each sheep broadcasts its GPS position every 3 steps on channel 1 and
-listens for the dog and peer sheep positions.  Peers are keyed by robot
-name so each neighbour has exactly one current entry in the dict.
+listens for the dog and peer sheep positions. The behavioural step is
+delegated to ``herding.flocking_sim.compute_heading_speed`` so the
+training environment and Webots run identical sheep dynamics.

-Force stack each step (summed then converted to a heading + speed):
-    flee       — away from dog, quadratic ramp, dominant when close
-    cohesion   — toward flock centre, halved while fleeing
-    separation — inverse-distance push, prevents physical overlap
-    walls      — linear repulsion from field boundary
-    wander     — small persistent drift for natural idle motion
-
-Pen behaviour: on first entry into the quarantine pen the sheep latches
-permanently — it turns pink (via the exposed woolColor PROTO field) and
-the normal force stack is replaced by pen-confinement forces only.
+Pen behaviour: a sheep latches to ``penned`` the first time it crosses
+the south-wall gate plane into the gate corridor. Once latched it turns
+pink (via the exposed ``woolColor`` PROTO field) and the force stack
+switches to in-pen containment.
 """

-import random
 import math
+import os
+import random
+import sys
+
+# --- Make the shared herding/ package importable from this controller dir ---
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+
 from controller import Supervisor

-# ---------------------------------------------------------------------------
-# Tuning constants
-# ---------------------------------------------------------------------------
+from herding.diffdrive import heading_speed_to_wheels
+from herding.flocking_sim import MAX_SPEED, compute_heading_speed
+from herding.geometry import (
+    SHEEP_MAX_WHEEL_OMEGA,
+    is_penned_position,
+)

-MAX_SPEED    = 22.0   # rad/s hard clamp on both motors
-FLEE_SPEED   = 20.0   # rad/s upper bound while panicking
-WANDER_SPEED =  3.0   # rad/s lower bound during calm wandering
-
-X_MIN, X_MAX = -14.5, 14.5   # stone wall inner edges (metres)
-Y_MIN, Y_MAX = -14.5, 14.5
-WALL_MARGIN  =  3.5           # avoidance starts this far from the wall
-
-FLEE_DIST       = 7.0   # dog within this radius triggers flee (metres)
-SEPARATION_DIST = 2.5   # inverse-distance push active inside this radius
-COHESION_DIST   = 8.0   # pull toward flock centre active inside this radius
-
-PEN_X_MIN, PEN_X_MAX = 10.0, 13.0   # quarantine pen extents (metres)
-PEN_Y_MIN, PEN_Y_MAX = -15.0, -8.0  # open entrance at y=-8, gate at y=-15
-PEN_MARGIN = 0.8                     # confinement force starts this far from pen wall

 # ---------------------------------------------------------------------------
 # Device setup
@@ -56,178 +47,102 @@ left_motor.setPosition(float("inf"))
 right_motor.setPosition(float("inf"))
 left_motor.setVelocity(0.0)
 right_motor.setVelocity(0.0)
+MOTOR_MAX = min(left_motor.getMaxVelocity(), SHEEP_MAX_WHEEL_OMEGA)

 gps = robot.getDevice("gps");           gps.enable(timestep)
 compass = robot.getDevice("compass");   compass.enable(timestep)
 receiver = robot.getDevice("receiver"); receiver.enable(timestep)
 emitter = robot.getDevice("emitter")

+
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------

-def norm_angle(a):
-    return math.atan2(math.sin(a), math.cos(a))
-
-
 def bearing():
    # Compass returns north direction in sensor frame; for this Z-up world
    # with north = +Y, atan2(n[0], n[1]) gives the standard math angle
-    # (0 = east, π/2 = north) matching atan2(fy, fx) used for heading.
+    # (0 = east, π/2 = north) matching atan2(fy, fx) used for headings.
    n = compass.getValues()
    return math.atan2(n[0], n[1])


-def drive(heading, speed):
-    err = norm_angle(heading - bearing())
-    # Scale forward component by cos(err): at 90° error fwd→0 so the robot
-    # spins in place to realign rather than driving sideways at full speed.
-    fwd = speed * max(0.0, math.cos(err))
-    k = 4.0
-    left_motor.setVelocity( max(-MAX_SPEED, min(MAX_SPEED, fwd - k * err)))
-    right_motor.setVelocity(max(-MAX_SPEED, min(MAX_SPEED, fwd + k * err)))
+def drive(heading, speed_motor):
+    left_w, right_w = heading_speed_to_wheels(
+        heading, min(speed_motor, MAX_SPEED), bearing(), MOTOR_MAX, k_turn=4.0
+    )
+    left_motor.setVelocity(left_w)
+    right_motor.setVelocity(right_w)


 def paint_pink():
    # woolColor is declared as a PROTO field with IS binding to the DEF WOOL
-    # PBRAppearance baseColor.  Changing it here propagates to every USE WOOL
-    # shape on the body.  Direct field access avoids PROTO-internal opacity.
+    # PBRAppearance baseColor; setting it propagates to every USE WOOL shape.
    self_node.getField("woolColor").setSFColor([1.0, 0.55, 0.72])

+
 # ---------------------------------------------------------------------------
 # State
 # ---------------------------------------------------------------------------

 wander_angle = random.uniform(-math.pi, math.pi)
-step   = 0
-dog_x  = None
-dog_y  = None
+step_count = 0
+dog_x, dog_y = None, None
 peers = {}        # name → (x, y), one entry per neighbour, cleared every 30 steps
 penned = False

+# Stuck detection: differential-drive sheep can pin against a wall and need
+# a forced reverse-and-rotate to escape. If displacement < STUCK_DIST for
+# STUCK_STEPS consecutive steps, drive toward field centre.
+_prev_x, _prev_y = None, None
+_stuck_count = 0
+STUCK_STEPS = 20
+STUCK_DIST = 0.05
+
+
 # ---------------------------------------------------------------------------
 # Main loop
 # ---------------------------------------------------------------------------

 while robot.step(timestep) != -1:
-    step += 1
+    step_count += 1
    pos = gps.getValues()
    x, y = pos[0], pos[1]

-    # Pen entry: one-way latch, never unset
-    if not penned and PEN_X_MIN < x < PEN_X_MAX and PEN_Y_MIN < y < PEN_Y_MAX:
+    # Pen entry: one-way latch. Penned sheep get pink wool and switch behaviour.
+    if not penned and is_penned_position(x, y):
        penned = True
        paint_pink()

-    # Refresh peer table (clear before receiving so fresh data is never lost)
-    if step % 30 == 0:
+    # Refresh peer table — clear before receiving so fresh data is never lost.
+    if step_count % 30 == 0:
        peers.clear()
    while receiver.getQueueLength() > 0:
        msg = receiver.getString()
        receiver.nextPacket()
-        p = msg.split(":")
-        if p[0] == "dog" and len(p) >= 3:
-            dog_x, dog_y = float(p[1]), float(p[2])
-        elif p[0] == "sheep" and len(p) >= 4 and p[1] != name:
-            peers[p[1]] = (float(p[2]), float(p[3]))
+        parts = msg.split(":")
+        if parts[0] == "dog" and len(parts) >= 3:
+            dog_x, dog_y = float(parts[1]), float(parts[2])
+        elif parts[0] == "sheep" and len(parts) >= 4 and parts[1] != name:
+            peers[parts[1]] = (float(parts[2]), float(parts[3]))

-    fx, fy = 0.0, 0.0
+    dog_xy = (dog_x, dog_y) if dog_x is not None and dog_y is not None else None
+    heading, speed, wander_angle = compute_heading_speed(
+        x=x, y=y, penned=penned, dog_xy=dog_xy, peers=peers,
+        wander_angle=wander_angle,
+    )

-    # Repel unpenned sheep from the exterior of the pen's side walls so they
-    # don't get pinned by flee forces. Only fires when strictly outside the pen
-    # (x < PEN_X_MIN or x > PEN_X_MAX) at pen height (y in pen y-range).
-    # Entrance is open on the north (y > PEN_Y_MAX) — no force there.
-    PEN_EXT_MARGIN = 0.8
-    if not penned and PEN_Y_MIN < y < PEN_Y_MAX:
-        if PEN_X_MIN - PEN_EXT_MARGIN < x < PEN_X_MIN:
-            fx -= ((x - (PEN_X_MIN - PEN_EXT_MARGIN)) / PEN_EXT_MARGIN) * 6.0
-        if PEN_X_MAX < x < PEN_X_MAX + PEN_EXT_MARGIN:
-            fx += ((PEN_X_MAX + PEN_EXT_MARGIN - x) / PEN_EXT_MARGIN) * 6.0
+    # Stuck detection — safety net for differential-drive wall pinning.
+    if _prev_x is not None:
+        moved = math.hypot(x - _prev_x, y - _prev_y)
+        _stuck_count = _stuck_count + 1 if moved < STUCK_DIST else 0
+    if _stuck_count >= STUCK_STEPS:
+        heading = math.atan2(-y, -x)   # always points away from the boundary
+        speed = MAX_SPEED
+        _stuck_count = 0
+    _prev_x, _prev_y = x, y

-    if penned:
-        # Inside pen: wander freely, strong boundary forces prevent exit,
-        # separation still active to avoid collisions with other penned sheep.
-
-        pm = PEN_MARGIN
-        if x < PEN_X_MIN + pm: fx += ((PEN_X_MIN + pm - x) / pm) * 15.0
-        if x > PEN_X_MAX - pm: fx -= ((x - (PEN_X_MAX - pm)) / pm) * 15.0
-        if y < PEN_Y_MIN + pm: fy += ((PEN_Y_MIN + pm - y) / pm) * 15.0
-        if y > PEN_Y_MAX - pm: fy -= ((y - (PEN_Y_MAX - pm)) / pm) * 15.0
-
-        for px, py in peers.values():
-            dx, dy = px - x, py - y
-            d = math.hypot(dx, dy)
-            if 0.05 < d < SEPARATION_DIST:
-                push = (SEPARATION_DIST - d) / d
-                fx -= (dx / d) * push * 2.5
-                fy -= (dy / d) * push * 2.5
-
-        if random.random() < 0.02:
-            wander_angle += random.uniform(-0.6, 0.6)
-        fx += math.cos(wander_angle) * 0.5
-        fy += math.sin(wander_angle) * 0.5
-
-    else:
-        fleeing = False
-
-        # Flee — quadratic ramp so force grows rapidly as the dog closes in
-        if dog_x is not None:
-            dx   = dog_x - x
-            dy   = dog_y - y
-            dist = math.hypot(dx, dy)
-            if 0.01 < dist < FLEE_DIST:
-                fleeing = True
-                t = 1.0 - dist / FLEE_DIST
-                s = t * t * 20.0
-                fx -= (dx / dist) * s
-                fy -= (dy / dist) * s
-
-        # Cohesion — halved while fleeing to reduce mid-panic collisions
-        cx, cy, cn = 0.0, 0.0, 0
-        for px, py in peers.values():
-            d = math.hypot(px - x, py - y)
-            if 0.3 < d < COHESION_DIST:
-                cx += px; cy += py; cn += 1
-        if cn > 0:
-            w = 0.08 if fleeing else 0.15
-            fx += (cx / cn - x) * w
-            fy += (cy / cn - y) * w
-
-        # Separation — inverse-distance: huge when nearly overlapping, fades quickly
-        for px, py in peers.values():
-            dx, dy = px - x, py - y
-            d = math.hypot(dx, dy)
-            if 0.05 < d < SEPARATION_DIST:
-                push = (SEPARATION_DIST - d) / d
-                fx -= (dx / d) * push * 2.5
-                fy -= (dy / d) * push * 2.5
-
-        # Walls
-        if x < X_MIN + WALL_MARGIN: fx += ((X_MIN + WALL_MARGIN - x) / WALL_MARGIN) * 6.0
-        if x > X_MAX - WALL_MARGIN: fx -= ((x - (X_MAX - WALL_MARGIN)) / WALL_MARGIN) * 6.0
-        if y < Y_MIN + WALL_MARGIN: fy += ((Y_MIN + WALL_MARGIN - y) / WALL_MARGIN) * 6.0
-        if y > Y_MAX - WALL_MARGIN: fy -= ((y - (Y_MAX - WALL_MARGIN)) / WALL_MARGIN) * 6.0
-
-        # Wander — suppressed while fleeing so drift cannot deflect the flee heading
-        if not fleeing:
-            if random.random() < 0.02:
-                wander_angle += random.uniform(-0.6, 0.6)
-            fx += math.cos(wander_angle) * 0.5
-            fy += math.sin(wander_angle) * 0.5
-
-    # Hard-stop clamp: within 0.5 m of a wall, zero any force component that
-    # would push further into it.  Prevents the flee force from pinning a sheep
-    # against the boundary when the dog approaches from outside.
-    HS = 0.5
-    if x < X_MIN + HS and fx < 0: fx = 0.0
-    if x > X_MAX - HS and fx > 0: fx = 0.0
-    if y < Y_MIN + HS and fy < 0: fy = 0.0
-    if y > Y_MAX - HS and fy > 0: fy = 0.0
-
-    heading = math.atan2(fy, fx)
-    mag     = math.hypot(fx, fy)
-    speed   = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))
    drive(heading, speed)

-    if step % 3 == 0:
+    if step_count % 3 == 0:
        emitter.send(f"sheep:{name}:{x:.4f}:{y:.4f}")
@@ -0,0 +1,78 @@
+"""Lazy loader for the SB3 PPO policy used by the dog controller.
+
+Importing stable-baselines3 inside the Webots Python interpreter is only
+needed when ``HERDING_MODE=rl``; the Strömbom mode runs without it. This
+loader keeps SB3 out of the import path until you actually ask for the RL
+policy, so users without SB3 installed can still run the Strömbom
+baseline.
+
+The policy + VecNormalize statistics are saved together by
+``training/train_ppo.py``:
+
+    runs/<name>/best/best_model.zip     # SB3 PPO checkpoint
+    runs/<name>/best/vecnormalize.pkl   # observation-normaliser stats
+
+Pass either the directory or the explicit zip path.
+"""
+
+import os
+from pathlib import Path
+
+
+class PolicyHandle:
+    """Wrap a loaded PPO policy + VecNormalize so the controller can call
+    ``predict(obs)`` without thinking about either."""
+
+    def __init__(self, model, vecnorm):
+        self.model = model
+        self.vecnorm = vecnorm
+
+    def predict(self, obs):
+        # VecNormalize expects a batched obs of shape (n_envs, obs_dim).
+        if self.vecnorm is not None:
+            import numpy as np
+            obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
+            obs_b = self.vecnorm.normalize_obs(obs_b)
+        else:
+            import numpy as np
+            obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
+        action, _ = self.model.predict(obs_b, deterministic=True)
+        return action[0]
+
+
+def load(model_path: str, vecnorm_path: str | None = None) -> PolicyHandle:
+    """Load a PPO model (and optional VecNormalize) from disk.
+
+    ``model_path`` may be the .zip checkpoint or a directory containing
+    ``best_model.zip`` (and optionally ``vecnormalize.pkl``).
+    """
+    p = Path(model_path)
+    if p.is_dir():
+        zip_candidates = [p / "best_model.zip", p / "final.zip", p / "policy.zip"]
+        zip_path = next((z for z in zip_candidates if z.exists()), None)
+        if zip_path is None:
+            raise FileNotFoundError(
+                f"No PPO zip found in {p} (looked for best_model.zip, final.zip, policy.zip)"
+            )
+        if vecnorm_path is None:
+            vn = p / "vecnormalize.pkl"
+            if vn.exists():
+                vecnorm_path = str(vn)
+    else:
+        zip_path = p
+
+    # Imports deferred so the Strömbom path doesn't require SB3.
+    from stable_baselines3 import PPO
+    from stable_baselines3.common.vec_env import VecNormalize
+
+    model = PPO.load(str(zip_path), device="auto")
+    vecnorm = None
+    if vecnorm_path and os.path.exists(vecnorm_path):
+        # VecNormalize.load needs a venv to attach to; we only need its stats
+        # at inference, so we reconstruct the wrapper manually.
+        import pickle
+        with open(vecnorm_path, "rb") as f:
+            vecnorm = pickle.load(f)
+        vecnorm.training = False
+        vecnorm.norm_reward = False
+    return PolicyHandle(model=model, vecnorm=vecnorm)
@@ -1,14 +1,182 @@
-"""
-Shepherd Dog controller (Webots, manual keyboard control).
+"""Shepherd Dog controller (Webots).

-WASD / arrow keys drive the robot.  +/- adjust speed in 10 % increments.
-GPS position is broadcast every step on channel 1 so sheep controllers
-can compute flee forces.  Ears wag continuously via sinusoidal position
-targets — purely cosmetic.
+Runs in one of two modes selected by the ``HERDING_MODE`` environment
+variable:
+
+    HERDING_MODE=rl        → load an SB3 PPO policy from
+                             HERDING_POLICY_DIR (default
+                             training/runs/latest/best) and use its
+                             (vx, vy) action each step.
+    HERDING_MODE=strombom  → use the analytic Strömbom collect/drive
+                             heuristic. This is the fallback if the RL
+                             policy can't be loaded (e.g. SB3 not
+                             installed in the Webots Python env, or no
+                             checkpoint yet).
+
+Both modes share the same low-level differential-drive controller
+(``herding.diffdrive.velocity_to_wheels`` + clamped forward speed), so
+switching modes does not retune the actuation layer.
+
+A safety supervisor enforces the "dog stays out of the pen" invariant:
+if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is
+overridden with a north-driving correction. This is a hard guarantee
+the policy cannot escape.
 """

 import math
-from controller import Robot, Keyboard
+import os
+import sys
+
+# --- Make the shared herding/ package importable from this controller dir ---
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+
+from controller import Robot
+
+from herding.diffdrive import velocity_to_wheels
+from herding.geometry import (
+    DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA,
+    DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS,
+    PEN_ENTRY,
+)
+from herding.obs import build_obs
+from herding.sequential import compute_action_debug as sequential_action_debug
+from herding.strombom import compute_action_debug as strombom_action_debug
+
+
+# ---------------------------------------------------------------------------
+# Mode selection
+# ---------------------------------------------------------------------------
+
+def _load_runtime_config():
+    """Read mode + policy_dir overrides from a runtime config file.
+
+    Webots strips HERDING_* env vars in some configurations, so the
+    launcher writes a tiny ``herding_runtime.cfg`` (key=value lines)
+    in the project root and the controller reads it here. Env vars
+    win if both are present; the file is the fallback.
+    """
+    cfg_path = os.path.join(_PROJECT_ROOT, "herding_runtime.cfg")
+    if not os.path.exists(cfg_path):
+        return {}
+    out = {}
+    try:
+        with open(cfg_path) as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith("#") or "=" not in line:
+                    continue
+                k, _, v = line.partition("=")
+                out[k.strip().upper()] = v.strip()
+    except OSError:
+        return {}
+    return out
+
+
+_runtime_cfg = _load_runtime_config()
+MODE = (os.environ.get("HERDING_MODE")
+        or _runtime_cfg.get("HERDING_MODE")
+        or "rl").lower()
+
+
+def _resolve_policy_dir() -> str:
+    """Where to look for the trained policy.
+
+    Priority:
+      1. HERDING_POLICY_DIR env var (if set and points to a real dir)
+      2. training/runs/bc_pretrained/  (BC-only checkpoint)
+      3. training/runs/bc_ppo/best/    (PPO fine-tuned best)
+      4. training/runs/latest/best/    (legacy default)
+    """
+    env_dir = (os.environ.get("HERDING_POLICY_DIR")
+               or _runtime_cfg.get("HERDING_POLICY_DIR"))
+    if env_dir and os.path.isdir(env_dir):
+        return env_dir
+    candidates = [
+        os.path.join(_PROJECT_ROOT, "training", "runs", "bc_pretrained"),
+        os.path.join(_PROJECT_ROOT, "training", "runs", "bc_ppo", "best"),
+        os.path.join(_PROJECT_ROOT, "training", "runs", "latest", "best"),
+    ]
+    for c in candidates:
+        if os.path.isdir(c):
+            return c
+    # Last resort — return env var anyway so error message is informative.
+    return env_dir or candidates[0]
+
+
+POLICY_DIR = _resolve_policy_dir()
+
+policy_handle = None
+if MODE == "rl":
+    print(f"[dog] HERDING_MODE={MODE}  HERDING_POLICY_DIR(env)="
+          f"{os.environ.get('HERDING_POLICY_DIR', '<unset>')}")
+    print(f"[dog] resolved POLICY_DIR={POLICY_DIR}  exists="
+          f"{os.path.isdir(POLICY_DIR)}")
+    if os.path.isdir(POLICY_DIR):
+        try:
+            entries = sorted(os.listdir(POLICY_DIR))
+        except OSError:
+            entries = []
+        print(f"[dog] dir contents: {entries}")
+    try:
+        from policy_loader import load as _load_policy
+        policy_handle = _load_policy(POLICY_DIR)
+        print(f"[dog] RL policy loaded from {POLICY_DIR}")
+    except Exception as exc:
+        print(f"[dog] RL policy load failed ({exc!r}); falling back to Strömbom.")
+        MODE = "strombom"
+if MODE not in ("rl", "strombom", "sequential"):
+    print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
+    MODE = "strombom"
+print(f"[dog] running in mode={MODE}")
+
+
+# ---------------------------------------------------------------------------
+# Action smoothing + safety supervisor
+# ---------------------------------------------------------------------------
+
+ACTION_SMOOTH = 0.35
+prev_action = (0.0, 0.0)
+
+
+def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
+    """If the dog is near the south barrier and the action would push it
+    further south, override with a northward action. Hard invariant: the
+    dog never enters the pen."""
+    if dog_y < DOG_SOUTH_LIMIT and vy < 0.0:
+        return (0.0, 1.0)
+    if dog_y < DOG_SOUTH_LIMIT + 0.5 and vy < -0.2:
+        return (vx * 0.5, max(0.0, vy + 0.5))
+    return (vx, vy)
+
+
+# ---------------------------------------------------------------------------
+# Driving
+# ---------------------------------------------------------------------------
+
+def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float):
+    if math.hypot(vx, vy) < 1e-3:
+        left_motor.setVelocity(0.0)
+        right_motor.setVelocity(0.0)
+        return
+    n = compass.getValues()
+    h = math.atan2(n[0], n[1])
+    left, right = velocity_to_wheels(
+        vx, vy, h,
+        max_linear=DOG_MAX_LINEAR,
+        wheel_radius=DOG_WHEEL_RADIUS,
+        max_wheel_omega=motor_max,
+        k_turn=4.0,
+    )
+    left_motor.setVelocity(left)
+    right_motor.setVelocity(right)
+
+
+# ---------------------------------------------------------------------------
+# Webots devices
+# ---------------------------------------------------------------------------

 robot = Robot()
 timestep = int(robot.getBasicTimeStep())
@@ -19,70 +187,97 @@ left_motor.setPosition(float("inf"))
 right_motor.setPosition(float("inf"))
 left_motor.setVelocity(0.0)
 right_motor.setVelocity(0.0)
-
-lidar = robot.getDevice("lidar")
-lidar.enable(timestep)
-lidar.enablePointCloud()
+MOTOR_MAX = min(left_motor.getMaxVelocity(), DOG_MAX_WHEEL_OMEGA)

 gps = robot.getDevice("gps");           gps.enable(timestep)
 compass = robot.getDevice("compass");   compass.enable(timestep)
-emitter = robot.getDevice("emitter")
 receiver = robot.getDevice("receiver"); receiver.enable(timestep)
+emitter = robot.getDevice("emitter")

+# Cosmetic ear motors — ignored by control logic but keep them animated.
 left_ear = robot.getDevice("left ear motor")
 right_ear = robot.getDevice("right ear motor")
 left_ear.setPosition(float("inf"))
 right_ear.setPosition(float("inf"))
 left_ear.setVelocity(0.0)
 right_ear.setVelocity(0.0)
-
-keyboard = robot.getKeyboard()
-keyboard.enable(timestep)
-
-MOTOR_MAX   = left_motor.getMaxVelocity()
-speed_level = 0.5   # fraction of MOTOR_MAX; adjusted by +/-
-
-EAR_AMPLITUDE = 0.35   # rad, peak ear deflection
-EAR_RATE      = 8.0    # rad/s, how fast the ears are driven
 ear_phase = 0.0
+EAR_AMPLITUDE = 0.35
+EAR_RATE = 8.0
+
+
+# ---------------------------------------------------------------------------
+# Main loop
+# ---------------------------------------------------------------------------
+
+# {name: (x, y)} — kept across all sheep ever heard from. Sheep that drift
+# into the pen are tracked by ``penned`` so observations and Strömbom
+# agree on which ones still need herding.
+sheep_positions: dict = {}
+penned_set: set = set()
+step_count = 0
+
+from herding.geometry import is_penned_position

 while robot.step(timestep) != -1:
-    speed = MOTOR_MAX * speed_level
-    turn  = speed * 0.6   # differential turn radius
+    step_count += 1

-    left_vel  = 0.0
-    right_vel = 0.0
-    key = keyboard.getKey()
-    while key > 0:
-        if   key in (ord('W'), Keyboard.UP):
-            left_vel  = speed
-            right_vel = speed
-        elif key in (ord('S'), Keyboard.DOWN):
-            left_vel  = -speed
-            right_vel = -speed
-        elif key in (ord('A'), Keyboard.LEFT):
-            left_vel  = -turn
-            right_vel =  turn
-        elif key in (ord('D'), Keyboard.RIGHT):
-            left_vel  =  turn
-            right_vel = -turn
-        elif key in (ord('+'), ord('=')):
-            speed_level = min(1.0, speed_level + 0.1)
-            print(f"Speed: {speed_level:.0%} ({MOTOR_MAX * speed_level:.1f} rad/s)")
-        elif key in (ord('-'), ord('_')):
-            speed_level = max(0.1, speed_level - 0.1)
-            print(f"Speed: {speed_level:.0%} ({MOTOR_MAX * speed_level:.1f} rad/s)")
-        key = keyboard.getKey()
-
-    left_motor.setVelocity(left_vel)
-    right_motor.setVelocity(right_vel)
+    while receiver.getQueueLength() > 0:
+        msg = receiver.getString()
+        receiver.nextPacket()
+        parts = msg.split(":")
+        if len(parts) == 4 and parts[0] == "sheep":
+            try:
+                x, y = float(parts[2]), float(parts[3])
+            except ValueError:
+                continue
+            sheep_positions[parts[1]] = (x, y)
+            if parts[1] not in penned_set and is_penned_position(x, y):
+                penned_set.add(parts[1])

    pos = gps.getValues()
-    emitter.send(f"dog:{pos[0]}:{pos[1]}")
+    dog_xy = (pos[0], pos[1])
+    n = compass.getValues()
+    dog_heading = math.atan2(n[0], n[1])

+    # ---- Action selection ----
+    if MODE == "rl" and policy_handle is not None:
+        sheep_xy_list = list(sheep_positions.values())
+        sheep_names = list(sheep_positions.keys())
+        sheep_penned_list = [s in penned_set for s in sheep_names]
+        obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list)
+        action = policy_handle.predict(obs)
+        vx, vy = float(action[0]), float(action[1])
+    elif MODE == "sequential":
+        vx, vy, _mode_str, _dbg = sequential_action_debug(
+            dog_xy, sheep_positions, PEN_ENTRY,
+        )
+    else:
+        # Strömbom (canonical baseline).
+        vx, vy, _mode_str, _dbg = strombom_action_debug(
+            dog_xy, sheep_positions, PEN_ENTRY,
+        )
+
+    # EMA smoothing — reduces oscillation from policy or Strömbom flips.
+    vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx
+    vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy
+
+    # Safety: dog must never enter the pen.
+    vx, vy = safety_clamp(vx, vy, dog_xy[0], dog_xy[1])
+    prev_action = (vx, vy)
+
+    drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX)
+    emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}")
+
+    # Cosmetic ear wiggle — purely visual.
    ear_phase += 0.12
    ear_pos = EAR_AMPLITUDE * math.sin(ear_phase)
    left_ear.setVelocity(EAR_RATE)
    right_ear.setVelocity(EAR_RATE)
-    left_ear.setPosition( ear_pos)
+    left_ear.setPosition(ear_pos)
    right_ear.setPosition(-ear_pos)
+
+    if step_count % 200 == 0:
+        n_active = sum(1 for s in sheep_positions if s not in penned_set)
+        print(f"[dog mode={MODE}] step={step_count} known={len(sheep_positions)} "
+              f"penned={len(penned_set)} active={n_active} action=({vx:+.2f}, {vy:+.2f})")
@@ -1,153 +0,0 @@
-"""
-Render Webots-side debug trajectory from debug.csv.
-
-The shepherd_dog_rl controller writes per-step state to debug.csv when
-DOG_DEBUG=1. This script reads it and produces:
-
-  trajectory.png   — dog path + sheep paths overlaid on the field
-  obs_drift.png    — normalized observation distribution over time
-  actions.png      — vx, vy time series
-
-Run:
-    python plot_debug.py                    # uses debug.csv next to this file
-    python plot_debug.py --csv path/to.csv --out-dir somewhere/
-"""
-import argparse
-import csv
-import os
-import sys
-
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
-import numpy as np
-
-
-def load_csv(path):
-    rows = []
-    with open(path) as f:
-        rd = csv.DictReader(f)
-        for r in rd:
-            rows.append(r)
-    if not rows:
-        sys.exit(f"empty CSV: {path}")
-    return rows
-
-
-def parse_floats(s):
-    return [float(x) for x in s.split(";") if x]
-
-
-def plot_trajectory(rows, out_path):
-    fig, ax = plt.subplots(figsize=(7, 7))
-    ax.set_xlim(-16, 16); ax.set_ylim(-16, 16); ax.set_aspect("equal")
-    ax.set_facecolor("#dcedc8")
-    ax.add_patch(mpatches.Rectangle((-15, -15), 30, 30,
-                 fill=False, edgecolor="#795548", lw=2))
-    ax.add_patch(mpatches.Rectangle((10, -15), 3, 7,
-                 facecolor="#ffe082", edgecolor="#795548", lw=2))
-    ax.text(11.5, -11.5, "pen", ha="center", va="center", fontsize=8)
-
-    dog_x = [float(r["dog_x"]) for r in rows]
-    dog_y = [float(r["dog_y"]) for r in rows]
-    ax.plot(dog_x, dog_y, color="#4e342e", lw=1.5, alpha=0.7, label="dog")
-    ax.plot(dog_x[0], dog_y[0], "s", color="#4e342e", ms=10)
-    ax.plot(dog_x[-1], dog_y[-1], "D", color="#4e342e", ms=10)
-
-    # Sheep — re-shape into per-sheep tracks
-    sx_all = [parse_floats(r["sheep_xs"]) for r in rows]
-    sy_all = [parse_floats(r["sheep_ys"]) for r in rows]
-    if sx_all and sx_all[-1]:
-        n_sheep = len(sx_all[-1])
-        palette = ["#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00",
-                   "#a65628","#f781bf","#999999","#66c2a5","#fc8d62"]
-        for i in range(n_sheep):
-            xs = [r[i] if i < len(r) else None for r in sx_all]
-            ys = [r[i] if i < len(r) else None for r in sy_all]
-            xs = [x for x in xs if x is not None]
-            ys = [y for y in ys if y is not None]
-            if xs:
-                c = palette[i % len(palette)]
-                ax.plot(xs, ys, color=c, lw=0.8, alpha=0.6, label=f"sheep {i+1}")
-                ax.plot(xs[0], ys[0], "o", color=c, ms=6)
-                ax.plot(xs[-1], ys[-1], "*", color=c, ms=10)
-
-    n_in_pen = int(rows[-1]["n_penned"])
-    ax.set_title(f"Webots trajectory  {len(rows)} steps  penned={n_in_pen}",
-                 fontsize=12)
-    ax.legend(loc="upper left", fontsize=7, ncol=2)
-    plt.tight_layout()
-    fig.savefig(out_path, dpi=120)
-    plt.close(fig)
-
-
-def plot_actions(rows, out_path):
-    t = np.arange(len(rows))
-    vx = np.array([float(r["vx"]) for r in rows])
-    vy = np.array([float(r["vy"]) for r in rows])
-    mag = np.sqrt(vx ** 2 + vy ** 2)
-
-    fig, axes = plt.subplots(3, 1, figsize=(12, 7), sharex=True)
-    axes[0].plot(t, vx, color="tab:red", lw=0.8); axes[0].set_ylabel("vx")
-    axes[0].axhline(0, color="black", lw=0.4); axes[0].set_ylim(-1.1, 1.1)
-    axes[1].plot(t, vy, color="tab:blue", lw=0.8); axes[1].set_ylabel("vy")
-    axes[1].axhline(0, color="black", lw=0.4); axes[1].set_ylim(-1.1, 1.1)
-    axes[2].plot(t, mag, color="tab:purple", lw=0.8); axes[2].set_ylabel("||action||")
-    axes[2].axhline(np.sqrt(2), color="orange", ls="--", lw=1, label="saturated √2")
-    axes[2].axhline(1.0, color="gray", ls="--", lw=1)
-    axes[2].set_xlabel("step"); axes[2].legend(fontsize=8)
-    fig.suptitle("Webots action time series")
-    plt.tight_layout()
-    fig.savefig(out_path, dpi=120)
-    plt.close(fig)
-
-
-def plot_obs(rows, out_path):
-    norm = np.array([parse_floats(r["norm_obs"]) for r in rows])
-    raw  = np.array([parse_floats(r["raw_obs"])  for r in rows])
-    if norm.size == 0:
-        return
-    n_dims = norm.shape[1]
-    labels = [
-        "dog_x", "dog_y", "com-dog_x", "com-dog_y",
-        "far1-com_x", "far1-com_y", "far2-com_x", "far2-com_y",
-        "far3-com_x", "far3-com_y", "pen-com_x", "pen-com_y",
-        "pen-far1_x", "pen-far1_y", "radius", "frac_active",
-    ][:n_dims]
-
-    t = np.arange(norm.shape[0])
-    fig, axes = plt.subplots(n_dims, 1, figsize=(11, 1.0 * n_dims), sharex=True)
-    if n_dims == 1: axes = [axes]
-    for i in range(n_dims):
-        axes[i].plot(t, raw[:, i], color="tab:gray", lw=0.6, alpha=0.6, label="raw")
-        axes[i].plot(t, norm[:, i], color="tab:red", lw=0.8, label="normalised")
-        axes[i].set_ylabel(labels[i], fontsize=8)
-        axes[i].tick_params(labelsize=7)
-        if i == 0:
-            axes[i].legend(fontsize=7, loc="upper right")
-    axes[-1].set_xlabel("step")
-    fig.suptitle("Observation values over time (raw vs VecNormalize-normalised)")
-    plt.tight_layout()
-    fig.savefig(out_path, dpi=110)
-    plt.close(fig)
-
-
-def main():
-    p = argparse.ArgumentParser()
-    here = os.path.dirname(os.path.abspath(__file__))
-    p.add_argument("--csv", default=os.path.join(here, "debug.csv"))
-    p.add_argument("--out-dir", default=os.path.join(here, "debug_out"))
-    args = p.parse_args()
-
-    rows = load_csv(args.csv)
-    os.makedirs(args.out_dir, exist_ok=True)
-    print(f"loaded {len(rows)} rows from {args.csv}")
-    plot_trajectory(rows, os.path.join(args.out_dir, "trajectory.png"))
-    plot_actions(rows,    os.path.join(args.out_dir, "actions.png"))
-    plot_obs(rows,        os.path.join(args.out_dir, "obs.png"))
-    print(f"saved trajectory.png + actions.png + obs.png to {args.out_dir}/")
-
-
-if __name__ == "__main__":
-    main()
@@ -1,285 +0,0 @@
-"""
-Shepherd Dog RL controller — runs a trained SB3 PPO policy inside Webots.
-
-Setup
-----
-1. Copy your trained files into this directory:
-       controllers/shepherd_dog_rl/final_model.zip
-       controllers/shepherd_dog_rl/vecnorm.pkl
-
-2. In field.wbt, set the ShepherdDog robot's controller field to
-   "shepherd_dog_rl".  You can do this in the Webots GUI:
-       click the robot → Controller → shepherd_dog_rl
-
-3. Optional: set controllerArgs to ["5"] (number of sheep) if it differs
-   from the default of 5.
-
-The controller reads GPS (dog position) and Receiver (sheep broadcasts),
-builds the same 16-dim flock observation the training env used, normalises
-it with the saved VecNormalize stats, and converts the (vx, vy) policy
-output into differential wheel speeds.
-
-Debug logging
-------------
-Set env var DOG_DEBUG=1 to write a per-step CSV (dog pos, sheep positions,
-raw obs, normalised obs, action) to debug.csv alongside this script. Use
-plot_debug.py to render trajectories from it.
-"""
-
-import sys
-import os
-import math
-import struct
-import numpy as np
-
-# ── make training code importable ───────────────────────────────────────────
-_HERE = os.path.dirname(os.path.abspath(__file__))
-_TRAINING = os.path.join(_HERE, "..", "..", "training")
-sys.path.insert(0, _TRAINING)
-
-from controller import Robot
-from stable_baselines3 import PPO
-from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
-from herding_env import HerdingEnv
-
-# ── constants (must match herding_env.py) ───────────────────────────────────
-FIELD      = 15.0
-PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
-PEN_X      = (10.0, 13.0)
-PEN_Y      = (-15.0, -8.0)
-DOG_SPEED  = 2.5         # m/s
-WHEEL_R    = 0.038       # wheel radius (metres) — from ShepherdDog.proto
-K_TURN     = 4.0         # heading-error gain (rad/s per rad)
-EAR_AMPLITUDE = 0.35
-EAR_RATE      = 8.0
-
-# ── model paths ─────────────────────────────────────────────────────────────
-MODEL_PATH   = os.path.join(_HERE, "final_model.zip")
-VECNORM_PATH = os.path.join(_HERE, "vecnorm.pkl")
-DEBUG_CSV    = os.path.join(_HERE, "debug.csv")
-DEBUG_ENABLED = True   # set False to disable debug.csv logging
-
-# ── action smoothing ─────────────────────────────────────────────────────────
-# EMA on policy output to suppress the rapid oscillation (vx/vy flipping
-# between -1 and +1 every step) that stalls the physical dog.  0 = no
-# smoothing (raw policy), 1 = frozen.  0.3 keeps ~30% of previous action.
-ACTION_SMOOTH = 0.3
-prev_action   = np.zeros(2, dtype=np.float32)
-
-
-def norm_angle(a: float) -> float:
-    while a >  math.pi: a -= 2 * math.pi
-    while a < -math.pi: a += 2 * math.pi
-    return a
-
-
-def in_pen(x: float, y: float) -> bool:
-    return PEN_X[0] < x < PEN_X[1] and PEN_Y[0] < y < PEN_Y[1]
-
-
-def build_obs(dog_pos: np.ndarray,
-              sheep_dict: dict,
-              n_sheep: int,
-              dog_heading: float = 0.0) -> np.ndarray:
-    """
-    Build the 18-dim flock observation — identical to HerdingEnv._obs().
-
-    sheep_dict: {name: (x, y)} for ALL known sheep (penned or not).
-    dog_heading: dog's current world-frame heading in radians.
-    """
-    D = 2 * FIELD
-
-    # Split active vs penned
-    active_pos = np.array(
-        [v for v in sheep_dict.values() if not in_pen(*v)],
-        dtype=np.float32
-    )
-    n_active = len(active_pos)
-
-    if n_active > 0:
-        com        = active_pos.mean(axis=0)
-        d_from_com = np.linalg.norm(active_pos - com, axis=1)
-        sorted_idx = np.argsort(d_from_com)[::-1]
-        radius     = float(d_from_com[sorted_idx[0]])
-        def nth(n):
-            return active_pos[sorted_idx[n]] if len(sorted_idx) > n else com
-        far1, far2, far3 = nth(0), nth(1), nth(2)
-    else:
-        com = PEN_CENTER.copy()
-        radius = 0.0
-        far1 = far2 = far3 = PEN_CENTER.copy()
-
-    frac_active = n_active / max(n_sheep, 1)
-
-    return np.array([
-        dog_pos[0] / FIELD,  dog_pos[1] / FIELD,
-        (com[0]  - dog_pos[0]) / D, (com[1]  - dog_pos[1]) / D,
-        (far1[0] - com[0]) / D, (far1[1] - com[1]) / D,
-        (far2[0] - com[0]) / D, (far2[1] - com[1]) / D,
-        (far3[0] - com[0]) / D, (far3[1] - com[1]) / D,
-        (PEN_CENTER[0] - com[0])  / D, (PEN_CENTER[1] - com[1])  / D,
-        (PEN_CENTER[0] - far1[0]) / D, (PEN_CENTER[1] - far1[1]) / D,
-        radius / D,
-        frac_active,
-        math.cos(dog_heading), math.sin(dog_heading),
-    ], dtype=np.float32)
-
-
-# ── Webots setup ─────────────────────────────────────────────────────────────
-robot    = Robot()
-timestep = int(robot.getBasicTimeStep())
-
-# Drive motors
-left_motor  = robot.getDevice("left wheel motor")
-right_motor = robot.getDevice("right wheel motor")
-left_motor.setPosition(float("inf"))
-right_motor.setPosition(float("inf"))
-left_motor.setVelocity(0.0)
-right_motor.setVelocity(0.0)
-MOTOR_MAX = left_motor.getMaxVelocity()
-
-# Sensors
-gps      = robot.getDevice("gps");      gps.enable(timestep)
-compass  = robot.getDevice("compass");  compass.enable(timestep)
-receiver = robot.getDevice("receiver"); receiver.enable(timestep)
-emitter  = robot.getDevice("emitter")
-
-# Cosmetic
-left_ear  = robot.getDevice("left ear motor")
-right_ear = robot.getDevice("right ear motor")
-left_ear.setPosition(float("inf"));  right_ear.setPosition(float("inf"))
-left_ear.setVelocity(0.0);           right_ear.setVelocity(0.0)
-ear_phase = 0.0
-
-# Number of sheep (from controllerArgs or default)
-try:
-    n_sheep = int(sys.argv[1])
-except (IndexError, ValueError):
-    n_sheep = 3
-
-# ── Load model ───────────────────────────────────────────────────────────────
-print(f"[RL dog] Loading model from {MODEL_PATH}")
-print(f"[RL dog] Loading vecnorm from {VECNORM_PATH}")
-
-dummy_env = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep)])
-vecnorm   = VecNormalize.load(VECNORM_PATH, dummy_env)
-vecnorm.training    = False
-vecnorm.norm_reward = False
-
-model = PPO.load(MODEL_PATH, device="cpu")
-print(f"[RL dog] Model loaded — running with n_sheep={n_sheep}")
-
-# ── Runtime state ─────────────────────────────────────────────────────────────
-sheep_positions: dict = {}   # {name: (x, y)} — updated every step from receiver
-step_count = 0
-
-# Debug CSV — written every step when DOG_DEBUG=1
-debug_file = None
-if DEBUG_ENABLED:
-    import csv
-    debug_file = open(DEBUG_CSV, "w", newline="")
-    debug_writer = csv.writer(debug_file)
-    debug_writer.writerow([
-        "step", "dog_x", "dog_y", "heading",
-        "sheep_xs", "sheep_ys", "n_active", "n_penned",
-        "raw_obs", "norm_obs", "vx", "vy",
-    ])
-    print(f"[RL dog] DEBUG logging to {DEBUG_CSV}")
-
-
-def bearing() -> float:
-    """Current robot heading in world frame (radians)."""
-    n = compass.getValues()
-    return math.atan2(n[0], n[1])
-
-
-def drive(action_vx: float, action_vy: float) -> None:
-    """Convert (vx, vy) policy action to differential wheel speeds."""
-    speed_ms = math.sqrt(action_vx ** 2 + action_vy ** 2) * DOG_SPEED
-    if speed_ms < 0.05:
-        left_motor.setVelocity(0.0)
-        right_motor.setVelocity(0.0)
-        return
-
-    target_heading = math.atan2(action_vy, action_vx)
-    err = norm_angle(target_heading - bearing())
-
-    fwd_ms  = speed_ms * max(0.0, math.cos(err))
-    fwd_rad = fwd_ms / WHEEL_R
-    turn    = K_TURN * err    # rad/s correction
-
-    l = max(-MOTOR_MAX, min(MOTOR_MAX, fwd_rad - turn))
-    r = max(-MOTOR_MAX, min(MOTOR_MAX, fwd_rad + turn))
-    left_motor.setVelocity(l)
-    right_motor.setVelocity(r)
-
-
-# ── Main loop ─────────────────────────────────────────────────────────────────
-while robot.step(timestep) != -1:
-    step_count += 1
-
-    # 1. Drain receiver — update sheep position table
-    while receiver.getQueueLength() > 0:
-        try:
-            msg = receiver.getString()
-            parts = msg.split(":")
-            if parts[0] == "sheep" and len(parts) == 4:
-                sheep_positions[parts[1]] = (float(parts[2]), float(parts[3]))
-        except Exception:
-            pass
-        receiver.nextPacket()
-
-    # 2. Dog GPS
-    gps_vals = gps.getValues()
-    dog_pos  = np.array([gps_vals[0], gps_vals[1]], dtype=np.float32)
-
-    # 3. Build and normalise observation (heading from compass)
-    raw_obs  = build_obs(dog_pos, sheep_positions, n_sheep,
-                         dog_heading=bearing())
-    obs_norm = vecnorm.normalize_obs(raw_obs[np.newaxis])  # (1, 13)
-
-    # 4. Policy inference + smoothing
-    action, _ = model.predict(obs_norm, deterministic=True)
-    raw_a = np.array([float(action[0][0]), float(action[0][1])], dtype=np.float32)
-    if ACTION_SMOOTH > 0:
-        smoothed = ACTION_SMOOTH * prev_action + (1.0 - ACTION_SMOOTH) * raw_a
-        prev_action[:] = smoothed
-        vx, vy = float(smoothed[0]), float(smoothed[1])
-    else:
-        vx, vy = float(raw_a[0]), float(raw_a[1])
-
-    # 5. Drive
-    drive(vx, vy)
-
-    # 6. Broadcast dog position so sheep can compute flee forces
-    emitter.send(f"dog:{dog_pos[0]:.4f}:{dog_pos[1]:.4f}")
-
-    # 7. Ear animation
-    ear_phase += 0.12
-    ep = EAR_AMPLITUDE * math.sin(ear_phase)
-    left_ear.setVelocity(EAR_RATE);  right_ear.setVelocity(EAR_RATE)
-    left_ear.setPosition( ep);        right_ear.setPosition(-ep)
-
-    # Periodic status
-    if step_count % 100 == 0:
-        n_in_pen = sum(1 for x, y in sheep_positions.values() if in_pen(x, y))
-        print(f"[RL dog] step={step_count}  known_sheep={len(sheep_positions)}"
-              f"  penned={n_in_pen}/{n_sheep}  dog=({dog_pos[0]:.2f},{dog_pos[1]:.2f})"
-              f"  action=({vx:.2f}, {vy:.2f})")
-
-    # Debug CSV row
-    if debug_file is not None:
-        n_active = sum(1 for x, y in sheep_positions.values() if not in_pen(x, y))
-        n_in_pen = len(sheep_positions) - n_active
-        debug_writer.writerow([
-            step_count, f"{dog_pos[0]:.4f}", f"{dog_pos[1]:.4f}",
-            f"{bearing():.4f}",
-            ";".join(f"{v[0]:.3f}" for v in sheep_positions.values()),
-            ";".join(f"{v[1]:.3f}" for v in sheep_positions.values()),
-            n_active, n_in_pen,
-            ";".join(f"{x:.4f}" for x in raw_obs),
-            ";".join(f"{x:.4f}" for x in obs_norm[0]),
-            f"{vx:.4f}", f"{vy:.4f}",
-        ])
-        if step_count % 200 == 0:
-            debug_file.flush()
@@ -6,28 +6,28 @@
 - Nelson Neto <up202108117@up.pt>

 ## (i) Title and General objectives
-**RL-Based Autonomous Shepherd Robot for Livestock Herding**
+**Autonomous Shepherd Robot for Livestock Herding (Strömbom)**

 - Implement effective herding behaviors through proximity and movement strategies
 - Build a 3D environment with realistic robot dynamics and LIDAR-based perception
- Develop a mobile robot capable of autonomously guiding a flock of sheep into a designated target area using Reinforcement Learning
+- Develop a mobile robot capable of autonomously guiding a flock of sheep into a designated target area using the Strömbom heuristic approach


 # Group G25 - (ii) Intermediate Goals

 ## Intermediate goals
 - Set up the Webots simulation environment with an open field and target zone
- Implement lightweight Gymnasium-based 2D herding environment
+- Implement lightweight 2D herding environment for algorithm evaluation
 - Design a Sheep and Dog robot
- Implement a sheep flocking model for fast RL iteration
+- Implement a sheep flocking model for fast Strömbom iteration
 - Validate LiDAR sensor feedback for sheep detection and distance estimation


 # Group G25 - Course Project (Final) Goals

 ## (iii) Main goals
- State-of-the-art survey on shepherding algorithms and multi-agent RL herding
- Train the robot using PPO to successfully herd a single sheep into the goal
+- State-of-the-art survey on shepherding algorithms with focus on Strömbom herding
+- Implement and tune Strömbom controller to successfully herd a single sheep into the goal
 - Achieve fully autonomous herding of multiple sheep and a full flock into the target area
 - Optimize robot trajectory to minimize the time required to group the flock
 - Ensure zero collisions between the robot and the sheep during the task
@@ -35,7 +35,7 @@
 - Article, demo video, and final presentation

 ## (iv) Extra Merit
- Curriculum Learning (scaling from 1 sheep to a flock)
+- Progressive evaluation (scaling from 1 sheep to a flock)
 - Comparison of performance between Differential Drive and Mecanum wheels
 - Robustness testing under sensor noise or varying sheep speeds, configurations and parameters
 - Multi-shepherd cooperative mode: 2 dogs learn role specialization (collector vs. driver)
@@ -46,11 +46,10 @@

 ## (v) Tools
 - Webots for 3D physics simulation with ROS2 integration via `webots_ros2` package
- Stable-Baselines3 for the PPO algorithm implementation
- Gymnasium (OpenAI) for the RL environment wrapper (lightweight 2D herding env for fast RL training)
+- Gymnasium (OpenAI) for the simulation wrapper and evaluation tooling
 - Python as the primary programming language (sheep flocking model, reward shaping, evaluation)

 ## (vi) Limitations
- Computational Power: Training time might be high for complex flock behaviors
+- Computational Power: Large batch evaluation and parameter sweeps can still be time-consuming
 - Sim-to-Real Gap: No real-world validation of the herding controller; project is simulation-only (2D + Webots 3D)
 - Model Complexity: Simplified sheep behavior (scripted) may not account for all biological livestock nuances
@@ -0,0 +1,8 @@
+"""Shared core for the shepherd herding project.
+
+This package is the single source of truth for world geometry, sheep
+flocking dynamics, differential-drive kinematics, observation building,
+and the Strömbom heuristic. It is imported both by the Webots
+controllers (for inference) and by the Gymnasium training environment
+(for fast PPO rollouts), so the two paths cannot drift apart.
+"""
@@ -0,0 +1,70 @@
+"""Differential-drive kinematics matching the Webots robot specs.
+
+The Webots controllers and the training env both use these helpers so the
+sim and the real (Webots) physics agree to first order. They do not model
+slip, wheel acceleration limits, or contact forces — Webots does that for
+us at inference time. The training env has to be close enough that a
+policy trained against this kinematic model still works when handed off
+to ODE physics.
+"""
+
+import math
+
+
+def kinematics_step(x, y, h, w_left, w_right, wheel_radius, wheel_base, dt):
+    """Integrate one step of differential-drive forward kinematics.
+
+    Inputs
+    ------
+    x, y : robot position (m)
+    h    : robot heading (rad), 0 = +x axis
+    w_left, w_right : wheel angular velocities (rad/s)
+    wheel_radius, wheel_base : robot dimensions (m)
+    dt   : timestep (s)
+
+    Returns (new_x, new_y, new_h).
+    """
+    v = (w_right + w_left) * wheel_radius * 0.5
+    omega = (w_right - w_left) * wheel_radius / wheel_base
+    new_x = x + v * math.cos(h) * dt
+    new_y = y + v * math.sin(h) * dt
+    new_h = math.atan2(math.sin(h + omega * dt), math.cos(h + omega * dt))
+    return new_x, new_y, new_h
+
+
+def velocity_to_wheels(vx, vy, h, max_linear, wheel_radius, max_wheel_omega,
+                       k_turn=4.0):
+    """Convert a desired (vx, vy) intent in [-1, 1]^2 to wheel speeds.
+
+    Mirrors ``drive_action`` in controllers/shepherd_dog/shepherd_dog.py:
+    forward speed scales by ``cos(err)`` (clamped to ±90°), and a P
+    controller on heading error contributes the wheel-rate differential.
+    """
+    speed_ms = math.hypot(vx, vy) * max_linear
+    if speed_ms < 1e-3:
+        return 0.0, 0.0
+    target_h = math.atan2(vy, vx)
+    err = math.atan2(math.sin(target_h - h), math.cos(target_h - h))
+    clamped_err = max(-math.pi / 2, min(math.pi / 2, err))
+    fwd_ms = speed_ms * math.cos(clamped_err)
+    fwd_rad = fwd_ms / wheel_radius
+    turn = k_turn * err
+    left = max(-max_wheel_omega, min(max_wheel_omega, fwd_rad - turn))
+    right = max(-max_wheel_omega, min(max_wheel_omega, fwd_rad + turn))
+    return left, right
+
+
+def heading_speed_to_wheels(heading, speed_motor, h, max_wheel_omega,
+                            k_turn=4.0):
+    """Sheep variant: speed already expressed in motor (wheel rad/s) units.
+
+    Matches the existing sheep controller (``controllers/sheep/sheep.py``)
+    where ``speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))`` and
+    these constants are wheel angular velocities, not linear m/s.
+    """
+    err = math.atan2(math.sin(heading - h), math.cos(heading - h))
+    fwd = max(0.0, math.cos(err)) * speed_motor
+    turn = k_turn * err
+    left = max(-max_wheel_omega, min(max_wheel_omega, fwd - turn))
+    right = max(-max_wheel_omega, min(max_wheel_omega, fwd + turn))
+    return left, right
@@ -0,0 +1,178 @@
+"""Reynolds-style sheep flocking dynamics.
+
+This is the per-sheep behavioural step used both by the Webots sheep
+controller (scalar, one sheep at a time) and by the training environment
+(loop over sheep). The numerics are adapted from the original
+``controllers/sheep/flocking.py`` and retuned for the new external-pen
+layout: the south stone wall is intact except in the gate column, so
+sheep can only reach the pen by walking through that 3-m corridor.
+
+Force stack each step (summed → heading + speed):
+    flee       — quadratic ramp away from dog within FLEE_DIST
+    cohesion   — drift toward flock centre, halved while fleeing
+    separation — inverse-distance push from peers
+    walls      — soft repulsion + hard escape band against field walls,
+                 except inside the gate column where the south wall is
+                 absent
+    wander     — small persistent drift for natural idle motion
+
+A sheep latches to ``penned`` the first time it crosses the gate plane
+into the gate column (handled by callers via ``geometry.is_penned_position``);
+once latched, ``penned=True`` is passed in here and the force stack
+switches to in-pen containment + jitter.
+"""
+
+import math
+import random
+
+from herding.geometry import (
+    FIELD_X, FIELD_Y,
+    PEN_X, PEN_Y,
+    GATE_X,
+)
+
+# --- Speed and force constants ---
+# All speeds here are in wheel rad/s (motor units), matching the existing
+# sheep controller. Conversion to m/s = speed * SHEEP_WHEEL_RADIUS.
+MAX_SPEED = 22.0
+FLEE_SPEED = 20.0
+WANDER_SPEED = 3.0
+
+WALL_MARGIN = 5.0
+WALL_HARD_MARGIN = 1.0
+WALL_HARD_GAIN = 50.0
+
+FLEE_DIST = 7.0
+SEPARATION_DIST = 2.5
+COHESION_DIST = 8.0
+
+PEN_MARGIN = 0.8
+
+
+def _peers_iter(peers):
+    """Accept either a {name: (x, y)} dict or an iterable of (x, y) tuples."""
+    if isinstance(peers, dict):
+        return list(peers.values())
+    return list(peers)
+
+
+def compute_heading_speed(x, y, penned, dog_xy, peers, wander_angle, rng=None):
+    """Return ``(heading, speed, new_wander_angle)`` for one sheep step.
+
+    ``speed`` is in wheel rad/s (motor units), bounded by ``[WANDER_SPEED,
+    FLEE_SPEED]``. ``heading`` is the world-frame target heading the sheep
+    should aim for (atan2 convention).
+
+    ``rng`` is an optional ``random.Random``-compatible object used for
+    the wander-jitter. If ``None``, falls back to Python's global module
+    (matches Webots controller usage). Pass an env-owned RNG to make
+    rollouts deterministic given a seed.
+    """
+    fx, fy = 0.0, 0.0
+    peer_list = _peers_iter(peers)
+    rnd = rng if rng is not None else random
+
+    if penned:
+        # --- Pen containment: bounce off the four pen walls ---
+        pm = PEN_MARGIN
+        if x < PEN_X[0] + pm:
+            fx += ((PEN_X[0] + pm - x) / pm) * 15.0
+        if x > PEN_X[1] - pm:
+            fx -= ((x - (PEN_X[1] - pm)) / pm) * 15.0
+        if y < PEN_Y[0] + pm:
+            fy += ((PEN_Y[0] + pm - y) / pm) * 15.0
+        if y > PEN_Y[1] - pm:
+            fy -= ((y - (PEN_Y[1] - pm)) / pm) * 15.0
+
+        # Mild peer separation — penned sheep crowd the corner otherwise.
+        for px, py in peer_list:
+            dx, dy = px - x, py - y
+            d = math.hypot(dx, dy)
+            if 0.05 < d < SEPARATION_DIST:
+                push = (SEPARATION_DIST - d) / d
+                fx -= (dx / d) * push * 2.5
+                fy -= (dy / d) * push * 2.5
+
+        if rnd.random() < 0.02:
+            wander_angle += rnd.uniform(-0.6, 0.6)
+        fx += math.cos(wander_angle) * 0.5
+        fy += math.sin(wander_angle) * 0.5
+
+    else:
+        # --- Free-roaming sheep in the field ---
+        fleeing = False
+        if dog_xy is not None:
+            ddx = dog_xy[0] - x
+            ddy = dog_xy[1] - y
+            dist = math.hypot(ddx, ddy)
+            if 0.01 < dist < FLEE_DIST:
+                fleeing = True
+                t = 1.0 - dist / FLEE_DIST
+                s = t * t * 20.0
+                fx -= (ddx / dist) * s
+                fy -= (ddy / dist) * s
+
+        # Cohesion — drift toward flock CoM (peers within COHESION_DIST).
+        # Cohesion is *stronger* under flee than at rest (the
+        # predator-confusion / safety-in-numbers effect — sheep huddle when
+        # threatened). This is what makes shepherding work: the flock stays
+        # as one unit through the narrow gate instead of fragmenting.
+        cx, cy, cn = 0.0, 0.0, 0
+        for px, py in peer_list:
+            d = math.hypot(px - x, py - y)
+            if 0.3 < d < COHESION_DIST:
+                cx += px
+                cy += py
+                cn += 1
+        if cn > 0:
+            # Cohesion needs to be comparable to flee at close range to keep
+            # the flock together through narrow obstacles like the 3m gate.
+            # Flee at 2m has magnitude ~10; cohesion at peer-distance 5m
+            # with w=1.5 contributes ~7.5 — same order, so the flock
+            # translates as a unit instead of fragmenting under pressure.
+            w = 1.5 if fleeing else 0.6
+            fx += (cx / cn - x) * w
+            fy += (cy / cn - y) * w
+
+        # Separation — inverse-distance push from peers.
+        for px, py in peer_list:
+            ddx, ddy = px - x, py - y
+            d = math.hypot(ddx, ddy)
+            if 0.05 < d < SEPARATION_DIST:
+                push = (SEPARATION_DIST - d) / d
+                fx -= (ddx / d) * push * 2.5
+                fy -= (ddy / d) * push * 2.5
+
+        # Wall soft repulsion. The south wall is absent inside the gate
+        # column so sheep can be driven through it by the dog.
+        if x < FIELD_X[0] + WALL_MARGIN:
+            fx += ((FIELD_X[0] + WALL_MARGIN - x) / WALL_MARGIN) * 6.0
+        if x > FIELD_X[1] - WALL_MARGIN:
+            fx -= ((x - (FIELD_X[1] - WALL_MARGIN)) / WALL_MARGIN) * 6.0
+        if y > FIELD_Y[1] - WALL_MARGIN:
+            fy -= ((y - (FIELD_Y[1] - WALL_MARGIN)) / WALL_MARGIN) * 6.0
+        if y < FIELD_Y[0] + WALL_MARGIN and not (GATE_X[0] <= x <= GATE_X[1]):
+            fy += ((FIELD_Y[0] + WALL_MARGIN - y) / WALL_MARGIN) * 6.0
+
+        if not fleeing:
+            if random.random() < 0.02:
+                wander_angle += random.uniform(-0.6, 0.6)
+            fx += math.cos(wander_angle) * 0.5
+            fy += math.sin(wander_angle) * 0.5
+
+    # --- Hard escape band — overrides everything when very close to a wall ---
+    m, g = WALL_HARD_MARGIN, WALL_HARD_GAIN
+    if x - FIELD_X[0] < m:
+        fx = max(fx, g * (1.0 - (x - FIELD_X[0]) / m))
+    if FIELD_X[1] - x < m:
+        fx = min(fx, -g * (1.0 - (FIELD_X[1] - x) / m))
+    if FIELD_Y[1] - y < m:
+        fy = min(fy, -g * (1.0 - (FIELD_Y[1] - y) / m))
+    # South wall hard escape only when not in the gate column and not penned.
+    if (not penned) and (y - FIELD_Y[0] < m) and not (GATE_X[0] <= x <= GATE_X[1]):
+        fy = max(fy, g * (1.0 - (y - FIELD_Y[0]) / m))
+
+    heading = math.atan2(fy, fx)
+    mag = math.hypot(fx, fy)
+    speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))
+    return heading, speed, wander_angle
@@ -0,0 +1,99 @@
+"""World geometry and robot specs.
+
+All coordinates are in meters. (0, 0) is the centre of the field, +x is
+east, +y is north. Z is up but unused here. These constants must match
+``worlds/field.wbt`` and the proto files; if the world changes, change
+this file and only this file.
+
+Pen layout (post-refactor)
+--------------------------
+The pen is *external* to the field, accessed through a 3 m gate cut into
+the south stone wall at y = -15. Sheep entering through the gate end up
+in a fenced rectangle south of the field; the dog stays in the field
+(soft-limited above DOG_SOUTH_LIMIT during training and inference).
+
+    field        +y north
+    +-----------+
+    |           |
+    |           |
+    |  ......   |
+    +---||||----+   y = -15  (south wall, gate at x ∈ [10, 13])
+        ||||
+        |pen|       y ∈ [-22, -15]
+        +---+
+"""
+
+import math
+
+# --- Field (square, stone-walled) ---
+FIELD_X = (-15.0, 15.0)
+FIELD_Y = (-15.0, 15.0)
+
+# Conservative inside bounds — sheep/dog should not graze the wall.
+FIELD_INSIDE_MARGIN = 0.5
+
+# --- Pen (external, south of the field) ---
+PEN_X = (10.0, 13.0)
+PEN_Y = (-22.0, -15.0)
+PEN_CENTER = (0.5 * (PEN_X[0] + PEN_X[1]), 0.5 * (PEN_Y[0] + PEN_Y[1]))
+# The point the dog drives the flock toward: the gate centre on the field side.
+PEN_ENTRY = (0.5 * (PEN_X[0] + PEN_X[1]), -15.0)
+
+# --- Gate (the hole in the south stone wall) ---
+GATE_X = PEN_X
+GATE_Y = -15.0
+
+# --- Robot specs (must match proto files) ---
+# Dog (controllers/shepherd_dog/, protos/ShepherdDog.proto)
+DOG_WHEEL_RADIUS = 0.038         # m
+DOG_WHEEL_BASE = 0.28            # m, axle-to-axle
+DOG_MAX_WHEEL_OMEGA = 70.0       # rad/s
+DOG_MAX_LINEAR = DOG_WHEEL_RADIUS * DOG_MAX_WHEEL_OMEGA  # ~2.66 m/s
+
+# Sheep (controllers/sheep/, protos/Sheep.proto)
+SHEEP_WHEEL_RADIUS = 0.031       # m
+SHEEP_WHEEL_BASE = 0.20          # m
+SHEEP_MAX_WHEEL_OMEGA = 25.0     # rad/s
+SHEEP_MAX_LINEAR = SHEEP_WHEEL_RADIUS * SHEEP_MAX_WHEEL_OMEGA  # ~0.78 m/s
+
+# --- Webots step ---
+WEBOTS_DT = 0.016  # seconds, matches WorldInfo.basicTimeStep = 16 in field.wbt
+
+# --- Dog "virtual south wall" (training keeps dog out of the pen) ---
+# At inference the controller also clips to this so a slightly miscalibrated
+# policy doesn't accidentally drive into the pen and trap the sheep.
+DOG_SOUTH_LIMIT = -14.5
+
+# --- Maximum supported flock size ---
+MAX_SHEEP = 10
+
+
+def in_pen(x: float, y: float) -> bool:
+    """True if (x, y) lies inside the external pen rectangle."""
+    return PEN_X[0] < x < PEN_X[1] and PEN_Y[0] < y < PEN_Y[1]
+
+
+def in_field(x: float, y: float, margin: float = 0.0) -> bool:
+    return (FIELD_X[0] + margin <= x <= FIELD_X[1] - margin
+            and FIELD_Y[0] + margin <= y <= FIELD_Y[1] - margin)
+
+
+def in_gate_corridor(x: float, y: float, margin: float = 0.0) -> bool:
+    """True if (x, y) lies in the column of the gate (between field and pen)."""
+    return (PEN_X[0] - margin <= x <= PEN_X[1] + margin
+            and PEN_Y[0] - margin <= y <= GATE_Y + margin)
+
+
+def is_penned_position(x: float, y: float, latch_margin: float = 0.2) -> bool:
+    """A sheep latches to "penned" once it crosses the gate plane south.
+
+    True iff x is inside the gate column (with a small margin) AND
+    y has dipped below the gate line. Once latched, the sheep is held by
+    in-pen forces and will not exit on its own.
+    """
+    return (PEN_X[0] - latch_margin <= x <= PEN_X[1] + latch_margin
+            and y <= GATE_Y)
+
+
+def distance_to_pen_entry(x: float, y: float) -> float:
+    return math.hypot(x - PEN_ENTRY[0], y - PEN_ENTRY[1])
@@ -0,0 +1,137 @@
+"""Observation builder for the shepherd dog policy.
+
+Order-invariant 32-D feature vector — the policy generalises across
+flock sizes 1..MAX_SHEEP because individual sheep coordinates never
+appear in the observation by index, only summary statistics, a polar
+histogram, and two "named" sheep (closest-to-pen and rearmost-from-pen).
+
+The two named sheep matter for the sequential-driving teacher: it
+targets the closest-to-pen sheep specifically, so the policy needs
+that channel to mimic the teacher.
+
+Layout (all components normalised so values stay roughly in [-1, 1]):
+
+    idx   field
+    -----  ----------------------------------------------------------
+     0..3  dog pose: x/15, y/15, cos(heading), sin(heading)
+     4..5  active-sheep CoM x/15, y/15
+     6..8  flock dispersion: max-radius/15, std_x/15, std_y/15
+     9..11 vector dog→CoM: dx/30, dy/30, dist/30
+    12..14 vector dog→pen-entry: dx/30, dy/30, dist/30
+    15..16 vector furthest-sheep→CoM: dx/15, dy/15
+    17..18 min sheep-to-wall, min dog-to-wall (both /15)
+       19  active-sheep count / MAX_SHEEP
+    20..27 8-bin polar histogram of active sheep around the dog,
+           rotation-aware (binned in dog-relative frame), normalised
+           so the bins sum to 1.
+    28..29 vector dog→closest-to-pen sheep: dx/15, dy/15
+    30..31 vector dog→rearmost (furthest-from-pen) sheep: dx/15, dy/15
+"""
+
+import math
+import numpy as np
+
+from herding.geometry import (
+    FIELD_X, FIELD_Y, PEN_ENTRY, MAX_SHEEP,
+)
+
+OBS_DIM = 32
+
+
+def build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list,
+              n_max: int = MAX_SHEEP) -> np.ndarray:
+    """Assemble the dog policy's observation vector.
+
+    Parameters
+    ----------
+    dog_xy : tuple (x, y) of the dog's GPS position (m)
+    dog_heading : dog heading in rad
+    sheep_xy_list : iterable of (x, y) for ALL known sheep
+    sheep_penned_list : parallel iterable of bool — True if sheep is penned
+    n_max : maximum supported flock size used for the count normaliser
+    """
+    dog_x, dog_y = dog_xy
+    obs = np.zeros(OBS_DIM, dtype=np.float32)
+
+    obs[0] = dog_x / 15.0
+    obs[1] = dog_y / 15.0
+    obs[2] = math.cos(dog_heading)
+    obs[3] = math.sin(dog_heading)
+
+    active = [(x, y) for (x, y), p
+              in zip(sheep_xy_list, sheep_penned_list) if not p]
+    n = len(active)
+
+    pdx0, pdy0 = PEN_ENTRY[0] - dog_x, PEN_ENTRY[1] - dog_y
+    obs[12] = pdx0 / 30.0
+    obs[13] = pdy0 / 30.0
+    obs[14] = math.hypot(pdx0, pdy0) / 30.0
+
+    if n == 0:
+        # All sheep penned — terminal observation.
+        obs[19] = 0.0
+        return obs
+
+    arr = np.asarray(active, dtype=np.float32)
+    com_x = float(arr[:, 0].mean())
+    com_y = float(arr[:, 1].mean())
+    rel = arr - np.array([com_x, com_y], dtype=np.float32)
+    dists = np.hypot(rel[:, 0], rel[:, 1])
+    radius = float(dists.max())
+    std_x = float(arr[:, 0].std())
+    std_y = float(arr[:, 1].std())
+
+    obs[4] = com_x / 15.0
+    obs[5] = com_y / 15.0
+    obs[6] = radius / 15.0
+    obs[7] = std_x / 15.0
+    obs[8] = std_y / 15.0
+
+    cdx, cdy = com_x - dog_x, com_y - dog_y
+    obs[9]  = cdx / 30.0
+    obs[10] = cdy / 30.0
+    obs[11] = math.hypot(cdx, cdy) / 30.0
+
+    far_idx = int(np.argmax(dists))
+    obs[15] = float(rel[far_idx, 0]) / 15.0
+    obs[16] = float(rel[far_idx, 1]) / 15.0
+
+    min_sheep_wall = min(
+        float(np.min(arr[:, 0] - FIELD_X[0])),
+        float(np.min(FIELD_X[1] - arr[:, 0])),
+        float(np.min(arr[:, 1] - FIELD_Y[0])),
+        float(np.min(FIELD_Y[1] - arr[:, 1])),
+    )
+    min_dog_wall = min(
+        dog_x - FIELD_X[0], FIELD_X[1] - dog_x,
+        dog_y - FIELD_Y[0], FIELD_Y[1] - dog_y,
+    )
+    obs[17] = min_sheep_wall / 15.0
+    obs[18] = float(min_dog_wall) / 15.0
+    obs[19] = n / n_max
+
+    # 8-bin polar histogram in the dog's body frame.
+    rel_dx = arr[:, 0] - dog_x
+    rel_dy = arr[:, 1] - dog_y
+    angles = np.arctan2(rel_dy, rel_dx) - dog_heading
+    angles = np.arctan2(np.sin(angles), np.cos(angles))
+    bins = np.floor((angles + math.pi) / (2 * math.pi) * 8).astype(int)
+    bins = np.clip(bins, 0, 7)
+    hist = np.bincount(bins, minlength=8).astype(np.float32)
+    hist /= max(1, n)
+    obs[20:28] = hist
+
+    # Closest-to-pen sheep (the sequential teacher's target) and rearmost
+    # (furthest-from-pen, the natural "next target" once the closest is
+    # penned). Both expressed as offset from dog. These two channels make
+    # BC tractable — without them the obs doesn't uniquely identify which
+    # sheep the teacher is steering toward.
+    pen_dists = np.hypot(arr[:, 0] - PEN_ENTRY[0], arr[:, 1] - PEN_ENTRY[1])
+    closest_idx = int(np.argmin(pen_dists))
+    rearmost_idx = int(np.argmax(pen_dists))
+    obs[28] = (float(arr[closest_idx, 0]) - dog_x) / 15.0
+    obs[29] = (float(arr[closest_idx, 1]) - dog_y) / 15.0
+    obs[30] = (float(arr[rearmost_idx, 0]) - dog_x) / 15.0
+    obs[31] = (float(arr[rearmost_idx, 1]) - dog_y) / 15.0
+
+    return obs
@@ -0,0 +1,98 @@
+"""Sequential single-target shepherd dog algorithm.
+
+Strömbom drives the flock's centre of mass; with N sheep and a narrow
+3 m gate, this fails because the flock is wider than the gate and CoM
+driving abandons stragglers. Real sheepdogs solve this differently:
+they pick *one* sheep at a time, drive it through, return for the next.
+
+This module implements that "pin-and-push" approach.
+
+Algorithm (one step):
+1. Active sheep = those still in the field (not yet penned).
+2. Target = the active sheep currently closest to the pen entry.
+3. Drive position = ``target + Δ · unit(target − pen_entry)`` —
+   directly behind the target relative to the goal.
+4. Output unit vector pointing the dog at the drive position.
+
+Once the target crosses the gate it latches as penned and is removed
+from the active set; the next-closest unpenned sheep becomes the
+target. The algorithm naturally "queues" sheep through the gate.
+
+Empirically (with our flocking dynamics) this scales linearly with
+flock size and works up to at least n=10 within a 15 000-step budget.
+"""
+
+import math
+
+from herding.geometry import GATE_Y, PEN_ENTRY, in_pen
+
+
+DELTA_DRIVE = 1.5     # standoff behind the target sheep
+APPROACH_GAIN = 1.0   # action magnitude scale (1 = full speed)
+
+
+def _unit(x, y):
+    d = math.hypot(x, y)
+    if d < 1e-6:
+        return 0.0, 0.0
+    return x / d, y / d
+
+
+def _is_active(x, y) -> bool:
+    return (not in_pen(x, y)) and y > GATE_Y
+
+
+def compute_action(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
+    """Return ``(vx, vy, mode)`` where mode encodes the current target.
+
+    Compatible with the Strömbom call signature so it can be drop-in
+    swapped in the dog controller and the env's imitation reward.
+    """
+    active = [(name, x, y) for name, (x, y) in sheep_positions.items()
+              if _is_active(x, y)]
+    if not active:
+        return 0.0, 0.0, "idle"
+
+    # Pick target = sheep closest to pen entry. Stable choice: as one
+    # sheep approaches and crosses the gate it stays the target until
+    # latched; then the next-closest takes over.
+    name, sx, sy = min(
+        active,
+        key=lambda s: math.hypot(s[1] - pen_target[0], s[2] - pen_target[1]),
+    )
+
+    # Drive position behind the target along the (target → pen) line.
+    ux, uy = _unit(sx - pen_target[0], sy - pen_target[1])
+    tx = sx + DELTA_DRIVE * ux
+    ty = sy + DELTA_DRIVE * uy
+
+    ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
+    return APPROACH_GAIN * ax, APPROACH_GAIN * ay, f"drive:{name}"
+
+
+def compute_action_debug(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
+    """Debug variant returning ``(vx, vy, mode, debug_dict)``."""
+    active = [(name, x, y) for name, (x, y) in sheep_positions.items()
+              if _is_active(x, y)]
+    if not active:
+        return 0.0, 0.0, "idle", {
+            "n_active": 0, "target_name": "",
+            "target_x": 0.0, "target_y": 0.0,
+            "drive_x": dog_xy[0], "drive_y": dog_xy[1],
+        }
+
+    name, sx, sy = min(
+        active,
+        key=lambda s: math.hypot(s[1] - pen_target[0], s[2] - pen_target[1]),
+    )
+
+    ux, uy = _unit(sx - pen_target[0], sy - pen_target[1])
+    tx = sx + DELTA_DRIVE * ux
+    ty = sy + DELTA_DRIVE * uy
+    ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
+
+    return APPROACH_GAIN * ax, APPROACH_GAIN * ay, f"drive:{name}", {
+        "n_active": len(active), "target_name": name,
+        "target_x": sx, "target_y": sy,
+        "drive_x": tx, "drive_y": ty,
+    }
@@ -0,0 +1,114 @@
+"""Strömbom collect/drive heuristic for the shepherd dog.
+
+Adapted from the original ``controllers/shepherd_dog/strombom.py`` and
+updated for the external pen layout. Used as a baseline controller and
+as the fallback when the RL policy isn't available.
+
+Reference: Strömbom et al. 2014, "Solving the shepherding problem".
+"""
+
+import math
+
+from herding.geometry import PEN_ENTRY, GATE_Y, in_pen
+
+# Algorithm parameters. DELTA_DRIVE / DELTA_COLLECT were tightened from
+# the original (4.0 / 2.5) because the new external pen sits ~26 m from
+# typical sheep spawn locations — at the old 4 m standoff, the flee force
+# (quadratic ramp, 3.7 at 4 m vs ~10 at 2 m) couldn't move sheep through
+# the path inside the 3000-step episode budget.
+#
+# F_FACTOR was 2.0 in the original Strömbom paper; raised to 4.0 here so
+# the dog stays in *drive* mode much longer. With our tighter cohesion
+# (flocking_sim.py), partially-collected flocks consolidate naturally
+# during a drive, and we don't waste 80% of the time budget on a slow
+# "collect" pre-phase.
+F_FACTOR = 4.0
+DELTA_COLLECT = 1.5
+DELTA_DRIVE = 2.0
+
+
+def _unit(x, y):
+    d = math.hypot(x, y)
+    if d < 1e-6:
+        return 0.0, 0.0
+    return x / d, y / d
+
+
+def _is_active(x, y) -> bool:
+    """A sheep is "active" if it's still in the field — not in or below
+    the gate plane (we treat anything south of the gate as committed to
+    the pen and stop trying to herd it)."""
+    return (not in_pen(x, y)) and y > GATE_Y
+
+
+def compute_action(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
+    """Return ``(vx, vy, mode)`` — mode in {idle, collect, drive}.
+
+    ``sheep_positions`` is a ``{name: (x, y)}`` mapping (matches the
+    Webots controller's representation).
+    """
+    active = [(x, y) for (x, y) in sheep_positions.values() if _is_active(x, y)]
+    if not active:
+        return 0.0, 0.0, "idle"
+
+    n = len(active)
+    com_x = sum(p[0] for p in active) / n
+    com_y = sum(p[1] for p in active) / n
+    dists = [math.hypot(p[0] - com_x, p[1] - com_y) for p in active]
+    radius = max(dists)
+
+    if radius > F_FACTOR * math.sqrt(n):
+        # Collect: aim at a point behind the furthest sheep, opposite the CoM.
+        idx = max(range(n), key=lambda i: dists[i])
+        sx, sy = active[idx]
+        ux, uy = _unit(sx - com_x, sy - com_y)
+        tx, ty = sx + DELTA_COLLECT * ux, sy + DELTA_COLLECT * uy
+        mode = "collect"
+    else:
+        # Drive: aim at a point behind the flock CoM relative to the goal.
+        ux, uy = _unit(com_x - pen_target[0], com_y - pen_target[1])
+        tx, ty = com_x + DELTA_DRIVE * ux, com_y + DELTA_DRIVE * uy
+        mode = "drive"
+
+    ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
+    return ax, ay, mode
+
+
+def compute_action_debug(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
+    """Variant of compute_action that also returns a small debug dict.
+
+    Kept for parity with the legacy controller's CSV logger.
+    """
+    active = [(x, y) for (x, y) in sheep_positions.values() if _is_active(x, y)]
+    if not active:
+        return 0.0, 0.0, "idle", {
+            "n_active": 0, "radius": 0.0, "threshold": 0.0,
+            "com_x": 0.0, "com_y": 0.0,
+            "target_x": dog_xy[0], "target_y": dog_xy[1],
+        }
+
+    n = len(active)
+    com_x = sum(p[0] for p in active) / n
+    com_y = sum(p[1] for p in active) / n
+    dists = [math.hypot(p[0] - com_x, p[1] - com_y) for p in active]
+    radius = max(dists)
+    threshold = F_FACTOR * math.sqrt(n)
+
+    if radius > threshold:
+        idx = max(range(n), key=lambda i: dists[i])
+        sx, sy = active[idx]
+        ux, uy = _unit(sx - com_x, sy - com_y)
+        tx, ty = sx + DELTA_COLLECT * ux, sy + DELTA_COLLECT * uy
+        mode = "collect"
+    else:
+        ux, uy = _unit(com_x - pen_target[0], com_y - pen_target[1])
+        tx, ty = com_x + DELTA_DRIVE * ux, com_y + DELTA_DRIVE * uy
+        mode = "drive"
+
+    ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
+    dbg = {
+        "n_active": n, "radius": radius, "threshold": threshold,
+        "com_x": com_x, "com_y": com_y,
+        "target_x": tx, "target_y": ty,
+    }
+    return ax, ay, mode, dbg
@@ -0,0 +1,458 @@
+# RL-Driven Shepherd Herding — Implementation Plan
+
+This plan turns the existing Strömbom-only Webots project into a dual-mode
+shepherd controller (RL primary, Strömbom fallback), with a fast Gymnasium
+training environment that mirrors the Webots dynamics tightly enough for
+sim-to-sim transfer. Stable-Baselines3 PPO is the learner.
+
+---
+
+## 1. Current state (audit)
+
+### World geometry — `worlds/field.wbt`
+- Field bounded by stone walls at **x,y ∈ [−15, +15]**. Inside-usable area is
+  ~[−14.5, 14.5] (`X_MIN/MAX` in `flocking.py`).
+- **Pen is *inside* the field**: x ∈ [10, 13], y ∈ [−15, −8], with the
+  opening on its **north** side at y = −8 (post-and-rail fence W/E; open N).
+- South stone wall has a **gate at x ∈ [10, 13], y = −15** (split wall +
+  gate posts at x=10 and x=13). So sheep that get penned end up between the
+  fence (N side at y=−8) and the south stone wall (with the wooden gate at
+  y=−15 currently slightly ajar). The pen is effectively an L-shape inside
+  the field, not external.
+- Spawns: dog at origin (0, 0), 3 sheep around (3, ±2) and (4, 0). Two more
+  sheep are commented out.
+
+### Robots — protos
+- **Sheep** (`protos/Sheep.proto`): differential drive, wheel radius 0.031 m,
+  axle half-width 0.10 m → wheel base 0.20 m. `maxVelocity = 25 rad/s` →
+  max linear ≈ **0.78 m/s**. Sensors: GPS, Compass, Emitter+Receiver on
+  channel 1. `supervisor = TRUE` (used to repaint wool pink on pen entry).
+- **ShepherdDog** (`protos/ShepherdDog.proto`): differential drive, wheel
+  radius 0.038 m, axle half-width 0.14 m → wheel base 0.28 m.
+  `maxVelocity = 70 rad/s` → max linear ≈ **2.66 m/s**. Sensors: GPS,
+  Compass, Gyro, Accelerometer, **Lidar** (front-only, FOV 2.44 rad ≈ 140°,
+  180 rays, range 0.10–12 m, noise 0.005), Emitter+Receiver on channel 1,
+  cosmetic ear/tail motors.
+
+### Sheep controller — `controllers/sheep/{sheep.py,flocking.py}`
+- Reynolds-style boid stack: flee (quadratic ramp inside FLEE_DIST=7 m),
+  cohesion (within 8 m), separation (within 2.5 m), wall soft repulsion
+  (margin 5 m), wall hard escape (margin 1 m, gain 50), wander.
+- Pen-aware: sheep below the gate line but outside the gate corridor get a
+  northward "deadzone" assist; on first entry into the pen rectangle,
+  sheep latches `penned=True`, repaints pink, and switches to in-pen
+  containment + jitter.
+- Driver: heading-error PD on diff-drive (k=4), forward velocity scaled by
+  `cos(err)`, MAX_SPEED=22 (motor units, capped by proto's 25 rad/s).
+- Stuck detector: if displacement < 0.05 m for 20 steps, drives toward
+  field origin to escape wall-pin (a known differential-drive failure mode).
+
+### Dog controller — `controllers/shepherd_dog/{shepherd_dog.py,strombom.py}`
+- Strömbom collect/drive heuristic. CoM-radius gating
+  `radius > F·√n` with F=2 selects collect (push furthest sheep inward) vs
+  drive (push CoM toward the pen entry point at (11.5, −8.0)).
+- Deadzone rescue: when a sheep is below the gate line and outside the
+  pen's x-corridor, the dog repositions to a "behind the sheep, opposite
+  the pen" stand-off so the sheep's flee vector points back through the
+  gate. Variants 0/1 alternate lateral offset to break corner cycles.
+- Stuck-rescue, EMA action smoothing, target-deadband, RESCUE_SPEED_CAP,
+  cooldown — all empirical fixes for diff-drive oscillation.
+- Logs full per-step debug to `dog_behavior_log.csv` (currently 7 MB —
+  add to `.gitignore`).
+
+### Deleted training scaffolding (per `git status`)
+- `controllers/shepherd_dog_rl/{shepherd_dog_rl.py, final_model.zip, vecnorm.pkl, plot_debug.py}`
+- `training/{config.json, herding_env.py, parity_test.py, requirements.txt, train.py, train_at.py, viz.py, runs/.gitkeep}`
+
+A previous attempt existed; we'll redesign rather than resurrect, keeping
+only the lessons (parity-tested env, VecNormalize wrapper, eval cadence).
+
+---
+
+## 2. Design decisions
+
+### 2.1 Pen location — keep inside-field with N gate
+The user offered moving the pen *external* (through a wall hole). Tradeoffs:
+
+| Option | Pros | Cons |
+|---|---|---|
+| **(A) Keep inside-field** (current) | World already built; Strömbom logic already tuned; gate corridor is short | Dog must navigate around three pen walls; adds geometric clutter |
+| (B) External pen via wall hole | Cleaner field — dog only sees sheep + outer walls; pen as goal region beyond a 3 m hole at y=−15 | Requires editing `field.wbt` (split south wall, add external pen walls beyond y<−15); existing rescue/deadzone logic must be retuned; outside-field flocking constants don't currently apply |
+
+**Recommendation: keep (A)** for parity with the working Strömbom controller,
+but add a **simplification**: widen the pen entrance from 3 m (x ∈ [10, 13])
+to 4 m (x ∈ [9.5, 13.5]) and raise the entrance line from y=−8 to y=−7.5
+to give the dog more turning room. Optional later: gate B as a curriculum
+extension (Section 7).
+
+### 2.2 Where to train
+
+PPO on Webots directly is too slow (real-time stepping, single env, slow
+reset). The previous training scaffolding used a Python 2D sim — that is
+the right approach. Constraints for sim-to-sim transfer:
+
+1. **Use the exact same flocking math**: import `controllers/sheep/flocking.py`
+   from the env, do not reimplement.
+2. **Use the same world constants**: import `controllers/shepherd_dog/strombom.py`
+   for pen geometry and Strömbom baseline.
+3. **Model differential drive faithfully**: match wheel-radius, base, and
+   max wheel-velocity from the proto files. Heading update from
+   `(ω_R − ω_L)·r / b`, position from `(ω_R + ω_L)·r / 2`.
+4. **Match Webots step**: `basicTimeStep = 16 ms`. The sheep controller runs
+   at every basic step; the env will use the same `dt = 0.016 s`.
+5. **Lidar deferred**: dog policy will use a *symbolic* observation
+   (positions of dog + sheep, plus pen geometry) — not raw lidar — for the
+   first iteration. Lidar-from-pixels is a much harder learning problem
+   and isn't required for the herding task. (See Section 7 for an
+   optional later upgrade.)
+
+### 2.3 Action space for the dog
+
+Two viable choices:
+
+- **(a) High-level velocity vector** `(vx, vy) ∈ [−1, 1]²`. The same
+  representation Strömbom emits today; the existing
+  `drive_action(vx, vy, ...)` function in `shepherd_dog.py` converts this
+  to wheel speeds. Decouples the policy from low-level diff-drive
+  oscillations and enables direct A/B against Strömbom.
+- (b) Direct wheel speeds `(ω_L, ω_R) ∈ [−1, 1]²`. More expressive but the
+  policy must learn diff-drive control from scratch — which is exactly
+  the source of the wall-stuck and oscillation pain we're trying to
+  avoid.
+
+**Recommendation: (a)** — high-level `(vx, vy)`. Reuses the well-tuned
+`drive_action` controller, which already handles `cos(err)` clamping and
+turn gain. RL focuses on *strategy*, not actuation.
+
+### 2.4 Observation space for the dog
+
+Symbolic, fixed-size, normalized to [−1, 1]:
+
+| Field | Dim | Notes |
+|---|---|---|
+| Dog (x, y, cos h, sin h) | 4 | Position normalized by 15 |
+| Sheep CoM (x, y) | 2 | Of *active* (not-penned) sheep |
+| Sheep dispersion (radius, std-x, std-y) | 3 | Strömbom collect-vs-drive features |
+| Vector dog→CoM (dx, dy, dist) | 3 | Helps the value function |
+| Vector dog→pen-entry (dx, dy, dist) | 3 | |
+| Vector furthest-sheep→CoM (dx, dy) | 2 | Strömbom collect target hint |
+| Min sheep-to-wall distance + min dog-to-wall | 2 | Safety signal |
+| Active sheep count / N_max | 1 | |
+| 8-bin polar histogram of sheep around dog | 8 | Order-invariant flock shape |
+
+Total: **28 features**. Order-invariant by construction (histogram + summary
+stats), so the policy generalizes across flock sizes 1..N_max.
+
+### 2.5 Reward
+
+Sparse-only is too hard at flock scale; we shape conservatively.
+
+```
+r_t = w_pen     · ΔN_penned                       # +1 per newly penned sheep
+    + w_progress· (d_CoM_pen[t-1] − d_CoM_pen[t]) # closer-to-pen progress
+    + w_compact· (R[t-1] − R[t])                  # tighter flock progress
+    − w_time   · 1                                 # constant time penalty
+    − w_wall   · I(min_wall_dist < 1.0 m)         # dog too close to wall
+    − w_collide· I(dog within 0.3 m of any sheep) # avoid contact
+    + w_done   · I(all sheep penned)              # terminal bonus
+```
+
+Initial weights: `w_pen=2.0, w_progress=0.5, w_compact=0.2, w_time=0.005,
+w_wall=0.01, w_collide=0.05, w_done=10.0`. Tune via 1-sheep curriculum
+first — if the dog learns 1-sheep cleanly, the weights are sane.
+
+### 2.6 Episode
+
+- Max steps: 3000 (≈ 48 s at dt=16 ms — generous).
+- Termination: all sheep penned (success), dog/sheep stuck > 600 steps with
+  no progress (failure), step limit (timeout).
+- Reset: domain-randomized — sheep count ∈ {1..N_max}, sheep positions
+  uniform in field minus pen+gate corridor, dog at origin ± U(−2, 2).
+
+### 2.7 Curriculum
+
+| Stage | N_sheep | Duration (steps) | Pass criterion |
+|---|---|---|---|
+| 0 | 1 | 0.5 M | success ≥ 90 % |
+| 1 | 2 | 1.0 M | success ≥ 80 % |
+| 2 | 3 | 1.5 M | success ≥ 70 % |
+| 3 | 1..3 mixed | 2.0 M | mean reward stable |
+| 4 (optional) | 5 | 2.0 M | success ≥ 60 % |
+
+Implemented by changing only `n_sheep` in the env reset.
+
+---
+
+## 3. Repository layout (new)
+
+```
+project/
+├── controllers/
+│   ├── sheep/                      # unchanged
+│   ├── shepherd_dog/               # Strömbom controller (renamed entry)
+│   │   ├── shepherd_dog.py         # mode-switch wrapper: RL | strombom
+│   │   ├── strombom.py             # unchanged (canonical Strömbom)
+│   │   └── policy_loader.py        # NEW: loads SB3 zip + VecNormalize
+│   └── ...
+├── herding/                        # NEW: Python package, importable from env + controller
+│   ├── __init__.py
+│   ├── geometry.py                 # field/pen constants, in_pen(), wall helpers (single source of truth)
+│   ├── flocking_sim.py             # vectorised numpy port of flocking.py for fast batched sheep
+│   ├── diffdrive.py                # diff-drive integrator matching the proto specs
+│   └── obs.py                      # observation builder shared by env and Webots controller
+├── training/                       # NEW
+│   ├── herding_env.py              # gymnasium.Env, single-agent (the dog)
+│   ├── parity_test.py              # asserts env trajectory ≈ Webots trajectory for fixed seeds
+│   ├── train_ppo.py                # SB3 PPO entry point
+│   ├── eval.py                     # rollout + metrics (success rate, time-to-pen)
+│   ├── configs/
+│   │   ├── ppo_default.yaml
+│   │   └── curriculum.yaml
+│   ├── runs/                       # tensorboard + checkpoints (.gitignored)
+│   └── requirements.txt
+├── docs/
+│   └── project.md                  # unchanged
+├── plan.md                         # this file
+└── ...
+```
+
+`herding/` becomes the **single source of truth** for geometry and dynamics.
+The Webots controllers and the training env both import from it, so when a
+constant changes in one place it changes everywhere — eliminating the
+sim/Webots-drift class of bugs.
+
+This means the existing `controllers/sheep/flocking.py` and
+`controllers/shepherd_dog/strombom.py` become thin shims that re-export
+from `herding/`. Webots controllers can import `herding/` because Webots
+adds the project root to `sys.path` at controller startup; we'll verify.
+
+---
+
+## 4. The Gymnasium environment — `training/herding_env.py`
+
+```python
+class HerdingEnv(gymnasium.Env):
+    metadata = {"render_modes": ["rgb_array", "human"]}
+
+    def __init__(self, n_sheep=3, max_steps=3000, dt=0.016, seed=None):
+        self.action_space      = Box(low=-1, high=1, shape=(2,), dtype=np.float32)
+        self.observation_space = Box(low=-1, high=1, shape=(28,), dtype=np.float32)
+        ...
+
+    def reset(self, *, seed=None, options=None):
+        # Random sheep positions in field \ pen corridor, dog near origin.
+        # Optional curriculum: options["n_sheep"] overrides.
+        ...
+
+    def step(self, action):
+        vx, vy = action  # high-level velocity intent
+        # Convert to wheel speeds via the same drive_action inverse used in Webots
+        wL, wR = self._diffdrive_inverse(vx, vy, self.dog_state)
+        self.dog_state = self._integrate_diffdrive(self.dog_state, wL, wR, self.dt)
+        # Step every sheep one boid step (vectorized in flocking_sim.py)
+        self.sheep_state = self._step_sheep(self.sheep_state, self.dog_state)
+        # Update penned set, compute reward, observation, done flags
+        ...
+```
+
+Key points:
+- **Vectorised sheep update**: re-implements `flocking.py` in numpy so 100
+  parallel envs with 5 sheep each take ms, not seconds. Numerical parity
+  with the scalar version is asserted in `parity_test.py`.
+- **Same diff-drive integrator** for the dog as Webots will see at
+  inference. Wall + pen-fence collisions clamp position (a Webots-realistic
+  no-pass-through approximation).
+- **Domain randomization** in reset: sheep count, spawn positions, sheep
+  flock-parameter jitter (±10 % on FLEE_DIST, COHESION_DIST, etc.) for
+  robustness.
+
+---
+
+## 5. Training pipeline — `training/train_ppo.py`
+
+- **Algorithm**: SB3 `PPO` with `MlpPolicy`, `n_steps=2048`, `batch_size=256`,
+  `n_epochs=10`, `gamma=0.995`, `gae_lambda=0.95`, `clip_range=0.2`,
+  `ent_coef=0.005`, `vf_coef=0.5`, `learning_rate=3e-4`.
+- **Vec envs**: `SubprocVecEnv` × 16 parallel envs (the env is pure numpy
+  so subprocs are CPU-cheap).
+- **Normalization**: `VecNormalize(norm_obs=True, norm_reward=True,
+  clip_obs=10.0)`. Pickled alongside the policy zip — both required at
+  inference.
+- **Callbacks**:
+  - `CheckpointCallback` every 100 k steps.
+  - `EvalCallback` on a separate eval env (no normalization-update) every
+    50 k steps; logs success rate and time-to-pen to TensorBoard.
+  - Custom `CurriculumCallback`: bumps `n_sheep` when eval success rate
+    crosses the stage threshold for 3 consecutive evals.
+- **Determinism for debugging**: seed-pinned eval env so regressions are
+  catchable.
+
+---
+
+## 6. Webots integration — RL inference path
+
+`controllers/shepherd_dog/shepherd_dog.py` becomes a thin wrapper:
+
+```python
+MODE = os.environ.get("HERDING_MODE", "rl")  # "rl" | "strombom"
+
+if MODE == "rl":
+    policy = policy_loader.load("training/runs/best/policy.zip",
+                                "training/runs/best/vecnormalize.pkl")
+    obs_fn = build_obs   # from herding/obs.py
+else:
+    obs_fn = None        # strombom path uses sheep_positions directly
+
+while robot.step(timestep) != -1:
+    receive_messages()
+    if MODE == "rl":
+        obs = obs_fn(dog_xy, dog_heading, sheep_positions, ...)
+        action, _ = policy.predict(obs, deterministic=True)
+        vx, vy = action.tolist()
+    else:
+        vx, vy, mode, dbg = compute_action_debug(dog_xy, sheep_positions, PEN_ENTRY)
+        # plus existing rescue/cooldown/EMA layer
+    drive_action(vx, vy, ...)
+```
+
+A **safety supervisor** wraps the RL output: if `obs` indicates the dog is
+< 0.6 m from a wall, override with the existing wall-escape behavior
+(reverse + turn). This is a hard guarantee diff-drive needs because PPO
+may not discover wall-escape reliably from on-policy data.
+
+`policy_loader.py` handles the SB3 import lazily so the controller still
+works with `MODE=strombom` even if SB3 is not installed in the Webots
+Python environment.
+
+---
+
+## 7. Optional extensions (post-baseline)
+
+- **External pen** (Section 2.1 option B): edit `field.wbt` to extend the
+  south wall hole into an external L-shaped pen with its own walls; update
+  `herding/geometry.py`; retrain stage 3 only.
+- **Lidar observation**: replace symbolic obs with 36-bin downsampled
+  lidar + ego state; train end-to-end. Useful as the "extra merit"
+  dimension in the project doc.
+- **Two-dog mode**: make env multi-agent, train with `MAPPO`-style shared
+  critic or independent PPO. The proto already supports multiple dog
+  instances; world only needs a second `ShepherdDog` node.
+- **Mecanum comparison**: swap the dog proto for a mecanum variant; same
+  policy, different `_integrate_diffdrive` (becomes holonomic).
+- **Sheep flock size scaling**: 5, 10, 20 — the obs is order-invariant so
+  the same policy generalises; just curriculum further.
+
+---
+
+## 8. Risks & mitigations
+
+| Risk | Mitigation |
+|---|---|
+| Sim-to-Webots gap (sheep dynamics, wall friction) | `parity_test.py` asserts trajectory match within tolerance for fixed seeds; if it fails, fix the env, not the policy |
+| Dog learns to wall-pin sheep against fence | Add `w_collide` penalty + min-sheep-to-wall term in obs; curriculum from 1 sheep first |
+| PPO oscillation collapses into spinning | Action smoothing in env step (EMA on `(vx, vy)`, mirroring `ACTION_SMOOTH=0.35` from Strömbom controller); reward small `‖a_t − a_{t-1}‖` penalty |
+| Pen approach failures (sheep refuse gate) | Reuse the existing `deadzone_rescue` as a *scripted fallback* triggered when a sheep has been deadzoned > 200 steps — RL handles the common case, scripted handles the corner |
+| Gym version mismatch (gymnasium vs gym) | Lock to `gymnasium>=0.29`, `stable-baselines3>=2.3` in requirements |
+
+---
+
+## 9. Milestones (suggested order of implementation)
+
+1. **M0 — Refactor** (no behavior change): create `herding/` package, move
+   constants out of `flocking.py`/`strombom.py`, leave shims; verify
+   Webots still runs Strömbom unchanged. Add `dog_behavior_log.csv` to
+   `.gitignore`.
+2. **M1 — Env & parity**: `herding_env.py`, `parity_test.py`. Asserts
+   sheep + dog trajectories match Webots within tolerance for 5 fixed
+   seeds. *Done when parity test green.*
+3. **M2 — PPO baseline**: train Stage 0 (1 sheep) for 0.5 M steps; eval
+   in env at ≥ 90 % success.
+4. **M3 — Webots inference**: load Stage 0 policy in `shepherd_dog.py`
+   with `HERDING_MODE=rl`; verify the dog herds 1 sheep into the pen in
+   the actual Webots world. *This is the sim-to-sim transfer gate.*
+5. **M4 — Curriculum**: stages 1–3, ~5 M steps total, with checkpoints
+   and eval logs.
+6. **M5 — Strömbom comparison**: run both controllers on a fixed eval
+   suite (same seeds, 1/2/3 sheep), log success rate and time-to-pen.
+   This is a deliverable for the project's "quantitative evaluation"
+   goal.
+7. **M6 — Documentation**: a short README in `training/` showing how to
+   train, evaluate, and switch modes in Webots.
+
+Each milestone is independently demoable. M0–M3 is the critical path to
+"RL works in Webots"; M4–M6 polishes it for the project deliverable.
+
+---
+
+## 10. Decisions (locked in by implementation)
+
+- **Pen layout**: option B (external pen). The pen sits south of the
+  field at x ∈ [10, 13], y ∈ [-22, -15] and is reached through the
+  existing 3 m gap in the south stone wall. The old in-field
+  quarantine fence is gone and the wooden gate is modeled as
+  swung-open and parked on the west gate post so the corridor is
+  unobstructed. This kills the deadzone class entirely.
+- **Flock size**: 1..10 sheep, sampled uniformly each reset. The order-
+  invariant observation (CoM, dispersion, polar histogram) lets a
+  single policy generalise across the whole range. A curriculum widens
+  ``max_n_sheep`` from 1 to 10 over training to keep early exploration
+  tractable.
+- **Single-sheep mode**: handled by the same policy (n_sheep=1 is the
+  first stage of the curriculum and stays in the training distribution
+  throughout). No separate model.
+- **Hardware**: GPU for training. SubprocVecEnv × 16 on CPU feeds an
+  MlpPolicy on GPU; ~2–3 h for the full curriculum.
+
+## 11. What was built
+
+```
+herding/                    # single source of truth, importable from both
+  geometry.py               # field/pen constants, latch helpers, robot specs
+  flocking_sim.py           # Reynolds boid step (matches Webots controller)
+  diffdrive.py              # diff-drive kinematics + velocity↔wheels
+  obs.py                    # 28-D order-invariant observation builder
+  strombom.py               # collect/drive heuristic (baseline + fallback)
+
+worlds/field.wbt            # external pen south of field, 10 sheep slots,
+                            # gate parked open, in-field fence removed
+
+controllers/sheep/sheep.py            # imports from herding/, latches on
+                                      # is_penned_position
+controllers/shepherd_dog/
+  shepherd_dog.py           # mode switch (HERDING_MODE=rl|strombom),
+                            # safety supervisor for DOG_SOUTH_LIMIT
+  policy_loader.py          # lazy SB3 zip + VecNormalize loader
+  strombom.py               # shim re-exporting herding.strombom
+
+training/
+  herding_env.py            # gymnasium.Env, action smoothing, reward shaping
+  train_ppo.py              # SB3 PPO with VecNormalize, eval, checkpoints,
+                            # curriculum callback
+  eval.py                   # success-rate / time-to-pen across n_sheep
+  parity_test.py            # shape, determinism, baseline-rollout smoke test
+  configs/ppo_default.yaml
+  requirements.txt
+  README.md                 # how to train, evaluate, switch modes in Webots
+```
+
+## 12. To run
+
+```bash
+# 1. Install deps (CUDA-enabled torch wheel for GPU)
+pip install -r training/requirements.txt
+
+# 2. Smoke test
+python -m training.parity_test
+
+# 3. Train (5 M steps, ~2–3 h on a single GPU)
+python -m training.train_ppo --out-dir training/runs/baseline
+
+# 4. Evaluate vs Strömbom
+python -m training.eval --policy training/runs/baseline/best
+python -m training.eval --policy strombom
+
+# 5. Run in Webots
+export HERDING_MODE=rl
+export HERDING_POLICY_DIR=$PWD/training/runs/baseline/best
+webots worlds/field.wbt
+```
@@ -0,0 +1,117 @@
+"""Collect (obs, action) demonstrations from the sequential teacher.
+
+Runs the sequential algorithm across a grid of (n_sheep, seed) combos
+at full difficulty, logs the (observation, action) pair every Nth step,
+and saves successful trajectories to a numpy ``.npz`` for behavior
+cloning. Failed trajectories are dropped by default — we only want to
+teach the policy from good examples.
+
+Usage::
+
+    python -m tools.collect_demos --out training/demos.npz
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+
+import numpy as np
+
+from herding.geometry import PEN_ENTRY
+from herding.sequential import compute_action
+from training.herding_env import HerdingEnv
+
+
+def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int):
+    env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
+                    difficulty=1.0, seed=seed)
+    obs, _ = env.reset(seed=seed)
+    obs_list, action_list = [], []
+    for step in range(max_steps):
+        positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
+                     for i in range(env.n_sheep) if not env.sheep_penned[i]}
+        if not positions:
+            break
+        vx, vy, _mode = compute_action(
+            (env.dog_x, env.dog_y), positions, PEN_ENTRY,
+        )
+        action = np.array([vx, vy], dtype=np.float32)
+        if step % subsample == 0:
+            obs_list.append(obs.copy())
+            action_list.append(action.copy())
+        obs, _r, term, trunc, _info = env.step(action)
+        if term or trunc:
+            break
+    success = bool(env.sheep_penned.all())
+    return (
+        np.asarray(obs_list, dtype=np.float32),
+        np.asarray(action_list, dtype=np.float32),
+        success,
+        env.steps,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out", default="training/demos.npz")
+    parser.add_argument("--n-sheep-list", default="1,2,3,5,8,10")
+    parser.add_argument("--seeds-per-n", type=int, default=15)
+    parser.add_argument("--max-steps", type=int, default=30000)
+    parser.add_argument("--subsample", type=int, default=5,
+                        help="Keep every Nth (obs, action) pair.")
+    parser.add_argument("--keep-failures", action="store_true",
+                        help="Include partial-success trajectories. Default off.")
+    args = parser.parse_args()
+
+    n_sheep_list = [int(x) for x in args.n_sheep_list.split(",")]
+    print(f"[demos] grid: n_sheep={n_sheep_list}, seeds={args.seeds_per_n}, "
+          f"max_steps={args.max_steps}, subsample={args.subsample}")
+
+    all_obs, all_actions, all_meta = [], [], []
+    t_start = time.time()
+    n_success = 0; n_total = 0
+
+    for n in n_sheep_list:
+        for seed in range(args.seeds_per_n):
+            obs, actions, success, total_steps = collect_one(
+                n, seed, args.max_steps, args.subsample,
+            )
+            n_total += 1
+            if success:
+                n_success += 1
+            keep = success or args.keep_failures
+            if keep and len(obs) > 0:
+                all_obs.append(obs)
+                all_actions.append(actions)
+                all_meta.append((n, seed, len(obs), int(success), total_steps))
+            tag = "✓" if success else "✗"
+            print(f"  [{tag}] n={n:>2d} seed={seed:>2d}  steps={total_steps:>6d}  "
+                  f"logged={len(obs):>5d}")
+
+    if not all_obs:
+        raise RuntimeError("No trajectories kept — try --keep-failures.")
+
+    obs = np.concatenate(all_obs, axis=0)
+    actions = np.concatenate(all_actions, axis=0)
+    meta = np.array(all_meta, dtype=np.int32)
+
+    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
+    np.savez(args.out, obs=obs, actions=actions, meta=meta)
+
+    elapsed = time.time() - t_start
+    print(f"\n=== {n_success}/{n_total} trajectories successful ({100*n_success/n_total:.0f}%) ===")
+    print(f"=== {len(obs)} transitions saved to {args.out} ===")
+    print(f"=== obs={obs.shape}, actions={actions.shape}, elapsed={elapsed:.0f}s ===")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Launch Webots with N sheep enabled and the chosen controller mode.
+# Generates a temporary world file in worlds/field_test.wbt with sheep
+# beyond N commented out, sets the env vars the dog controller reads,
+# then execs Webots on it.
+#
+# Usage:
+#   tools/run_webots.sh [N] [MODE]
+#     N    : number of active sheep (1..10), default 10
+#     MODE : "rl" | "strombom" | "sequential", default "rl"
+#
+# Examples:
+#   tools/run_webots.sh 10 rl         # BC-trained RL policy, 10 sheep
+#   tools/run_webots.sh 5 sequential  # the analytic teacher, 5 sheep
+#   tools/run_webots.sh 3 strombom    # canonical baseline, 3 sheep
+#
+# Notes:
+# * The RL mode loads training/runs/bc_pretrained/policy.zip by default.
+#   Override via HERDING_POLICY_DIR=/path/to/run env var.
+# * Conda env "tir" must be active (provides stable-baselines3 + torch).
+
+set -e
+N=${1:-10}
+MODE=${2:-rl}
+
+if (( N < 1 || N > 10 )); then
+    echo "N must be 1..10, got $N" >&2; exit 1
+fi
+case "$MODE" in
+    rl|strombom|sequential) ;;
+    *) echo "MODE must be rl|strombom|sequential, got '$MODE'" >&2; exit 1 ;;
+esac
+
+ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
+SRC="$ROOT/worlds/field.wbt"
+DST="$ROOT/worlds/field_test.wbt"
+
+cp "$SRC" "$DST"
+# Comment out sheep N+1..10 by prefixing the matching Sheep { ... } line.
+for i in $(seq $((N+1)) 10); do
+    sed -i "s|^Sheep .* \"sheep${i}\".*|# &|" "$DST"
+done
+
+active=$(grep -c '^Sheep' "$DST")
+echo "------------------------------------------------------------"
+echo "World      : $DST"
+echo "Mode       : $MODE"
+echo "Sheep      : $active active"
+echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}"
+echo "------------------------------------------------------------"
+
+# Webots strips HERDING_* env vars from controller subprocesses in some
+# setups, so we also write a runtime config file the controller reads.
+RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}"
+cat > "$ROOT/herding_runtime.cfg" <<EOF
+HERDING_MODE=$MODE
+HERDING_POLICY_DIR=$RESOLVED_POLICY_DIR
+EOF
+
+export HERDING_MODE="$MODE"
+export HERDING_POLICY_DIR="$RESOLVED_POLICY_DIR"
+
+exec webots "$DST"
@@ -0,0 +1,115 @@
+# Shepherd Herding — Training & Inference
+
+This directory holds the Gymnasium environment, PPO training script, and
+evaluation harness for the RL shepherd-dog policy. The Webots controller
+in `controllers/shepherd_dog/` loads the resulting policy at inference
+time when launched with `HERDING_MODE=rl`.
+
+## Layout
+
+```
+training/
+├── herding_env.py        # gymnasium.Env — the dog is the agent
+├── train_ppo.py          # SB3 PPO entry point (vec envs, eval, curriculum)
+├── eval.py               # rollout success-rate / time-to-pen across flock sizes
+├── parity_test.py        # smoke test: shapes, determinism, baseline rollout
+├── configs/ppo_default.yaml
+├── runs/                 # tensorboard + checkpoints (gitignored)
+└── requirements.txt
+```
+
+## Setup
+
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install -r training/requirements.txt
+```
+
+CPU is the default and also the recommended device — SB3's PPO with an
+MLP policy of this size runs faster on CPU than on GPU because the
+bottleneck is rollout collection, not gradient compute. The 16 SubprocVecEnv
+workers saturate ~16 CPU cores. To force CUDA anyway, pass `--device cuda`.
+
+## Train
+
+```bash
+# Full curriculum (1 → 10 sheep), ~5M steps, ~2–3h on a single GPU.
+python -m training.train_ppo \
+    --config training/configs/ppo_default.yaml \
+    --out-dir training/runs/baseline
+```
+
+Outputs:
+- `training/runs/baseline/best/best_model.zip` — best eval checkpoint
+- `training/runs/baseline/best/vecnormalize.pkl` — observation stats
+- `training/runs/baseline/checkpoints/ppo_*.zip` — periodic checkpoints
+- `training/runs/baseline/tb/` — TensorBoard logs (`tensorboard --logdir`)
+
+To resume:
+
+```bash
+python -m training.train_ppo --resume training/runs/baseline/checkpoints/ppo_500000_steps.zip
+```
+
+## Evaluate
+
+```bash
+# RL policy
+python -m training.eval --policy training/runs/baseline/best
+
+# Strömbom baseline
+python -m training.eval --policy strombom
+```
+
+Prints success rate, mean steps, and mean penned-count per flock size.
+Use the same `--n-seeds` for both to get a fair RL-vs-Strömbom A/B.
+
+## Parity / smoke test
+
+```bash
+python -m training.parity_test
+```
+
+Checks observation/action shapes, deterministic seeding, the curriculum
+sampler, and a 400-step Strömbom rollout. Run this before every long
+training job — catches the boring class of bugs in seconds.
+
+## Run the policy in Webots
+
+1. Train (above) — produces `training/runs/<name>/best/`.
+2. In Webots, set the dog controller's environment variables:
+
+   ```bash
+   export HERDING_MODE=rl
+   export HERDING_POLICY_DIR=$(pwd)/training/runs/baseline/best
+   webots worlds/field.wbt
+   ```
+
+   Or set them via Webots' controller args / a `.wbproj` if you prefer.
+
+3. To force the Strömbom baseline (same world, same controller):
+
+   ```bash
+   export HERDING_MODE=strombom
+   webots worlds/field.wbt
+   ```
+
+If `HERDING_MODE=rl` but the policy can't be loaded (SB3 not installed,
+zip missing, etc.), the controller logs the error and falls back to
+Strömbom automatically.
+
+## Curriculum knobs
+
+The default schedule in `configs/ppo_default.yaml` widens
+`max_n_sheep` over training. Each reset samples `n_sheep ~ U[1,
+max_n_sheep]`, so the final policy has seen every flock size from 1 to
+10 in proportion. To pin a specific size, instantiate the env with
+`HerdingEnv(n_sheep=N)` (see `eval.py`).
+
+## Reward shaping
+
+Weights live in class attributes on `HerdingEnv`. Tune from the 1-sheep
+curriculum first — if the dog can't herd a single sheep cleanly, raising
+`W_PROGRESS` or lowering `W_TIME` is usually the fix. For multi-sheep
+collapse modes (dog spins between sheep), increase `W_COMPACT` so
+tightening the flock pays.
@@ -0,0 +1,218 @@
+"""Behavior cloning of the sequential teacher into an SB3-compatible policy.
+
+Trains the policy network (mean-action head) of an SB3 ``MlpPolicy`` to
+mimic the demonstrations collected by ``tools.collect_demos``. The
+saved zip is loadable via ``PPO.load(...)`` and can be passed to
+``train_ppo.py --resume`` for fine-tuning.
+
+Why this works: the teacher (sequential single-target driving) solves
+n=10 at 80%+ in our env. BC gives the RL a competent starting policy,
+so PPO doesn't have to discover behavior from scratch — it only has to
+*refine* the teacher's strategy via the sparse pen reward.
+
+Usage::
+
+    python -m training.bc_pretrain \\
+        --demos training/demos.npz \\
+        --out training/runs/bc_pretrained
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+
+from stable_baselines3 import PPO
+from stable_baselines3.common.vec_env import DummyVecEnv
+
+from training.herding_env import HerdingEnv
+
+
+def build_model(net_arch_pi, net_arch_vf, log_std_init: float):
+    """Build a fresh SB3 PPO with the same architecture as train_ppo.
+
+    We only need the policy to load weights into; PPO's training-loop
+    plumbing isn't used during BC.
+    """
+    env = DummyVecEnv([lambda: HerdingEnv()])
+    model = PPO(
+        "MlpPolicy", env,
+        policy_kwargs=dict(
+            net_arch=dict(pi=net_arch_pi, vf=net_arch_vf),
+            log_std_init=log_std_init,
+        ),
+        verbose=0,
+    )
+    return model, env
+
+
+def policy_forward_mean(policy, obs_batch):
+    """Return the policy's deterministic mean action for a batch.
+
+    SB3's ActorCriticPolicy doesn't expose this directly — it goes
+    through a Distribution wrapper. We replicate the forward path:
+    extract_features → mlp_extractor → action_net.
+    """
+    features = policy.extract_features(obs_batch)
+    if isinstance(features, tuple):
+        # SB3 ≥ 2.0 sometimes returns (pi_features, vf_features)
+        pi_features = features[0]
+    else:
+        pi_features = features
+    latent_pi, _latent_vf = policy.mlp_extractor(pi_features)
+    return policy.action_net(latent_pi)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--demos", default="training/demos.npz")
+    parser.add_argument("--out", default="training/runs/bc_pretrained")
+    parser.add_argument("--epochs", type=int, default=60)
+    parser.add_argument("--batch-size", type=int, default=256)
+    parser.add_argument("--lr", type=float, default=1e-3)
+    parser.add_argument("--val-split", type=float, default=0.1)
+    parser.add_argument("--net-arch", default="256,256",
+                        help="Comma-separated hidden layer widths.")
+    parser.add_argument("--log-std-init", type=float, default=0.5)
+    parser.add_argument("--cos-weight", type=float, default=1.0,
+                        help="Weight on (1 - cosine similarity) loss term. "
+                             "MSE alone shrinks policy output toward zero "
+                             "(zero-magnitude action minimises mean squared "
+                             "error against ±1 targets); cos loss keeps "
+                             "the action pointed correctly even at small "
+                             "magnitudes.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--device", default="cpu")
+    args = parser.parse_args()
+
+    torch.manual_seed(args.seed)
+    np.random.seed(args.seed)
+
+    # --- Load demos ---
+    print(f"[bc] loading demos from {args.demos}")
+    data = np.load(args.demos)
+    obs = data["obs"].astype(np.float32)
+    actions = data["actions"].astype(np.float32)
+    meta = data["meta"]
+    print(f"[bc] obs={obs.shape}  actions={actions.shape}  trajectories={len(meta)}")
+    if obs.size == 0:
+        raise RuntimeError("Empty demo file.")
+
+    # Action sanity check — sequential outputs unit vectors.
+    a_norms = np.linalg.norm(actions, axis=1)
+    print(f"[bc] action L2 norm: mean={a_norms.mean():.3f}  "
+          f"min={a_norms.min():.3f}  max={a_norms.max():.3f}")
+
+    # --- Train/val split ---
+    n = len(obs)
+    perm = np.random.permutation(n)
+    n_val = int(n * args.val_split)
+    val_idx, train_idx = perm[:n_val], perm[n_val:]
+    print(f"[bc] train={len(train_idx)}  val={len(val_idx)}")
+
+    obs_t = torch.from_numpy(obs)
+    act_t = torch.from_numpy(actions)
+    train_loader = DataLoader(
+        TensorDataset(obs_t[train_idx], act_t[train_idx]),
+        batch_size=args.batch_size, shuffle=True,
+    )
+    val_loader = DataLoader(
+        TensorDataset(obs_t[val_idx], act_t[val_idx]),
+        batch_size=args.batch_size, shuffle=False,
+    )
+
+    # --- Build model ---
+    net_arch_pi = [int(x) for x in args.net_arch.split(",")]
+    net_arch_vf = net_arch_pi[:]
+    model, _env = build_model(net_arch_pi, net_arch_vf, args.log_std_init)
+    policy = model.policy.to(args.device)
+    optimizer = optim.Adam(policy.parameters(), lr=args.lr)
+
+    # --- Train ---
+    print(f"[bc] training: epochs={args.epochs}  batch={args.batch_size}  "
+          f"lr={args.lr}  device={args.device}")
+    t_start = time.time()
+    best_val = float("inf")
+
+    def combined_loss(pred, target):
+        mse = nn.functional.mse_loss(pred, target)
+        p_norm = pred.norm(dim=1).clamp_min(1e-6)
+        t_norm = target.norm(dim=1).clamp_min(1e-6)
+        cos_sim = (pred * target).sum(dim=1) / (p_norm * t_norm)
+        cos_loss = (1.0 - cos_sim).mean()
+        return mse + args.cos_weight * cos_loss, mse.item(), cos_sim.mean().item()
+
+    for epoch in range(args.epochs):
+        policy.train()
+        train_loss_total, train_mse_total, train_cos_total, train_count = 0.0, 0.0, 0.0, 0
+        for ob_batch, act_batch in train_loader:
+            ob_batch = ob_batch.to(args.device)
+            act_batch = act_batch.to(args.device)
+            optimizer.zero_grad()
+            mean_action = policy_forward_mean(policy, ob_batch)
+            loss, mse_val, cos_val = combined_loss(mean_action, act_batch)
+            loss.backward()
+            optimizer.step()
+            bs = ob_batch.size(0)
+            train_loss_total += loss.item() * bs
+            train_mse_total += mse_val * bs
+            train_cos_total += cos_val * bs
+            train_count += bs
+        train_mse = train_mse_total / max(1, train_count)
+        train_cos = train_cos_total / max(1, train_count)
+
+        policy.eval()
+        val_total, val_count = 0.0, 0
+        cos_sim_total = 0.0
+        with torch.no_grad():
+            for ob_batch, act_batch in val_loader:
+                ob_batch = ob_batch.to(args.device)
+                act_batch = act_batch.to(args.device)
+                mean_action = policy_forward_mean(policy, ob_batch)
+                bs = ob_batch.size(0)
+                val_total += nn.functional.mse_loss(
+                    mean_action, act_batch, reduction="sum",
+                ).item()
+                # Cosine similarity in action space — useful sanity for
+                # "is the policy pointing the same way as the teacher?".
+                m_norm = mean_action.norm(dim=1).clamp_min(1e-6)
+                a_norm = act_batch.norm(dim=1).clamp_min(1e-6)
+                cos = (mean_action * act_batch).sum(dim=1) / (m_norm * a_norm)
+                cos_sim_total += cos.sum().item()
+                val_count += bs
+        val_mse = val_total / max(1, val_count) / actions.shape[1]
+        cos_sim = cos_sim_total / max(1, val_count)
+        print(f"  epoch {epoch+1:>2d}/{args.epochs}  "
+              f"train_mse={train_mse:.4f}  train_cos={train_cos:+.3f}  "
+              f"val_mse={val_mse:.4f}  val_cos={cos_sim:+.3f}")
+        if val_mse < best_val:
+            best_val = val_mse
+
+    elapsed = time.time() - t_start
+    print(f"[bc] done in {elapsed:.0f}s  best_val_mse={best_val:.4f}")
+
+    # --- Save ---
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    model.save(out_dir / "policy.zip")
+    print(f"[bc] saved policy to {out_dir / 'policy.zip'}")
+    print(f"\n[bc] verify with:  "
+          f"python -m training.eval --policy {out_dir}")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,14 +0,0 @@
-{
-    "W_PER_SHEEP": 2.0,
-    "W_ALIGN": 0.05,
-    "W_PEN_BONUS": 10.0,
-    "W_COMPLETE": 100.0,
-    "W_STEP_COST": 0.02,
-    "W_COMPACT": 0.0,
-    "W_WALL_TOUCH": 0.0,
-    "WALL_TOUCH_BUFFER": 0.4,
-    "ALIGN_SHAPE": "standoff",
-    "ALIGN_GATED": true,
-    "ENTRY_AWARE": true,
-    "ent_coef": 0.02
-}
@@ -0,0 +1,52 @@
+# PPO hyperparameters for the herding env. Tuned for a 28-D obs / 2-D
+# continuous action space with 16 parallel envs on GPU. These are SB3
+# defaults nudged toward longer credit assignment (gamma=0.995) and a
+# slightly higher entropy bonus to keep exploration alive while curriculum
+# expands the flock size.
+
+# --- PPO ---
+learning_rate: 3.0e-4
+n_steps: 2048              # rollout length per env before each update
+batch_size: 256
+n_epochs: 10
+gamma: 0.995
+gae_lambda: 0.95
+clip_range: 0.2
+ent_coef: 0.05             # was 0.01 — earlier runs collapsed to ~0 actions
+vf_coef: 0.5
+max_grad_norm: 0.5
+target_kl: null            # disable early-stop on KL
+
+# --- Network ---
+policy: MlpPolicy
+net_arch_pi: [128, 128]
+net_arch_vf: [128, 128]
+log_std_init: 0.5          # std≈1.6 instead of default 1.0 — more exploration
+
+# --- Training schedule ---
+total_timesteps: 10_000_000
+n_envs: 16
+checkpoint_freq: 500_000   # in env steps
+eval_freq: 100_000         # in env steps
+n_eval_episodes: 20
+
+# --- Curriculum (max-n_sheep schedule, in env steps) ---
+# Each entry: at step s, raise the env's max_n_sheep to k. The env samples
+# uniformly from [1, max_n_sheep] each reset, so this widens the
+# distribution gradually rather than swapping fixed sizes.
+#
+# State-space curriculum: difficulty controls sheep spawn area
+# (0 = sheep spawn just north of gate, 1 = sheep spawn anywhere in field).
+# Plus the existing flock-size curriculum.
+#
+# The two together let the policy first learn "what penning looks like"
+# in a regime where random exploration reliably triggers it, then
+# gradually generalise to the deployment distribution.
+curriculum:
+  - { step: 0,          max_n_sheep: 1, difficulty: 0.0 }
+  - { step: 1_000_000,  max_n_sheep: 1, difficulty: 0.3 }
+  - { step: 2_000_000,  max_n_sheep: 2, difficulty: 0.5 }
+  - { step: 4_000_000,  max_n_sheep: 3, difficulty: 0.8 }
+  - { step: 6_000_000,  max_n_sheep: 5, difficulty: 1.0 }
+  - { step: 8_000_000,  max_n_sheep: 8, difficulty: 1.0 }
+  - { step: 9_000_000,  max_n_sheep: 10, difficulty: 1.0 }
@@ -0,0 +1,136 @@
+"""Evaluate a trained PPO policy (or the Strömbom baseline) on the env.
+
+Reports success rate and time-to-pen across a fixed seed grid for each
+flock size 1..MAX_SHEEP. Used to produce the M5 quantitative comparison
+table mentioned in plan.md.
+
+Usage::
+
+    python -m training.eval --policy training/runs/latest/best
+    python -m training.eval --policy strombom
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+from statistics import mean, stdev
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+
+import numpy as np
+
+from herding.geometry import MAX_SHEEP, PEN_ENTRY
+from herding.strombom import compute_action as strombom_action
+from herding.sequential import compute_action as sequential_action
+from training.herding_env import HerdingEnv
+
+
+def rollout(env: HerdingEnv, predict_fn, max_steps: int) -> dict:
+    obs, _ = env.reset()
+    success = False
+    for t in range(max_steps):
+        action = predict_fn(env, obs)
+        obs, _r, terminated, truncated, info = env.step(action)
+        if terminated or truncated:
+            success = bool(info.get("is_success", False))
+            return {"success": success, "steps": info.get("steps", t + 1),
+                    "n_penned": info.get("n_penned", 0)}
+    return {"success": False, "steps": max_steps, "n_penned": int(env.sheep_penned.sum())}
+
+
+def make_analytic_predictor(action_fn):
+    def _predict(env, _obs):
+        positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
+                     for i in range(env.n_sheep)
+                     if not env.sheep_penned[i]}
+        vx, vy, _mode = action_fn((env.dog_x, env.dog_y), positions, PEN_ENTRY)
+        return np.array([vx, vy], dtype=np.float32)
+    return _predict
+
+
+# Backwards-compat alias.
+def make_strombom_predictor():
+    return make_analytic_predictor(strombom_action)
+
+
+def make_policy_predictor(model, vecnorm):
+    def _predict(_env, obs):
+        if vecnorm is not None:
+            obs_b = vecnorm.normalize_obs(np.asarray(obs, dtype=np.float32).reshape(1, -1))
+        else:
+            obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
+        action, _ = model.predict(obs_b, deterministic=True)
+        return action[0]
+    return _predict
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--policy", required=True,
+                        help="Either 'strombom' or path to an SB3 run directory.")
+    parser.add_argument("--n-seeds", type=int, default=10)
+    parser.add_argument("--max-steps", type=int, default=5000)
+    parser.add_argument("--max-flock", type=int, default=MAX_SHEEP)
+    # 1.0 = deployment distribution (sheep anywhere in field).
+    # Lower values use the training-curriculum spawn band (sheep near gate).
+    parser.add_argument("--difficulty", type=float, default=1.0)
+    args = parser.parse_args()
+
+    if args.policy == "strombom":
+        predict = make_analytic_predictor(strombom_action)
+    elif args.policy == "sequential":
+        predict = make_analytic_predictor(sequential_action)
+    else:
+        from stable_baselines3 import PPO
+        run = Path(args.policy)
+        # Resolve to a zip: directory of checkpoints, or a direct zip path.
+        if run.is_file():
+            zip_path = run
+        else:
+            for name in ("best_model.zip", "policy.zip", "final.zip"):
+                if (run / name).exists():
+                    zip_path = run / name
+                    break
+            else:
+                raise FileNotFoundError(
+                    f"No checkpoint found in {run} (tried best_model.zip, "
+                    f"policy.zip, final.zip)"
+                )
+        model = PPO.load(str(zip_path), device="auto")
+        vecnorm = None
+        vn_path = run / "vecnormalize.pkl"
+        if not vn_path.exists() and run.parent.name != "best":
+            vn_path = run.parent / "vecnormalize.pkl"
+        if vn_path.exists():
+            import pickle
+            with open(vn_path, "rb") as f:
+                vecnorm = pickle.load(f)
+            vecnorm.training = False
+            vecnorm.norm_reward = False
+        predict = make_policy_predictor(model, vecnorm)
+
+    print(f"{'n_sheep':>8} {'success%':>10} {'mean_steps':>12} {'mean_penned':>12}")
+    print("-" * 46)
+    for n in range(1, args.max_flock + 1):
+        successes, steps, penned = [], [], []
+        for seed in range(args.n_seeds):
+            env = HerdingEnv(n_sheep=n, max_steps=args.max_steps,
+                             difficulty=args.difficulty, seed=seed)
+            r = rollout(env, predict, args.max_steps)
+            successes.append(int(r["success"]))
+            steps.append(r["steps"])
+            penned.append(r["n_penned"])
+        sr = 100.0 * mean(successes)
+        ms = mean(steps)
+        mp = mean(penned)
+        print(f"{n:>8d} {sr:>9.1f}% {ms:>12.0f} {mp:>12.2f}")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,318 +1,96 @@
-"""
-Parity test: verify 2D training env matches Webots controller implementations.
+"""Parity smoke-test for the herding env.

-Tests:
-1. Observation building: HerdingEnv._obs() vs shepherd_dog_rl.build_obs()
-2. Dog drive: HerdingEnv._step_dog_substep() vs shepherd_dog_rl.drive() math
-3. Sheep drive: HerdingEnv._sheep_drive() vs sheep.py drive() math
+Verifies (a) all imports resolve, (b) the env's reset/step contract is
+correct, (c) deterministic seeds give deterministic trajectories, and
+(d) the Strömbom baseline can drive the env without crashing.
+
+Run::
+
+    python -m training.parity_test
 """

-import sys
+from __future__ import annotations
+
 import os
-import math
+import sys
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+
 import numpy as np

-# Make imports work from project root
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "controllers", "shepherd_dog_rl"))
-
-from herding_env import HerdingEnv
-
-# Re-implement the Webots functions standalone (no Webots dependency)
-FIELD = 15.0
-PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
-PEN_ENTRY  = np.array([11.5,  -8.0], dtype=np.float32)
-PEN_X = (10.0, 13.0)
-PEN_Y = (-15.0, -8.0)
-ENTRY_AWARE = True
+from herding.geometry import MAX_SHEEP, PEN_ENTRY
+from herding.obs import OBS_DIM
+from herding.strombom import compute_action
+from training.herding_env import HerdingEnv


-def webots_build_obs(dog_pos, sheep_positions, n_sheep, dog_heading):
-    """Standalone version of shepherd_dog_rl.py build_obs()."""
-    D = 2 * FIELD
-    active_pos = np.array(
-        [p for p in sheep_positions
-         if not (PEN_X[0] < p[0] < PEN_X[1] and PEN_Y[0] < p[1] < PEN_Y[1])],
-        dtype=np.float32
-    )
-    n_active = len(active_pos)
-    if n_active > 0:
-        com = active_pos.mean(axis=0)
-        d_from_com = np.linalg.norm(active_pos - com, axis=1)
-        sorted_idx = np.argsort(d_from_com)[::-1]
-        radius = float(d_from_com[sorted_idx[0]])
-        def nth(n):
-            return active_pos[sorted_idx[n]] if len(sorted_idx) > n else com
-        far1, far2, far3 = nth(0), nth(1), nth(2)
-    else:
-        com = PEN_CENTER.copy()
-        radius = 0.0
-        far1 = far2 = far3 = PEN_CENTER.copy()
-    frac_active = n_active / max(n_sheep, 1)
-    pen_ref = PEN_ENTRY if ENTRY_AWARE else PEN_CENTER
-    return np.array([
-        dog_pos[0] / FIELD, dog_pos[1] / FIELD,
-        (com[0] - dog_pos[0]) / D, (com[1] - dog_pos[1]) / D,
-        (far1[0] - com[0]) / D, (far1[1] - com[1]) / D,
-        (far2[0] - com[0]) / D, (far2[1] - com[1]) / D,
-        (far3[0] - com[0]) / D, (far3[1] - com[1]) / D,
-        (pen_ref[0] - com[0]) / D, (pen_ref[1] - com[1]) / D,
-        (pen_ref[0] - far1[0]) / D, (pen_ref[1] - far1[1]) / D,
-        radius / D,
-        frac_active,
-        math.cos(dog_heading), math.sin(dog_heading),
-    ], dtype=np.float32)
+def test_obs_action_shapes():
+    env = HerdingEnv(n_sheep=3, seed=0)
+    obs, info = env.reset()
+    assert obs.shape == (OBS_DIM,), obs.shape
+    assert obs.dtype == np.float32
+    obs2, r, term, trunc, info = env.step(np.array([0.5, 0.0], dtype=np.float32))
+    assert obs2.shape == (OBS_DIM,)
+    assert isinstance(r, float)
+    assert isinstance(term, bool) and isinstance(trunc, bool)
+    print("[ok] shapes")


-def webots_dog_drive(heading, speed_ms, wheel_r=0.038, k_turn=4.0,
-                     motor_max=70.0, axle_track=0.28):
-    """Standalone version of shepherd_dog_rl.py drive() kinematics.
+def test_reset_determinism():
+    """Reset with the same seed should give the same initial observation.

-    Returns (v_linear, omega, left_w, right_w).
+    We don't require step-determinism — PPO doesn't need it, and chasing
+    bit-exactness through the flocking jitter isn't worth the complexity.
    """
-    err = math.atan2(math.sin(heading), math.cos(heading))
-    fwd_ms = speed_ms * max(0.0, math.cos(err))
-    fwd_rad = fwd_ms / wheel_r
-    turn = k_turn * err
-    l = max(-motor_max, min(motor_max, fwd_rad - turn))
-    r = max(-motor_max, min(motor_max, fwd_rad + turn))
-    v = wheel_r * 0.5 * (r + l)
-    w = (wheel_r / axle_track) * (r - l)
-    return v, w, l, r
+    env_a = HerdingEnv(n_sheep=3, seed=42)
+    env_b = HerdingEnv(n_sheep=3, seed=42)
+    obs_a, _ = env_a.reset(seed=42)
+    obs_b, _ = env_b.reset(seed=42)
+    assert np.allclose(obs_a, obs_b), "Reset is non-deterministic for same seed"
+    print("[ok] reset determinism")


-def webots_sheep_drive(heading, speed_rad, wheel_r=0.031, k_turn=4.0,
-                       motor_max=22.0, axle_track=0.20):
-    """Standalone version of sheep.py drive() kinematics."""
-    err = math.atan2(math.sin(heading), math.cos(heading))
-    fwd = speed_rad * max(0.0, math.cos(err))
-    k = 4.0
-    l = max(-motor_max, min(motor_max, fwd - k * err))
-    r = max(-motor_max, min(motor_max, fwd + k * err))
-    v = wheel_r * 0.5 * (r + l)
-    w = (wheel_r / axle_track) * (r - l)
-    return v, w, l, r
+def test_curriculum_n_sheep_varies():
+    env = HerdingEnv(seed=0)
+    sizes = set()
+    for _ in range(40):
+        _, info = env.reset()
+        sizes.add(info["n_sheep"])
+    assert 1 in sizes
+    assert max(sizes) <= MAX_SHEEP
+    print(f"[ok] curriculum sampling — saw n_sheep in {sorted(sizes)}")


-def test_obs_parity():
-    """Test that build_obs matches between 2D env and Webots controller."""
-    print("=== Test 1: Observation Parity ===")
-    env = HerdingEnv(n_sheep=3)
-    # Set ENTRY_AWARE to match our webots constant
-    env.ENTRY_AWARE = ENTRY_AWARE
-    env.reset(seed=42)
-
-    # Manually set positions for a controlled test
-    env.dog_pos = np.array([5.0, 3.0], dtype=np.float32)
-    env.dog_heading = 1.2
-    env.sheep_pos[0] = np.array([0.0, 0.0], dtype=np.float32)
-    env.sheep_pos[1] = np.array([2.0, -1.0], dtype=np.float32)
-    env.sheep_pos[2] = np.array([11.5, -11.5], dtype=np.float32)  # penned
-    env.penned[0] = False
-    env.penned[1] = False
-    env.penned[2] = True
-
-    obs_2d = env._obs()
-
-    # Build equivalent Webots observation
-    sheep_positions = [
-        env.sheep_pos[0].tolist(),
-        env.sheep_pos[1].tolist(),
-        env.sheep_pos[2].tolist(),
-    ]
-    obs_webots = webots_build_obs(
-        env.dog_pos, sheep_positions, 3, env.dog_heading
-    )
-
-    max_diff = float(np.max(np.abs(obs_2d - obs_webots)))
-    print(f"  Max element-wise diff: {max_diff:.2e}")
-    if max_diff < 1e-6:
-        print("  PASS: Observations match")
-    else:
-        print("  FAIL: Observations differ!")
-        for i in range(18):
-            if abs(obs_2d[i] - obs_webots[i]) > 1e-6:
-                print(f"    dim {i}: 2d={obs_2d[i]:.6f}  webots={obs_webots[i]:.6f}")
-    return max_diff < 1e-6
+def test_strombom_drives_env():
+    """Quick functional check that the analytic baseline can play the env
+    without exploding. Not a success-rate test — just no errors / NaNs."""
+    env = HerdingEnv(n_sheep=2, max_steps=400, seed=1)
+    obs, _ = env.reset()
+    for t in range(400):
+        positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
+                     for i in range(env.n_sheep)
+                     if not env.sheep_penned[i]}
+        if not positions:
+            break
+        vx, vy, _mode = compute_action((env.dog_x, env.dog_y), positions, PEN_ENTRY)
+        obs, r, term, trunc, info = env.step(np.array([vx, vy], dtype=np.float32))
+        assert np.isfinite(obs).all(), f"NaN/Inf in obs at step {t}"
+        assert np.isfinite(r), f"NaN reward at step {t}"
+        if term or trunc:
+            break
+    print(f"[ok] strombom rollout — final n_penned={int(env.sheep_penned.sum())}/{env.n_sheep} after {env.steps} steps")


-def test_dog_drive_parity():
-    """Test that dog diff-drive matches Webots controller."""
-    print("\n=== Test 2: Dog Drive Parity ===")
-    env = HerdingEnv(n_sheep=1)
-    env.reset(seed=42)
-
-    all_pass = True
-    test_cases = [
-        # (heading_error, speed_ms) — target_heading relative to current heading
-        (0.0, 2.5),      # aligned, full speed
-        (0.5, 2.5),      # 30deg error
-        (1.5, 2.5),      # ~86deg error
-        (3.14, 2.5),     # ~180deg error — should spin in place
-        (0.0, 0.5),      # aligned, slow
-        (0.3, 1.0),      # small error, medium speed
-    ]
-
-    for heading_err, speed_ms in test_cases:
-        env.dog_heading = 0.0
-        target_heading = heading_err
-        action = np.array([
-            math.cos(target_heading), math.sin(target_heading)
-        ], dtype=np.float32) * (speed_ms / env.DOG_SPEED)
-
-        # 2D env step
-        dbg = env._step_dog_substep(action, 0.016)
-        v_2d = dbg["v"]
-        w_2d = dbg["w"]
-        l_2d = dbg["left_w"]
-        r_2d = dbg["right_w"]
-
-        # Webots equivalent
-        v_w, w_w, l_w, r_w = webots_dog_drive(heading_err, speed_ms)
-
-        diffs = {
-            "v": abs(v_2d - v_w),
-            "w": abs(w_2d - w_w),
-            "left": abs(l_2d - l_w),
-            "right": abs(r_2d - r_w),
-        }
-        max_diff = max(diffs.values())
-        ok = max_diff < 1e-6
-        status = "PASS" if ok else "FAIL"
-        print(f"  err={heading_err:.2f} spd={speed_ms:.1f}: {status} (max_diff={max_diff:.2e})")
-        if not ok:
-            for k, d in diffs.items():
-                if d > 1e-6:
-                    print(f"    {k}: 2d={eval(k+'_2d'):.6f} webots={eval(k+'_w'):.6f}")
-            all_pass = False
-
-    return all_pass
-
-
-def test_sheep_drive_parity():
-    """Test that sheep diff-drive matches Webots sheep controller."""
-    print("\n=== Test 3: Sheep Drive Parity ===")
-    env = HerdingEnv(n_sheep=1)
-    env.reset(seed=42)
-
-    all_pass = True
-    test_cases = [
-        # (heading_error, speed_rad)
-        (0.0, 20.0),     # aligned, flee speed
-        (0.0, 3.0),      # aligned, wander speed
-        (0.5, 15.0),     # moderate error
-        (1.57, 10.0),    # 90deg — should spin in place
-        (3.14, 20.0),    # 180deg — should spin in place fast
-        (0.2, 8.0),      # small error, medium speed
-    ]
-
-    for heading_err, speed_rad in test_cases:
-        env.sheep_heading[0] = 0.0
-        env.sheep_pos[0] = np.array([0.0, 0.0], dtype=np.float32)
-        target_heading = heading_err
-
-        # 2D env
-        new_pos = env._sheep_drive(0, target_heading, speed_rad, 0.016)
-        v_2d_raw = float(np.linalg.norm(new_pos - np.array([0.0, 0.0]))) / 0.016
-        # Re-derive v, w from the internal state
-        heading_2d = env.sheep_heading[0]
-
-        # Webots equivalent
-        v_w, w_w, l_w, r_w = webots_sheep_drive(heading_err, speed_rad)
-
-        # For 2D, compute the same intermediate values
-        err_2d = (target_heading - 0.0 + np.pi) % (2 * np.pi) - np.pi
-        fwd_2d = speed_rad * max(0.0, math.cos(err_2d))
-        turn_2d = 4.0 * err_2d
-        l_2d = max(-22.0, min(22.0, fwd_2d - turn_2d))
-        r_2d = max(-22.0, min(22.0, fwd_2d + turn_2d))
-
-        diffs = {
-            "left": abs(l_2d - l_w),
-            "right": abs(r_2d - r_w),
-        }
-        max_diff = max(diffs.values())
-        ok = max_diff < 1e-6
-        status = "PASS" if ok else "FAIL"
-        print(f"  err={heading_err:.2f} spd={speed_rad:.1f}: {status} (max_diff={max_diff:.2e})")
-        if not ok:
-            for k, d in diffs.items():
-                if d > 1e-6:
-                    print(f"    {k}: 2d={l_2d if k=='left' else r_2d:.6f} webots={l_w if k=='left' else r_w:.6f}")
-            all_pass = False
-
-    return all_pass
-
-
-def test_full_trajectory_parity():
-    """Test that running identical actions produces matching trajectories."""
-    print("\n=== Test 4: Full Trajectory Parity (dog only) ===")
-    # Run 50 steps with a fixed action, compare dog heading/position
-    # at each step between 2D env kinematics and pure Webots kinematics.
-    env = HerdingEnv(n_sheep=1)
-    env.reset(seed=42)
-    env.dog_pos = np.array([0.0, 0.0], dtype=np.float32)
-    env.dog_heading = 0.0
-    env.ENTRY_AWARE = ENTRY_AWARE
-
-    action = np.array([0.8, -0.6], dtype=np.float32)  # magnitude 1.0
-    dt = 0.016667  # sub_dt
-
-    # Webots-side tracking
-    wb_heading = 0.0
-    wb_x, wb_y = 0.0, 0.0
-
-    max_heading_diff = 0.0
-    max_pos_diff = 0.0
-
-    for step in range(50):
-        # 2D env sub-step
-        env._step_dog_substep(action, dt)
-
-        # Webots-side computation
-        speed_ms = 1.0 * 2.5
-        target_heading = math.atan2(-0.6, 0.8)
-        err = math.atan2(math.sin(target_heading - wb_heading),
-                         math.cos(target_heading - wb_heading))
-        fwd_ms = speed_ms * max(0.0, math.cos(err))
-        fwd_rad = fwd_ms / 0.038
-        turn = 4.0 * err
-        l = max(-70.0, min(70.0, fwd_rad - turn))
-        r = max(-70.0, min(70.0, fwd_rad + turn))
-        v = 0.038 * 0.5 * (r + l)
-        w = (0.038 / 0.28) * (r - l)
-        wb_heading = math.atan2(math.sin(wb_heading + w * dt),
-                                math.cos(wb_heading + w * dt))
-        wb_x += math.cos(wb_heading) * v * dt
-        wb_y += math.sin(wb_heading) * v * dt
-
-        heading_diff = abs(env.dog_heading - wb_heading)
-        pos_diff = math.hypot(env.dog_pos[0] - wb_x, env.dog_pos[1] - wb_y)
-        max_heading_diff = max(max_heading_diff, heading_diff)
-        max_pos_diff = max(max_pos_diff, pos_diff)
-
-    print(f"  Max heading diff over 50 steps: {max_heading_diff:.2e} rad")
-    print(f"  Max position diff over 50 steps: {max_pos_diff:.2e} m")
-    ok = max_pos_diff < 1e-4
-    print(f"  {'PASS' if ok else 'FAIL'}: Trajectories match")
-    return ok
+def main():
+    test_obs_action_shapes()
+    test_reset_determinism()
+    test_curriculum_n_sheep_varies()
+    test_strombom_drives_env()
+    print("\nAll parity checks passed.")


 if __name__ == "__main__":
-    results = []
-    results.append(("Obs parity", test_obs_parity()))
-    results.append(("Dog drive parity", test_dog_drive_parity()))
-    results.append(("Sheep drive parity", test_sheep_drive_parity()))
-    results.append(("Trajectory parity", test_full_trajectory_parity()))
-
-    print("\n" + "=" * 50)
-    print("RESULTS")
-    print("=" * 50)
-    all_pass = True
-    for name, passed in results:
-        print(f"  {name}: {'PASS' if passed else 'FAIL'}")
-        if not passed:
-            all_pass = False
-    print(f"\nOverall: {'ALL PASS' if all_pass else 'SOME FAILURES'}")
-    env.close()
+    main()
@@ -1,6 +1,8 @@
-gymnasium>=0.29
-stable-baselines3>=2.3
-torch>=2.2
-numpy>=1.26
-matplotlib>=3.8
-tensorboard>=2.16
+# Pin major versions; SB3 2.x requires gymnasium and torch >= 1.13.
+gymnasium>=0.29,<2.0
+stable-baselines3[extra]>=2.3,<3.0
+torch>=2.1
+numpy>=1.24
+pyyaml>=6.0
+tensorboard>=2.14
+tqdm>=4.66
@@ -1 +0,0 @@
-
@@ -1,392 +0,0 @@
-"""
-PPO training for the herding task with curriculum learning.
-
-Trains from scratch through a 1→max_sheep curriculum, evaluates after each
-stage, and auto-generates trajectory/timeseries plots plus a summary chart.
-
-Usage
-----
-    python train.py                                       # defaults from config.json
-    python train.py --config my_config.json --max-sheep 5
-    python train.py --max-sheep 3 --steps-per-stage 1000000
-
-Outputs (in runs/<timestamp>/):
-    config.json          resolved config
-    final_model.zip      trained PPO model
-    vecnorm.pkl          VecNormalize statistics
-    stage_results.json   per-stage evaluation metrics
-    success_rate.png     summary bar chart
-    eval/                trajectory & timeseries plots per sheep count
-"""
-
-import argparse
-import json
-import os
-import time
-from copy import deepcopy
-
-import numpy as np
-from stable_baselines3 import PPO
-from stable_baselines3.common.callbacks import BaseCallback
-from stable_baselines3.common.vec_env import (
-    DummyVecEnv,
-    SubprocVecEnv,
-    VecNormalize,
-)
-
-from herding_env import HerdingEnv
-from viz import (
-    run_and_record,
-    plot_trajectory,
-    plot_timeseries,
-    plot_success_rate,
-    save_episode_gif,
-)
-
-
-# ── Callbacks ────────────────────────────────────────────────────────────────
-
-class ProgressCallback(BaseCallback):
-    """One-line progress summary every `freq` env steps."""
-
-    def __init__(self, stage_label: str, freq: int = 100_000):
-        super().__init__()
-        self.stage_label = stage_label
-        self.freq = freq
-        self._last = 0
-        self._ep_returns = []
-        self._ep_success = []
-        self._total_eps = 0
-        self._total_success = 0
-        self._cur_ret = None
-
-    def _on_step(self) -> bool:
-        rewards = self.locals.get("rewards")
-        dones = self.locals.get("dones")
-        infos = self.locals.get("infos", [])
-        if rewards is None or dones is None:
-            return True
-        if self._cur_ret is None or len(self._cur_ret) != len(rewards):
-            self._cur_ret = np.zeros(len(rewards), dtype=np.float64)
-        self._cur_ret += np.asarray(rewards, dtype=np.float64)
-        for i, d in enumerate(dones):
-            if not d:
-                continue
-            self._ep_returns.append(float(self._cur_ret[i]))
-            info = infos[i] if i < len(infos) else {}
-            success = int(info.get("n_penned", 0) == info.get("n_sheep", -1))
-            self._ep_success.append(success)
-            self._total_eps += 1
-            self._total_success += success
-            self._cur_ret[i] = 0.0
-            if len(self._ep_returns) > 50:
-                self._ep_returns.pop(0)
-                self._ep_success.pop(0)
-        if self.num_timesteps - self._last >= self.freq:
-            self._last = self.num_timesteps
-            n = len(self._ep_returns)
-            mean_r = float(np.mean(self._ep_returns)) if n else float("nan")
-            win_sr = float(np.mean(self._ep_success)) if n else float("nan")
-            cum_sr = (self._total_success / self._total_eps
-                      if self._total_eps else float("nan"))
-            print(f"           ... [{self.stage_label} | "
-                  f"{self.num_timesteps:>7,} steps | "
-                  f"ret(last {n})={mean_r:+.2f}  "
-                  f"win_sr={win_sr*100:.0f}%  cum_sr={cum_sr*100:.0f}%]",
-                  flush=True)
-        return True
-
-
-# ── Environment factory ──────────────────────────────────────────────────────
-
-def make_env(n_sheep, seed, max_steps, reward_cfg=None):
-    def _init():
-        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
-                         reward_cfg=reward_cfg)
-        env.reset(seed=seed)
-        return env
-    return _init
-
-
-# ── Failure-mode classification ──────────────────────────────────────────────
-
-COMPACT_RADIUS = 5.0
-
-
-def _classify(ep_radii, ep_com_dists, n_penned, n_sheep):
-    if n_penned == n_sheep:
-        return "SUCCESS"
-    if min(ep_radii) > COMPACT_RADIUS:
-        return "NEVER_COMPACT"
-    first = next(i for i, r in enumerate(ep_radii) if r <= COMPACT_RADIUS)
-    if min(ep_com_dists[first:]) > 3.0:
-        return "COMPACT_CANT_DRIVE"
-    if n_penned == 0:
-        return "DROVE_NO_SHEEP"
-    return f"PARTIAL_{n_penned}of{n_sheep}"
-
-
-# ── Evaluation ───────────────────────────────────────────────────────────────
-
-def evaluate(model, vn_template, n_sheep, n_episodes, max_steps,
-             reward_cfg=None):
-    """Evaluate at a given sheep count; returns metrics dict."""
-    raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, reward_cfg)])
-    vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
-    vn.obs_rms = deepcopy(vn_template.obs_rms)
-    vn.ret_rms = deepcopy(vn_template.ret_rms)
-
-    successes = 0
-    ep_lens = []
-    min_pen_list = []
-    action_mags = []
-    failure_counts = {}
-    rc_sums = {}
-    rc_n = 0
-
-    for _ in range(n_episodes):
-        obs = vn.reset()
-        done = False
-        steps = 0
-        min_pen = float("inf")
-        mags = []
-        ep_radii = []
-        ep_com_dists = []
-        while not done:
-            action, _ = model.predict(obs, deterministic=True)
-            obs, _, dones, infos = vn.step(action)
-            done = dones[0]
-            inner = vn.envs[0]
-            com, radius, _ = inner._flock_stats()
-            min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER)))
-            mags.append(float(np.linalg.norm(action[0])))
-            ep_radii.append(radius)
-            ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER)))
-            steps += 1
-            rc = infos[0].get("rcomps")
-            if rc:
-                for k, v in rc.items():
-                    rc_sums[k] = rc_sums.get(k, 0.0) + v
-                rc_n += 1
-        n_penned = infos[0].get("n_penned", 0)
-        success = n_penned == n_sheep
-        successes += int(success)
-        ep_lens.append(steps)
-        min_pen_list.append(min_pen)
-        action_mags.extend(mags)
-        mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep)
-        failure_counts[mode] = failure_counts.get(mode, 0) + 1
-
-    vn.close()
-
-    result = {
-        "sr": successes / n_episodes,
-        "mean_len": float(np.mean(ep_lens)),
-        "mean_min_pen": float(np.mean(min_pen_list)),
-        "mean_act": float(np.mean(action_mags)) if action_mags else 0.0,
-        "failure_modes": failure_counts,
-    }
-    if rc_n > 0:
-        result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()}
-    return result
-
-
-
-# ── CLI ──────────────────────────────────────────────────────────────────────
-
-DEFAULT_CONFIG = {
-    "W_PER_SHEEP": 2.0,
-    "W_ALIGN": 0.05,
-    "W_PEN_BONUS": 10.0,
-    "W_COMPLETE": 100.0,
-    "W_STEP_COST": 0.02,
-    "W_SOUTH": 0.01,
-    "W_COMPACT": 0.0,
-    "W_WALL_TOUCH": 0.04,
-    "WALL_TOUCH_BUFFER": 0.3,
-    "ALIGN_SHAPE": "standoff",
-    "ALIGN_GATED": True,
-    "ENTRY_AWARE": True,
-    "ent_coef": 0.02,
-}
-
-
-def parse_args():
-    p = argparse.ArgumentParser(
-        description="PPO training for herding task with curriculum learning")
-    p.add_argument("--config", type=str, default=None,
-                   help="JSON config file (reward weights + ent_coef)")
-    p.add_argument("--max-sheep", type=int, default=10)
-    p.add_argument("--steps-per-stage", type=int, default=1_500_000)
-    p.add_argument("--n-envs", type=int, default=8)
-    p.add_argument("--max-steps", type=int, default=2500)
-    p.add_argument("--eval-episodes", type=int, default=30)
-    p.add_argument("--run-dir", type=str, default=None)
-    p.add_argument("--no-gif", action="store_true",
-                   help="Skip per-stage GIF rendering (PNGs still produced).")
-    p.add_argument("--gif-fps", type=int, default=20)
-    p.add_argument("--gif-skip", type=int, default=3,
-                   help="Keep every Nth frame (smaller GIF; default 3).")
-    return p.parse_args()
-
-
-# ── Main ─────────────────────────────────────────────────────────────────────
-
-def main():
-    args = parse_args()
-
-    # Load config: --config overrides, else auto-load config.json if present
-    cfg = dict(DEFAULT_CONFIG)
-    config_path = args.config
-    if config_path is None and os.path.exists("config.json"):
-        config_path = "config.json"
-    if config_path:
-        with open(config_path) as f:
-            cfg.update(json.load(f))
-        print(f"Config loaded from {config_path}")
-
-    rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
-
-    # Run directory
-    run_dir = args.run_dir or os.path.join(
-        "runs", time.strftime("%Y%m%d_%H%M%S"))
-    eval_dir = os.path.join(run_dir, "eval")
-    os.makedirs(eval_dir, exist_ok=True)
-    with open(os.path.join(run_dir, "config.json"), "w") as f:
-        json.dump(cfg, f, indent=2)
-
-    print(f"Config: {cfg}")
-    print(f"Run dir: {run_dir}")
-    print(f"Curriculum: 1 → {args.max_sheep} sheep, "
-          f"{args.steps_per_stage:,} steps/stage\n")
-
-    # Training envs
-    train_env = SubprocVecEnv([
-        make_env(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg)
-        for i in range(args.n_envs)
-    ])
-    vn = VecNormalize(train_env, norm_obs=True, norm_reward=True,
-                      clip_obs=10.0)
-
-    # Model — force CPU (PPO with MLP runs faster on CPU than GPU; SB3 warns
-    # about this otherwise).
-    model = PPO(
-        "MlpPolicy", vn,
-        learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
-        gamma=0.995, gae_lambda=0.95, clip_range=0.2,
-        ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5,
-        policy_kwargs=dict(net_arch=[256, 256]),
-        device="cpu",
-        verbose=0,
-    )
-
-    # Curriculum training
-    stage_results = []
-    t0 = time.time()
-
-    try:
-        for n in range(1, args.max_sheep + 1):
-            if n == 1:
-                print(f"\n[Stage n_sheep=1] training {args.steps_per_stage:,} steps")
-                model.learn(
-                    total_timesteps=args.steps_per_stage,
-                    reset_num_timesteps=True,
-                    callback=ProgressCallback("1 sheep", freq=100_000),
-                )
-            else:
-                # Mixed transition: half envs stay at n-1, half advance to n,
-                # for the first half of the stage budget. This prevents the
-                # n+1 task's noisy early gradients from destroying the n policy
-                # (catastrophic forgetting) before it has a chance to adapt.
-                half = max(1, args.n_envs // 2)
-                for i in range(half):
-                    vn.env_method("set_n_sheep", n - 1, indices=[i])
-                for i in range(half, args.n_envs):
-                    vn.env_method("set_n_sheep", n, indices=[i])
-                mix_steps  = args.steps_per_stage // 2
-                full_steps = args.steps_per_stage - mix_steps
-                print(f"\n[Stage n_sheep={n}] mixed ({n-1}/{n} sheep) "
-                      f"{mix_steps:,} steps")
-                model.learn(
-                    total_timesteps=mix_steps,
-                    reset_num_timesteps=False,
-                    callback=ProgressCallback(f"{n-1}→{n} mix", freq=100_000),
-                )
-                vn.env_method("set_n_sheep", n)
-                print(f"[Stage n_sheep={n}] full ({n} sheep) {full_steps:,} steps")
-                model.learn(
-                    total_timesteps=full_steps,
-                    reset_num_timesteps=False,
-                    callback=ProgressCallback(f"{n} sheep", freq=100_000),
-                )
-
-            # Evaluate
-            print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
-            r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
-            print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}%  "
-                  f"mean_len={r['mean_len']:.0f}  "
-                  f"mean_min_pen={r['mean_min_pen']:.1f}m  "
-                  f"mean_act={r['mean_act']:.2f}")
-
-            # Failure-mode breakdown
-            if r["failure_modes"]:
-                modes = "  ".join(
-                    f"{k}={v}" for k, v in sorted(
-                        r["failure_modes"].items(), key=lambda x: -x[1]))
-                print(f"  failure modes: {modes}")
-
-            # Reward breakdown
-            if "reward_per_step" in r:
-                rps = r["reward_per_step"]
-                print(f"  reward/step: " + "  ".join(
-                    f"{k}={v:+.4f}" for k, v in rps.items()))
-
-            # Episode visualisation: trajectory + timeseries + animated GIF
-            hist = run_and_record(model, vn, n, args.max_steps, rcfg,
-                                  seed=1000 + n)
-            tag = "success" if hist["success"] else "fail"
-            plot_trajectory(
-                hist,
-                os.path.join(eval_dir, f"traj_{n}s_{tag}.png"))
-            plot_timeseries(
-                hist,
-                os.path.join(eval_dir, f"ts_{n}s_{tag}.png"))
-            if not args.no_gif:
-                save_episode_gif(
-                    hist,
-                    os.path.join(eval_dir, f"ep_{n}s_{tag}.gif"),
-                    fps=args.gif_fps, skip=args.gif_skip)
-
-            r["n_sheep"] = n
-            stage_results.append(r)
-
-        # Save artefacts
-        model.save(os.path.join(run_dir, "final_model"))
-        vn.save(os.path.join(run_dir, "vecnorm.pkl"))
-        with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
-            json.dump(stage_results, f, indent=2)
-
-    finally:
-        try:
-            vn.close()
-        except Exception:
-            pass
-
-    # Summary
-    elapsed = (time.time() - t0) / 60
-    print("\n" + "=" * 70)
-    print("  TRAINING SUMMARY")
-    print("=" * 70)
-    for r in stage_results:
-        print(f"  n_sheep={r['n_sheep']}  sr={r['sr']*100:>3.0f}%  "
-              f"len={r['mean_len']:>5.0f}  min_pen={r['mean_min_pen']:>5.1f}m  "
-              f"act={r['mean_act']:.2f}")
-    print(f"\n  Total time: {elapsed:.1f} min")
-    print(f"  Artefacts:  {run_dir}/")
-
-    plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png"))
-    print(f"  Plots:      {run_dir}/success_rate.png, {eval_dir}/")
-
-
-if __name__ == "__main__":
-    main()
@@ -1,412 +0,0 @@
-"""
-PPO training with attention-based policy (train_at.py).
-
-Key difference from train.py
-----------------------------
- Observation exposes ALL sheep as individual per-sheep tokens rather than
-  only the top-3 farthest. The policy therefore has complete flock visibility
-  at any sheep count — no hidden sheep even at n=10.
- A TransformerFeaturesExtractor processes the sheep tokens with multi-head
-  self-attention (permutation-invariant), then mean-pools over valid tokens
-  and concatenates the result with global dog/pen features.
- Curriculum transition uses the same mixed-env approach as train.py: half
-  the envs stay at n-1 for the first half of each new stage to suppress
-  catastrophic forgetting.
-
-Observation layout  (7 + MAX_SHEEP*6 = 67 dims, fixed)
-------------------------------------------------------
-  Global (7):
-    dog_x / FIELD,  dog_y / FIELD,
-    cos(heading),   sin(heading),
-    (pen_x - dog_x) / D,  (pen_y - dog_y) / D,
-    n_active / n_sheep
-
-  Per sheep i  (6):
-    (sheep_x - dog_x) / D,  (sheep_y - dog_y) / D,   ← pos rel to dog
-    (pen_x   - sheep_x) / D, (pen_y  - sheep_y) / D,  ← sheep-to-pen
-    is_active   1.0 if not penned, else 0.0
-    is_valid    1.0 if i < n_sheep, else 0.0 (padding sentinel)
-
-  After VecNormalize, is_valid for real sheep normalises > 0 and for
-  padding tokens < 0 (because mean ∈ (0,1)), so a threshold of 0 cleanly
-  separates real from padded without any extra bookkeeping.
-
-Usage
-----
-    python train_at.py                                 # defaults from config.json
-    python train_at.py --max-sheep 10 --steps-per-stage 2000000
-    python train_at.py --embed-dim 128 --n-heads 4 --n-layers 3
-"""
-
-import argparse
-import json
-import os
-import time
-from copy import deepcopy
-
-import numpy as np
-import torch
-import torch.nn as nn
-from gymnasium import spaces
-from stable_baselines3 import PPO
-from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
-from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize
-
-from herding_env import HerdingEnv
-from train import ProgressCallback, _classify, COMPACT_RADIUS, DEFAULT_CONFIG
-from viz import (
-    run_and_record, plot_trajectory, plot_timeseries,
-    plot_success_rate, save_episode_gif,
-)
-
-
-# ── Per-sheep token observation environment ───────────────────────────────────
-
-class HerdingEnvAt(HerdingEnv):
-    """
-    HerdingEnv with a per-sheep token observation for the attention policy.
-    Everything else (dynamics, reward, curriculum interface) is inherited.
-    """
-
-    OBS_GLOBAL = 7
-    OBS_SHEEP  = 6
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        obs_dim = self.OBS_GLOBAL + self.MAX_SHEEP * self.OBS_SHEEP
-        self.observation_space = spaces.Box(
-            low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32
-        )
-
-    def _obs(self) -> np.ndarray:
-        S = self.FIELD
-        D = 2.0 * self.FIELD
-        pen_ref     = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER
-        active_mask = ~self.penned[:self.n_sheep]
-        n_active    = int(active_mask.sum())
-
-        global_feats = np.array([
-            self.dog_pos[0] / S,
-            self.dog_pos[1] / S,
-            float(np.cos(self.dog_heading)),
-            float(np.sin(self.dog_heading)),
-            (pen_ref[0] - self.dog_pos[0]) / D,
-            (pen_ref[1] - self.dog_pos[1]) / D,
-            n_active / max(self.n_sheep, 1),
-        ], dtype=np.float32)
-
-        sheep_feats = np.zeros((self.MAX_SHEEP, self.OBS_SHEEP), dtype=np.float32)
-        for i in range(self.n_sheep):
-            pos = self.sheep_pos[i]
-            sheep_feats[i] = [
-                (pos[0] - self.dog_pos[0]) / D,
-                (pos[1] - self.dog_pos[1]) / D,
-                (pen_ref[0] - pos[0]) / D,
-                (pen_ref[1] - pos[1]) / D,
-                float(not self.penned[i]),
-                1.0,   # is_valid: this sheep exists
-            ]
-        # i >= n_sheep: all zeros, is_valid=0 → masked out in attention
-
-        return np.concatenate([global_feats, sheep_feats.ravel()])
-
-
-# ── Attention features extractor ──────────────────────────────────────────────
-
-class ShepherdAttentionExtractor(BaseFeaturesExtractor):
-    """
-    Multi-head self-attention over per-sheep tokens, mean-pooled over valid
-    (non-padding) tokens and concatenated with global dog/pen features.
-
-    After VecNormalize:
-      real sheep  → is_valid_norm > 0   (normalised from 1.0)
-      padding     → is_valid_norm ≤ 0   (normalised from 0.0)
-    so threshold at 0 is always correct regardless of curriculum stage.
-    """
-
-    GLOBAL_DIM = HerdingEnvAt.OBS_GLOBAL   # 7
-    SHEEP_DIM  = HerdingEnvAt.OBS_SHEEP    # 6
-    MAX_SHEEP  = HerdingEnv.MAX_SHEEP      # 10
-    VALID_IDX  = 5                          # index of is_valid within each token
-
-    def __init__(self, observation_space, embed_dim: int = 64,
-                 n_heads: int = 4, n_layers: int = 2, ff_dim: int = 128):
-        super().__init__(observation_space,
-                         features_dim=self.GLOBAL_DIM + embed_dim)
-        self.sheep_embed = nn.Linear(self.SHEEP_DIM, embed_dim)
-        encoder_layer = nn.TransformerEncoderLayer(
-            d_model=embed_dim, nhead=n_heads, dim_feedforward=ff_dim,
-            dropout=0.0, batch_first=True,
-        )
-        self.transformer = nn.TransformerEncoder(encoder_layer,
-                                                 num_layers=n_layers,
-                                                 enable_nested_tensor=False)
-
-    def forward(self, obs: torch.Tensor) -> torch.Tensor:
-        B = obs.shape[0]
-        global_feats = obs[:, :self.GLOBAL_DIM]                       # (B, 7)
-        tokens = obs[:, self.GLOBAL_DIM:].view(
-            B, self.MAX_SHEEP, self.SHEEP_DIM)                        # (B, 10, 6)
-
-        # is_valid after VecNorm: real > 0, padding ≤ 0
-        is_valid_norm    = tokens[:, :, self.VALID_IDX]               # (B, 10)
-        key_padding_mask = is_valid_norm <= 0.0                       # True → ignore
-
-        x = self.sheep_embed(tokens)                                  # (B, 10, E)
-        x = self.transformer(x, src_key_padding_mask=key_padding_mask)
-
-        valid_w = (is_valid_norm > 0.0).float().unsqueeze(-1)        # (B, 10, 1)
-        pooled  = (x * valid_w).sum(1) / valid_w.sum(1).clamp(min=1.0)
-
-        return torch.cat([global_feats, pooled], dim=1)               # (B, 7+E)
-
-
-# ── Environment factory ───────────────────────────────────────────────────────
-
-def make_env_at(n_sheep, seed, max_steps, reward_cfg=None):
-    def _init():
-        env = HerdingEnvAt(n_sheep=n_sheep, max_steps=max_steps,
-                           reward_cfg=reward_cfg)
-        env.reset(seed=seed)
-        return env
-    return _init
-
-
-# ── Evaluation ────────────────────────────────────────────────────────────────
-
-def evaluate_at(model, vn_template, n_sheep, n_episodes, max_steps,
-                reward_cfg=None):
-    raw = DummyVecEnv([make_env_at(n_sheep, 9999, max_steps, reward_cfg)])
-    vn  = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
-    vn.obs_rms = deepcopy(vn_template.obs_rms)
-    vn.ret_rms = deepcopy(vn_template.ret_rms)
-
-    successes = 0
-    ep_lens, min_pen_list, action_mags = [], [], []
-    failure_counts, rc_sums = {}, {}
-    rc_n = 0
-
-    for _ in range(n_episodes):
-        obs  = vn.reset()
-        done = False
-        steps, min_pen = 0, float("inf")
-        mags, ep_radii, ep_com_dists = [], [], []
-        while not done:
-            action, _ = model.predict(obs, deterministic=True)
-            obs, _, dones, infos = vn.step(action)
-            done  = dones[0]
-            inner = vn.envs[0]
-            com, radius, _ = inner._flock_stats()
-            min_pen = min(min_pen,
-                          float(np.linalg.norm(com - inner.PEN_CENTER)))
-            mags.append(float(np.linalg.norm(action[0])))
-            ep_radii.append(radius)
-            ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER)))
-            steps += 1
-            rc = infos[0].get("rcomps")
-            if rc:
-                for k, v in rc.items():
-                    rc_sums[k] = rc_sums.get(k, 0.0) + v
-                rc_n += 1
-        n_penned = infos[0].get("n_penned", 0)
-        successes += int(n_penned == n_sheep)
-        ep_lens.append(steps)
-        min_pen_list.append(min_pen)
-        action_mags.extend(mags)
-        mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep)
-        failure_counts[mode] = failure_counts.get(mode, 0) + 1
-
-    vn.close()
-    result = {
-        "sr":          successes / n_episodes,
-        "mean_len":    float(np.mean(ep_lens)),
-        "mean_min_pen": float(np.mean(min_pen_list)),
-        "mean_act":    float(np.mean(action_mags)) if action_mags else 0.0,
-        "failure_modes": failure_counts,
-    }
-    if rc_n > 0:
-        result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()}
-    return result
-
-
-# ── CLI ───────────────────────────────────────────────────────────────────────
-
-def parse_args():
-    p = argparse.ArgumentParser(
-        description="PPO + attention training for herding task")
-    p.add_argument("--config",           type=str, default=None)
-    p.add_argument("--max-sheep",        type=int, default=10)
-    p.add_argument("--steps-per-stage",  type=int, default=1_500_000)
-    p.add_argument("--n-envs",           type=int, default=8)
-    p.add_argument("--max-steps",        type=int, default=2500)
-    p.add_argument("--eval-episodes",    type=int, default=30)
-    p.add_argument("--run-dir",          type=str, default=None)
-    p.add_argument("--no-gif",           action="store_true")
-    p.add_argument("--gif-fps",          type=int, default=20)
-    p.add_argument("--gif-skip",         type=int, default=3)
-    # Attention architecture
-    p.add_argument("--embed-dim",        type=int, default=64,
-                   help="Transformer embedding dimension (default 64)")
-    p.add_argument("--n-heads",          type=int, default=4,
-                   help="Number of attention heads (default 4)")
-    p.add_argument("--n-layers",         type=int, default=2,
-                   help="Number of transformer encoder layers (default 2)")
-    p.add_argument("--ff-dim",           type=int, default=128,
-                   help="Transformer feed-forward dim (default 128)")
-    return p.parse_args()
-
-
-# ── Main ──────────────────────────────────────────────────────────────────────
-
-def main():
-    args = parse_args()
-
-    cfg = dict(DEFAULT_CONFIG)
-    config_path = args.config
-    if config_path is None and os.path.exists("config.json"):
-        config_path = "config.json"
-    if config_path:
-        with open(config_path) as f:
-            cfg.update(json.load(f))
-        print(f"Config loaded from {config_path}")
-
-    rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
-
-    run_dir  = args.run_dir or os.path.join(
-        "runs", "at_" + time.strftime("%Y%m%d_%H%M%S"))
-    eval_dir = os.path.join(run_dir, "eval")
-    os.makedirs(eval_dir, exist_ok=True)
-    with open(os.path.join(run_dir, "config.json"), "w") as f:
-        json.dump(cfg, f, indent=2)
-
-    print(f"Config:      {cfg}")
-    print(f"Run dir:     {run_dir}")
-    print(f"Curriculum:  1 → {args.max_sheep} sheep, "
-          f"{args.steps_per_stage:,} steps/stage")
-    print(f"Transformer: embed={args.embed_dim}  heads={args.n_heads}  "
-          f"layers={args.n_layers}  ff={args.ff_dim}\n")
-
-    train_env = SubprocVecEnv([
-        make_env_at(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg)
-        for i in range(args.n_envs)
-    ])
-    vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
-
-    model = PPO(
-        "MlpPolicy", vn,
-        learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
-        gamma=0.995, gae_lambda=0.95, clip_range=0.2,
-        ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5,
-        policy_kwargs=dict(
-            features_extractor_class=ShepherdAttentionExtractor,
-            features_extractor_kwargs=dict(
-                embed_dim=args.embed_dim,
-                n_heads=args.n_heads,
-                n_layers=args.n_layers,
-                ff_dim=args.ff_dim,
-            ),
-            net_arch=[256, 256],
-        ),
-        device="cpu",
-        verbose=0,
-    )
-
-    stage_results = []
-    t0 = time.time()
-
-    try:
-        for n in range(1, args.max_sheep + 1):
-            if n == 1:
-                print(f"\n[Stage n_sheep=1] training {args.steps_per_stage:,} steps")
-                model.learn(
-                    total_timesteps=args.steps_per_stage,
-                    reset_num_timesteps=True,
-                    callback=ProgressCallback("1 sheep", freq=100_000),
-                )
-            else:
-                half       = max(1, args.n_envs // 2)
-                mix_steps  = args.steps_per_stage // 2
-                full_steps = args.steps_per_stage - mix_steps
-
-                for i in range(half):
-                    vn.env_method("set_n_sheep", n - 1, indices=[i])
-                for i in range(half, args.n_envs):
-                    vn.env_method("set_n_sheep", n, indices=[i])
-
-                print(f"\n[Stage n_sheep={n}] mixed ({n-1}/{n} sheep) "
-                      f"{mix_steps:,} steps")
-                model.learn(
-                    total_timesteps=mix_steps,
-                    reset_num_timesteps=False,
-                    callback=ProgressCallback(f"{n-1}→{n} mix", freq=100_000),
-                )
-
-                vn.env_method("set_n_sheep", n)
-                print(f"[Stage n_sheep={n}] full ({n} sheep) {full_steps:,} steps")
-                model.learn(
-                    total_timesteps=full_steps,
-                    reset_num_timesteps=False,
-                    callback=ProgressCallback(f"{n} sheep", freq=100_000),
-                )
-
-            print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
-            r = evaluate_at(model, vn, n, args.eval_episodes,
-                            args.max_steps, rcfg)
-            print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}%  "
-                  f"mean_len={r['mean_len']:.0f}  "
-                  f"mean_min_pen={r['mean_min_pen']:.1f}m  "
-                  f"mean_act={r['mean_act']:.2f}")
-            if r["failure_modes"]:
-                modes = "  ".join(
-                    f"{k}={v}" for k, v in sorted(
-                        r["failure_modes"].items(), key=lambda x: -x[1]))
-                print(f"  failure modes: {modes}")
-            if "reward_per_step" in r:
-                rps = r["reward_per_step"]
-                print("  reward/step: " + "  ".join(
-                    f"{k}={v:+.4f}" for k, v in rps.items()))
-
-            hist = run_and_record(
-                model, vn, n, args.max_steps, rcfg,
-                seed=1000 + n, make_env_fn=make_env_at,
-            )
-            tag = "success" if hist["success"] else "fail"
-            plot_trajectory(hist, os.path.join(eval_dir, f"traj_{n}s_{tag}.png"))
-            plot_timeseries(hist, os.path.join(eval_dir, f"ts_{n}s_{tag}.png"))
-            if not args.no_gif:
-                save_episode_gif(
-                    hist,
-                    os.path.join(eval_dir, f"ep_{n}s_{tag}.gif"),
-                    fps=args.gif_fps, skip=args.gif_skip)
-
-            r["n_sheep"] = n
-            stage_results.append(r)
-
-        model.save(os.path.join(run_dir, "final_model"))
-        vn.save(os.path.join(run_dir, "vecnorm.pkl"))
-        with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
-            json.dump(stage_results, f, indent=2)
-
-    finally:
-        try:
-            vn.close()
-        except Exception:
-            pass
-
-    elapsed = (time.time() - t0) / 60
-    print("\n" + "=" * 70)
-    print("  TRAINING SUMMARY  (attention policy)")
-    print("=" * 70)
-    for r in stage_results:
-        print(f"  n_sheep={r['n_sheep']}  sr={r['sr']*100:>3.0f}%  "
-              f"len={r['mean_len']:>5.0f}  "
-              f"min_pen={r['mean_min_pen']:>5.1f}m  "
-              f"act={r['mean_act']:.2f}")
-    print(f"\n  Total time: {elapsed:.1f} min")
-    print(f"  Artefacts:  {run_dir}/")
-    plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png"))
-    print(f"  Plots:      {run_dir}/success_rate.png, {eval_dir}/")
-
-
-if __name__ == "__main__":
-    main()
@@ -0,0 +1,267 @@
+"""Train a PPO shepherd-dog policy on ``HerdingEnv`` with curriculum.
+
+Defaults to 16 parallel ``SubprocVecEnv`` workers feeding a GPU policy.
+Saves checkpoints, the best-eval model, and the VecNormalize stats —
+all three are needed at inference time by the Webots controller.
+
+Usage::
+
+    python -m training.train_ppo \
+        --config training/configs/ppo_default.yaml \
+        --out-dir training/runs/baseline
+
+To resume from a checkpoint::
+
+    python -m training.train_ppo --resume training/runs/baseline/checkpoints/ppo_500000_steps.zip
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+import yaml
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+
+import numpy as np
+import torch as th
+from stable_baselines3 import PPO
+from stable_baselines3.common.callbacks import (
+    BaseCallback, CheckpointCallback, EvalCallback,
+)
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.vec_env import (
+    DummyVecEnv, SubprocVecEnv, VecNormalize,
+)
+
+from training.herding_env import HerdingEnv
+
+
+# --------------------------------------------------------------------------
+# Env factories
+# --------------------------------------------------------------------------
+
+def _make_env(rank: int, seed: int = 0):
+    def _thunk():
+        env = HerdingEnv(seed=seed + rank)
+        env = Monitor(env, info_keywords=("is_success", "n_sheep", "n_penned"))
+        return env
+    return _thunk
+
+
+# --------------------------------------------------------------------------
+# Curriculum callback
+# --------------------------------------------------------------------------
+
+class CurriculumCallback(BaseCallback):
+    """Drive the env's flock-size + state-space difficulty curriculum.
+
+    Schedule entries: {step, max_n_sheep, difficulty}. The largest entry
+    whose step <= num_timesteps wins; both knobs update together.
+    """
+
+    def __init__(self, schedule, vec_envs, verbose: int = 0):
+        super().__init__(verbose)
+        self.schedule = sorted(schedule, key=lambda d: d["step"])
+        # Accept a list of envs so the eval env tracks training difficulty.
+        self.vec_envs = vec_envs if isinstance(vec_envs, (list, tuple)) else [vec_envs]
+        self._last_n = None
+        self._last_d = None
+
+    def _call(self, method, value):
+        for v in self.vec_envs:
+            try:
+                v.env_method(method, value)
+            except AttributeError:
+                v.venv.env_method(method, value)
+
+    def _on_step(self) -> bool:
+        t = self.num_timesteps
+        n = self.schedule[0]["max_n_sheep"]
+        d = self.schedule[0].get("difficulty", 1.0)
+        for entry in self.schedule:
+            if t >= entry["step"]:
+                n = entry["max_n_sheep"]
+                d = entry.get("difficulty", 1.0)
+        if n != self._last_n:
+            self._call("set_max_n_sheep", n)
+            self._last_n = n
+        if d != self._last_d:
+            self._call("set_difficulty", d)
+            self._last_d = d
+            if self.verbose:
+                print(f"[curriculum] t={t} → max_n_sheep={n} difficulty={d}")
+        return True
+
+
+# --------------------------------------------------------------------------
+# Main
+# --------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default=os.path.join(_HERE, "configs", "ppo_default.yaml"))
+    parser.add_argument("--out-dir", default=os.path.join(_HERE, "runs", "latest"))
+    parser.add_argument("--n-envs", type=int, default=None,
+                        help="Override config n_envs.")
+    parser.add_argument("--total-timesteps", type=int, default=None,
+                        help="Override config total_timesteps.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--resume", type=str, default=None,
+                        help="Path to a SB3 zip to resume from.")
+    # SB3 recommends CPU for MlpPolicy — GPU helps CNN policies, not MLPs
+    # of this size. Override with --device cuda if you really want it.
+    parser.add_argument("--device", default="cpu")
+    parser.add_argument("--no-vecnorm", action="store_true",
+                        help="Disable VecNormalize wrapper. Required when "
+                             "resuming from a BC-pretrained policy that "
+                             "wasn't trained under it.")
+    parser.add_argument("--no-curriculum", action="store_true",
+                        help="Skip curriculum callback (resumed policy is "
+                             "already competent across the distribution).")
+    parser.add_argument("--imitate-weight", type=float, default=None,
+                        help="Override env W_IMITATE. Set to 0 to disable "
+                             "Strömbom imitation reward.")
+    parser.add_argument("--difficulty", type=float, default=None,
+                        help="Override env difficulty (0=easy, 1=hard). "
+                             "Used in BC fine-tune to skip easy curriculum.")
+    parser.add_argument("--log-std", type=float, default=None,
+                        help="Override the policy's log_std after load. "
+                             "BC trained with std≈1.6 (log_std=0.5) which "
+                             "is too noisy for fine-tune. Use -1.5 (std≈0.22) "
+                             "to keep PPO close to the BC mean while still "
+                             "exploring locally.")
+    parser.add_argument("--learning-rate", type=float, default=None,
+                        help="Override config learning rate. For BC "
+                             "fine-tune, 5e-5 is much safer than the 3e-4 "
+                             "default.")
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        cfg = yaml.safe_load(f)
+
+    n_envs = args.n_envs or cfg["n_envs"]
+    total_timesteps = args.total_timesteps or cfg["total_timesteps"]
+
+    out = Path(args.out_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    (out / "checkpoints").mkdir(exist_ok=True)
+    (out / "best").mkdir(exist_ok=True)
+    (out / "evals").mkdir(exist_ok=True)
+
+    print(f"[train] out={out}  n_envs={n_envs}  total={total_timesteps}  device={args.device}")
+
+    # --- Train env (vectorised, optionally normalised) ---
+    env_fns = [_make_env(i, seed=args.seed) for i in range(n_envs)]
+    venv = SubprocVecEnv(env_fns) if n_envs > 1 else DummyVecEnv(env_fns)
+    eval_venv = DummyVecEnv([_make_env(99, seed=args.seed + 999)])
+    if not args.no_vecnorm:
+        venv = VecNormalize(venv, norm_obs=True, norm_reward=False, clip_obs=10.0)
+        eval_venv = VecNormalize(eval_venv, norm_obs=True, norm_reward=False,
+                                 clip_obs=10.0, training=False)
+        eval_venv.obs_rms = venv.obs_rms
+    else:
+        print("[train] VecNormalize disabled (resumed policy was trained without it).")
+
+    # Apply env-level overrides (used by BC fine-tune to disable Strömbom
+    # imitation and start at full deployment difficulty).
+    def _env_call(method, value):
+        for v in (venv, eval_venv):
+            try:
+                v.env_method(method, value)
+            except AttributeError:
+                v.venv.env_method(method, value)
+
+    if args.imitate_weight is not None:
+        _env_call("set_imitate_weight", args.imitate_weight)
+        print(f"[train] W_IMITATE overridden to {args.imitate_weight}")
+    if args.difficulty is not None:
+        _env_call("set_difficulty", args.difficulty)
+        print(f"[train] difficulty pinned to {args.difficulty}")
+
+    # --- Model ---
+    policy_kwargs = dict(
+        net_arch=dict(pi=cfg["net_arch_pi"], vf=cfg["net_arch_vf"]),
+        log_std_init=cfg.get("log_std_init", 0.0),
+    )
+
+    if args.resume:
+        print(f"[train] resuming from {args.resume}")
+        custom_objects = {}
+        if args.learning_rate is not None:
+            custom_objects["learning_rate"] = args.learning_rate
+        model = PPO.load(args.resume, env=venv, device=args.device,
+                         tensorboard_log=str(out / "tb"),
+                         custom_objects=custom_objects or None)
+        if args.log_std is not None:
+            import torch as _th
+            with _th.no_grad():
+                model.policy.log_std.fill_(args.log_std)
+            print(f"[train] log_std overridden to {args.log_std} "
+                  f"(std≈{2.71828 ** args.log_std:.2f})")
+        if args.learning_rate is not None:
+            print(f"[train] learning_rate overridden to {args.learning_rate}")
+    else:
+        model = PPO(
+            cfg["policy"], venv,
+            learning_rate=cfg["learning_rate"],
+            n_steps=cfg["n_steps"],
+            batch_size=cfg["batch_size"],
+            n_epochs=cfg["n_epochs"],
+            gamma=cfg["gamma"],
+            gae_lambda=cfg["gae_lambda"],
+            clip_range=cfg["clip_range"],
+            ent_coef=cfg["ent_coef"],
+            vf_coef=cfg["vf_coef"],
+            max_grad_norm=cfg["max_grad_norm"],
+            target_kl=cfg.get("target_kl"),
+            policy_kwargs=policy_kwargs,
+            tensorboard_log=str(out / "tb"),
+            seed=args.seed,
+            device=args.device,
+            verbose=1,
+        )
+
+    # --- Callbacks ---
+    ckpt_cb = CheckpointCallback(
+        save_freq=max(1, cfg["checkpoint_freq"] // n_envs),
+        save_path=str(out / "checkpoints"), name_prefix="ppo",
+        save_vecnormalize=True,
+    )
+    eval_cb = EvalCallback(
+        eval_venv,
+        best_model_save_path=str(out / "best"),
+        log_path=str(out / "evals"),
+        eval_freq=max(1, cfg["eval_freq"] // n_envs),
+        n_eval_episodes=cfg["n_eval_episodes"],
+        deterministic=True,
+    )
+    callbacks = [ckpt_cb, eval_cb]
+    if not args.no_curriculum and "curriculum" in cfg and cfg["curriculum"]:
+        callbacks.append(CurriculumCallback(
+            cfg["curriculum"], [venv, eval_venv], verbose=1,
+        ))
+    elif args.no_curriculum:
+        print("[train] curriculum disabled — env knobs left at their current values.")
+
+    # --- Train ---
+    model.learn(total_timesteps=total_timesteps, callback=callbacks,
+                progress_bar=True)
+
+    # --- Save final model + VecNormalize stats ---
+    model.save(out / "final.zip")
+    venv.save(str(out / "vecnormalize.pkl"))
+    # The EvalCallback already wrote best_model.zip into out/best/ — drop the
+    # VecNormalize stats next to it for the controller to pick up.
+    venv.save(str(out / "best" / "vecnormalize.pkl"))
+    print(f"[train] done. saved to {out}")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,342 +0,0 @@
-"""
-All visualization for the herding policy: trajectory plots, timeseries plots,
-success-rate bar chart, and animated GIFs.
-
-Used both by train.py (auto-rendered after each curriculum stage) and as a CLI
-to render a fresh episode against a saved model.
-
-CLI usage:
-    python viz.py --run-dir runs/v1 --n-sheep 5
-    python viz.py --run-dir runs/v1 --n-sheep 10 --no-gif
-    python viz.py --model runs/v1/final_model.zip --vecnorm runs/v1/vecnorm.pkl \\
-        --n-sheep 3 --out-dir vis_v1_3sheep
-"""
-import argparse
-import os
-import json
-from copy import deepcopy
-
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
-import matplotlib.animation as animation
-from matplotlib.collections import LineCollection
-import numpy as np
-from stable_baselines3 import PPO
-from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
-
-from herding_env import HerdingEnv
-
-
-# ── Palette ──────────────────────────────────────────────────────────────────
-
-SHEEP_COLORS = [
-    "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00",
-    "#a65628", "#f781bf", "#999999", "#66c2a5", "#fc8d62",
-]
-DOG_COLOR = "#4e342e"
-
-
-# ── Common drawing primitives ────────────────────────────────────────────────
-
-def draw_field(ax):
-    ax.set_xlim(-16, 16)
-    ax.set_ylim(-16, 16)
-    ax.set_aspect("equal")
-    ax.set_facecolor("#dcedc8")
-    ax.add_patch(mpatches.Rectangle(
-        (-15, -15), 30, 30, fill=False, edgecolor="#795548", lw=2))
-    ax.add_patch(mpatches.Rectangle(
-        (10, -15), 3, 7, facecolor="#ffe082", edgecolor="#795548", lw=2))
-    ax.text(11.5, -11.5, "pen", ha="center", va="center",
-            fontsize=8, color="#795548")
-
-
-def faded_path(ax, xs, ys, color, lw=1.5, label=None):
-    n = len(xs)
-    if n < 2:
-        return
-    points = np.array([xs, ys]).T.reshape(-1, 1, 2)
-    segs = np.concatenate([points[:-1], points[1:]], axis=1)
-    alphas = np.linspace(0.15, 1.0, len(segs))
-    colors = [(*matplotlib.colors.to_rgb(color), a) for a in alphas]
-    ax.add_collection(LineCollection(segs, colors=colors, linewidth=lw))
-    if label:
-        ax.plot([], [], color=color, lw=lw, label=label)
-
-
-# ── Episode rollout ──────────────────────────────────────────────────────────
-
-def make_eval_env(n_sheep, seed, max_steps, reward_cfg=None):
-    def _init():
-        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
-                         reward_cfg=reward_cfg)
-        env.reset(seed=seed)
-        return env
-    return _init
-
-
-def run_and_record(model, vn_template, n_sheep, max_steps,
-                   reward_cfg=None, seed=42, make_env_fn=None):
-    """Run one deterministic episode and return full trajectory history."""
-    _factory = make_env_fn or make_eval_env
-    raw = DummyVecEnv([_factory(n_sheep, seed, max_steps, reward_cfg)])
-    vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
-    vn.obs_rms = deepcopy(vn_template.obs_rms)
-    vn.ret_rms = deepcopy(vn_template.ret_rms)
-
-    obs = vn.reset()
-    inner = vn.envs[0]
-    done = False
-
-    dog_xs, dog_ys = [], []
-    sheep_xs = [[] for _ in range(n_sheep)]
-    sheep_ys = [[] for _ in range(n_sheep)]
-    sheep_penned = [[] for _ in range(n_sheep)]
-    radii = []
-    pen_dists = [[] for _ in range(n_sheep)]
-    action_mags = []
-    rewards = []
-    penned_at = [None] * n_sheep
-    step = 0
-
-    while not done:
-        action, _ = model.predict(obs, deterministic=True)
-        obs, reward, dones, infos = vn.step(action)
-        done = dones[0]
-        step += 1
-
-        dog_xs.append(float(inner.dog_pos[0]))
-        dog_ys.append(float(inner.dog_pos[1]))
-        com, radius, _ = inner._flock_stats()
-        radii.append(radius)
-        rewards.append(float(reward[0]))
-        action_mags.append(float(np.linalg.norm(action[0])))
-        for i in range(n_sheep):
-            sheep_xs[i].append(float(inner.sheep_pos[i][0]))
-            sheep_ys[i].append(float(inner.sheep_pos[i][1]))
-            sheep_penned[i].append(bool(inner.penned[i]))
-            pen_dists[i].append(
-                float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER)))
-            if inner.penned[i] and penned_at[i] is None:
-                penned_at[i] = step
-
-    n_penned = infos[0].get("n_penned", 0)
-    vn.close()
-
-    return dict(
-        dog_xs=dog_xs, dog_ys=dog_ys,
-        sheep_xs=sheep_xs, sheep_ys=sheep_ys,
-        sheep_penned=sheep_penned,
-        radii=radii, pen_dists=pen_dists,
-        action_mags=action_mags, rewards=rewards,
-        penned_at=penned_at,
-        n_penned=n_penned, n_sheep=n_sheep,
-        success=n_penned == n_sheep, steps=step,
-    )
-
-
-# ── Static plots ─────────────────────────────────────────────────────────────
-
-def plot_trajectory(hist, out_path):
-    fig, ax = plt.subplots(figsize=(7, 7))
-    draw_field(ax)
-    for i in range(hist["n_sheep"]):
-        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
-        xs, ys = hist["sheep_xs"][i], hist["sheep_ys"][i]
-        faded_path(ax, xs, ys, c, lw=1.2, label=f"sheep {i+1}")
-        ax.plot(xs[0], ys[0], "o", color=c, ms=7, zorder=4)
-        end = hist["penned_at"][i] if hist["penned_at"][i] is not None else -1
-        ax.plot(xs[end], ys[end], "*", color=c, ms=11, zorder=5)
-    faded_path(ax, hist["dog_xs"], hist["dog_ys"], DOG_COLOR, lw=2.0,
-               label="dog")
-    ax.plot(hist["dog_xs"][0], hist["dog_ys"][0], "s", color=DOG_COLOR,
-            ms=10, zorder=5)
-    ax.plot(hist["dog_xs"][-1], hist["dog_ys"][-1], "D", color=DOG_COLOR,
-            ms=10, zorder=5)
-    result = ("SUCCESS" if hist["success"]
-              else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})")
-    ax.set_title(f"n={hist['n_sheep']}  {result}  {hist['steps']} steps",
-                 fontsize=12)
-    ax.legend(loc="upper left", fontsize=8)
-    plt.tight_layout()
-    fig.savefig(out_path, dpi=120)
-    plt.close(fig)
-
-
-def plot_timeseries(hist, out_path):
-    t = np.arange(hist["steps"])
-    fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)
-
-    axes[0].plot(t, hist["radii"], color="steelblue")
-    axes[0].axhline(5.0, color="orange", ls="--", lw=1, label="compact (5m)")
-    axes[0].set_ylabel("flock radius (m)")
-    axes[0].legend(fontsize=8)
-    axes[0].set_title("Flock radius")
-
-    for i in range(hist["n_sheep"]):
-        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
-        axes[1].plot(t, hist["pen_dists"][i], color=c, lw=1,
-                     label=f"sheep {i+1}")
-        if hist["penned_at"][i] is not None:
-            axes[1].axvline(hist["penned_at"][i], color=c, ls=":", lw=1)
-    axes[1].set_ylabel("dist to pen (m)")
-    axes[1].legend(fontsize=7, ncol=min(hist["n_sheep"], 5))
-    axes[1].set_title("Per-sheep distance to pen")
-
-    axes[2].plot(t, hist["action_mags"], color="tomato", lw=1)
-    axes[2].axhline(1.0, color="gray", ls="--", lw=1, label="max")
-    axes[2].set_ylabel("action ||(vx,vy)||")
-    axes[2].set_ylim(0, 1.5)
-    axes[2].set_title("Dog action magnitude")
-    axes[2].legend(fontsize=8)
-
-    axes[3].plot(t, hist["rewards"], color="purple", lw=1, alpha=0.7)
-    axes[3].axhline(0, color="black", lw=0.5)
-    axes[3].set_ylabel("reward")
-    axes[3].set_xlabel("step")
-    axes[3].set_title("Reward per step")
-
-    result = ("SUCCESS" if hist["success"]
-              else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})")
-    fig.suptitle(f"n_sheep={hist['n_sheep']}  {result}  {hist['steps']} steps",
-                 fontsize=13)
-    plt.tight_layout()
-    fig.savefig(out_path, dpi=120)
-    plt.close(fig)
-
-
-def plot_success_rate(stage_results, out_path):
-    fig, ax = plt.subplots(figsize=(8, 4))
-    ns = [r["n_sheep"] for r in stage_results]
-    srs = [r["sr"] * 100 for r in stage_results]
-    bars = ax.bar(ns, srs, color="steelblue", edgecolor="white")
-    ax.set_xlabel("Sheep count")
-    ax.set_ylabel("Success rate (%)")
-    ax.set_ylim(0, 105)
-    ax.axhline(90, color="orange", ls="--", lw=1, label="90% target")
-    for bar, sr in zip(bars, srs):
-        ax.text(bar.get_x() + bar.get_width() / 2,
-                bar.get_height() + 1, f"{sr:.0f}%",
-                ha="center", fontsize=9)
-    ax.legend()
-    ax.set_title("Evaluation success rate per sheep count")
-    plt.tight_layout()
-    fig.savefig(out_path, dpi=120)
-    plt.close(fig)
-
-
-# ── Animated GIF ─────────────────────────────────────────────────────────────
-
-def save_episode_gif(hist, out_path, fps=20, skip=3):
-    """Render hist as an animated GIF. `skip` keeps every Nth frame (smaller file)."""
-    n_sheep = hist["n_sheep"]
-    frames = list(range(0, hist["steps"], max(1, skip)))
-    if frames[-1] != hist["steps"] - 1:
-        frames.append(hist["steps"] - 1)
-
-    fig, ax = plt.subplots(figsize=(6, 6))
-    draw_field(ax)
-    title = ax.text(0, 16.5, "", ha="center", fontsize=11)
-    dog_marker, = ax.plot([], [], "s", color=DOG_COLOR, ms=12,
-                          markeredgecolor="black", markeredgewidth=1.5,
-                          zorder=5)
-    sheep_markers = []
-    for i in range(n_sheep):
-        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
-        m, = ax.plot([], [], "o", color=c, ms=10,
-                     markeredgecolor="#333", markeredgewidth=1, zorder=4)
-        sheep_markers.append(m)
-    dog_trail, = ax.plot([], [], color=DOG_COLOR, lw=1.0, alpha=0.5)
-
-    def update(k):
-        title.set_text(
-            f"n={n_sheep}  step {k+1}/{hist['steps']}  "
-            f"penned {sum(hist['sheep_penned'][i][k] for i in range(n_sheep))}/{n_sheep}")
-        dog_marker.set_data([hist["dog_xs"][k]], [hist["dog_ys"][k]])
-        dog_trail.set_data(hist["dog_xs"][:k+1], hist["dog_ys"][:k+1])
-        for i, m in enumerate(sheep_markers):
-            m.set_data([hist["sheep_xs"][i][k]], [hist["sheep_ys"][i][k]])
-            penned = hist["sheep_penned"][i][k]
-            m.set_color("deeppink" if penned else SHEEP_COLORS[i % len(SHEEP_COLORS)])
-        return [title, dog_marker, dog_trail, *sheep_markers]
-
-    anim = animation.FuncAnimation(
-        fig, update, frames=frames, interval=1000 / fps, blit=False)
-    anim.save(out_path, writer=animation.PillowWriter(fps=fps), dpi=80)
-    plt.close(fig)
-
-
-# ── CLI ──────────────────────────────────────────────────────────────────────
-
-def _resolve_paths(args):
-    if args.run_dir:
-        model_path  = os.path.join(args.run_dir, "final_model.zip")
-        vn_path     = os.path.join(args.run_dir, "vecnorm.pkl")
-        cfg_path    = os.path.join(args.run_dir, "config.json")
-    else:
-        model_path  = args.model
-        vn_path     = args.vecnorm
-        cfg_path    = args.config
-    return model_path, vn_path, cfg_path
-
-
-def main():
-    p = argparse.ArgumentParser(
-        description="Render trajectory + timeseries + GIF for a saved policy.")
-    p.add_argument("--run-dir", type=str, default=None,
-                   help="Run directory containing final_model.zip + vecnorm.pkl + config.json")
-    p.add_argument("--model",   type=str, default=None)
-    p.add_argument("--vecnorm", type=str, default=None)
-    p.add_argument("--config",  type=str, default=None)
-    p.add_argument("--n-sheep", type=int, default=3)
-    p.add_argument("--seed",    type=int, default=42)
-    p.add_argument("--max-steps", type=int, default=2500)
-    p.add_argument("--out-dir", type=str, default=None)
-    p.add_argument("--no-gif",  action="store_true",
-                   help="Skip the animated GIF (PNG-only is faster).")
-    p.add_argument("--gif-fps", type=int, default=20)
-    p.add_argument("--gif-skip", type=int, default=3)
-    args = p.parse_args()
-
-    model_path, vn_path, cfg_path = _resolve_paths(args)
-    if not (model_path and vn_path):
-        p.error("either --run-dir or both --model and --vecnorm are required")
-
-    rcfg = None
-    if cfg_path and os.path.exists(cfg_path):
-        with open(cfg_path) as f:
-            cfg = json.load(f)
-        rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
-
-    out_dir = args.out_dir or os.path.join(
-        os.path.dirname(os.path.abspath(model_path)),
-        f"vis_{args.n_sheep}s")
-    os.makedirs(out_dir, exist_ok=True)
-
-    print(f"Loading model:   {model_path}")
-    print(f"Loading vecnorm: {vn_path}")
-    model = PPO.load(model_path, device="cpu")
-
-    raw = DummyVecEnv([make_eval_env(args.n_sheep, args.seed, args.max_steps, rcfg)])
-    vn = VecNormalize.load(vn_path, raw)
-
-    print(f"Rolling out n_sheep={args.n_sheep} (seed={args.seed})...")
-    hist = run_and_record(model, vn, args.n_sheep, args.max_steps,
-                          reward_cfg=rcfg, seed=args.seed)
-    result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})"
-    print(f"  {result} in {hist['steps']} steps")
-
-    plot_trajectory(hist, os.path.join(out_dir, "trajectory.png"))
-    plot_timeseries(hist, os.path.join(out_dir, "timeseries.png"))
-    print(f"  saved trajectory.png + timeseries.png to {out_dir}/")
-    if not args.no_gif:
-        gif_path = os.path.join(out_dir, "episode.gif")
-        print(f"  rendering GIF (fps={args.gif_fps}, skip={args.gif_skip})...")
-        save_episode_gif(hist, gif_path, fps=args.gif_fps, skip=args.gif_skip)
-        print(f"  saved {gif_path}")
-
-
-if __name__ == "__main__":
-    main()
@@ -1,5 +1,5 @@
 Webots Project File version R2025a
-perspectives: 000000ff00000000fd00000002000000010000011c00000298fc0200000001fb0000001400540065007800740045006400690074006f00720100000000000002980000003f00ffffff000000030000084300000238fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000008430000006900ffffff000007250000029800000001000000020000000100000008fc00000000
+perspectives: 000000ff00000000fd00000002000000010000011c000001bcfc0200000001fb0000001400540065007800740045006400690074006f00720100000000000001bc0000003f00ffffff00000003000005c600000220fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000005c60000006900ffffff000004a8000001bc00000001000000020000000100000008fc00000000
 simulationViewPerspectives: 000000ff000000010000000200000100000006250100000002010000000100
 sceneTreePerspectives: 000000ff00000001000000030000001f000000c0000000000100000002010000000200
 maximizedDockId: -1
@@ -10,7 +10,7 @@ EXTERNPROTO "../protos/Sheep.proto"
 # World
 WorldInfo {
  info [
-    "RL-Based Autonomous Shepherd Robot"
+    "Autonomous Shepherd Robot (Strömbom)"
    "Group G25"
  ]
  title "Shepherd Herding"
@@ -106,19 +106,26 @@ Solid { translation -2.5 -15 0.84 children [ Shape { appearance USE CAP geometry
 Solid { translation 14 -15 0.40 children [ Shape { appearance USE STONE_A geometry Box { size 2.0 0.16 0.80 } } ] boundingObject Box { size 2.0 0.16 0.80 } }
 Solid { translation 14 -15 0.84 children [ Shape { appearance USE CAP geometry Box { size 2.1 0.26 0.07 } } ] boundingObject Box { size 2.1 0.26 0.07 } }
 # Gate posts
-Solid { translation 10 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
-Solid { translation 13 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
-# Outer gate (wooden, slightly ajar, Z-brace)
-Solid { translation 11.5 -15.08 0.55 rotation 0 0 1 0.25 children [
+Solid { translation 10 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
+Solid { translation 13 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
+# Outer gate — fully open, hinged on the west gate post. Modeled as a swung-back
+# wooden gate parallel to the south wall, on the west side, so the 3m corridor
+# between gate posts (x=10..13, y=-15) is unobstructed.
+Solid { translation 8.6 -15.05 0.55 rotation 0 0 1 0 children [
  Shape { appearance USE WOOD geometry Box { size 2.80 0.05 1.00 } }
-  Transform { translation 0 0.02 0 rotation 0 1 0 0.34 children [ Shape { appearance DEF FPOST PBRAppearance { baseColor 0.35 0.22 0.10 roughness 0.90 } geometry Box { size 2.97 0.04 0.06 } } ] }
+  # FPOST appearance DEF lives here so the external pen below can USE it.
+  Transform { translation 0 0.02 0 rotation 0 1 0 0.34 children [
+    Shape { appearance DEF FPOST PBRAppearance { baseColor 0.35 0.22 0.10 roughness 0.90 } geometry Box { size 2.97 0.04 0.06 } }
+  ] }
 ] boundingObject Box { size 2.80 0.08 1.00 } }

-# ==================== QUARANTINE PEN (wooden post-and-rail fence, inside field) ====================
-# Flow: main field → inner gate → quarantine area → outer gate → outside
+# ==================== EXTERNAL PEN (south of field, accessed through south-wall gate) ====================
+# Flow: main field → south-wall gate (x ∈ [10, 13], y = -15) → external pen
+# The pen is a wooden post-and-rail rectangle south of the field, x ∈ [10, 13],
+# y ∈ [-22, -15], open on the north side (the gate hole is the entrance).

-# West wall (x=10, ~7m along Y)
-Solid { translation 10 -11.46 0.55 children [
+# Pen west wall (x=10, y from -22 to -15, length 7m)
+Solid { translation 10 -18.5 0.55 children [
  Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
@@ -130,8 +137,8 @@ Solid { translation 10 -11.46 0.55 children [
  Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] }
 ] boundingObject Box { size 0.14 6.92 1.10 } }

-# East wall (x=13)
-Solid { translation 13 -11.46 0.55 children [
+# Pen east wall (x=13, y from -22 to -15, length 7m)
+Solid { translation 13 -18.5 0.55 children [
  Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
@@ -143,39 +150,50 @@ Solid { translation 13 -11.46 0.55 children [
  Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] }
 ] boundingObject Box { size 0.14 6.92 1.10 } }

-# North wall - open entrance (no wall, just corner posts)
-Solid { translation 10 -8 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] boundingObject Box { size 0.12 0.12 1.10 } }
-Solid { translation 13 -8 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] boundingObject Box { size 0.12 0.12 1.10 } }
+# Pen south wall (y=-22, x from 10 to 13, length 3m, closes the back of the pen)
+Solid { translation 11.5 -22 0.55 children [
+  Transform { translation -1.5 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
+  Transform { translation  0   0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
+  Transform { translation  1.5 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
+  Transform { translation 0 0 -0.38 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] }
+  Transform { translation 0 0 -0.05 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] }
+  Transform { translation 0 0 0.30 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] }
+  Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 2.92 0.14 0.04 } } ] }
+] boundingObject Box { size 2.92 0.14 1.10 } }
+
+# Pen north corner posts at the gate opening (no wall — sheep enter here from the field)
+Solid { translation 10 -15.0 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
+Solid { translation 13 -15.0 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }

 # Corner pillars
-Solid { translation  15  15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
-Solid { translation  15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
-Solid { translation -15  15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
-Solid { translation -15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
+Solid { translation  15  15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
+Solid { translation  15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
+Solid { translation -15  15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
+Solid { translation -15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }

 # Mid-pillars every 5 m — East
-Solid { translation  15  10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation  15   5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation  15   0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation  15  -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation  15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  15  10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation  15   5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation  15   0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation  15  -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation  15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
 # West
-Solid { translation -15  10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation -15   5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation -15   0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation -15  -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation -15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation -15  10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation -15   5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation -15   0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation -15  -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation -15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
 # North
-Solid { translation  10  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation   5  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation   0  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation  -5  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation -10  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  10  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation   5  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation   0  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation  -5  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation -10  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
 # South
-Solid { translation   5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation   0 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation  -5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
-Solid { translation -10 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation   5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation   0 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation  -5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
+Solid { translation -10 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }

 # ==================== BARN 1 — Gambrel/Dutch style (NE, outside fence) ====================
 # Body 10×7×4, weathered gray-brown wood, gambrel roof, large double doors
@@ -503,28 +521,16 @@ ShepherdDog {
 }

 # ==================== SHEEP ====================
-Sheep {
-  translation 3 2 0.5
-  name "sheep1"
-  controller "sheep"
-}
-Sheep {
-  translation 3 -2 0.5
-  name "sheep2"
-  controller "sheep"
-}
-Sheep {
-  translation 4 0 0.5
-  name "sheep3"
-  controller "sheep"
-}
-# Sheep {
-#   translation 3.5 1 0.5
-#   name "sheep4"
-#   controller "sheep"
-# }
-# Sheep {
-#   translation 3.5 -1 0.5
-#   name "sheep5"
-#   controller "sheep"
-# }
+# Up to 10 sheep, scattered through the field's central/north zone. Comment
+# out trailing slots to test smaller flock sizes; the dog policy is trained
+# to handle 1..10 sheep so any prefix works.
+Sheep { translation  3.0  2.0 0.5 name "sheep1"  controller "sheep" }
+Sheep { translation  3.0 -2.0 0.5 name "sheep2"  controller "sheep" }
+Sheep { translation  4.0  0.0 0.5 name "sheep3"  controller "sheep" }
+Sheep { translation -3.0  4.0 0.5 name "sheep4"  controller "sheep" }
+Sheep { translation -5.0 -2.0 0.5 name "sheep5"  controller "sheep" }
+Sheep { translation  6.0  5.0 0.5 name "sheep6"  controller "sheep" }
+Sheep { translation -6.0  6.0 0.5 name "sheep7"  controller "sheep" }
+Sheep { translation  0.0  8.0 0.5 name "sheep8"  controller "sheep" }
+Sheep { translation -8.0  0.0 0.5 name "sheep9"  controller "sheep" }
+Sheep { translation  7.0 -4.0 0.5 name "sheep10" controller "sheep" }