Checkpoint 2

2026-05-07 22:00:10 +01:00
parent 90aa3bbcb4
commit 1bb9415414
37 changed files with 3068 additions and 2912 deletions
@@ -4,18 +4,22 @@
 # Python
 __pycache__/
-
+*.pyc
-# Training
+.venv/
 training/**/events.out.tfevents.*
 training/**/checkpoints/
 training/runs/**
 !training/runs/.gitkeep
 # Controller runtime artefacts
 controllers/shepherd_dog_rl/debug*.csv
 controllers/shepherd_dog_rl/debug_out*/
 controllers/shepherd_dog_rl/final_model*.zip
 controllers/shepherd_dog_rl/vecnorm*.pkl
 # Optional env parity debug
 dog_debug.csv
 # Webots controller scratch
 controllers/shepherd_dog/dog_behavior_log.csv
 # Training artefacts
 training/runs/*
 !training/runs/.gitkeep
 *.zip
 *.pkl
 # TensorBoard
 events.out.tfevents.*
 worlds/field_test.wbt
 herding_runtime.cfg
@@ -1,45 +1,36 @@
-"""
+"""Sheep flocking controller (Webots).
 Sheep flocking controller (Webots, Reynolds boids variant).
 Each sheep broadcasts its GPS position every 3 steps on channel 1 and
-listens for the dog and peer sheep positions.  Peers are keyed by robot
+listens for the dog and peer sheep positions. The behavioural step is
-name so each neighbour has exactly one current entry in the dict.
+delegated to ``herding.flocking_sim.compute_heading_speed`` so the
 training environment and Webots run identical sheep dynamics.
-Force stack each step (summed then converted to a heading + speed):
+Pen behaviour: a sheep latches to ``penned`` the first time it crosses
-    flee       — away from dog, quadratic ramp, dominant when close
+the south-wall gate plane into the gate corridor. Once latched it turns
-    cohesion   — toward flock centre, halved while fleeing
+pink (via the exposed ``woolColor`` PROTO field) and the force stack
-    separation — inverse-distance push, prevents physical overlap
+switches to in-pen containment.
    walls      — linear repulsion from field boundary
    wander     — small persistent drift for natural idle motion
 Pen behaviour: on first entry into the quarantine pen the sheep latches
 permanently — it turns pink (via the exposed woolColor PROTO field) and
 the normal force stack is replaced by pen-confinement forces only.
 """
 import random
 import math
 import os
 import random
 import sys
 # --- Make the shared herding/ package importable from this controller dir ---
 _HERE = os.path.dirname(os.path.abspath(__file__))
 _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
 from controller import Supervisor
-# ---------------------------------------------------------------------------
+from herding.diffdrive import heading_speed_to_wheels
-# Tuning constants
+from herding.flocking_sim import MAX_SPEED, compute_heading_speed
-# ---------------------------------------------------------------------------
+from herding.geometry import (
    SHEEP_MAX_WHEEL_OMEGA,
    is_penned_position,
 )
 MAX_SPEED    = 22.0   # rad/s hard clamp on both motors
 FLEE_SPEED   = 20.0   # rad/s upper bound while panicking
 WANDER_SPEED =  3.0   # rad/s lower bound during calm wandering
 X_MIN, X_MAX = -14.5, 14.5   # stone wall inner edges (metres)
 Y_MIN, Y_MAX = -14.5, 14.5
 WALL_MARGIN  =  3.5           # avoidance starts this far from the wall
 FLEE_DIST       = 7.0   # dog within this radius triggers flee (metres)
 SEPARATION_DIST = 2.5   # inverse-distance push active inside this radius
 COHESION_DIST   = 8.0   # pull toward flock centre active inside this radius
 PEN_X_MIN, PEN_X_MAX = 10.0, 13.0   # quarantine pen extents (metres)
 PEN_Y_MIN, PEN_Y_MAX = -15.0, -8.0  # open entrance at y=-8, gate at y=-15
 PEN_MARGIN = 0.8                     # confinement force starts this far from pen wall
 # ---------------------------------------------------------------------------
 # Device setup
@@ -56,178 +47,102 @@ left_motor.setPosition(float("inf"))
 right_motor.setPosition(float("inf"))
 left_motor.setVelocity(0.0)
 right_motor.setVelocity(0.0)
 MOTOR_MAX = min(left_motor.getMaxVelocity(), SHEEP_MAX_WHEEL_OMEGA)
 gps = robot.getDevice("gps");           gps.enable(timestep)
 compass = robot.getDevice("compass");   compass.enable(timestep)
 receiver = robot.getDevice("receiver"); receiver.enable(timestep)
 emitter = robot.getDevice("emitter")
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def norm_angle(a):
    return math.atan2(math.sin(a), math.cos(a))
 def bearing():
    # Compass returns north direction in sensor frame; for this Z-up world
    # with north = +Y, atan2(n[0], n[1]) gives the standard math angle
-    # (0 = east, π/2 = north) matching atan2(fy, fx) used for heading.
+    # (0 = east, π/2 = north) matching atan2(fy, fx) used for headings.
    n = compass.getValues()
    return math.atan2(n[0], n[1])
-def drive(heading, speed):
+def drive(heading, speed_motor):
-    err = norm_angle(heading - bearing())
+    left_w, right_w = heading_speed_to_wheels(
-    # Scale forward component by cos(err): at 90° error fwd→0 so the robot
+        heading, min(speed_motor, MAX_SPEED), bearing(), MOTOR_MAX, k_turn=4.0
-    # spins in place to realign rather than driving sideways at full speed.
+    )
-    fwd = speed * max(0.0, math.cos(err))
+    left_motor.setVelocity(left_w)
-    k = 4.0
+    right_motor.setVelocity(right_w)
    left_motor.setVelocity( max(-MAX_SPEED, min(MAX_SPEED, fwd - k * err)))
    right_motor.setVelocity(max(-MAX_SPEED, min(MAX_SPEED, fwd + k * err)))
 def paint_pink():
    # woolColor is declared as a PROTO field with IS binding to the DEF WOOL
-    # PBRAppearance baseColor.  Changing it here propagates to every USE WOOL
+    # PBRAppearance baseColor; setting it propagates to every USE WOOL shape.
    # shape on the body.  Direct field access avoids PROTO-internal opacity.
    self_node.getField("woolColor").setSFColor([1.0, 0.55, 0.72])
 # ---------------------------------------------------------------------------
 # State
 # ---------------------------------------------------------------------------
 wander_angle = random.uniform(-math.pi, math.pi)
-step   = 0
+step_count = 0
-dog_x  = None
+dog_x, dog_y = None, None
 dog_y  = None
 peers = {}        # name → (x, y), one entry per neighbour, cleared every 30 steps
 penned = False
 # Stuck detection: differential-drive sheep can pin against a wall and need
 # a forced reverse-and-rotate to escape. If displacement < STUCK_DIST for
 # STUCK_STEPS consecutive steps, drive toward field centre.
 _prev_x, _prev_y = None, None
 _stuck_count = 0
 STUCK_STEPS = 20
 STUCK_DIST = 0.05
 # ---------------------------------------------------------------------------
 # Main loop
 # ---------------------------------------------------------------------------
 while robot.step(timestep) != -1:
-    step += 1
+    step_count += 1
    pos = gps.getValues()
    x, y = pos[0], pos[1]
-    # Pen entry: one-way latch, never unset
+    # Pen entry: one-way latch. Penned sheep get pink wool and switch behaviour.
-    if not penned and PEN_X_MIN < x < PEN_X_MAX and PEN_Y_MIN < y < PEN_Y_MAX:
+    if not penned and is_penned_position(x, y):
        penned = True
        paint_pink()
-    # Refresh peer table (clear before receiving so fresh data is never lost)
+    # Refresh peer table — clear before receiving so fresh data is never lost.
-    if step % 30 == 0:
+    if step_count % 30 == 0:
        peers.clear()
    while receiver.getQueueLength() > 0:
        msg = receiver.getString()
        receiver.nextPacket()
-        p = msg.split(":")
+        parts = msg.split(":")
-        if p[0] == "dog" and len(p) >= 3:
+        if parts[0] == "dog" and len(parts) >= 3:
-            dog_x, dog_y = float(p[1]), float(p[2])
+            dog_x, dog_y = float(parts[1]), float(parts[2])
-        elif p[0] == "sheep" and len(p) >= 4 and p[1] != name:
+        elif parts[0] == "sheep" and len(parts) >= 4 and parts[1] != name:
-            peers[p[1]] = (float(p[2]), float(p[3]))
+            peers[parts[1]] = (float(parts[2]), float(parts[3]))
-    fx, fy = 0.0, 0.0
+    dog_xy = (dog_x, dog_y) if dog_x is not None and dog_y is not None else None
    heading, speed, wander_angle = compute_heading_speed(
        x=x, y=y, penned=penned, dog_xy=dog_xy, peers=peers,
        wander_angle=wander_angle,
    )
-    # Repel unpenned sheep from the exterior of the pen's side walls so they
+    # Stuck detection — safety net for differential-drive wall pinning.
-    # don't get pinned by flee forces. Only fires when strictly outside the pen
+    if _prev_x is not None:
-    # (x < PEN_X_MIN or x > PEN_X_MAX) at pen height (y in pen y-range).
+        moved = math.hypot(x - _prev_x, y - _prev_y)
-    # Entrance is open on the north (y > PEN_Y_MAX) — no force there.
+        _stuck_count = _stuck_count + 1 if moved < STUCK_DIST else 0
-    PEN_EXT_MARGIN = 0.8
+    if _stuck_count >= STUCK_STEPS:
-    if not penned and PEN_Y_MIN < y < PEN_Y_MAX:
+        heading = math.atan2(-y, -x)   # always points away from the boundary
-        if PEN_X_MIN - PEN_EXT_MARGIN < x < PEN_X_MIN:
+        speed = MAX_SPEED
-            fx -= ((x - (PEN_X_MIN - PEN_EXT_MARGIN)) / PEN_EXT_MARGIN) * 6.0
+        _stuck_count = 0
-        if PEN_X_MAX < x < PEN_X_MAX + PEN_EXT_MARGIN:
+    _prev_x, _prev_y = x, y
            fx += ((PEN_X_MAX + PEN_EXT_MARGIN - x) / PEN_EXT_MARGIN) * 6.0
    if penned:
        # Inside pen: wander freely, strong boundary forces prevent exit,
        # separation still active to avoid collisions with other penned sheep.
        pm = PEN_MARGIN
        if x < PEN_X_MIN + pm: fx += ((PEN_X_MIN + pm - x) / pm) * 15.0
        if x > PEN_X_MAX - pm: fx -= ((x - (PEN_X_MAX - pm)) / pm) * 15.0
        if y < PEN_Y_MIN + pm: fy += ((PEN_Y_MIN + pm - y) / pm) * 15.0
        if y > PEN_Y_MAX - pm: fy -= ((y - (PEN_Y_MAX - pm)) / pm) * 15.0
        for px, py in peers.values():
            dx, dy = px - x, py - y
            d = math.hypot(dx, dy)
            if 0.05 < d < SEPARATION_DIST:
                push = (SEPARATION_DIST - d) / d
                fx -= (dx / d) * push * 2.5
                fy -= (dy / d) * push * 2.5
        if random.random() < 0.02:
            wander_angle += random.uniform(-0.6, 0.6)
        fx += math.cos(wander_angle) * 0.5
        fy += math.sin(wander_angle) * 0.5
    else:
        fleeing = False
        # Flee — quadratic ramp so force grows rapidly as the dog closes in
        if dog_x is not None:
            dx   = dog_x - x
            dy   = dog_y - y
            dist = math.hypot(dx, dy)
            if 0.01 < dist < FLEE_DIST:
                fleeing = True
                t = 1.0 - dist / FLEE_DIST
                s = t * t * 20.0
                fx -= (dx / dist) * s
                fy -= (dy / dist) * s
        # Cohesion — halved while fleeing to reduce mid-panic collisions
        cx, cy, cn = 0.0, 0.0, 0
        for px, py in peers.values():
            d = math.hypot(px - x, py - y)
            if 0.3 < d < COHESION_DIST:
                cx += px; cy += py; cn += 1
        if cn > 0:
            w = 0.08 if fleeing else 0.15
            fx += (cx / cn - x) * w
            fy += (cy / cn - y) * w
        # Separation — inverse-distance: huge when nearly overlapping, fades quickly
        for px, py in peers.values():
            dx, dy = px - x, py - y
            d = math.hypot(dx, dy)
            if 0.05 < d < SEPARATION_DIST:
                push = (SEPARATION_DIST - d) / d
                fx -= (dx / d) * push * 2.5
                fy -= (dy / d) * push * 2.5
        # Walls
        if x < X_MIN + WALL_MARGIN: fx += ((X_MIN + WALL_MARGIN - x) / WALL_MARGIN) * 6.0
        if x > X_MAX - WALL_MARGIN: fx -= ((x - (X_MAX - WALL_MARGIN)) / WALL_MARGIN) * 6.0
        if y < Y_MIN + WALL_MARGIN: fy += ((Y_MIN + WALL_MARGIN - y) / WALL_MARGIN) * 6.0
        if y > Y_MAX - WALL_MARGIN: fy -= ((y - (Y_MAX - WALL_MARGIN)) / WALL_MARGIN) * 6.0
        # Wander — suppressed while fleeing so drift cannot deflect the flee heading
        if not fleeing:
            if random.random() < 0.02:
                wander_angle += random.uniform(-0.6, 0.6)
            fx += math.cos(wander_angle) * 0.5
            fy += math.sin(wander_angle) * 0.5
    # Hard-stop clamp: within 0.5 m of a wall, zero any force component that
    # would push further into it.  Prevents the flee force from pinning a sheep
    # against the boundary when the dog approaches from outside.
    HS = 0.5
    if x < X_MIN + HS and fx < 0: fx = 0.0
    if x > X_MAX - HS and fx > 0: fx = 0.0
    if y < Y_MIN + HS and fy < 0: fy = 0.0
    if y > Y_MAX - HS and fy > 0: fy = 0.0
    heading = math.atan2(fy, fx)
    mag     = math.hypot(fx, fy)
    speed   = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))
    drive(heading, speed)
-    if step % 3 == 0:
+    if step_count % 3 == 0:
        emitter.send(f"sheep:{name}:{x:.4f}:{y:.4f}")
@@ -0,0 +1,78 @@
 """Lazy loader for the SB3 PPO policy used by the dog controller.
 Importing stable-baselines3 inside the Webots Python interpreter is only
 needed when ``HERDING_MODE=rl``; the Strömbom mode runs without it. This
 loader keeps SB3 out of the import path until you actually ask for the RL
 policy, so users without SB3 installed can still run the Strömbom
 baseline.
 The policy + VecNormalize statistics are saved together by
 ``training/train_ppo.py``:
    runs/<name>/best/best_model.zip     # SB3 PPO checkpoint
    runs/<name>/best/vecnormalize.pkl   # observation-normaliser stats
 Pass either the directory or the explicit zip path.
 """
 import os
 from pathlib import Path
 class PolicyHandle:
    """Wrap a loaded PPO policy + VecNormalize so the controller can call
    ``predict(obs)`` without thinking about either."""
    def __init__(self, model, vecnorm):
        self.model = model
        self.vecnorm = vecnorm
    def predict(self, obs):
        # VecNormalize expects a batched obs of shape (n_envs, obs_dim).
        if self.vecnorm is not None:
            import numpy as np
            obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
            obs_b = self.vecnorm.normalize_obs(obs_b)
        else:
            import numpy as np
            obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
        action, _ = self.model.predict(obs_b, deterministic=True)
        return action[0]
 def load(model_path: str, vecnorm_path: str | None = None) -> PolicyHandle:
    """Load a PPO model (and optional VecNormalize) from disk.
    ``model_path`` may be the .zip checkpoint or a directory containing
    ``best_model.zip`` (and optionally ``vecnormalize.pkl``).
    """
    p = Path(model_path)
    if p.is_dir():
        zip_candidates = [p / "best_model.zip", p / "final.zip", p / "policy.zip"]
        zip_path = next((z for z in zip_candidates if z.exists()), None)
        if zip_path is None:
            raise FileNotFoundError(
                f"No PPO zip found in {p} (looked for best_model.zip, final.zip, policy.zip)"
            )
        if vecnorm_path is None:
            vn = p / "vecnormalize.pkl"
            if vn.exists():
                vecnorm_path = str(vn)
    else:
        zip_path = p
    # Imports deferred so the Strömbom path doesn't require SB3.
    from stable_baselines3 import PPO
    from stable_baselines3.common.vec_env import VecNormalize
    model = PPO.load(str(zip_path), device="auto")
    vecnorm = None
    if vecnorm_path and os.path.exists(vecnorm_path):
        # VecNormalize.load needs a venv to attach to; we only need its stats
        # at inference, so we reconstruct the wrapper manually.
        import pickle
        with open(vecnorm_path, "rb") as f:
            vecnorm = pickle.load(f)
        vecnorm.training = False
        vecnorm.norm_reward = False
    return PolicyHandle(model=model, vecnorm=vecnorm)
@@ -1,14 +1,182 @@
-"""
+"""Shepherd Dog controller (Webots).
 Shepherd Dog controller (Webots, manual keyboard control).
-WASD / arrow keys drive the robot.  +/- adjust speed in 10 % increments.
+Runs in one of two modes selected by the ``HERDING_MODE`` environment
-GPS position is broadcast every step on channel 1 so sheep controllers
+variable:
-can compute flee forces.  Ears wag continuously via sinusoidal position
+
-targets — purely cosmetic.
+    HERDING_MODE=rl        → load an SB3 PPO policy from
                             HERDING_POLICY_DIR (default
                             training/runs/latest/best) and use its
                             (vx, vy) action each step.
    HERDING_MODE=strombom  → use the analytic Strömbom collect/drive
                             heuristic. This is the fallback if the RL
                             policy can't be loaded (e.g. SB3 not
                             installed in the Webots Python env, or no
                             checkpoint yet).
 Both modes share the same low-level differential-drive controller
 (``herding.diffdrive.velocity_to_wheels`` + clamped forward speed), so
 switching modes does not retune the actuation layer.
 A safety supervisor enforces the "dog stays out of the pen" invariant:
 if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is
 overridden with a north-driving correction. This is a hard guarantee
 the policy cannot escape.
 """
 import math
-from controller import Robot, Keyboard
+import os
 import sys
 # --- Make the shared herding/ package importable from this controller dir ---
 _HERE = os.path.dirname(os.path.abspath(__file__))
 _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
 from controller import Robot
 from herding.diffdrive import velocity_to_wheels
 from herding.geometry import (
    DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA,
    DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS,
    PEN_ENTRY,
 )
 from herding.obs import build_obs
 from herding.sequential import compute_action_debug as sequential_action_debug
 from herding.strombom import compute_action_debug as strombom_action_debug
 # ---------------------------------------------------------------------------
 # Mode selection
 # ---------------------------------------------------------------------------
 def _load_runtime_config():
    """Read mode + policy_dir overrides from a runtime config file.
    Webots strips HERDING_* env vars in some configurations, so the
    launcher writes a tiny ``herding_runtime.cfg`` (key=value lines)
    in the project root and the controller reads it here. Env vars
    win if both are present; the file is the fallback.
    """
    cfg_path = os.path.join(_PROJECT_ROOT, "herding_runtime.cfg")
    if not os.path.exists(cfg_path):
        return {}
    out = {}
    try:
        with open(cfg_path) as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith("#") or "=" not in line:
                    continue
                k, _, v = line.partition("=")
                out[k.strip().upper()] = v.strip()
    except OSError:
        return {}
    return out
 _runtime_cfg = _load_runtime_config()
 MODE = (os.environ.get("HERDING_MODE")
        or _runtime_cfg.get("HERDING_MODE")
        or "rl").lower()
 def _resolve_policy_dir() -> str:
    """Where to look for the trained policy.
    Priority:
      1. HERDING_POLICY_DIR env var (if set and points to a real dir)
      2. training/runs/bc_pretrained/  (BC-only checkpoint)
      3. training/runs/bc_ppo/best/    (PPO fine-tuned best)
      4. training/runs/latest/best/    (legacy default)
    """
    env_dir = (os.environ.get("HERDING_POLICY_DIR")
               or _runtime_cfg.get("HERDING_POLICY_DIR"))
    if env_dir and os.path.isdir(env_dir):
        return env_dir
    candidates = [
        os.path.join(_PROJECT_ROOT, "training", "runs", "bc_pretrained"),
        os.path.join(_PROJECT_ROOT, "training", "runs", "bc_ppo", "best"),
        os.path.join(_PROJECT_ROOT, "training", "runs", "latest", "best"),
    ]
    for c in candidates:
        if os.path.isdir(c):
            return c
    # Last resort — return env var anyway so error message is informative.
    return env_dir or candidates[0]
 POLICY_DIR = _resolve_policy_dir()
 policy_handle = None
 if MODE == "rl":
    print(f"[dog] HERDING_MODE={MODE}  HERDING_POLICY_DIR(env)="
          f"{os.environ.get('HERDING_POLICY_DIR', '<unset>')}")
    print(f"[dog] resolved POLICY_DIR={POLICY_DIR}  exists="
          f"{os.path.isdir(POLICY_DIR)}")
    if os.path.isdir(POLICY_DIR):
        try:
            entries = sorted(os.listdir(POLICY_DIR))
        except OSError:
            entries = []
        print(f"[dog] dir contents: {entries}")
    try:
        from policy_loader import load as _load_policy
        policy_handle = _load_policy(POLICY_DIR)
        print(f"[dog] RL policy loaded from {POLICY_DIR}")
    except Exception as exc:
        print(f"[dog] RL policy load failed ({exc!r}); falling back to Strömbom.")
        MODE = "strombom"
 if MODE not in ("rl", "strombom", "sequential"):
    print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
    MODE = "strombom"
 print(f"[dog] running in mode={MODE}")
 # ---------------------------------------------------------------------------
 # Action smoothing + safety supervisor
 # ---------------------------------------------------------------------------
 ACTION_SMOOTH = 0.35
 prev_action = (0.0, 0.0)
 def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
    """If the dog is near the south barrier and the action would push it
    further south, override with a northward action. Hard invariant: the
    dog never enters the pen."""
    if dog_y < DOG_SOUTH_LIMIT and vy < 0.0:
        return (0.0, 1.0)
    if dog_y < DOG_SOUTH_LIMIT + 0.5 and vy < -0.2:
        return (vx * 0.5, max(0.0, vy + 0.5))
    return (vx, vy)
 # ---------------------------------------------------------------------------
 # Driving
 # ---------------------------------------------------------------------------
 def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float):
    if math.hypot(vx, vy) < 1e-3:
        left_motor.setVelocity(0.0)
        right_motor.setVelocity(0.0)
        return
    n = compass.getValues()
    h = math.atan2(n[0], n[1])
    left, right = velocity_to_wheels(
        vx, vy, h,
        max_linear=DOG_MAX_LINEAR,
        wheel_radius=DOG_WHEEL_RADIUS,
        max_wheel_omega=motor_max,
        k_turn=4.0,
    )
    left_motor.setVelocity(left)
    right_motor.setVelocity(right)
 # ---------------------------------------------------------------------------
 # Webots devices
 # ---------------------------------------------------------------------------
 robot = Robot()
 timestep = int(robot.getBasicTimeStep())
@@ -19,70 +187,97 @@ left_motor.setPosition(float("inf"))
 right_motor.setPosition(float("inf"))
 left_motor.setVelocity(0.0)
 right_motor.setVelocity(0.0)
-
+MOTOR_MAX = min(left_motor.getMaxVelocity(), DOG_MAX_WHEEL_OMEGA)
 lidar = robot.getDevice("lidar")
 lidar.enable(timestep)
 lidar.enablePointCloud()
 gps = robot.getDevice("gps");           gps.enable(timestep)
 compass = robot.getDevice("compass");   compass.enable(timestep)
 emitter = robot.getDevice("emitter")
 receiver = robot.getDevice("receiver"); receiver.enable(timestep)
 emitter = robot.getDevice("emitter")
 # Cosmetic ear motors — ignored by control logic but keep them animated.
 left_ear = robot.getDevice("left ear motor")
 right_ear = robot.getDevice("right ear motor")
 left_ear.setPosition(float("inf"))
 right_ear.setPosition(float("inf"))
 left_ear.setVelocity(0.0)
 right_ear.setVelocity(0.0)
 keyboard = robot.getKeyboard()
 keyboard.enable(timestep)
 MOTOR_MAX   = left_motor.getMaxVelocity()
 speed_level = 0.5   # fraction of MOTOR_MAX; adjusted by +/-
 EAR_AMPLITUDE = 0.35   # rad, peak ear deflection
 EAR_RATE      = 8.0    # rad/s, how fast the ears are driven
 ear_phase = 0.0
 EAR_AMPLITUDE = 0.35
 EAR_RATE = 8.0
 # ---------------------------------------------------------------------------
 # Main loop
 # ---------------------------------------------------------------------------
 # {name: (x, y)} — kept across all sheep ever heard from. Sheep that drift
 # into the pen are tracked by ``penned`` so observations and Strömbom
 # agree on which ones still need herding.
 sheep_positions: dict = {}
 penned_set: set = set()
 step_count = 0
 from herding.geometry import is_penned_position
 while robot.step(timestep) != -1:
-    speed = MOTOR_MAX * speed_level
+    step_count += 1
    turn  = speed * 0.6   # differential turn radius
-    left_vel  = 0.0
+    while receiver.getQueueLength() > 0:
-    right_vel = 0.0
+        msg = receiver.getString()
-    key = keyboard.getKey()
+        receiver.nextPacket()
-    while key > 0:
+        parts = msg.split(":")
-        if   key in (ord('W'), Keyboard.UP):
+        if len(parts) == 4 and parts[0] == "sheep":
-            left_vel  = speed
+            try:
-            right_vel = speed
+                x, y = float(parts[2]), float(parts[3])
-        elif key in (ord('S'), Keyboard.DOWN):
+            except ValueError:
-            left_vel  = -speed
+                continue
-            right_vel = -speed
+            sheep_positions[parts[1]] = (x, y)
-        elif key in (ord('A'), Keyboard.LEFT):
+            if parts[1] not in penned_set and is_penned_position(x, y):
-            left_vel  = -turn
+                penned_set.add(parts[1])
            right_vel =  turn
        elif key in (ord('D'), Keyboard.RIGHT):
            left_vel  =  turn
            right_vel = -turn
        elif key in (ord('+'), ord('=')):
            speed_level = min(1.0, speed_level + 0.1)
            print(f"Speed: {speed_level:.0%} ({MOTOR_MAX * speed_level:.1f} rad/s)")
        elif key in (ord('-'), ord('_')):
            speed_level = max(0.1, speed_level - 0.1)
            print(f"Speed: {speed_level:.0%} ({MOTOR_MAX * speed_level:.1f} rad/s)")
        key = keyboard.getKey()
    left_motor.setVelocity(left_vel)
    right_motor.setVelocity(right_vel)
    pos = gps.getValues()
-    emitter.send(f"dog:{pos[0]}:{pos[1]}")
+    dog_xy = (pos[0], pos[1])
    n = compass.getValues()
    dog_heading = math.atan2(n[0], n[1])
    # ---- Action selection ----
    if MODE == "rl" and policy_handle is not None:
        sheep_xy_list = list(sheep_positions.values())
        sheep_names = list(sheep_positions.keys())
        sheep_penned_list = [s in penned_set for s in sheep_names]
        obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list)
        action = policy_handle.predict(obs)
        vx, vy = float(action[0]), float(action[1])
    elif MODE == "sequential":
        vx, vy, _mode_str, _dbg = sequential_action_debug(
            dog_xy, sheep_positions, PEN_ENTRY,
        )
    else:
        # Strömbom (canonical baseline).
        vx, vy, _mode_str, _dbg = strombom_action_debug(
            dog_xy, sheep_positions, PEN_ENTRY,
        )
    # EMA smoothing — reduces oscillation from policy or Strömbom flips.
    vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx
    vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy
    # Safety: dog must never enter the pen.
    vx, vy = safety_clamp(vx, vy, dog_xy[0], dog_xy[1])
    prev_action = (vx, vy)
    drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX)
    emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}")
    # Cosmetic ear wiggle — purely visual.
    ear_phase += 0.12
    ear_pos = EAR_AMPLITUDE * math.sin(ear_phase)
    left_ear.setVelocity(EAR_RATE)
    right_ear.setVelocity(EAR_RATE)
    left_ear.setPosition(ear_pos)
    right_ear.setPosition(-ear_pos)
    if step_count % 200 == 0:
        n_active = sum(1 for s in sheep_positions if s not in penned_set)
        print(f"[dog mode={MODE}] step={step_count} known={len(sheep_positions)} "
              f"penned={len(penned_set)} active={n_active} action=({vx:+.2f}, {vy:+.2f})")
@@ -1,153 +0,0 @@
 """
 Render Webots-side debug trajectory from debug.csv.
 The shepherd_dog_rl controller writes per-step state to debug.csv when
 DOG_DEBUG=1. This script reads it and produces:
  trajectory.png   — dog path + sheep paths overlaid on the field
  obs_drift.png    — normalized observation distribution over time
  actions.png      — vx, vy time series
 Run:
    python plot_debug.py                    # uses debug.csv next to this file
    python plot_debug.py --csv path/to.csv --out-dir somewhere/
 """
 import argparse
 import csv
 import os
 import sys
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 import matplotlib.patches as mpatches
 import numpy as np
 def load_csv(path):
    rows = []
    with open(path) as f:
        rd = csv.DictReader(f)
        for r in rd:
            rows.append(r)
    if not rows:
        sys.exit(f"empty CSV: {path}")
    return rows
 def parse_floats(s):
    return [float(x) for x in s.split(";") if x]
 def plot_trajectory(rows, out_path):
    fig, ax = plt.subplots(figsize=(7, 7))
    ax.set_xlim(-16, 16); ax.set_ylim(-16, 16); ax.set_aspect("equal")
    ax.set_facecolor("#dcedc8")
    ax.add_patch(mpatches.Rectangle((-15, -15), 30, 30,
                 fill=False, edgecolor="#795548", lw=2))
    ax.add_patch(mpatches.Rectangle((10, -15), 3, 7,
                 facecolor="#ffe082", edgecolor="#795548", lw=2))
    ax.text(11.5, -11.5, "pen", ha="center", va="center", fontsize=8)
    dog_x = [float(r["dog_x"]) for r in rows]
    dog_y = [float(r["dog_y"]) for r in rows]
    ax.plot(dog_x, dog_y, color="#4e342e", lw=1.5, alpha=0.7, label="dog")
    ax.plot(dog_x[0], dog_y[0], "s", color="#4e342e", ms=10)
    ax.plot(dog_x[-1], dog_y[-1], "D", color="#4e342e", ms=10)
    # Sheep — re-shape into per-sheep tracks
    sx_all = [parse_floats(r["sheep_xs"]) for r in rows]
    sy_all = [parse_floats(r["sheep_ys"]) for r in rows]
    if sx_all and sx_all[-1]:
        n_sheep = len(sx_all[-1])
        palette = ["#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00",
                   "#a65628","#f781bf","#999999","#66c2a5","#fc8d62"]
        for i in range(n_sheep):
            xs = [r[i] if i < len(r) else None for r in sx_all]
            ys = [r[i] if i < len(r) else None for r in sy_all]
            xs = [x for x in xs if x is not None]
            ys = [y for y in ys if y is not None]
            if xs:
                c = palette[i % len(palette)]
                ax.plot(xs, ys, color=c, lw=0.8, alpha=0.6, label=f"sheep {i+1}")
                ax.plot(xs[0], ys[0], "o", color=c, ms=6)
                ax.plot(xs[-1], ys[-1], "*", color=c, ms=10)
    n_in_pen = int(rows[-1]["n_penned"])
    ax.set_title(f"Webots trajectory  {len(rows)} steps  penned={n_in_pen}",
                 fontsize=12)
    ax.legend(loc="upper left", fontsize=7, ncol=2)
    plt.tight_layout()
    fig.savefig(out_path, dpi=120)
    plt.close(fig)
 def plot_actions(rows, out_path):
    t = np.arange(len(rows))
    vx = np.array([float(r["vx"]) for r in rows])
    vy = np.array([float(r["vy"]) for r in rows])
    mag = np.sqrt(vx ** 2 + vy ** 2)
    fig, axes = plt.subplots(3, 1, figsize=(12, 7), sharex=True)
    axes[0].plot(t, vx, color="tab:red", lw=0.8); axes[0].set_ylabel("vx")
    axes[0].axhline(0, color="black", lw=0.4); axes[0].set_ylim(-1.1, 1.1)
    axes[1].plot(t, vy, color="tab:blue", lw=0.8); axes[1].set_ylabel("vy")
    axes[1].axhline(0, color="black", lw=0.4); axes[1].set_ylim(-1.1, 1.1)
    axes[2].plot(t, mag, color="tab:purple", lw=0.8); axes[2].set_ylabel("||action||")
    axes[2].axhline(np.sqrt(2), color="orange", ls="--", lw=1, label="saturated √2")
    axes[2].axhline(1.0, color="gray", ls="--", lw=1)
    axes[2].set_xlabel("step"); axes[2].legend(fontsize=8)
    fig.suptitle("Webots action time series")
    plt.tight_layout()
    fig.savefig(out_path, dpi=120)
    plt.close(fig)
 def plot_obs(rows, out_path):
    norm = np.array([parse_floats(r["norm_obs"]) for r in rows])
    raw  = np.array([parse_floats(r["raw_obs"])  for r in rows])
    if norm.size == 0:
        return
    n_dims = norm.shape[1]
    labels = [
        "dog_x", "dog_y", "com-dog_x", "com-dog_y",
        "far1-com_x", "far1-com_y", "far2-com_x", "far2-com_y",
        "far3-com_x", "far3-com_y", "pen-com_x", "pen-com_y",
        "pen-far1_x", "pen-far1_y", "radius", "frac_active",
    ][:n_dims]
    t = np.arange(norm.shape[0])
    fig, axes = plt.subplots(n_dims, 1, figsize=(11, 1.0 * n_dims), sharex=True)
    if n_dims == 1: axes = [axes]
    for i in range(n_dims):
        axes[i].plot(t, raw[:, i], color="tab:gray", lw=0.6, alpha=0.6, label="raw")
        axes[i].plot(t, norm[:, i], color="tab:red", lw=0.8, label="normalised")
        axes[i].set_ylabel(labels[i], fontsize=8)
        axes[i].tick_params(labelsize=7)
        if i == 0:
            axes[i].legend(fontsize=7, loc="upper right")
    axes[-1].set_xlabel("step")
    fig.suptitle("Observation values over time (raw vs VecNormalize-normalised)")
    plt.tight_layout()
    fig.savefig(out_path, dpi=110)
    plt.close(fig)
 def main():
    p = argparse.ArgumentParser()
    here = os.path.dirname(os.path.abspath(__file__))
    p.add_argument("--csv", default=os.path.join(here, "debug.csv"))
    p.add_argument("--out-dir", default=os.path.join(here, "debug_out"))
    args = p.parse_args()
    rows = load_csv(args.csv)
    os.makedirs(args.out_dir, exist_ok=True)
    print(f"loaded {len(rows)} rows from {args.csv}")
    plot_trajectory(rows, os.path.join(args.out_dir, "trajectory.png"))
    plot_actions(rows,    os.path.join(args.out_dir, "actions.png"))
    plot_obs(rows,        os.path.join(args.out_dir, "obs.png"))
    print(f"saved trajectory.png + actions.png + obs.png to {args.out_dir}/")
 if __name__ == "__main__":
    main()
@@ -1,285 +0,0 @@
 """
 Shepherd Dog RL controller — runs a trained SB3 PPO policy inside Webots.
 Setup
 -----
 1. Copy your trained files into this directory:
       controllers/shepherd_dog_rl/final_model.zip
       controllers/shepherd_dog_rl/vecnorm.pkl
 2. In field.wbt, set the ShepherdDog robot's controller field to
   "shepherd_dog_rl".  You can do this in the Webots GUI:
       click the robot → Controller → shepherd_dog_rl
 3. Optional: set controllerArgs to ["5"] (number of sheep) if it differs
   from the default of 5.
 The controller reads GPS (dog position) and Receiver (sheep broadcasts),
 builds the same 16-dim flock observation the training env used, normalises
 it with the saved VecNormalize stats, and converts the (vx, vy) policy
 output into differential wheel speeds.
 Debug logging
 -------------
 Set env var DOG_DEBUG=1 to write a per-step CSV (dog pos, sheep positions,
 raw obs, normalised obs, action) to debug.csv alongside this script. Use
 plot_debug.py to render trajectories from it.
 """
 import sys
 import os
 import math
 import struct
 import numpy as np
 # ── make training code importable ───────────────────────────────────────────
 _HERE = os.path.dirname(os.path.abspath(__file__))
 _TRAINING = os.path.join(_HERE, "..", "..", "training")
 sys.path.insert(0, _TRAINING)
 from controller import Robot
 from stable_baselines3 import PPO
 from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
 from herding_env import HerdingEnv
 # ── constants (must match herding_env.py) ───────────────────────────────────
 FIELD      = 15.0
 PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
 PEN_X      = (10.0, 13.0)
 PEN_Y      = (-15.0, -8.0)
 DOG_SPEED  = 2.5         # m/s
 WHEEL_R    = 0.038       # wheel radius (metres) — from ShepherdDog.proto
 K_TURN     = 4.0         # heading-error gain (rad/s per rad)
 EAR_AMPLITUDE = 0.35
 EAR_RATE      = 8.0
 # ── model paths ─────────────────────────────────────────────────────────────
 MODEL_PATH   = os.path.join(_HERE, "final_model.zip")
 VECNORM_PATH = os.path.join(_HERE, "vecnorm.pkl")
 DEBUG_CSV    = os.path.join(_HERE, "debug.csv")
 DEBUG_ENABLED = True   # set False to disable debug.csv logging
 # ── action smoothing ─────────────────────────────────────────────────────────
 # EMA on policy output to suppress the rapid oscillation (vx/vy flipping
 # between -1 and +1 every step) that stalls the physical dog.  0 = no
 # smoothing (raw policy), 1 = frozen.  0.3 keeps ~30% of previous action.
 ACTION_SMOOTH = 0.3
 prev_action   = np.zeros(2, dtype=np.float32)
 def norm_angle(a: float) -> float:
    while a >  math.pi: a -= 2 * math.pi
    while a < -math.pi: a += 2 * math.pi
    return a
 def in_pen(x: float, y: float) -> bool:
    return PEN_X[0] < x < PEN_X[1] and PEN_Y[0] < y < PEN_Y[1]
 def build_obs(dog_pos: np.ndarray,
              sheep_dict: dict,
              n_sheep: int,
              dog_heading: float = 0.0) -> np.ndarray:
    """
    Build the 18-dim flock observation — identical to HerdingEnv._obs().
    sheep_dict: {name: (x, y)} for ALL known sheep (penned or not).
    dog_heading: dog's current world-frame heading in radians.
    """
    D = 2 * FIELD
    # Split active vs penned
    active_pos = np.array(
        [v for v in sheep_dict.values() if not in_pen(*v)],
        dtype=np.float32
    )
    n_active = len(active_pos)
    if n_active > 0:
        com        = active_pos.mean(axis=0)
        d_from_com = np.linalg.norm(active_pos - com, axis=1)
        sorted_idx = np.argsort(d_from_com)[::-1]
        radius     = float(d_from_com[sorted_idx[0]])
        def nth(n):
            return active_pos[sorted_idx[n]] if len(sorted_idx) > n else com
        far1, far2, far3 = nth(0), nth(1), nth(2)
    else:
        com = PEN_CENTER.copy()
        radius = 0.0
        far1 = far2 = far3 = PEN_CENTER.copy()
    frac_active = n_active / max(n_sheep, 1)
    return np.array([
        dog_pos[0] / FIELD,  dog_pos[1] / FIELD,
        (com[0]  - dog_pos[0]) / D, (com[1]  - dog_pos[1]) / D,
        (far1[0] - com[0]) / D, (far1[1] - com[1]) / D,
        (far2[0] - com[0]) / D, (far2[1] - com[1]) / D,
        (far3[0] - com[0]) / D, (far3[1] - com[1]) / D,
        (PEN_CENTER[0] - com[0])  / D, (PEN_CENTER[1] - com[1])  / D,
        (PEN_CENTER[0] - far1[0]) / D, (PEN_CENTER[1] - far1[1]) / D,
        radius / D,
        frac_active,
        math.cos(dog_heading), math.sin(dog_heading),
    ], dtype=np.float32)
 # ── Webots setup ─────────────────────────────────────────────────────────────
 robot    = Robot()
 timestep = int(robot.getBasicTimeStep())
 # Drive motors
 left_motor  = robot.getDevice("left wheel motor")
 right_motor = robot.getDevice("right wheel motor")
 left_motor.setPosition(float("inf"))
 right_motor.setPosition(float("inf"))
 left_motor.setVelocity(0.0)
 right_motor.setVelocity(0.0)
 MOTOR_MAX = left_motor.getMaxVelocity()
 # Sensors
 gps      = robot.getDevice("gps");      gps.enable(timestep)
 compass  = robot.getDevice("compass");  compass.enable(timestep)
 receiver = robot.getDevice("receiver"); receiver.enable(timestep)
 emitter  = robot.getDevice("emitter")
 # Cosmetic
 left_ear  = robot.getDevice("left ear motor")
 right_ear = robot.getDevice("right ear motor")
 left_ear.setPosition(float("inf"));  right_ear.setPosition(float("inf"))
 left_ear.setVelocity(0.0);           right_ear.setVelocity(0.0)
 ear_phase = 0.0
 # Number of sheep (from controllerArgs or default)
 try:
    n_sheep = int(sys.argv[1])
 except (IndexError, ValueError):
    n_sheep = 3
 # ── Load model ───────────────────────────────────────────────────────────────
 print(f"[RL dog] Loading model from {MODEL_PATH}")
 print(f"[RL dog] Loading vecnorm from {VECNORM_PATH}")
 dummy_env = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep)])
 vecnorm   = VecNormalize.load(VECNORM_PATH, dummy_env)
 vecnorm.training    = False
 vecnorm.norm_reward = False
 model = PPO.load(MODEL_PATH, device="cpu")
 print(f"[RL dog] Model loaded — running with n_sheep={n_sheep}")
 # ── Runtime state ─────────────────────────────────────────────────────────────
 sheep_positions: dict = {}   # {name: (x, y)} — updated every step from receiver
 step_count = 0
 # Debug CSV — written every step when DOG_DEBUG=1
 debug_file = None
 if DEBUG_ENABLED:
    import csv
    debug_file = open(DEBUG_CSV, "w", newline="")
    debug_writer = csv.writer(debug_file)
    debug_writer.writerow([
        "step", "dog_x", "dog_y", "heading",
        "sheep_xs", "sheep_ys", "n_active", "n_penned",
        "raw_obs", "norm_obs", "vx", "vy",
    ])
    print(f"[RL dog] DEBUG logging to {DEBUG_CSV}")
 def bearing() -> float:
    """Current robot heading in world frame (radians)."""
    n = compass.getValues()
    return math.atan2(n[0], n[1])
 def drive(action_vx: float, action_vy: float) -> None:
    """Convert (vx, vy) policy action to differential wheel speeds."""
    speed_ms = math.sqrt(action_vx ** 2 + action_vy ** 2) * DOG_SPEED
    if speed_ms < 0.05:
        left_motor.setVelocity(0.0)
        right_motor.setVelocity(0.0)
        return
    target_heading = math.atan2(action_vy, action_vx)
    err = norm_angle(target_heading - bearing())
    fwd_ms  = speed_ms * max(0.0, math.cos(err))
    fwd_rad = fwd_ms / WHEEL_R
    turn    = K_TURN * err    # rad/s correction
    l = max(-MOTOR_MAX, min(MOTOR_MAX, fwd_rad - turn))
    r = max(-MOTOR_MAX, min(MOTOR_MAX, fwd_rad + turn))
    left_motor.setVelocity(l)
    right_motor.setVelocity(r)
 # ── Main loop ─────────────────────────────────────────────────────────────────
 while robot.step(timestep) != -1:
    step_count += 1
    # 1. Drain receiver — update sheep position table
    while receiver.getQueueLength() > 0:
        try:
            msg = receiver.getString()
            parts = msg.split(":")
            if parts[0] == "sheep" and len(parts) == 4:
                sheep_positions[parts[1]] = (float(parts[2]), float(parts[3]))
        except Exception:
            pass
        receiver.nextPacket()
    # 2. Dog GPS
    gps_vals = gps.getValues()
    dog_pos  = np.array([gps_vals[0], gps_vals[1]], dtype=np.float32)
    # 3. Build and normalise observation (heading from compass)
    raw_obs  = build_obs(dog_pos, sheep_positions, n_sheep,
                         dog_heading=bearing())
    obs_norm = vecnorm.normalize_obs(raw_obs[np.newaxis])  # (1, 13)
    # 4. Policy inference + smoothing
    action, _ = model.predict(obs_norm, deterministic=True)
    raw_a = np.array([float(action[0][0]), float(action[0][1])], dtype=np.float32)
    if ACTION_SMOOTH > 0:
        smoothed = ACTION_SMOOTH * prev_action + (1.0 - ACTION_SMOOTH) * raw_a
        prev_action[:] = smoothed
        vx, vy = float(smoothed[0]), float(smoothed[1])
    else:
        vx, vy = float(raw_a[0]), float(raw_a[1])
    # 5. Drive
    drive(vx, vy)
    # 6. Broadcast dog position so sheep can compute flee forces
    emitter.send(f"dog:{dog_pos[0]:.4f}:{dog_pos[1]:.4f}")
    # 7. Ear animation
    ear_phase += 0.12
    ep = EAR_AMPLITUDE * math.sin(ear_phase)
    left_ear.setVelocity(EAR_RATE);  right_ear.setVelocity(EAR_RATE)
    left_ear.setPosition( ep);        right_ear.setPosition(-ep)
    # Periodic status
    if step_count % 100 == 0:
        n_in_pen = sum(1 for x, y in sheep_positions.values() if in_pen(x, y))
        print(f"[RL dog] step={step_count}  known_sheep={len(sheep_positions)}"
              f"  penned={n_in_pen}/{n_sheep}  dog=({dog_pos[0]:.2f},{dog_pos[1]:.2f})"
              f"  action=({vx:.2f}, {vy:.2f})")
    # Debug CSV row
    if debug_file is not None:
        n_active = sum(1 for x, y in sheep_positions.values() if not in_pen(x, y))
        n_in_pen = len(sheep_positions) - n_active
        debug_writer.writerow([
            step_count, f"{dog_pos[0]:.4f}", f"{dog_pos[1]:.4f}",
            f"{bearing():.4f}",
            ";".join(f"{v[0]:.3f}" for v in sheep_positions.values()),
            ";".join(f"{v[1]:.3f}" for v in sheep_positions.values()),
            n_active, n_in_pen,
            ";".join(f"{x:.4f}" for x in raw_obs),
            ";".join(f"{x:.4f}" for x in obs_norm[0]),
            f"{vx:.4f}", f"{vy:.4f}",
        ])
        if step_count % 200 == 0:
            debug_file.flush()
@@ -6,28 +6,28 @@
 - Nelson Neto <up202108117@up.pt>
 ## (i) Title and General objectives
-**RL-Based Autonomous Shepherd Robot for Livestock Herding**
+**Autonomous Shepherd Robot for Livestock Herding (Strömbom)**
 - Implement effective herding behaviors through proximity and movement strategies
 - Build a 3D environment with realistic robot dynamics and LIDAR-based perception
- Develop a mobile robot capable of autonomously guiding a flock of sheep into a designated target area using Reinforcement Learning
+- Develop a mobile robot capable of autonomously guiding a flock of sheep into a designated target area using the Strömbom heuristic approach
 # Group G25 - (ii) Intermediate Goals
 ## Intermediate goals
 - Set up the Webots simulation environment with an open field and target zone
- Implement lightweight Gymnasium-based 2D herding environment
+- Implement lightweight 2D herding environment for algorithm evaluation
 - Design a Sheep and Dog robot
- Implement a sheep flocking model for fast RL iteration
+- Implement a sheep flocking model for fast Strömbom iteration
 - Validate LiDAR sensor feedback for sheep detection and distance estimation
 # Group G25 - Course Project (Final) Goals
 ## (iii) Main goals
- State-of-the-art survey on shepherding algorithms and multi-agent RL herding
+- State-of-the-art survey on shepherding algorithms with focus on Strömbom herding
- Train the robot using PPO to successfully herd a single sheep into the goal
+- Implement and tune Strömbom controller to successfully herd a single sheep into the goal
 - Achieve fully autonomous herding of multiple sheep and a full flock into the target area
 - Optimize robot trajectory to minimize the time required to group the flock
 - Ensure zero collisions between the robot and the sheep during the task
@@ -35,7 +35,7 @@
 - Article, demo video, and final presentation
 ## (iv) Extra Merit
- Curriculum Learning (scaling from 1 sheep to a flock)
+- Progressive evaluation (scaling from 1 sheep to a flock)
 - Comparison of performance between Differential Drive and Mecanum wheels
 - Robustness testing under sensor noise or varying sheep speeds, configurations and parameters
 - Multi-shepherd cooperative mode: 2 dogs learn role specialization (collector vs. driver)
@@ -46,11 +46,10 @@
 ## (v) Tools
 - Webots for 3D physics simulation with ROS2 integration via `webots_ros2` package
- Stable-Baselines3 for the PPO algorithm implementation
+- Gymnasium (OpenAI) for the simulation wrapper and evaluation tooling
 - Gymnasium (OpenAI) for the RL environment wrapper (lightweight 2D herding env for fast RL training)
 - Python as the primary programming language (sheep flocking model, reward shaping, evaluation)
 ## (vi) Limitations
- Computational Power: Training time might be high for complex flock behaviors
+- Computational Power: Large batch evaluation and parameter sweeps can still be time-consuming
 - Sim-to-Real Gap: No real-world validation of the herding controller; project is simulation-only (2D + Webots 3D)
 - Model Complexity: Simplified sheep behavior (scripted) may not account for all biological livestock nuances
@@ -0,0 +1,8 @@
 """Shared core for the shepherd herding project.
 This package is the single source of truth for world geometry, sheep
 flocking dynamics, differential-drive kinematics, observation building,
 and the Strömbom heuristic. It is imported both by the Webots
 controllers (for inference) and by the Gymnasium training environment
 (for fast PPO rollouts), so the two paths cannot drift apart.
 """
@@ -0,0 +1,70 @@
 """Differential-drive kinematics matching the Webots robot specs.
 The Webots controllers and the training env both use these helpers so the
 sim and the real (Webots) physics agree to first order. They do not model
 slip, wheel acceleration limits, or contact forces — Webots does that for
 us at inference time. The training env has to be close enough that a
 policy trained against this kinematic model still works when handed off
 to ODE physics.
 """
 import math
 def kinematics_step(x, y, h, w_left, w_right, wheel_radius, wheel_base, dt):
    """Integrate one step of differential-drive forward kinematics.
    Inputs
    ------
    x, y : robot position (m)
    h    : robot heading (rad), 0 = +x axis
    w_left, w_right : wheel angular velocities (rad/s)
    wheel_radius, wheel_base : robot dimensions (m)
    dt   : timestep (s)
    Returns (new_x, new_y, new_h).
    """
    v = (w_right + w_left) * wheel_radius * 0.5
    omega = (w_right - w_left) * wheel_radius / wheel_base
    new_x = x + v * math.cos(h) * dt
    new_y = y + v * math.sin(h) * dt
    new_h = math.atan2(math.sin(h + omega * dt), math.cos(h + omega * dt))
    return new_x, new_y, new_h
 def velocity_to_wheels(vx, vy, h, max_linear, wheel_radius, max_wheel_omega,
                       k_turn=4.0):
    """Convert a desired (vx, vy) intent in [-1, 1]^2 to wheel speeds.
    Mirrors ``drive_action`` in controllers/shepherd_dog/shepherd_dog.py:
    forward speed scales by ``cos(err)`` (clamped to ±90°), and a P
    controller on heading error contributes the wheel-rate differential.
    """
    speed_ms = math.hypot(vx, vy) * max_linear
    if speed_ms < 1e-3:
        return 0.0, 0.0
    target_h = math.atan2(vy, vx)
    err = math.atan2(math.sin(target_h - h), math.cos(target_h - h))
    clamped_err = max(-math.pi / 2, min(math.pi / 2, err))
    fwd_ms = speed_ms * math.cos(clamped_err)
    fwd_rad = fwd_ms / wheel_radius
    turn = k_turn * err
    left = max(-max_wheel_omega, min(max_wheel_omega, fwd_rad - turn))
    right = max(-max_wheel_omega, min(max_wheel_omega, fwd_rad + turn))
    return left, right
 def heading_speed_to_wheels(heading, speed_motor, h, max_wheel_omega,
                            k_turn=4.0):
    """Sheep variant: speed already expressed in motor (wheel rad/s) units.
    Matches the existing sheep controller (``controllers/sheep/sheep.py``)
    where ``speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))`` and
    these constants are wheel angular velocities, not linear m/s.
    """
    err = math.atan2(math.sin(heading - h), math.cos(heading - h))
    fwd = max(0.0, math.cos(err)) * speed_motor
    turn = k_turn * err
    left = max(-max_wheel_omega, min(max_wheel_omega, fwd - turn))
    right = max(-max_wheel_omega, min(max_wheel_omega, fwd + turn))
    return left, right
@@ -0,0 +1,178 @@
 """Reynolds-style sheep flocking dynamics.
 This is the per-sheep behavioural step used both by the Webots sheep
 controller (scalar, one sheep at a time) and by the training environment
 (loop over sheep). The numerics are adapted from the original
 ``controllers/sheep/flocking.py`` and retuned for the new external-pen
 layout: the south stone wall is intact except in the gate column, so
 sheep can only reach the pen by walking through that 3-m corridor.
 Force stack each step (summed → heading + speed):
    flee       — quadratic ramp away from dog within FLEE_DIST
    cohesion   — drift toward flock centre, halved while fleeing
    separation — inverse-distance push from peers
    walls      — soft repulsion + hard escape band against field walls,
                 except inside the gate column where the south wall is
                 absent
    wander     — small persistent drift for natural idle motion
 A sheep latches to ``penned`` the first time it crosses the gate plane
 into the gate column (handled by callers via ``geometry.is_penned_position``);
 once latched, ``penned=True`` is passed in here and the force stack
 switches to in-pen containment + jitter.
 """
 import math
 import random
 from herding.geometry import (
    FIELD_X, FIELD_Y,
    PEN_X, PEN_Y,
    GATE_X,
 )
 # --- Speed and force constants ---
 # All speeds here are in wheel rad/s (motor units), matching the existing
 # sheep controller. Conversion to m/s = speed * SHEEP_WHEEL_RADIUS.
 MAX_SPEED = 22.0
 FLEE_SPEED = 20.0
 WANDER_SPEED = 3.0
 WALL_MARGIN = 5.0
 WALL_HARD_MARGIN = 1.0
 WALL_HARD_GAIN = 50.0
 FLEE_DIST = 7.0
 SEPARATION_DIST = 2.5
 COHESION_DIST = 8.0
 PEN_MARGIN = 0.8
 def _peers_iter(peers):
    """Accept either a {name: (x, y)} dict or an iterable of (x, y) tuples."""
    if isinstance(peers, dict):
        return list(peers.values())
    return list(peers)
 def compute_heading_speed(x, y, penned, dog_xy, peers, wander_angle, rng=None):
    """Return ``(heading, speed, new_wander_angle)`` for one sheep step.
    ``speed`` is in wheel rad/s (motor units), bounded by ``[WANDER_SPEED,
    FLEE_SPEED]``. ``heading`` is the world-frame target heading the sheep
    should aim for (atan2 convention).
    ``rng`` is an optional ``random.Random``-compatible object used for
    the wander-jitter. If ``None``, falls back to Python's global module
    (matches Webots controller usage). Pass an env-owned RNG to make
    rollouts deterministic given a seed.
    """
    fx, fy = 0.0, 0.0
    peer_list = _peers_iter(peers)
    rnd = rng if rng is not None else random
    if penned:
        # --- Pen containment: bounce off the four pen walls ---
        pm = PEN_MARGIN
        if x < PEN_X[0] + pm:
            fx += ((PEN_X[0] + pm - x) / pm) * 15.0
        if x > PEN_X[1] - pm:
            fx -= ((x - (PEN_X[1] - pm)) / pm) * 15.0
        if y < PEN_Y[0] + pm:
            fy += ((PEN_Y[0] + pm - y) / pm) * 15.0
        if y > PEN_Y[1] - pm:
            fy -= ((y - (PEN_Y[1] - pm)) / pm) * 15.0
        # Mild peer separation — penned sheep crowd the corner otherwise.
        for px, py in peer_list:
            dx, dy = px - x, py - y
            d = math.hypot(dx, dy)
            if 0.05 < d < SEPARATION_DIST:
                push = (SEPARATION_DIST - d) / d
                fx -= (dx / d) * push * 2.5
                fy -= (dy / d) * push * 2.5
        if rnd.random() < 0.02:
            wander_angle += rnd.uniform(-0.6, 0.6)
        fx += math.cos(wander_angle) * 0.5
        fy += math.sin(wander_angle) * 0.5
    else:
        # --- Free-roaming sheep in the field ---
        fleeing = False
        if dog_xy is not None:
            ddx = dog_xy[0] - x
            ddy = dog_xy[1] - y
            dist = math.hypot(ddx, ddy)
            if 0.01 < dist < FLEE_DIST:
                fleeing = True
                t = 1.0 - dist / FLEE_DIST
                s = t * t * 20.0
                fx -= (ddx / dist) * s
                fy -= (ddy / dist) * s
        # Cohesion — drift toward flock CoM (peers within COHESION_DIST).
        # Cohesion is *stronger* under flee than at rest (the
        # predator-confusion / safety-in-numbers effect — sheep huddle when
        # threatened). This is what makes shepherding work: the flock stays
        # as one unit through the narrow gate instead of fragmenting.
        cx, cy, cn = 0.0, 0.0, 0
        for px, py in peer_list:
            d = math.hypot(px - x, py - y)
            if 0.3 < d < COHESION_DIST:
                cx += px
                cy += py
                cn += 1
        if cn > 0:
            # Cohesion needs to be comparable to flee at close range to keep
            # the flock together through narrow obstacles like the 3m gate.
            # Flee at 2m has magnitude ~10; cohesion at peer-distance 5m
            # with w=1.5 contributes ~7.5 — same order, so the flock
            # translates as a unit instead of fragmenting under pressure.
            w = 1.5 if fleeing else 0.6
            fx += (cx / cn - x) * w
            fy += (cy / cn - y) * w
        # Separation — inverse-distance push from peers.
        for px, py in peer_list:
            ddx, ddy = px - x, py - y
            d = math.hypot(ddx, ddy)
            if 0.05 < d < SEPARATION_DIST:
                push = (SEPARATION_DIST - d) / d
                fx -= (ddx / d) * push * 2.5
                fy -= (ddy / d) * push * 2.5
        # Wall soft repulsion. The south wall is absent inside the gate
        # column so sheep can be driven through it by the dog.
        if x < FIELD_X[0] + WALL_MARGIN:
            fx += ((FIELD_X[0] + WALL_MARGIN - x) / WALL_MARGIN) * 6.0
        if x > FIELD_X[1] - WALL_MARGIN:
            fx -= ((x - (FIELD_X[1] - WALL_MARGIN)) / WALL_MARGIN) * 6.0
        if y > FIELD_Y[1] - WALL_MARGIN:
            fy -= ((y - (FIELD_Y[1] - WALL_MARGIN)) / WALL_MARGIN) * 6.0
        if y < FIELD_Y[0] + WALL_MARGIN and not (GATE_X[0] <= x <= GATE_X[1]):
            fy += ((FIELD_Y[0] + WALL_MARGIN - y) / WALL_MARGIN) * 6.0
        if not fleeing:
            if random.random() < 0.02:
                wander_angle += random.uniform(-0.6, 0.6)
            fx += math.cos(wander_angle) * 0.5
            fy += math.sin(wander_angle) * 0.5
    # --- Hard escape band — overrides everything when very close to a wall ---
    m, g = WALL_HARD_MARGIN, WALL_HARD_GAIN
    if x - FIELD_X[0] < m:
        fx = max(fx, g * (1.0 - (x - FIELD_X[0]) / m))
    if FIELD_X[1] - x < m:
        fx = min(fx, -g * (1.0 - (FIELD_X[1] - x) / m))
    if FIELD_Y[1] - y < m:
        fy = min(fy, -g * (1.0 - (FIELD_Y[1] - y) / m))
    # South wall hard escape only when not in the gate column and not penned.
    if (not penned) and (y - FIELD_Y[0] < m) and not (GATE_X[0] <= x <= GATE_X[1]):
        fy = max(fy, g * (1.0 - (y - FIELD_Y[0]) / m))
    heading = math.atan2(fy, fx)
    mag = math.hypot(fx, fy)
    speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))
    return heading, speed, wander_angle
@@ -0,0 +1,99 @@
 """World geometry and robot specs.
 All coordinates are in meters. (0, 0) is the centre of the field, +x is
 east, +y is north. Z is up but unused here. These constants must match
 ``worlds/field.wbt`` and the proto files; if the world changes, change
 this file and only this file.
 Pen layout (post-refactor)
 --------------------------
 The pen is *external* to the field, accessed through a 3 m gate cut into
 the south stone wall at y = -15. Sheep entering through the gate end up
 in a fenced rectangle south of the field; the dog stays in the field
 (soft-limited above DOG_SOUTH_LIMIT during training and inference).
    field        +y north
    +-----------+
    |           |
    |           |
    |  ......   |
    +---||||----+   y = -15  (south wall, gate at x ∈ [10, 13])
        ||||
        |pen|       y ∈ [-22, -15]
        +---+
 """
 import math
 # --- Field (square, stone-walled) ---
 FIELD_X = (-15.0, 15.0)
 FIELD_Y = (-15.0, 15.0)
 # Conservative inside bounds — sheep/dog should not graze the wall.
 FIELD_INSIDE_MARGIN = 0.5
 # --- Pen (external, south of the field) ---
 PEN_X = (10.0, 13.0)
 PEN_Y = (-22.0, -15.0)
 PEN_CENTER = (0.5 * (PEN_X[0] + PEN_X[1]), 0.5 * (PEN_Y[0] + PEN_Y[1]))
 # The point the dog drives the flock toward: the gate centre on the field side.
 PEN_ENTRY = (0.5 * (PEN_X[0] + PEN_X[1]), -15.0)
 # --- Gate (the hole in the south stone wall) ---
 GATE_X = PEN_X
 GATE_Y = -15.0
 # --- Robot specs (must match proto files) ---
 # Dog (controllers/shepherd_dog/, protos/ShepherdDog.proto)
 DOG_WHEEL_RADIUS = 0.038         # m
 DOG_WHEEL_BASE = 0.28            # m, axle-to-axle
 DOG_MAX_WHEEL_OMEGA = 70.0       # rad/s
 DOG_MAX_LINEAR = DOG_WHEEL_RADIUS * DOG_MAX_WHEEL_OMEGA  # ~2.66 m/s
 # Sheep (controllers/sheep/, protos/Sheep.proto)
 SHEEP_WHEEL_RADIUS = 0.031       # m
 SHEEP_WHEEL_BASE = 0.20          # m
 SHEEP_MAX_WHEEL_OMEGA = 25.0     # rad/s
 SHEEP_MAX_LINEAR = SHEEP_WHEEL_RADIUS * SHEEP_MAX_WHEEL_OMEGA  # ~0.78 m/s
 # --- Webots step ---
 WEBOTS_DT = 0.016  # seconds, matches WorldInfo.basicTimeStep = 16 in field.wbt
 # --- Dog "virtual south wall" (training keeps dog out of the pen) ---
 # At inference the controller also clips to this so a slightly miscalibrated
 # policy doesn't accidentally drive into the pen and trap the sheep.
 DOG_SOUTH_LIMIT = -14.5
 # --- Maximum supported flock size ---
 MAX_SHEEP = 10
 def in_pen(x: float, y: float) -> bool:
    """True if (x, y) lies inside the external pen rectangle."""
    return PEN_X[0] < x < PEN_X[1] and PEN_Y[0] < y < PEN_Y[1]
 def in_field(x: float, y: float, margin: float = 0.0) -> bool:
    return (FIELD_X[0] + margin <= x <= FIELD_X[1] - margin
            and FIELD_Y[0] + margin <= y <= FIELD_Y[1] - margin)
 def in_gate_corridor(x: float, y: float, margin: float = 0.0) -> bool:
    """True if (x, y) lies in the column of the gate (between field and pen)."""
    return (PEN_X[0] - margin <= x <= PEN_X[1] + margin
            and PEN_Y[0] - margin <= y <= GATE_Y + margin)
 def is_penned_position(x: float, y: float, latch_margin: float = 0.2) -> bool:
    """A sheep latches to "penned" once it crosses the gate plane south.
    True iff x is inside the gate column (with a small margin) AND
    y has dipped below the gate line. Once latched, the sheep is held by
    in-pen forces and will not exit on its own.
    """
    return (PEN_X[0] - latch_margin <= x <= PEN_X[1] + latch_margin
            and y <= GATE_Y)
 def distance_to_pen_entry(x: float, y: float) -> float:
    return math.hypot(x - PEN_ENTRY[0], y - PEN_ENTRY[1])
@@ -0,0 +1,137 @@
 """Observation builder for the shepherd dog policy.
 Order-invariant 32-D feature vector — the policy generalises across
 flock sizes 1..MAX_SHEEP because individual sheep coordinates never
 appear in the observation by index, only summary statistics, a polar
 histogram, and two "named" sheep (closest-to-pen and rearmost-from-pen).
 The two named sheep matter for the sequential-driving teacher: it
 targets the closest-to-pen sheep specifically, so the policy needs
 that channel to mimic the teacher.
 Layout (all components normalised so values stay roughly in [-1, 1]):
    idx   field
    -----  ----------------------------------------------------------
     0..3  dog pose: x/15, y/15, cos(heading), sin(heading)
     4..5  active-sheep CoM x/15, y/15
     6..8  flock dispersion: max-radius/15, std_x/15, std_y/15
     9..11 vector dog→CoM: dx/30, dy/30, dist/30
    12..14 vector dog→pen-entry: dx/30, dy/30, dist/30
    15..16 vector furthest-sheep→CoM: dx/15, dy/15
    17..18 min sheep-to-wall, min dog-to-wall (both /15)
       19  active-sheep count / MAX_SHEEP
    20..27 8-bin polar histogram of active sheep around the dog,
           rotation-aware (binned in dog-relative frame), normalised
           so the bins sum to 1.
    28..29 vector dog→closest-to-pen sheep: dx/15, dy/15
    30..31 vector dog→rearmost (furthest-from-pen) sheep: dx/15, dy/15
 """
 import math
 import numpy as np
 from herding.geometry import (
    FIELD_X, FIELD_Y, PEN_ENTRY, MAX_SHEEP,
 )
 OBS_DIM = 32
 def build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list,
              n_max: int = MAX_SHEEP) -> np.ndarray:
    """Assemble the dog policy's observation vector.
    Parameters
    ----------
    dog_xy : tuple (x, y) of the dog's GPS position (m)
    dog_heading : dog heading in rad
    sheep_xy_list : iterable of (x, y) for ALL known sheep
    sheep_penned_list : parallel iterable of bool — True if sheep is penned
    n_max : maximum supported flock size used for the count normaliser
    """
    dog_x, dog_y = dog_xy
    obs = np.zeros(OBS_DIM, dtype=np.float32)
    obs[0] = dog_x / 15.0
    obs[1] = dog_y / 15.0
    obs[2] = math.cos(dog_heading)
    obs[3] = math.sin(dog_heading)
    active = [(x, y) for (x, y), p
              in zip(sheep_xy_list, sheep_penned_list) if not p]
    n = len(active)
    pdx0, pdy0 = PEN_ENTRY[0] - dog_x, PEN_ENTRY[1] - dog_y
    obs[12] = pdx0 / 30.0
    obs[13] = pdy0 / 30.0
    obs[14] = math.hypot(pdx0, pdy0) / 30.0
    if n == 0:
        # All sheep penned — terminal observation.
        obs[19] = 0.0
        return obs
    arr = np.asarray(active, dtype=np.float32)
    com_x = float(arr[:, 0].mean())
    com_y = float(arr[:, 1].mean())
    rel = arr - np.array([com_x, com_y], dtype=np.float32)
    dists = np.hypot(rel[:, 0], rel[:, 1])
    radius = float(dists.max())
    std_x = float(arr[:, 0].std())
    std_y = float(arr[:, 1].std())
    obs[4] = com_x / 15.0
    obs[5] = com_y / 15.0
    obs[6] = radius / 15.0
    obs[7] = std_x / 15.0
    obs[8] = std_y / 15.0
    cdx, cdy = com_x - dog_x, com_y - dog_y
    obs[9]  = cdx / 30.0
    obs[10] = cdy / 30.0
    obs[11] = math.hypot(cdx, cdy) / 30.0
    far_idx = int(np.argmax(dists))
    obs[15] = float(rel[far_idx, 0]) / 15.0
    obs[16] = float(rel[far_idx, 1]) / 15.0
    min_sheep_wall = min(
        float(np.min(arr[:, 0] - FIELD_X[0])),
        float(np.min(FIELD_X[1] - arr[:, 0])),
        float(np.min(arr[:, 1] - FIELD_Y[0])),
        float(np.min(FIELD_Y[1] - arr[:, 1])),
    )
    min_dog_wall = min(
        dog_x - FIELD_X[0], FIELD_X[1] - dog_x,
        dog_y - FIELD_Y[0], FIELD_Y[1] - dog_y,
    )
    obs[17] = min_sheep_wall / 15.0
    obs[18] = float(min_dog_wall) / 15.0
    obs[19] = n / n_max
    # 8-bin polar histogram in the dog's body frame.
    rel_dx = arr[:, 0] - dog_x
    rel_dy = arr[:, 1] - dog_y
    angles = np.arctan2(rel_dy, rel_dx) - dog_heading
    angles = np.arctan2(np.sin(angles), np.cos(angles))
    bins = np.floor((angles + math.pi) / (2 * math.pi) * 8).astype(int)
    bins = np.clip(bins, 0, 7)
    hist = np.bincount(bins, minlength=8).astype(np.float32)
    hist /= max(1, n)
    obs[20:28] = hist
    # Closest-to-pen sheep (the sequential teacher's target) and rearmost
    # (furthest-from-pen, the natural "next target" once the closest is
    # penned). Both expressed as offset from dog. These two channels make
    # BC tractable — without them the obs doesn't uniquely identify which
    # sheep the teacher is steering toward.
    pen_dists = np.hypot(arr[:, 0] - PEN_ENTRY[0], arr[:, 1] - PEN_ENTRY[1])
    closest_idx = int(np.argmin(pen_dists))
    rearmost_idx = int(np.argmax(pen_dists))
    obs[28] = (float(arr[closest_idx, 0]) - dog_x) / 15.0
    obs[29] = (float(arr[closest_idx, 1]) - dog_y) / 15.0
    obs[30] = (float(arr[rearmost_idx, 0]) - dog_x) / 15.0
    obs[31] = (float(arr[rearmost_idx, 1]) - dog_y) / 15.0
    return obs
@@ -0,0 +1,98 @@
 """Sequential single-target shepherd dog algorithm.
 Strömbom drives the flock's centre of mass; with N sheep and a narrow
 3 m gate, this fails because the flock is wider than the gate and CoM
 driving abandons stragglers. Real sheepdogs solve this differently:
 they pick *one* sheep at a time, drive it through, return for the next.
 This module implements that "pin-and-push" approach.
 Algorithm (one step):
 1. Active sheep = those still in the field (not yet penned).
 2. Target = the active sheep currently closest to the pen entry.
 3. Drive position = ``target + Δ · unit(target − pen_entry)`` —
   directly behind the target relative to the goal.
 4. Output unit vector pointing the dog at the drive position.
 Once the target crosses the gate it latches as penned and is removed
 from the active set; the next-closest unpenned sheep becomes the
 target. The algorithm naturally "queues" sheep through the gate.
 Empirically (with our flocking dynamics) this scales linearly with
 flock size and works up to at least n=10 within a 15 000-step budget.
 """
 import math
 from herding.geometry import GATE_Y, PEN_ENTRY, in_pen
 DELTA_DRIVE = 1.5     # standoff behind the target sheep
 APPROACH_GAIN = 1.0   # action magnitude scale (1 = full speed)
 def _unit(x, y):
    d = math.hypot(x, y)
    if d < 1e-6:
        return 0.0, 0.0
    return x / d, y / d
 def _is_active(x, y) -> bool:
    return (not in_pen(x, y)) and y > GATE_Y
 def compute_action(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
    """Return ``(vx, vy, mode)`` where mode encodes the current target.
    Compatible with the Strömbom call signature so it can be drop-in
    swapped in the dog controller and the env's imitation reward.
    """
    active = [(name, x, y) for name, (x, y) in sheep_positions.items()
              if _is_active(x, y)]
    if not active:
        return 0.0, 0.0, "idle"
    # Pick target = sheep closest to pen entry. Stable choice: as one
    # sheep approaches and crosses the gate it stays the target until
    # latched; then the next-closest takes over.
    name, sx, sy = min(
        active,
        key=lambda s: math.hypot(s[1] - pen_target[0], s[2] - pen_target[1]),
    )
    # Drive position behind the target along the (target → pen) line.
    ux, uy = _unit(sx - pen_target[0], sy - pen_target[1])
    tx = sx + DELTA_DRIVE * ux
    ty = sy + DELTA_DRIVE * uy
    ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
    return APPROACH_GAIN * ax, APPROACH_GAIN * ay, f"drive:{name}"
 def compute_action_debug(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
    """Debug variant returning ``(vx, vy, mode, debug_dict)``."""
    active = [(name, x, y) for name, (x, y) in sheep_positions.items()
              if _is_active(x, y)]
    if not active:
        return 0.0, 0.0, "idle", {
            "n_active": 0, "target_name": "",
            "target_x": 0.0, "target_y": 0.0,
            "drive_x": dog_xy[0], "drive_y": dog_xy[1],
        }
    name, sx, sy = min(
        active,
        key=lambda s: math.hypot(s[1] - pen_target[0], s[2] - pen_target[1]),
    )
    ux, uy = _unit(sx - pen_target[0], sy - pen_target[1])
    tx = sx + DELTA_DRIVE * ux
    ty = sy + DELTA_DRIVE * uy
    ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
    return APPROACH_GAIN * ax, APPROACH_GAIN * ay, f"drive:{name}", {
        "n_active": len(active), "target_name": name,
        "target_x": sx, "target_y": sy,
        "drive_x": tx, "drive_y": ty,
    }
@@ -0,0 +1,114 @@
 """Strömbom collect/drive heuristic for the shepherd dog.
 Adapted from the original ``controllers/shepherd_dog/strombom.py`` and
 updated for the external pen layout. Used as a baseline controller and
 as the fallback when the RL policy isn't available.
 Reference: Strömbom et al. 2014, "Solving the shepherding problem".
 """
 import math
 from herding.geometry import PEN_ENTRY, GATE_Y, in_pen
 # Algorithm parameters. DELTA_DRIVE / DELTA_COLLECT were tightened from
 # the original (4.0 / 2.5) because the new external pen sits ~26 m from
 # typical sheep spawn locations — at the old 4 m standoff, the flee force
 # (quadratic ramp, 3.7 at 4 m vs ~10 at 2 m) couldn't move sheep through
 # the path inside the 3000-step episode budget.
 #
 # F_FACTOR was 2.0 in the original Strömbom paper; raised to 4.0 here so
 # the dog stays in *drive* mode much longer. With our tighter cohesion
 # (flocking_sim.py), partially-collected flocks consolidate naturally
 # during a drive, and we don't waste 80% of the time budget on a slow
 # "collect" pre-phase.
 F_FACTOR = 4.0
 DELTA_COLLECT = 1.5
 DELTA_DRIVE = 2.0
 def _unit(x, y):
    d = math.hypot(x, y)
    if d < 1e-6:
        return 0.0, 0.0
    return x / d, y / d
 def _is_active(x, y) -> bool:
    """A sheep is "active" if it's still in the field — not in or below
    the gate plane (we treat anything south of the gate as committed to
    the pen and stop trying to herd it)."""
    return (not in_pen(x, y)) and y > GATE_Y
 def compute_action(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
    """Return ``(vx, vy, mode)`` — mode in {idle, collect, drive}.
    ``sheep_positions`` is a ``{name: (x, y)}`` mapping (matches the
    Webots controller's representation).
    """
    active = [(x, y) for (x, y) in sheep_positions.values() if _is_active(x, y)]
    if not active:
        return 0.0, 0.0, "idle"
    n = len(active)
    com_x = sum(p[0] for p in active) / n
    com_y = sum(p[1] for p in active) / n
    dists = [math.hypot(p[0] - com_x, p[1] - com_y) for p in active]
    radius = max(dists)
    if radius > F_FACTOR * math.sqrt(n):
        # Collect: aim at a point behind the furthest sheep, opposite the CoM.
        idx = max(range(n), key=lambda i: dists[i])
        sx, sy = active[idx]
        ux, uy = _unit(sx - com_x, sy - com_y)
        tx, ty = sx + DELTA_COLLECT * ux, sy + DELTA_COLLECT * uy
        mode = "collect"
    else:
        # Drive: aim at a point behind the flock CoM relative to the goal.
        ux, uy = _unit(com_x - pen_target[0], com_y - pen_target[1])
        tx, ty = com_x + DELTA_DRIVE * ux, com_y + DELTA_DRIVE * uy
        mode = "drive"
    ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
    return ax, ay, mode
 def compute_action_debug(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
    """Variant of compute_action that also returns a small debug dict.
    Kept for parity with the legacy controller's CSV logger.
    """
    active = [(x, y) for (x, y) in sheep_positions.values() if _is_active(x, y)]
    if not active:
        return 0.0, 0.0, "idle", {
            "n_active": 0, "radius": 0.0, "threshold": 0.0,
            "com_x": 0.0, "com_y": 0.0,
            "target_x": dog_xy[0], "target_y": dog_xy[1],
        }
    n = len(active)
    com_x = sum(p[0] for p in active) / n
    com_y = sum(p[1] for p in active) / n
    dists = [math.hypot(p[0] - com_x, p[1] - com_y) for p in active]
    radius = max(dists)
    threshold = F_FACTOR * math.sqrt(n)
    if radius > threshold:
        idx = max(range(n), key=lambda i: dists[i])
        sx, sy = active[idx]
        ux, uy = _unit(sx - com_x, sy - com_y)
        tx, ty = sx + DELTA_COLLECT * ux, sy + DELTA_COLLECT * uy
        mode = "collect"
    else:
        ux, uy = _unit(com_x - pen_target[0], com_y - pen_target[1])
        tx, ty = com_x + DELTA_DRIVE * ux, com_y + DELTA_DRIVE * uy
        mode = "drive"
    ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
    dbg = {
        "n_active": n, "radius": radius, "threshold": threshold,
        "com_x": com_x, "com_y": com_y,
        "target_x": tx, "target_y": ty,
    }
    return ax, ay, mode, dbg
@@ -0,0 +1,458 @@
 # RL-Driven Shepherd Herding — Implementation Plan
 This plan turns the existing Strömbom-only Webots project into a dual-mode
 shepherd controller (RL primary, Strömbom fallback), with a fast Gymnasium
 training environment that mirrors the Webots dynamics tightly enough for
 sim-to-sim transfer. Stable-Baselines3 PPO is the learner.
 ---
 ## 1. Current state (audit)
 ### World geometry — `worlds/field.wbt`
 - Field bounded by stone walls at **x,y ∈ [−15, +15]**. Inside-usable area is
  ~[−14.5, 14.5] (`X_MIN/MAX` in `flocking.py`).
 - **Pen is *inside* the field**: x ∈ [10, 13], y ∈ [−15, −8], with the
  opening on its **north** side at y = −8 (post-and-rail fence W/E; open N).
 - South stone wall has a **gate at x ∈ [10, 13], y = −15** (split wall +
  gate posts at x=10 and x=13). So sheep that get penned end up between the
  fence (N side at y=−8) and the south stone wall (with the wooden gate at
  y=−15 currently slightly ajar). The pen is effectively an L-shape inside
  the field, not external.
 - Spawns: dog at origin (0, 0), 3 sheep around (3, ±2) and (4, 0). Two more
  sheep are commented out.
 ### Robots — protos
 - **Sheep** (`protos/Sheep.proto`): differential drive, wheel radius 0.031 m,
  axle half-width 0.10 m → wheel base 0.20 m. `maxVelocity = 25 rad/s` →
  max linear ≈ **0.78 m/s**. Sensors: GPS, Compass, Emitter+Receiver on
  channel 1. `supervisor = TRUE` (used to repaint wool pink on pen entry).
 - **ShepherdDog** (`protos/ShepherdDog.proto`): differential drive, wheel
  radius 0.038 m, axle half-width 0.14 m → wheel base 0.28 m.
  `maxVelocity = 70 rad/s` → max linear ≈ **2.66 m/s**. Sensors: GPS,
  Compass, Gyro, Accelerometer, **Lidar** (front-only, FOV 2.44 rad ≈ 140°,
  180 rays, range 0.10–12 m, noise 0.005), Emitter+Receiver on channel 1,
  cosmetic ear/tail motors.
 ### Sheep controller — `controllers/sheep/{sheep.py,flocking.py}`
 - Reynolds-style boid stack: flee (quadratic ramp inside FLEE_DIST=7 m),
  cohesion (within 8 m), separation (within 2.5 m), wall soft repulsion
  (margin 5 m), wall hard escape (margin 1 m, gain 50), wander.
 - Pen-aware: sheep below the gate line but outside the gate corridor get a
  northward "deadzone" assist; on first entry into the pen rectangle,
  sheep latches `penned=True`, repaints pink, and switches to in-pen
  containment + jitter.
 - Driver: heading-error PD on diff-drive (k=4), forward velocity scaled by
  `cos(err)`, MAX_SPEED=22 (motor units, capped by proto's 25 rad/s).
 - Stuck detector: if displacement < 0.05 m for 20 steps, drives toward
  field origin to escape wall-pin (a known differential-drive failure mode).
 ### Dog controller — `controllers/shepherd_dog/{shepherd_dog.py,strombom.py}`
 - Strömbom collect/drive heuristic. CoM-radius gating
  `radius > F·√n` with F=2 selects collect (push furthest sheep inward) vs
  drive (push CoM toward the pen entry point at (11.5, −8.0)).
 - Deadzone rescue: when a sheep is below the gate line and outside the
  pen's x-corridor, the dog repositions to a "behind the sheep, opposite
  the pen" stand-off so the sheep's flee vector points back through the
  gate. Variants 0/1 alternate lateral offset to break corner cycles.
 - Stuck-rescue, EMA action smoothing, target-deadband, RESCUE_SPEED_CAP,
  cooldown — all empirical fixes for diff-drive oscillation.
 - Logs full per-step debug to `dog_behavior_log.csv` (currently 7 MB —
  add to `.gitignore`).
 ### Deleted training scaffolding (per `git status`)
 - `controllers/shepherd_dog_rl/{shepherd_dog_rl.py, final_model.zip, vecnorm.pkl, plot_debug.py}`
 - `training/{config.json, herding_env.py, parity_test.py, requirements.txt, train.py, train_at.py, viz.py, runs/.gitkeep}`
 A previous attempt existed; we'll redesign rather than resurrect, keeping
 only the lessons (parity-tested env, VecNormalize wrapper, eval cadence).
 ---
 ## 2. Design decisions
 ### 2.1 Pen location — keep inside-field with N gate
 The user offered moving the pen *external* (through a wall hole). Tradeoffs:
 | Option | Pros | Cons |
 |---|---|---|
 | **(A) Keep inside-field** (current) | World already built; Strömbom logic already tuned; gate corridor is short | Dog must navigate around three pen walls; adds geometric clutter |
 | (B) External pen via wall hole | Cleaner field — dog only sees sheep + outer walls; pen as goal region beyond a 3 m hole at y=−15 | Requires editing `field.wbt` (split south wall, add external pen walls beyond y<−15); existing rescue/deadzone logic must be retuned; outside-field flocking constants don't currently apply |
 **Recommendation: keep (A)** for parity with the working Strömbom controller,
 but add a **simplification**: widen the pen entrance from 3 m (x ∈ [10, 13])
 to 4 m (x ∈ [9.5, 13.5]) and raise the entrance line from y=−8 to y=−7.5
 to give the dog more turning room. Optional later: gate B as a curriculum
 extension (Section 7).
 ### 2.2 Where to train
 PPO on Webots directly is too slow (real-time stepping, single env, slow
 reset). The previous training scaffolding used a Python 2D sim — that is
 the right approach. Constraints for sim-to-sim transfer:
 1. **Use the exact same flocking math**: import `controllers/sheep/flocking.py`
   from the env, do not reimplement.
 2. **Use the same world constants**: import `controllers/shepherd_dog/strombom.py`
   for pen geometry and Strömbom baseline.
 3. **Model differential drive faithfully**: match wheel-radius, base, and
   max wheel-velocity from the proto files. Heading update from
   `(ω_R − ω_L)·r / b`, position from `(ω_R + ω_L)·r / 2`.
 4. **Match Webots step**: `basicTimeStep = 16 ms`. The sheep controller runs
   at every basic step; the env will use the same `dt = 0.016 s`.
 5. **Lidar deferred**: dog policy will use a *symbolic* observation
   (positions of dog + sheep, plus pen geometry) — not raw lidar — for the
   first iteration. Lidar-from-pixels is a much harder learning problem
   and isn't required for the herding task. (See Section 7 for an
   optional later upgrade.)
 ### 2.3 Action space for the dog
 Two viable choices:
 - **(a) High-level velocity vector** `(vx, vy) ∈ [−1, 1]²`. The same
  representation Strömbom emits today; the existing
  `drive_action(vx, vy, ...)` function in `shepherd_dog.py` converts this
  to wheel speeds. Decouples the policy from low-level diff-drive
  oscillations and enables direct A/B against Strömbom.
 - (b) Direct wheel speeds `(ω_L, ω_R) ∈ [−1, 1]²`. More expressive but the
  policy must learn diff-drive control from scratch — which is exactly
  the source of the wall-stuck and oscillation pain we're trying to
  avoid.
 **Recommendation: (a)** — high-level `(vx, vy)`. Reuses the well-tuned
 `drive_action` controller, which already handles `cos(err)` clamping and
 turn gain. RL focuses on *strategy*, not actuation.
 ### 2.4 Observation space for the dog
 Symbolic, fixed-size, normalized to [−1, 1]:
 | Field | Dim | Notes |
 |---|---|---|
 | Dog (x, y, cos h, sin h) | 4 | Position normalized by 15 |
 | Sheep CoM (x, y) | 2 | Of *active* (not-penned) sheep |
 | Sheep dispersion (radius, std-x, std-y) | 3 | Strömbom collect-vs-drive features |
 | Vector dog→CoM (dx, dy, dist) | 3 | Helps the value function |
 | Vector dog→pen-entry (dx, dy, dist) | 3 | |
 | Vector furthest-sheep→CoM (dx, dy) | 2 | Strömbom collect target hint |
 | Min sheep-to-wall distance + min dog-to-wall | 2 | Safety signal |
 | Active sheep count / N_max | 1 | |
 | 8-bin polar histogram of sheep around dog | 8 | Order-invariant flock shape |
 Total: **28 features**. Order-invariant by construction (histogram + summary
 stats), so the policy generalizes across flock sizes 1..N_max.
 ### 2.5 Reward
 Sparse-only is too hard at flock scale; we shape conservatively.
 ```
 r_t = w_pen     · ΔN_penned                       # +1 per newly penned sheep
    + w_progress· (d_CoM_pen[t-1] − d_CoM_pen[t]) # closer-to-pen progress
    + w_compact· (R[t-1] − R[t])                  # tighter flock progress
    − w_time   · 1                                 # constant time penalty
    − w_wall   · I(min_wall_dist < 1.0 m)         # dog too close to wall
    − w_collide· I(dog within 0.3 m of any sheep) # avoid contact
    + w_done   · I(all sheep penned)              # terminal bonus
 ```
 Initial weights: `w_pen=2.0, w_progress=0.5, w_compact=0.2, w_time=0.005,
 w_wall=0.01, w_collide=0.05, w_done=10.0`. Tune via 1-sheep curriculum
 first — if the dog learns 1-sheep cleanly, the weights are sane.
 ### 2.6 Episode
 - Max steps: 3000 (≈ 48 s at dt=16 ms — generous).
 - Termination: all sheep penned (success), dog/sheep stuck > 600 steps with
  no progress (failure), step limit (timeout).
 - Reset: domain-randomized — sheep count ∈ {1..N_max}, sheep positions
  uniform in field minus pen+gate corridor, dog at origin ± U(−2, 2).
 ### 2.7 Curriculum
 | Stage | N_sheep | Duration (steps) | Pass criterion |
 |---|---|---|---|
 | 0 | 1 | 0.5 M | success ≥ 90 % |
 | 1 | 2 | 1.0 M | success ≥ 80 % |
 | 2 | 3 | 1.5 M | success ≥ 70 % |
 | 3 | 1..3 mixed | 2.0 M | mean reward stable |
 | 4 (optional) | 5 | 2.0 M | success ≥ 60 % |
 Implemented by changing only `n_sheep` in the env reset.
 ---
 ## 3. Repository layout (new)
 ```
 project/
 ├── controllers/
 │   ├── sheep/                      # unchanged
 │   ├── shepherd_dog/               # Strömbom controller (renamed entry)
 │   │   ├── shepherd_dog.py         # mode-switch wrapper: RL | strombom
 │   │   ├── strombom.py             # unchanged (canonical Strömbom)
 │   │   └── policy_loader.py        # NEW: loads SB3 zip + VecNormalize
 │   └── ...
 ├── herding/                        # NEW: Python package, importable from env + controller
 │   ├── __init__.py
 │   ├── geometry.py                 # field/pen constants, in_pen(), wall helpers (single source of truth)
 │   ├── flocking_sim.py             # vectorised numpy port of flocking.py for fast batched sheep
 │   ├── diffdrive.py                # diff-drive integrator matching the proto specs
 │   └── obs.py                      # observation builder shared by env and Webots controller
 ├── training/                       # NEW
 │   ├── herding_env.py              # gymnasium.Env, single-agent (the dog)
 │   ├── parity_test.py              # asserts env trajectory ≈ Webots trajectory for fixed seeds
 │   ├── train_ppo.py                # SB3 PPO entry point
 │   ├── eval.py                     # rollout + metrics (success rate, time-to-pen)
 │   ├── configs/
 │   │   ├── ppo_default.yaml
 │   │   └── curriculum.yaml
 │   ├── runs/                       # tensorboard + checkpoints (.gitignored)
 │   └── requirements.txt
 ├── docs/
 │   └── project.md                  # unchanged
 ├── plan.md                         # this file
 └── ...
 ```
 `herding/` becomes the **single source of truth** for geometry and dynamics.
 The Webots controllers and the training env both import from it, so when a
 constant changes in one place it changes everywhere — eliminating the
 sim/Webots-drift class of bugs.
 This means the existing `controllers/sheep/flocking.py` and
 `controllers/shepherd_dog/strombom.py` become thin shims that re-export
 from `herding/`. Webots controllers can import `herding/` because Webots
 adds the project root to `sys.path` at controller startup; we'll verify.
 ---
 ## 4. The Gymnasium environment — `training/herding_env.py`
 ```python
 class HerdingEnv(gymnasium.Env):
    metadata = {"render_modes": ["rgb_array", "human"]}
    def __init__(self, n_sheep=3, max_steps=3000, dt=0.016, seed=None):
        self.action_space      = Box(low=-1, high=1, shape=(2,), dtype=np.float32)
        self.observation_space = Box(low=-1, high=1, shape=(28,), dtype=np.float32)
        ...
    def reset(self, *, seed=None, options=None):
        # Random sheep positions in field \ pen corridor, dog near origin.
        # Optional curriculum: options["n_sheep"] overrides.
        ...
    def step(self, action):
        vx, vy = action  # high-level velocity intent
        # Convert to wheel speeds via the same drive_action inverse used in Webots
        wL, wR = self._diffdrive_inverse(vx, vy, self.dog_state)
        self.dog_state = self._integrate_diffdrive(self.dog_state, wL, wR, self.dt)
        # Step every sheep one boid step (vectorized in flocking_sim.py)
        self.sheep_state = self._step_sheep(self.sheep_state, self.dog_state)
        # Update penned set, compute reward, observation, done flags
        ...
 ```
 Key points:
 - **Vectorised sheep update**: re-implements `flocking.py` in numpy so 100
  parallel envs with 5 sheep each take ms, not seconds. Numerical parity
  with the scalar version is asserted in `parity_test.py`.
 - **Same diff-drive integrator** for the dog as Webots will see at
  inference. Wall + pen-fence collisions clamp position (a Webots-realistic
  no-pass-through approximation).
 - **Domain randomization** in reset: sheep count, spawn positions, sheep
  flock-parameter jitter (±10 % on FLEE_DIST, COHESION_DIST, etc.) for
  robustness.
 ---
 ## 5. Training pipeline — `training/train_ppo.py`
 - **Algorithm**: SB3 `PPO` with `MlpPolicy`, `n_steps=2048`, `batch_size=256`,
  `n_epochs=10`, `gamma=0.995`, `gae_lambda=0.95`, `clip_range=0.2`,
  `ent_coef=0.005`, `vf_coef=0.5`, `learning_rate=3e-4`.
 - **Vec envs**: `SubprocVecEnv` × 16 parallel envs (the env is pure numpy
  so subprocs are CPU-cheap).
 - **Normalization**: `VecNormalize(norm_obs=True, norm_reward=True,
  clip_obs=10.0)`. Pickled alongside the policy zip — both required at
  inference.
 - **Callbacks**:
  - `CheckpointCallback` every 100 k steps.
  - `EvalCallback` on a separate eval env (no normalization-update) every
    50 k steps; logs success rate and time-to-pen to TensorBoard.
  - Custom `CurriculumCallback`: bumps `n_sheep` when eval success rate
    crosses the stage threshold for 3 consecutive evals.
 - **Determinism for debugging**: seed-pinned eval env so regressions are
  catchable.
 ---
 ## 6. Webots integration — RL inference path
 `controllers/shepherd_dog/shepherd_dog.py` becomes a thin wrapper:
 ```python
 MODE = os.environ.get("HERDING_MODE", "rl")  # "rl" | "strombom"
 if MODE == "rl":
    policy = policy_loader.load("training/runs/best/policy.zip",
                                "training/runs/best/vecnormalize.pkl")
    obs_fn = build_obs   # from herding/obs.py
 else:
    obs_fn = None        # strombom path uses sheep_positions directly
 while robot.step(timestep) != -1:
    receive_messages()
    if MODE == "rl":
        obs = obs_fn(dog_xy, dog_heading, sheep_positions, ...)
        action, _ = policy.predict(obs, deterministic=True)
        vx, vy = action.tolist()
    else:
        vx, vy, mode, dbg = compute_action_debug(dog_xy, sheep_positions, PEN_ENTRY)
        # plus existing rescue/cooldown/EMA layer
    drive_action(vx, vy, ...)
 ```
 A **safety supervisor** wraps the RL output: if `obs` indicates the dog is
 < 0.6 m from a wall, override with the existing wall-escape behavior
 (reverse + turn). This is a hard guarantee diff-drive needs because PPO
 may not discover wall-escape reliably from on-policy data.
 `policy_loader.py` handles the SB3 import lazily so the controller still
 works with `MODE=strombom` even if SB3 is not installed in the Webots
 Python environment.
 ---
 ## 7. Optional extensions (post-baseline)
 - **External pen** (Section 2.1 option B): edit `field.wbt` to extend the
  south wall hole into an external L-shaped pen with its own walls; update
  `herding/geometry.py`; retrain stage 3 only.
 - **Lidar observation**: replace symbolic obs with 36-bin downsampled
  lidar + ego state; train end-to-end. Useful as the "extra merit"
  dimension in the project doc.
 - **Two-dog mode**: make env multi-agent, train with `MAPPO`-style shared
  critic or independent PPO. The proto already supports multiple dog
  instances; world only needs a second `ShepherdDog` node.
 - **Mecanum comparison**: swap the dog proto for a mecanum variant; same
  policy, different `_integrate_diffdrive` (becomes holonomic).
 - **Sheep flock size scaling**: 5, 10, 20 — the obs is order-invariant so
  the same policy generalises; just curriculum further.
 ---
 ## 8. Risks & mitigations
 | Risk | Mitigation |
 |---|---|
 | Sim-to-Webots gap (sheep dynamics, wall friction) | `parity_test.py` asserts trajectory match within tolerance for fixed seeds; if it fails, fix the env, not the policy |
 | Dog learns to wall-pin sheep against fence | Add `w_collide` penalty + min-sheep-to-wall term in obs; curriculum from 1 sheep first |
 | PPO oscillation collapses into spinning | Action smoothing in env step (EMA on `(vx, vy)`, mirroring `ACTION_SMOOTH=0.35` from Strömbom controller); reward small `‖a_t − a_{t-1}‖` penalty |
 | Pen approach failures (sheep refuse gate) | Reuse the existing `deadzone_rescue` as a *scripted fallback* triggered when a sheep has been deadzoned > 200 steps — RL handles the common case, scripted handles the corner |
 | Gym version mismatch (gymnasium vs gym) | Lock to `gymnasium>=0.29`, `stable-baselines3>=2.3` in requirements |
 ---
 ## 9. Milestones (suggested order of implementation)
 1. **M0 — Refactor** (no behavior change): create `herding/` package, move
   constants out of `flocking.py`/`strombom.py`, leave shims; verify
   Webots still runs Strömbom unchanged. Add `dog_behavior_log.csv` to
   `.gitignore`.
 2. **M1 — Env & parity**: `herding_env.py`, `parity_test.py`. Asserts
   sheep + dog trajectories match Webots within tolerance for 5 fixed
   seeds. *Done when parity test green.*
 3. **M2 — PPO baseline**: train Stage 0 (1 sheep) for 0.5 M steps; eval
   in env at ≥ 90 % success.
 4. **M3 — Webots inference**: load Stage 0 policy in `shepherd_dog.py`
   with `HERDING_MODE=rl`; verify the dog herds 1 sheep into the pen in
   the actual Webots world. *This is the sim-to-sim transfer gate.*
 5. **M4 — Curriculum**: stages 1–3, ~5 M steps total, with checkpoints
   and eval logs.
 6. **M5 — Strömbom comparison**: run both controllers on a fixed eval
   suite (same seeds, 1/2/3 sheep), log success rate and time-to-pen.
   This is a deliverable for the project's "quantitative evaluation"
   goal.
 7. **M6 — Documentation**: a short README in `training/` showing how to
   train, evaluate, and switch modes in Webots.
 Each milestone is independently demoable. M0–M3 is the critical path to
 "RL works in Webots"; M4–M6 polishes it for the project deliverable.
 ---
 ## 10. Decisions (locked in by implementation)
 - **Pen layout**: option B (external pen). The pen sits south of the
  field at x ∈ [10, 13], y ∈ [-22, -15] and is reached through the
  existing 3 m gap in the south stone wall. The old in-field
  quarantine fence is gone and the wooden gate is modeled as
  swung-open and parked on the west gate post so the corridor is
  unobstructed. This kills the deadzone class entirely.
 - **Flock size**: 1..10 sheep, sampled uniformly each reset. The order-
  invariant observation (CoM, dispersion, polar histogram) lets a
  single policy generalise across the whole range. A curriculum widens
  ``max_n_sheep`` from 1 to 10 over training to keep early exploration
  tractable.
 - **Single-sheep mode**: handled by the same policy (n_sheep=1 is the
  first stage of the curriculum and stays in the training distribution
  throughout). No separate model.
 - **Hardware**: GPU for training. SubprocVecEnv × 16 on CPU feeds an
  MlpPolicy on GPU; ~2–3 h for the full curriculum.
 ## 11. What was built
 ```
 herding/                    # single source of truth, importable from both
  geometry.py               # field/pen constants, latch helpers, robot specs
  flocking_sim.py           # Reynolds boid step (matches Webots controller)
  diffdrive.py              # diff-drive kinematics + velocity↔wheels
  obs.py                    # 28-D order-invariant observation builder
  strombom.py               # collect/drive heuristic (baseline + fallback)
 worlds/field.wbt            # external pen south of field, 10 sheep slots,
                            # gate parked open, in-field fence removed
 controllers/sheep/sheep.py            # imports from herding/, latches on
                                      # is_penned_position
 controllers/shepherd_dog/
  shepherd_dog.py           # mode switch (HERDING_MODE=rl|strombom),
                            # safety supervisor for DOG_SOUTH_LIMIT
  policy_loader.py          # lazy SB3 zip + VecNormalize loader
  strombom.py               # shim re-exporting herding.strombom
 training/
  herding_env.py            # gymnasium.Env, action smoothing, reward shaping
  train_ppo.py              # SB3 PPO with VecNormalize, eval, checkpoints,
                            # curriculum callback
  eval.py                   # success-rate / time-to-pen across n_sheep
  parity_test.py            # shape, determinism, baseline-rollout smoke test
  configs/ppo_default.yaml
  requirements.txt
  README.md                 # how to train, evaluate, switch modes in Webots
 ```
 ## 12. To run
 ```bash
 # 1. Install deps (CUDA-enabled torch wheel for GPU)
 pip install -r training/requirements.txt
 # 2. Smoke test
 python -m training.parity_test
 # 3. Train (5 M steps, ~2–3 h on a single GPU)
 python -m training.train_ppo --out-dir training/runs/baseline
 # 4. Evaluate vs Strömbom
 python -m training.eval --policy training/runs/baseline/best
 python -m training.eval --policy strombom
 # 5. Run in Webots
 export HERDING_MODE=rl
 export HERDING_POLICY_DIR=$PWD/training/runs/baseline/best
 webots worlds/field.wbt
 ```
@@ -0,0 +1,117 @@
 """Collect (obs, action) demonstrations from the sequential teacher.
 Runs the sequential algorithm across a grid of (n_sheep, seed) combos
 at full difficulty, logs the (observation, action) pair every Nth step,
 and saves successful trajectories to a numpy ``.npz`` for behavior
 cloning. Failed trajectories are dropped by default — we only want to
 teach the policy from good examples.
 Usage::
    python -m tools.collect_demos --out training/demos.npz
 """
 from __future__ import annotations
 import argparse
 import os
 import sys
 import time
 from pathlib import Path
 _HERE = os.path.dirname(os.path.abspath(__file__))
 _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
 import numpy as np
 from herding.geometry import PEN_ENTRY
 from herding.sequential import compute_action
 from training.herding_env import HerdingEnv
 def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int):
    env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
                    difficulty=1.0, seed=seed)
    obs, _ = env.reset(seed=seed)
    obs_list, action_list = [], []
    for step in range(max_steps):
        positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
                     for i in range(env.n_sheep) if not env.sheep_penned[i]}
        if not positions:
            break
        vx, vy, _mode = compute_action(
            (env.dog_x, env.dog_y), positions, PEN_ENTRY,
        )
        action = np.array([vx, vy], dtype=np.float32)
        if step % subsample == 0:
            obs_list.append(obs.copy())
            action_list.append(action.copy())
        obs, _r, term, trunc, _info = env.step(action)
        if term or trunc:
            break
    success = bool(env.sheep_penned.all())
    return (
        np.asarray(obs_list, dtype=np.float32),
        np.asarray(action_list, dtype=np.float32),
        success,
        env.steps,
    )
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--out", default="training/demos.npz")
    parser.add_argument("--n-sheep-list", default="1,2,3,5,8,10")
    parser.add_argument("--seeds-per-n", type=int, default=15)
    parser.add_argument("--max-steps", type=int, default=30000)
    parser.add_argument("--subsample", type=int, default=5,
                        help="Keep every Nth (obs, action) pair.")
    parser.add_argument("--keep-failures", action="store_true",
                        help="Include partial-success trajectories. Default off.")
    args = parser.parse_args()
    n_sheep_list = [int(x) for x in args.n_sheep_list.split(",")]
    print(f"[demos] grid: n_sheep={n_sheep_list}, seeds={args.seeds_per_n}, "
          f"max_steps={args.max_steps}, subsample={args.subsample}")
    all_obs, all_actions, all_meta = [], [], []
    t_start = time.time()
    n_success = 0; n_total = 0
    for n in n_sheep_list:
        for seed in range(args.seeds_per_n):
            obs, actions, success, total_steps = collect_one(
                n, seed, args.max_steps, args.subsample,
            )
            n_total += 1
            if success:
                n_success += 1
            keep = success or args.keep_failures
            if keep and len(obs) > 0:
                all_obs.append(obs)
                all_actions.append(actions)
                all_meta.append((n, seed, len(obs), int(success), total_steps))
            tag = "✓" if success else "✗"
            print(f"  [{tag}] n={n:>2d} seed={seed:>2d}  steps={total_steps:>6d}  "
                  f"logged={len(obs):>5d}")
    if not all_obs:
        raise RuntimeError("No trajectories kept — try --keep-failures.")
    obs = np.concatenate(all_obs, axis=0)
    actions = np.concatenate(all_actions, axis=0)
    meta = np.array(all_meta, dtype=np.int32)
    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
    np.savez(args.out, obs=obs, actions=actions, meta=meta)
    elapsed = time.time() - t_start
    print(f"\n=== {n_success}/{n_total} trajectories successful ({100*n_success/n_total:.0f}%) ===")
    print(f"=== {len(obs)} transitions saved to {args.out} ===")
    print(f"=== obs={obs.shape}, actions={actions.shape}, elapsed={elapsed:.0f}s ===")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,63 @@
 #!/bin/bash
 # Launch Webots with N sheep enabled and the chosen controller mode.
 # Generates a temporary world file in worlds/field_test.wbt with sheep
 # beyond N commented out, sets the env vars the dog controller reads,
 # then execs Webots on it.
 #
 # Usage:
 #   tools/run_webots.sh [N] [MODE]
 #     N    : number of active sheep (1..10), default 10
 #     MODE : "rl" | "strombom" | "sequential", default "rl"
 #
 # Examples:
 #   tools/run_webots.sh 10 rl         # BC-trained RL policy, 10 sheep
 #   tools/run_webots.sh 5 sequential  # the analytic teacher, 5 sheep
 #   tools/run_webots.sh 3 strombom    # canonical baseline, 3 sheep
 #
 # Notes:
 # * The RL mode loads training/runs/bc_pretrained/policy.zip by default.
 #   Override via HERDING_POLICY_DIR=/path/to/run env var.
 # * Conda env "tir" must be active (provides stable-baselines3 + torch).
 set -e
 N=${1:-10}
 MODE=${2:-rl}
 if (( N < 1 || N > 10 )); then
    echo "N must be 1..10, got $N" >&2; exit 1
 fi
 case "$MODE" in
    rl|strombom|sequential) ;;
    *) echo "MODE must be rl|strombom|sequential, got '$MODE'" >&2; exit 1 ;;
 esac
 ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
 SRC="$ROOT/worlds/field.wbt"
 DST="$ROOT/worlds/field_test.wbt"
 cp "$SRC" "$DST"
 # Comment out sheep N+1..10 by prefixing the matching Sheep { ... } line.
 for i in $(seq $((N+1)) 10); do
    sed -i "s|^Sheep .* \"sheep${i}\".*|# &|" "$DST"
 done
 active=$(grep -c '^Sheep' "$DST")
 echo "------------------------------------------------------------"
 echo "World      : $DST"
 echo "Mode       : $MODE"
 echo "Sheep      : $active active"
 echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}"
 echo "------------------------------------------------------------"
 # Webots strips HERDING_* env vars from controller subprocesses in some
 # setups, so we also write a runtime config file the controller reads.
 RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}"
 cat > "$ROOT/herding_runtime.cfg" <<EOF
 HERDING_MODE=$MODE
 HERDING_POLICY_DIR=$RESOLVED_POLICY_DIR
 EOF
 export HERDING_MODE="$MODE"
 export HERDING_POLICY_DIR="$RESOLVED_POLICY_DIR"
 exec webots "$DST"
@@ -0,0 +1,115 @@
 # Shepherd Herding — Training & Inference
 This directory holds the Gymnasium environment, PPO training script, and
 evaluation harness for the RL shepherd-dog policy. The Webots controller
 in `controllers/shepherd_dog/` loads the resulting policy at inference
 time when launched with `HERDING_MODE=rl`.
 ## Layout
 ```
 training/
 ├── herding_env.py        # gymnasium.Env — the dog is the agent
 ├── train_ppo.py          # SB3 PPO entry point (vec envs, eval, curriculum)
 ├── eval.py               # rollout success-rate / time-to-pen across flock sizes
 ├── parity_test.py        # smoke test: shapes, determinism, baseline rollout
 ├── configs/ppo_default.yaml
 ├── runs/                 # tensorboard + checkpoints (gitignored)
 └── requirements.txt
 ```
 ## Setup
 ```bash
 python -m venv .venv && source .venv/bin/activate
 pip install -r training/requirements.txt
 ```
 CPU is the default and also the recommended device — SB3's PPO with an
 MLP policy of this size runs faster on CPU than on GPU because the
 bottleneck is rollout collection, not gradient compute. The 16 SubprocVecEnv
 workers saturate ~16 CPU cores. To force CUDA anyway, pass `--device cuda`.
 ## Train
 ```bash
 # Full curriculum (1 → 10 sheep), ~5M steps, ~2–3h on a single GPU.
 python -m training.train_ppo \
    --config training/configs/ppo_default.yaml \
    --out-dir training/runs/baseline
 ```
 Outputs:
 - `training/runs/baseline/best/best_model.zip` — best eval checkpoint
 - `training/runs/baseline/best/vecnormalize.pkl` — observation stats
 - `training/runs/baseline/checkpoints/ppo_*.zip` — periodic checkpoints
 - `training/runs/baseline/tb/` — TensorBoard logs (`tensorboard --logdir`)
 To resume:
 ```bash
 python -m training.train_ppo --resume training/runs/baseline/checkpoints/ppo_500000_steps.zip
 ```
 ## Evaluate
 ```bash
 # RL policy
 python -m training.eval --policy training/runs/baseline/best
 # Strömbom baseline
 python -m training.eval --policy strombom
 ```
 Prints success rate, mean steps, and mean penned-count per flock size.
 Use the same `--n-seeds` for both to get a fair RL-vs-Strömbom A/B.
 ## Parity / smoke test
 ```bash
 python -m training.parity_test
 ```
 Checks observation/action shapes, deterministic seeding, the curriculum
 sampler, and a 400-step Strömbom rollout. Run this before every long
 training job — catches the boring class of bugs in seconds.
 ## Run the policy in Webots
 1. Train (above) — produces `training/runs/<name>/best/`.
 2. In Webots, set the dog controller's environment variables:
   ```bash
   export HERDING_MODE=rl
   export HERDING_POLICY_DIR=$(pwd)/training/runs/baseline/best
   webots worlds/field.wbt
   ```
   Or set them via Webots' controller args / a `.wbproj` if you prefer.
 3. To force the Strömbom baseline (same world, same controller):
   ```bash
   export HERDING_MODE=strombom
   webots worlds/field.wbt
   ```
 If `HERDING_MODE=rl` but the policy can't be loaded (SB3 not installed,
 zip missing, etc.), the controller logs the error and falls back to
 Strömbom automatically.
 ## Curriculum knobs
 The default schedule in `configs/ppo_default.yaml` widens
 `max_n_sheep` over training. Each reset samples `n_sheep ~ U[1,
 max_n_sheep]`, so the final policy has seen every flock size from 1 to
 10 in proportion. To pin a specific size, instantiate the env with
 `HerdingEnv(n_sheep=N)` (see `eval.py`).
 ## Reward shaping
 Weights live in class attributes on `HerdingEnv`. Tune from the 1-sheep
 curriculum first — if the dog can't herd a single sheep cleanly, raising
 `W_PROGRESS` or lowering `W_TIME` is usually the fix. For multi-sheep
 collapse modes (dog spins between sheep), increase `W_COMPACT` so
 tightening the flock pays.
@@ -0,0 +1,218 @@
 """Behavior cloning of the sequential teacher into an SB3-compatible policy.
 Trains the policy network (mean-action head) of an SB3 ``MlpPolicy`` to
 mimic the demonstrations collected by ``tools.collect_demos``. The
 saved zip is loadable via ``PPO.load(...)`` and can be passed to
 ``train_ppo.py --resume`` for fine-tuning.
 Why this works: the teacher (sequential single-target driving) solves
 n=10 at 80%+ in our env. BC gives the RL a competent starting policy,
 so PPO doesn't have to discover behavior from scratch — it only has to
 *refine* the teacher's strategy via the sparse pen reward.
 Usage::
    python -m training.bc_pretrain \\
        --demos training/demos.npz \\
        --out training/runs/bc_pretrained
 """
 from __future__ import annotations
 import argparse
 import os
 import sys
 import time
 from pathlib import Path
 _HERE = os.path.dirname(os.path.abspath(__file__))
 _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader, TensorDataset
 from stable_baselines3 import PPO
 from stable_baselines3.common.vec_env import DummyVecEnv
 from training.herding_env import HerdingEnv
 def build_model(net_arch_pi, net_arch_vf, log_std_init: float):
    """Build a fresh SB3 PPO with the same architecture as train_ppo.
    We only need the policy to load weights into; PPO's training-loop
    plumbing isn't used during BC.
    """
    env = DummyVecEnv([lambda: HerdingEnv()])
    model = PPO(
        "MlpPolicy", env,
        policy_kwargs=dict(
            net_arch=dict(pi=net_arch_pi, vf=net_arch_vf),
            log_std_init=log_std_init,
        ),
        verbose=0,
    )
    return model, env
 def policy_forward_mean(policy, obs_batch):
    """Return the policy's deterministic mean action for a batch.
    SB3's ActorCriticPolicy doesn't expose this directly — it goes
    through a Distribution wrapper. We replicate the forward path:
    extract_features → mlp_extractor → action_net.
    """
    features = policy.extract_features(obs_batch)
    if isinstance(features, tuple):
        # SB3 ≥ 2.0 sometimes returns (pi_features, vf_features)
        pi_features = features[0]
    else:
        pi_features = features
    latent_pi, _latent_vf = policy.mlp_extractor(pi_features)
    return policy.action_net(latent_pi)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--demos", default="training/demos.npz")
    parser.add_argument("--out", default="training/runs/bc_pretrained")
    parser.add_argument("--epochs", type=int, default=60)
    parser.add_argument("--batch-size", type=int, default=256)
    parser.add_argument("--lr", type=float, default=1e-3)
    parser.add_argument("--val-split", type=float, default=0.1)
    parser.add_argument("--net-arch", default="256,256",
                        help="Comma-separated hidden layer widths.")
    parser.add_argument("--log-std-init", type=float, default=0.5)
    parser.add_argument("--cos-weight", type=float, default=1.0,
                        help="Weight on (1 - cosine similarity) loss term. "
                             "MSE alone shrinks policy output toward zero "
                             "(zero-magnitude action minimises mean squared "
                             "error against ±1 targets); cos loss keeps "
                             "the action pointed correctly even at small "
                             "magnitudes.")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--device", default="cpu")
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # --- Load demos ---
    print(f"[bc] loading demos from {args.demos}")
    data = np.load(args.demos)
    obs = data["obs"].astype(np.float32)
    actions = data["actions"].astype(np.float32)
    meta = data["meta"]
    print(f"[bc] obs={obs.shape}  actions={actions.shape}  trajectories={len(meta)}")
    if obs.size == 0:
        raise RuntimeError("Empty demo file.")
    # Action sanity check — sequential outputs unit vectors.
    a_norms = np.linalg.norm(actions, axis=1)
    print(f"[bc] action L2 norm: mean={a_norms.mean():.3f}  "
          f"min={a_norms.min():.3f}  max={a_norms.max():.3f}")
    # --- Train/val split ---
    n = len(obs)
    perm = np.random.permutation(n)
    n_val = int(n * args.val_split)
    val_idx, train_idx = perm[:n_val], perm[n_val:]
    print(f"[bc] train={len(train_idx)}  val={len(val_idx)}")
    obs_t = torch.from_numpy(obs)
    act_t = torch.from_numpy(actions)
    train_loader = DataLoader(
        TensorDataset(obs_t[train_idx], act_t[train_idx]),
        batch_size=args.batch_size, shuffle=True,
    )
    val_loader = DataLoader(
        TensorDataset(obs_t[val_idx], act_t[val_idx]),
        batch_size=args.batch_size, shuffle=False,
    )
    # --- Build model ---
    net_arch_pi = [int(x) for x in args.net_arch.split(",")]
    net_arch_vf = net_arch_pi[:]
    model, _env = build_model(net_arch_pi, net_arch_vf, args.log_std_init)
    policy = model.policy.to(args.device)
    optimizer = optim.Adam(policy.parameters(), lr=args.lr)
    # --- Train ---
    print(f"[bc] training: epochs={args.epochs}  batch={args.batch_size}  "
          f"lr={args.lr}  device={args.device}")
    t_start = time.time()
    best_val = float("inf")
    def combined_loss(pred, target):
        mse = nn.functional.mse_loss(pred, target)
        p_norm = pred.norm(dim=1).clamp_min(1e-6)
        t_norm = target.norm(dim=1).clamp_min(1e-6)
        cos_sim = (pred * target).sum(dim=1) / (p_norm * t_norm)
        cos_loss = (1.0 - cos_sim).mean()
        return mse + args.cos_weight * cos_loss, mse.item(), cos_sim.mean().item()
    for epoch in range(args.epochs):
        policy.train()
        train_loss_total, train_mse_total, train_cos_total, train_count = 0.0, 0.0, 0.0, 0
        for ob_batch, act_batch in train_loader:
            ob_batch = ob_batch.to(args.device)
            act_batch = act_batch.to(args.device)
            optimizer.zero_grad()
            mean_action = policy_forward_mean(policy, ob_batch)
            loss, mse_val, cos_val = combined_loss(mean_action, act_batch)
            loss.backward()
            optimizer.step()
            bs = ob_batch.size(0)
            train_loss_total += loss.item() * bs
            train_mse_total += mse_val * bs
            train_cos_total += cos_val * bs
            train_count += bs
        train_mse = train_mse_total / max(1, train_count)
        train_cos = train_cos_total / max(1, train_count)
        policy.eval()
        val_total, val_count = 0.0, 0
        cos_sim_total = 0.0
        with torch.no_grad():
            for ob_batch, act_batch in val_loader:
                ob_batch = ob_batch.to(args.device)
                act_batch = act_batch.to(args.device)
                mean_action = policy_forward_mean(policy, ob_batch)
                bs = ob_batch.size(0)
                val_total += nn.functional.mse_loss(
                    mean_action, act_batch, reduction="sum",
                ).item()
                # Cosine similarity in action space — useful sanity for
                # "is the policy pointing the same way as the teacher?".
                m_norm = mean_action.norm(dim=1).clamp_min(1e-6)
                a_norm = act_batch.norm(dim=1).clamp_min(1e-6)
                cos = (mean_action * act_batch).sum(dim=1) / (m_norm * a_norm)
                cos_sim_total += cos.sum().item()
                val_count += bs
        val_mse = val_total / max(1, val_count) / actions.shape[1]
        cos_sim = cos_sim_total / max(1, val_count)
        print(f"  epoch {epoch+1:>2d}/{args.epochs}  "
              f"train_mse={train_mse:.4f}  train_cos={train_cos:+.3f}  "
              f"val_mse={val_mse:.4f}  val_cos={cos_sim:+.3f}")
        if val_mse < best_val:
            best_val = val_mse
    elapsed = time.time() - t_start
    print(f"[bc] done in {elapsed:.0f}s  best_val_mse={best_val:.4f}")
    # --- Save ---
    out_dir = Path(args.out)
    out_dir.mkdir(parents=True, exist_ok=True)
    model.save(out_dir / "policy.zip")
    print(f"[bc] saved policy to {out_dir / 'policy.zip'}")
    print(f"\n[bc] verify with:  "
          f"python -m training.eval --policy {out_dir}")
 if __name__ == "__main__":
    main()
@@ -1,14 +0,0 @@
 {
    "W_PER_SHEEP": 2.0,
    "W_ALIGN": 0.05,
    "W_PEN_BONUS": 10.0,
    "W_COMPLETE": 100.0,
    "W_STEP_COST": 0.02,
    "W_COMPACT": 0.0,
    "W_WALL_TOUCH": 0.0,
    "WALL_TOUCH_BUFFER": 0.4,
    "ALIGN_SHAPE": "standoff",
    "ALIGN_GATED": true,
    "ENTRY_AWARE": true,
    "ent_coef": 0.02
 }
@@ -0,0 +1,52 @@
 # PPO hyperparameters for the herding env. Tuned for a 28-D obs / 2-D
 # continuous action space with 16 parallel envs on GPU. These are SB3
 # defaults nudged toward longer credit assignment (gamma=0.995) and a
 # slightly higher entropy bonus to keep exploration alive while curriculum
 # expands the flock size.
 # --- PPO ---
 learning_rate: 3.0e-4
 n_steps: 2048              # rollout length per env before each update
 batch_size: 256
 n_epochs: 10
 gamma: 0.995
 gae_lambda: 0.95
 clip_range: 0.2
 ent_coef: 0.05             # was 0.01 — earlier runs collapsed to ~0 actions
 vf_coef: 0.5
 max_grad_norm: 0.5
 target_kl: null            # disable early-stop on KL
 # --- Network ---
 policy: MlpPolicy
 net_arch_pi: [128, 128]
 net_arch_vf: [128, 128]
 log_std_init: 0.5          # std≈1.6 instead of default 1.0 — more exploration
 # --- Training schedule ---
 total_timesteps: 10_000_000
 n_envs: 16
 checkpoint_freq: 500_000   # in env steps
 eval_freq: 100_000         # in env steps
 n_eval_episodes: 20
 # --- Curriculum (max-n_sheep schedule, in env steps) ---
 # Each entry: at step s, raise the env's max_n_sheep to k. The env samples
 # uniformly from [1, max_n_sheep] each reset, so this widens the
 # distribution gradually rather than swapping fixed sizes.
 #
 # State-space curriculum: difficulty controls sheep spawn area
 # (0 = sheep spawn just north of gate, 1 = sheep spawn anywhere in field).
 # Plus the existing flock-size curriculum.
 #
 # The two together let the policy first learn "what penning looks like"
 # in a regime where random exploration reliably triggers it, then
 # gradually generalise to the deployment distribution.
 curriculum:
  - { step: 0,          max_n_sheep: 1, difficulty: 0.0 }
  - { step: 1_000_000,  max_n_sheep: 1, difficulty: 0.3 }
  - { step: 2_000_000,  max_n_sheep: 2, difficulty: 0.5 }
  - { step: 4_000_000,  max_n_sheep: 3, difficulty: 0.8 }
  - { step: 6_000_000,  max_n_sheep: 5, difficulty: 1.0 }
  - { step: 8_000_000,  max_n_sheep: 8, difficulty: 1.0 }
  - { step: 9_000_000,  max_n_sheep: 10, difficulty: 1.0 }
@@ -0,0 +1,136 @@
 """Evaluate a trained PPO policy (or the Strömbom baseline) on the env.
 Reports success rate and time-to-pen across a fixed seed grid for each
 flock size 1..MAX_SHEEP. Used to produce the M5 quantitative comparison
 table mentioned in plan.md.
 Usage::
    python -m training.eval --policy training/runs/latest/best
    python -m training.eval --policy strombom
 """
 from __future__ import annotations
 import argparse
 import os
 import sys
 from pathlib import Path
 from statistics import mean, stdev
 _HERE = os.path.dirname(os.path.abspath(__file__))
 _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
 import numpy as np
 from herding.geometry import MAX_SHEEP, PEN_ENTRY
 from herding.strombom import compute_action as strombom_action
 from herding.sequential import compute_action as sequential_action
 from training.herding_env import HerdingEnv
 def rollout(env: HerdingEnv, predict_fn, max_steps: int) -> dict:
    obs, _ = env.reset()
    success = False
    for t in range(max_steps):
        action = predict_fn(env, obs)
        obs, _r, terminated, truncated, info = env.step(action)
        if terminated or truncated:
            success = bool(info.get("is_success", False))
            return {"success": success, "steps": info.get("steps", t + 1),
                    "n_penned": info.get("n_penned", 0)}
    return {"success": False, "steps": max_steps, "n_penned": int(env.sheep_penned.sum())}
 def make_analytic_predictor(action_fn):
    def _predict(env, _obs):
        positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
                     for i in range(env.n_sheep)
                     if not env.sheep_penned[i]}
        vx, vy, _mode = action_fn((env.dog_x, env.dog_y), positions, PEN_ENTRY)
        return np.array([vx, vy], dtype=np.float32)
    return _predict
 # Backwards-compat alias.
 def make_strombom_predictor():
    return make_analytic_predictor(strombom_action)
 def make_policy_predictor(model, vecnorm):
    def _predict(_env, obs):
        if vecnorm is not None:
            obs_b = vecnorm.normalize_obs(np.asarray(obs, dtype=np.float32).reshape(1, -1))
        else:
            obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
        action, _ = model.predict(obs_b, deterministic=True)
        return action[0]
    return _predict
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--policy", required=True,
                        help="Either 'strombom' or path to an SB3 run directory.")
    parser.add_argument("--n-seeds", type=int, default=10)
    parser.add_argument("--max-steps", type=int, default=5000)
    parser.add_argument("--max-flock", type=int, default=MAX_SHEEP)
    # 1.0 = deployment distribution (sheep anywhere in field).
    # Lower values use the training-curriculum spawn band (sheep near gate).
    parser.add_argument("--difficulty", type=float, default=1.0)
    args = parser.parse_args()
    if args.policy == "strombom":
        predict = make_analytic_predictor(strombom_action)
    elif args.policy == "sequential":
        predict = make_analytic_predictor(sequential_action)
    else:
        from stable_baselines3 import PPO
        run = Path(args.policy)
        # Resolve to a zip: directory of checkpoints, or a direct zip path.
        if run.is_file():
            zip_path = run
        else:
            for name in ("best_model.zip", "policy.zip", "final.zip"):
                if (run / name).exists():
                    zip_path = run / name
                    break
            else:
                raise FileNotFoundError(
                    f"No checkpoint found in {run} (tried best_model.zip, "
                    f"policy.zip, final.zip)"
                )
        model = PPO.load(str(zip_path), device="auto")
        vecnorm = None
        vn_path = run / "vecnormalize.pkl"
        if not vn_path.exists() and run.parent.name != "best":
            vn_path = run.parent / "vecnormalize.pkl"
        if vn_path.exists():
            import pickle
            with open(vn_path, "rb") as f:
                vecnorm = pickle.load(f)
            vecnorm.training = False
            vecnorm.norm_reward = False
        predict = make_policy_predictor(model, vecnorm)
    print(f"{'n_sheep':>8} {'success%':>10} {'mean_steps':>12} {'mean_penned':>12}")
    print("-" * 46)
    for n in range(1, args.max_flock + 1):
        successes, steps, penned = [], [], []
        for seed in range(args.n_seeds):
            env = HerdingEnv(n_sheep=n, max_steps=args.max_steps,
                             difficulty=args.difficulty, seed=seed)
            r = rollout(env, predict, args.max_steps)
            successes.append(int(r["success"]))
            steps.append(r["steps"])
            penned.append(r["n_penned"])
        sr = 100.0 * mean(successes)
        ms = mean(steps)
        mp = mean(penned)
        print(f"{n:>8d} {sr:>9.1f}% {ms:>12.0f} {mp:>12.2f}")
 if __name__ == "__main__":
    main()
@@ -1,318 +1,96 @@
-"""
+"""Parity smoke-test for the herding env.
 Parity test: verify 2D training env matches Webots controller implementations.
-Tests:
+Verifies (a) all imports resolve, (b) the env's reset/step contract is
-1. Observation building: HerdingEnv._obs() vs shepherd_dog_rl.build_obs()
+correct, (c) deterministic seeds give deterministic trajectories, and
-2. Dog drive: HerdingEnv._step_dog_substep() vs shepherd_dog_rl.drive() math
+(d) the Strömbom baseline can drive the env without crashing.
-3. Sheep drive: HerdingEnv._sheep_drive() vs sheep.py drive() math
+
 Run::
    python -m training.parity_test
 """
-import sys
+from __future__ import annotations
 import os
-import math
+import sys
 _HERE = os.path.dirname(os.path.abspath(__file__))
 _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
 import numpy as np
-# Make imports work from project root
+from herding.geometry import MAX_SHEEP, PEN_ENTRY
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from herding.obs import OBS_DIM
-sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
+from herding.strombom import compute_action
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "controllers", "shepherd_dog_rl"))
+from training.herding_env import HerdingEnv
 from herding_env import HerdingEnv
 # Re-implement the Webots functions standalone (no Webots dependency)
 FIELD = 15.0
 PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
 PEN_ENTRY  = np.array([11.5,  -8.0], dtype=np.float32)
 PEN_X = (10.0, 13.0)
 PEN_Y = (-15.0, -8.0)
 ENTRY_AWARE = True
-def webots_build_obs(dog_pos, sheep_positions, n_sheep, dog_heading):
+def test_obs_action_shapes():
-    """Standalone version of shepherd_dog_rl.py build_obs()."""
+    env = HerdingEnv(n_sheep=3, seed=0)
-    D = 2 * FIELD
+    obs, info = env.reset()
-    active_pos = np.array(
+    assert obs.shape == (OBS_DIM,), obs.shape
-        [p for p in sheep_positions
+    assert obs.dtype == np.float32
-         if not (PEN_X[0] < p[0] < PEN_X[1] and PEN_Y[0] < p[1] < PEN_Y[1])],
+    obs2, r, term, trunc, info = env.step(np.array([0.5, 0.0], dtype=np.float32))
-        dtype=np.float32
+    assert obs2.shape == (OBS_DIM,)
-    )
+    assert isinstance(r, float)
-    n_active = len(active_pos)
+    assert isinstance(term, bool) and isinstance(trunc, bool)
-    if n_active > 0:
+    print("[ok] shapes")
        com = active_pos.mean(axis=0)
        d_from_com = np.linalg.norm(active_pos - com, axis=1)
        sorted_idx = np.argsort(d_from_com)[::-1]
        radius = float(d_from_com[sorted_idx[0]])
        def nth(n):
            return active_pos[sorted_idx[n]] if len(sorted_idx) > n else com
        far1, far2, far3 = nth(0), nth(1), nth(2)
    else:
        com = PEN_CENTER.copy()
        radius = 0.0
        far1 = far2 = far3 = PEN_CENTER.copy()
    frac_active = n_active / max(n_sheep, 1)
    pen_ref = PEN_ENTRY if ENTRY_AWARE else PEN_CENTER
    return np.array([
        dog_pos[0] / FIELD, dog_pos[1] / FIELD,
        (com[0] - dog_pos[0]) / D, (com[1] - dog_pos[1]) / D,
        (far1[0] - com[0]) / D, (far1[1] - com[1]) / D,
        (far2[0] - com[0]) / D, (far2[1] - com[1]) / D,
        (far3[0] - com[0]) / D, (far3[1] - com[1]) / D,
        (pen_ref[0] - com[0]) / D, (pen_ref[1] - com[1]) / D,
        (pen_ref[0] - far1[0]) / D, (pen_ref[1] - far1[1]) / D,
        radius / D,
        frac_active,
        math.cos(dog_heading), math.sin(dog_heading),
    ], dtype=np.float32)
-def webots_dog_drive(heading, speed_ms, wheel_r=0.038, k_turn=4.0,
+def test_reset_determinism():
-                     motor_max=70.0, axle_track=0.28):
+    """Reset with the same seed should give the same initial observation.
    """Standalone version of shepherd_dog_rl.py drive() kinematics.
-    Returns (v_linear, omega, left_w, right_w).
+    We don't require step-determinism — PPO doesn't need it, and chasing
    bit-exactness through the flocking jitter isn't worth the complexity.
    """
-    err = math.atan2(math.sin(heading), math.cos(heading))
+    env_a = HerdingEnv(n_sheep=3, seed=42)
-    fwd_ms = speed_ms * max(0.0, math.cos(err))
+    env_b = HerdingEnv(n_sheep=3, seed=42)
-    fwd_rad = fwd_ms / wheel_r
+    obs_a, _ = env_a.reset(seed=42)
-    turn = k_turn * err
+    obs_b, _ = env_b.reset(seed=42)
-    l = max(-motor_max, min(motor_max, fwd_rad - turn))
+    assert np.allclose(obs_a, obs_b), "Reset is non-deterministic for same seed"
-    r = max(-motor_max, min(motor_max, fwd_rad + turn))
+    print("[ok] reset determinism")
    v = wheel_r * 0.5 * (r + l)
    w = (wheel_r / axle_track) * (r - l)
    return v, w, l, r
-def webots_sheep_drive(heading, speed_rad, wheel_r=0.031, k_turn=4.0,
+def test_curriculum_n_sheep_varies():
-                       motor_max=22.0, axle_track=0.20):
+    env = HerdingEnv(seed=0)
-    """Standalone version of sheep.py drive() kinematics."""
+    sizes = set()
-    err = math.atan2(math.sin(heading), math.cos(heading))
+    for _ in range(40):
-    fwd = speed_rad * max(0.0, math.cos(err))
+        _, info = env.reset()
-    k = 4.0
+        sizes.add(info["n_sheep"])
-    l = max(-motor_max, min(motor_max, fwd - k * err))
+    assert 1 in sizes
-    r = max(-motor_max, min(motor_max, fwd + k * err))
+    assert max(sizes) <= MAX_SHEEP
-    v = wheel_r * 0.5 * (r + l)
+    print(f"[ok] curriculum sampling — saw n_sheep in {sorted(sizes)}")
    w = (wheel_r / axle_track) * (r - l)
    return v, w, l, r
-def test_obs_parity():
+def test_strombom_drives_env():
-    """Test that build_obs matches between 2D env and Webots controller."""
+    """Quick functional check that the analytic baseline can play the env
-    print("=== Test 1: Observation Parity ===")
+    without exploding. Not a success-rate test — just no errors / NaNs."""
-    env = HerdingEnv(n_sheep=3)
+    env = HerdingEnv(n_sheep=2, max_steps=400, seed=1)
-    # Set ENTRY_AWARE to match our webots constant
+    obs, _ = env.reset()
-    env.ENTRY_AWARE = ENTRY_AWARE
+    for t in range(400):
-    env.reset(seed=42)
+        positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
-
+                     for i in range(env.n_sheep)
-    # Manually set positions for a controlled test
+                     if not env.sheep_penned[i]}
-    env.dog_pos = np.array([5.0, 3.0], dtype=np.float32)
+        if not positions:
-    env.dog_heading = 1.2
+            break
-    env.sheep_pos[0] = np.array([0.0, 0.0], dtype=np.float32)
+        vx, vy, _mode = compute_action((env.dog_x, env.dog_y), positions, PEN_ENTRY)
-    env.sheep_pos[1] = np.array([2.0, -1.0], dtype=np.float32)
+        obs, r, term, trunc, info = env.step(np.array([vx, vy], dtype=np.float32))
-    env.sheep_pos[2] = np.array([11.5, -11.5], dtype=np.float32)  # penned
+        assert np.isfinite(obs).all(), f"NaN/Inf in obs at step {t}"
-    env.penned[0] = False
+        assert np.isfinite(r), f"NaN reward at step {t}"
-    env.penned[1] = False
+        if term or trunc:
-    env.penned[2] = True
+            break
-
+    print(f"[ok] strombom rollout — final n_penned={int(env.sheep_penned.sum())}/{env.n_sheep} after {env.steps} steps")
    obs_2d = env._obs()
    # Build equivalent Webots observation
    sheep_positions = [
        env.sheep_pos[0].tolist(),
        env.sheep_pos[1].tolist(),
        env.sheep_pos[2].tolist(),
    ]
    obs_webots = webots_build_obs(
        env.dog_pos, sheep_positions, 3, env.dog_heading
    )
    max_diff = float(np.max(np.abs(obs_2d - obs_webots)))
    print(f"  Max element-wise diff: {max_diff:.2e}")
    if max_diff < 1e-6:
        print("  PASS: Observations match")
    else:
        print("  FAIL: Observations differ!")
        for i in range(18):
            if abs(obs_2d[i] - obs_webots[i]) > 1e-6:
                print(f"    dim {i}: 2d={obs_2d[i]:.6f}  webots={obs_webots[i]:.6f}")
    return max_diff < 1e-6
-def test_dog_drive_parity():
+def main():
-    """Test that dog diff-drive matches Webots controller."""
+    test_obs_action_shapes()
-    print("\n=== Test 2: Dog Drive Parity ===")
+    test_reset_determinism()
-    env = HerdingEnv(n_sheep=1)
+    test_curriculum_n_sheep_varies()
-    env.reset(seed=42)
+    test_strombom_drives_env()
-
+    print("\nAll parity checks passed.")
    all_pass = True
    test_cases = [
        # (heading_error, speed_ms) — target_heading relative to current heading
        (0.0, 2.5),      # aligned, full speed
        (0.5, 2.5),      # 30deg error
        (1.5, 2.5),      # ~86deg error
        (3.14, 2.5),     # ~180deg error — should spin in place
        (0.0, 0.5),      # aligned, slow
        (0.3, 1.0),      # small error, medium speed
    ]
    for heading_err, speed_ms in test_cases:
        env.dog_heading = 0.0
        target_heading = heading_err
        action = np.array([
            math.cos(target_heading), math.sin(target_heading)
        ], dtype=np.float32) * (speed_ms / env.DOG_SPEED)
        # 2D env step
        dbg = env._step_dog_substep(action, 0.016)
        v_2d = dbg["v"]
        w_2d = dbg["w"]
        l_2d = dbg["left_w"]
        r_2d = dbg["right_w"]
        # Webots equivalent
        v_w, w_w, l_w, r_w = webots_dog_drive(heading_err, speed_ms)
        diffs = {
            "v": abs(v_2d - v_w),
            "w": abs(w_2d - w_w),
            "left": abs(l_2d - l_w),
            "right": abs(r_2d - r_w),
        }
        max_diff = max(diffs.values())
        ok = max_diff < 1e-6
        status = "PASS" if ok else "FAIL"
        print(f"  err={heading_err:.2f} spd={speed_ms:.1f}: {status} (max_diff={max_diff:.2e})")
        if not ok:
            for k, d in diffs.items():
                if d > 1e-6:
                    print(f"    {k}: 2d={eval(k+'_2d'):.6f} webots={eval(k+'_w'):.6f}")
            all_pass = False
    return all_pass
 def test_sheep_drive_parity():
    """Test that sheep diff-drive matches Webots sheep controller."""
    print("\n=== Test 3: Sheep Drive Parity ===")
    env = HerdingEnv(n_sheep=1)
    env.reset(seed=42)
    all_pass = True
    test_cases = [
        # (heading_error, speed_rad)
        (0.0, 20.0),     # aligned, flee speed
        (0.0, 3.0),      # aligned, wander speed
        (0.5, 15.0),     # moderate error
        (1.57, 10.0),    # 90deg — should spin in place
        (3.14, 20.0),    # 180deg — should spin in place fast
        (0.2, 8.0),      # small error, medium speed
    ]
    for heading_err, speed_rad in test_cases:
        env.sheep_heading[0] = 0.0
        env.sheep_pos[0] = np.array([0.0, 0.0], dtype=np.float32)
        target_heading = heading_err
        # 2D env
        new_pos = env._sheep_drive(0, target_heading, speed_rad, 0.016)
        v_2d_raw = float(np.linalg.norm(new_pos - np.array([0.0, 0.0]))) / 0.016
        # Re-derive v, w from the internal state
        heading_2d = env.sheep_heading[0]
        # Webots equivalent
        v_w, w_w, l_w, r_w = webots_sheep_drive(heading_err, speed_rad)
        # For 2D, compute the same intermediate values
        err_2d = (target_heading - 0.0 + np.pi) % (2 * np.pi) - np.pi
        fwd_2d = speed_rad * max(0.0, math.cos(err_2d))
        turn_2d = 4.0 * err_2d
        l_2d = max(-22.0, min(22.0, fwd_2d - turn_2d))
        r_2d = max(-22.0, min(22.0, fwd_2d + turn_2d))
        diffs = {
            "left": abs(l_2d - l_w),
            "right": abs(r_2d - r_w),
        }
        max_diff = max(diffs.values())
        ok = max_diff < 1e-6
        status = "PASS" if ok else "FAIL"
        print(f"  err={heading_err:.2f} spd={speed_rad:.1f}: {status} (max_diff={max_diff:.2e})")
        if not ok:
            for k, d in diffs.items():
                if d > 1e-6:
                    print(f"    {k}: 2d={l_2d if k=='left' else r_2d:.6f} webots={l_w if k=='left' else r_w:.6f}")
            all_pass = False
    return all_pass
 def test_full_trajectory_parity():
    """Test that running identical actions produces matching trajectories."""
    print("\n=== Test 4: Full Trajectory Parity (dog only) ===")
    # Run 50 steps with a fixed action, compare dog heading/position
    # at each step between 2D env kinematics and pure Webots kinematics.
    env = HerdingEnv(n_sheep=1)
    env.reset(seed=42)
    env.dog_pos = np.array([0.0, 0.0], dtype=np.float32)
    env.dog_heading = 0.0
    env.ENTRY_AWARE = ENTRY_AWARE
    action = np.array([0.8, -0.6], dtype=np.float32)  # magnitude 1.0
    dt = 0.016667  # sub_dt
    # Webots-side tracking
    wb_heading = 0.0
    wb_x, wb_y = 0.0, 0.0
    max_heading_diff = 0.0
    max_pos_diff = 0.0
    for step in range(50):
        # 2D env sub-step
        env._step_dog_substep(action, dt)
        # Webots-side computation
        speed_ms = 1.0 * 2.5
        target_heading = math.atan2(-0.6, 0.8)
        err = math.atan2(math.sin(target_heading - wb_heading),
                         math.cos(target_heading - wb_heading))
        fwd_ms = speed_ms * max(0.0, math.cos(err))
        fwd_rad = fwd_ms / 0.038
        turn = 4.0 * err
        l = max(-70.0, min(70.0, fwd_rad - turn))
        r = max(-70.0, min(70.0, fwd_rad + turn))
        v = 0.038 * 0.5 * (r + l)
        w = (0.038 / 0.28) * (r - l)
        wb_heading = math.atan2(math.sin(wb_heading + w * dt),
                                math.cos(wb_heading + w * dt))
        wb_x += math.cos(wb_heading) * v * dt
        wb_y += math.sin(wb_heading) * v * dt
        heading_diff = abs(env.dog_heading - wb_heading)
        pos_diff = math.hypot(env.dog_pos[0] - wb_x, env.dog_pos[1] - wb_y)
        max_heading_diff = max(max_heading_diff, heading_diff)
        max_pos_diff = max(max_pos_diff, pos_diff)
    print(f"  Max heading diff over 50 steps: {max_heading_diff:.2e} rad")
    print(f"  Max position diff over 50 steps: {max_pos_diff:.2e} m")
    ok = max_pos_diff < 1e-4
    print(f"  {'PASS' if ok else 'FAIL'}: Trajectories match")
    return ok
 if __name__ == "__main__":
-    results = []
+    main()
    results.append(("Obs parity", test_obs_parity()))
    results.append(("Dog drive parity", test_dog_drive_parity()))
    results.append(("Sheep drive parity", test_sheep_drive_parity()))
    results.append(("Trajectory parity", test_full_trajectory_parity()))
    print("\n" + "=" * 50)
    print("RESULTS")
    print("=" * 50)
    all_pass = True
    for name, passed in results:
        print(f"  {name}: {'PASS' if passed else 'FAIL'}")
        if not passed:
            all_pass = False
    print(f"\nOverall: {'ALL PASS' if all_pass else 'SOME FAILURES'}")
    env.close()
@@ -1,6 +1,8 @@
-gymnasium>=0.29
+# Pin major versions; SB3 2.x requires gymnasium and torch >= 1.13.
-stable-baselines3>=2.3
+gymnasium>=0.29,<2.0
-torch>=2.2
+stable-baselines3[extra]>=2.3,<3.0
-numpy>=1.26
+torch>=2.1
-matplotlib>=3.8
+numpy>=1.24
-tensorboard>=2.16
+pyyaml>=6.0
 tensorboard>=2.14
 tqdm>=4.66
@@ -1 +0,0 @@
@@ -1,392 +0,0 @@
 """
 PPO training for the herding task with curriculum learning.
 Trains from scratch through a 1→max_sheep curriculum, evaluates after each
 stage, and auto-generates trajectory/timeseries plots plus a summary chart.
 Usage
 -----
    python train.py                                       # defaults from config.json
    python train.py --config my_config.json --max-sheep 5
    python train.py --max-sheep 3 --steps-per-stage 1000000
 Outputs (in runs/<timestamp>/):
    config.json          resolved config
    final_model.zip      trained PPO model
    vecnorm.pkl          VecNormalize statistics
    stage_results.json   per-stage evaluation metrics
    success_rate.png     summary bar chart
    eval/                trajectory & timeseries plots per sheep count
 """
 import argparse
 import json
 import os
 import time
 from copy import deepcopy
 import numpy as np
 from stable_baselines3 import PPO
 from stable_baselines3.common.callbacks import BaseCallback
 from stable_baselines3.common.vec_env import (
    DummyVecEnv,
    SubprocVecEnv,
    VecNormalize,
 )
 from herding_env import HerdingEnv
 from viz import (
    run_and_record,
    plot_trajectory,
    plot_timeseries,
    plot_success_rate,
    save_episode_gif,
 )
 # ── Callbacks ────────────────────────────────────────────────────────────────
 class ProgressCallback(BaseCallback):
    """One-line progress summary every `freq` env steps."""
    def __init__(self, stage_label: str, freq: int = 100_000):
        super().__init__()
        self.stage_label = stage_label
        self.freq = freq
        self._last = 0
        self._ep_returns = []
        self._ep_success = []
        self._total_eps = 0
        self._total_success = 0
        self._cur_ret = None
    def _on_step(self) -> bool:
        rewards = self.locals.get("rewards")
        dones = self.locals.get("dones")
        infos = self.locals.get("infos", [])
        if rewards is None or dones is None:
            return True
        if self._cur_ret is None or len(self._cur_ret) != len(rewards):
            self._cur_ret = np.zeros(len(rewards), dtype=np.float64)
        self._cur_ret += np.asarray(rewards, dtype=np.float64)
        for i, d in enumerate(dones):
            if not d:
                continue
            self._ep_returns.append(float(self._cur_ret[i]))
            info = infos[i] if i < len(infos) else {}
            success = int(info.get("n_penned", 0) == info.get("n_sheep", -1))
            self._ep_success.append(success)
            self._total_eps += 1
            self._total_success += success
            self._cur_ret[i] = 0.0
            if len(self._ep_returns) > 50:
                self._ep_returns.pop(0)
                self._ep_success.pop(0)
        if self.num_timesteps - self._last >= self.freq:
            self._last = self.num_timesteps
            n = len(self._ep_returns)
            mean_r = float(np.mean(self._ep_returns)) if n else float("nan")
            win_sr = float(np.mean(self._ep_success)) if n else float("nan")
            cum_sr = (self._total_success / self._total_eps
                      if self._total_eps else float("nan"))
            print(f"           ... [{self.stage_label} | "
                  f"{self.num_timesteps:>7,} steps | "
                  f"ret(last {n})={mean_r:+.2f}  "
                  f"win_sr={win_sr*100:.0f}%  cum_sr={cum_sr*100:.0f}%]",
                  flush=True)
        return True
 # ── Environment factory ──────────────────────────────────────────────────────
 def make_env(n_sheep, seed, max_steps, reward_cfg=None):
    def _init():
        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
                         reward_cfg=reward_cfg)
        env.reset(seed=seed)
        return env
    return _init
 # ── Failure-mode classification ──────────────────────────────────────────────
 COMPACT_RADIUS = 5.0
 def _classify(ep_radii, ep_com_dists, n_penned, n_sheep):
    if n_penned == n_sheep:
        return "SUCCESS"
    if min(ep_radii) > COMPACT_RADIUS:
        return "NEVER_COMPACT"
    first = next(i for i, r in enumerate(ep_radii) if r <= COMPACT_RADIUS)
    if min(ep_com_dists[first:]) > 3.0:
        return "COMPACT_CANT_DRIVE"
    if n_penned == 0:
        return "DROVE_NO_SHEEP"
    return f"PARTIAL_{n_penned}of{n_sheep}"
 # ── Evaluation ───────────────────────────────────────────────────────────────
 def evaluate(model, vn_template, n_sheep, n_episodes, max_steps,
             reward_cfg=None):
    """Evaluate at a given sheep count; returns metrics dict."""
    raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, reward_cfg)])
    vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
    vn.obs_rms = deepcopy(vn_template.obs_rms)
    vn.ret_rms = deepcopy(vn_template.ret_rms)
    successes = 0
    ep_lens = []
    min_pen_list = []
    action_mags = []
    failure_counts = {}
    rc_sums = {}
    rc_n = 0
    for _ in range(n_episodes):
        obs = vn.reset()
        done = False
        steps = 0
        min_pen = float("inf")
        mags = []
        ep_radii = []
        ep_com_dists = []
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, _, dones, infos = vn.step(action)
            done = dones[0]
            inner = vn.envs[0]
            com, radius, _ = inner._flock_stats()
            min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER)))
            mags.append(float(np.linalg.norm(action[0])))
            ep_radii.append(radius)
            ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER)))
            steps += 1
            rc = infos[0].get("rcomps")
            if rc:
                for k, v in rc.items():
                    rc_sums[k] = rc_sums.get(k, 0.0) + v
                rc_n += 1
        n_penned = infos[0].get("n_penned", 0)
        success = n_penned == n_sheep
        successes += int(success)
        ep_lens.append(steps)
        min_pen_list.append(min_pen)
        action_mags.extend(mags)
        mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep)
        failure_counts[mode] = failure_counts.get(mode, 0) + 1
    vn.close()
    result = {
        "sr": successes / n_episodes,
        "mean_len": float(np.mean(ep_lens)),
        "mean_min_pen": float(np.mean(min_pen_list)),
        "mean_act": float(np.mean(action_mags)) if action_mags else 0.0,
        "failure_modes": failure_counts,
    }
    if rc_n > 0:
        result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()}
    return result
 # ── CLI ──────────────────────────────────────────────────────────────────────
 DEFAULT_CONFIG = {
    "W_PER_SHEEP": 2.0,
    "W_ALIGN": 0.05,
    "W_PEN_BONUS": 10.0,
    "W_COMPLETE": 100.0,
    "W_STEP_COST": 0.02,
    "W_SOUTH": 0.01,
    "W_COMPACT": 0.0,
    "W_WALL_TOUCH": 0.04,
    "WALL_TOUCH_BUFFER": 0.3,
    "ALIGN_SHAPE": "standoff",
    "ALIGN_GATED": True,
    "ENTRY_AWARE": True,
    "ent_coef": 0.02,
 }
 def parse_args():
    p = argparse.ArgumentParser(
        description="PPO training for herding task with curriculum learning")
    p.add_argument("--config", type=str, default=None,
                   help="JSON config file (reward weights + ent_coef)")
    p.add_argument("--max-sheep", type=int, default=10)
    p.add_argument("--steps-per-stage", type=int, default=1_500_000)
    p.add_argument("--n-envs", type=int, default=8)
    p.add_argument("--max-steps", type=int, default=2500)
    p.add_argument("--eval-episodes", type=int, default=30)
    p.add_argument("--run-dir", type=str, default=None)
    p.add_argument("--no-gif", action="store_true",
                   help="Skip per-stage GIF rendering (PNGs still produced).")
    p.add_argument("--gif-fps", type=int, default=20)
    p.add_argument("--gif-skip", type=int, default=3,
                   help="Keep every Nth frame (smaller GIF; default 3).")
    return p.parse_args()
 # ── Main ─────────────────────────────────────────────────────────────────────
 def main():
    args = parse_args()
    # Load config: --config overrides, else auto-load config.json if present
    cfg = dict(DEFAULT_CONFIG)
    config_path = args.config
    if config_path is None and os.path.exists("config.json"):
        config_path = "config.json"
    if config_path:
        with open(config_path) as f:
            cfg.update(json.load(f))
        print(f"Config loaded from {config_path}")
    rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
    # Run directory
    run_dir = args.run_dir or os.path.join(
        "runs", time.strftime("%Y%m%d_%H%M%S"))
    eval_dir = os.path.join(run_dir, "eval")
    os.makedirs(eval_dir, exist_ok=True)
    with open(os.path.join(run_dir, "config.json"), "w") as f:
        json.dump(cfg, f, indent=2)
    print(f"Config: {cfg}")
    print(f"Run dir: {run_dir}")
    print(f"Curriculum: 1 → {args.max_sheep} sheep, "
          f"{args.steps_per_stage:,} steps/stage\n")
    # Training envs
    train_env = SubprocVecEnv([
        make_env(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg)
        for i in range(args.n_envs)
    ])
    vn = VecNormalize(train_env, norm_obs=True, norm_reward=True,
                      clip_obs=10.0)
    # Model — force CPU (PPO with MLP runs faster on CPU than GPU; SB3 warns
    # about this otherwise).
    model = PPO(
        "MlpPolicy", vn,
        learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
        gamma=0.995, gae_lambda=0.95, clip_range=0.2,
        ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5,
        policy_kwargs=dict(net_arch=[256, 256]),
        device="cpu",
        verbose=0,
    )
    # Curriculum training
    stage_results = []
    t0 = time.time()
    try:
        for n in range(1, args.max_sheep + 1):
            if n == 1:
                print(f"\n[Stage n_sheep=1] training {args.steps_per_stage:,} steps")
                model.learn(
                    total_timesteps=args.steps_per_stage,
                    reset_num_timesteps=True,
                    callback=ProgressCallback("1 sheep", freq=100_000),
                )
            else:
                # Mixed transition: half envs stay at n-1, half advance to n,
                # for the first half of the stage budget. This prevents the
                # n+1 task's noisy early gradients from destroying the n policy
                # (catastrophic forgetting) before it has a chance to adapt.
                half = max(1, args.n_envs // 2)
                for i in range(half):
                    vn.env_method("set_n_sheep", n - 1, indices=[i])
                for i in range(half, args.n_envs):
                    vn.env_method("set_n_sheep", n, indices=[i])
                mix_steps  = args.steps_per_stage // 2
                full_steps = args.steps_per_stage - mix_steps
                print(f"\n[Stage n_sheep={n}] mixed ({n-1}/{n} sheep) "
                      f"{mix_steps:,} steps")
                model.learn(
                    total_timesteps=mix_steps,
                    reset_num_timesteps=False,
                    callback=ProgressCallback(f"{n-1}→{n} mix", freq=100_000),
                )
                vn.env_method("set_n_sheep", n)
                print(f"[Stage n_sheep={n}] full ({n} sheep) {full_steps:,} steps")
                model.learn(
                    total_timesteps=full_steps,
                    reset_num_timesteps=False,
                    callback=ProgressCallback(f"{n} sheep", freq=100_000),
                )
            # Evaluate
            print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
            r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
            print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}%  "
                  f"mean_len={r['mean_len']:.0f}  "
                  f"mean_min_pen={r['mean_min_pen']:.1f}m  "
                  f"mean_act={r['mean_act']:.2f}")
            # Failure-mode breakdown
            if r["failure_modes"]:
                modes = "  ".join(
                    f"{k}={v}" for k, v in sorted(
                        r["failure_modes"].items(), key=lambda x: -x[1]))
                print(f"  failure modes: {modes}")
            # Reward breakdown
            if "reward_per_step" in r:
                rps = r["reward_per_step"]
                print(f"  reward/step: " + "  ".join(
                    f"{k}={v:+.4f}" for k, v in rps.items()))
            # Episode visualisation: trajectory + timeseries + animated GIF
            hist = run_and_record(model, vn, n, args.max_steps, rcfg,
                                  seed=1000 + n)
            tag = "success" if hist["success"] else "fail"
            plot_trajectory(
                hist,
                os.path.join(eval_dir, f"traj_{n}s_{tag}.png"))
            plot_timeseries(
                hist,
                os.path.join(eval_dir, f"ts_{n}s_{tag}.png"))
            if not args.no_gif:
                save_episode_gif(
                    hist,
                    os.path.join(eval_dir, f"ep_{n}s_{tag}.gif"),
                    fps=args.gif_fps, skip=args.gif_skip)
            r["n_sheep"] = n
            stage_results.append(r)
        # Save artefacts
        model.save(os.path.join(run_dir, "final_model"))
        vn.save(os.path.join(run_dir, "vecnorm.pkl"))
        with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
            json.dump(stage_results, f, indent=2)
    finally:
        try:
            vn.close()
        except Exception:
            pass
    # Summary
    elapsed = (time.time() - t0) / 60
    print("\n" + "=" * 70)
    print("  TRAINING SUMMARY")
    print("=" * 70)
    for r in stage_results:
        print(f"  n_sheep={r['n_sheep']}  sr={r['sr']*100:>3.0f}%  "
              f"len={r['mean_len']:>5.0f}  min_pen={r['mean_min_pen']:>5.1f}m  "
              f"act={r['mean_act']:.2f}")
    print(f"\n  Total time: {elapsed:.1f} min")
    print(f"  Artefacts:  {run_dir}/")
    plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png"))
    print(f"  Plots:      {run_dir}/success_rate.png, {eval_dir}/")
 if __name__ == "__main__":
    main()
@@ -1,412 +0,0 @@
 """
 PPO training with attention-based policy (train_at.py).
 Key difference from train.py
 -----------------------------
 - Observation exposes ALL sheep as individual per-sheep tokens rather than
  only the top-3 farthest. The policy therefore has complete flock visibility
  at any sheep count — no hidden sheep even at n=10.
 - A TransformerFeaturesExtractor processes the sheep tokens with multi-head
  self-attention (permutation-invariant), then mean-pools over valid tokens
  and concatenates the result with global dog/pen features.
 - Curriculum transition uses the same mixed-env approach as train.py: half
  the envs stay at n-1 for the first half of each new stage to suppress
  catastrophic forgetting.
 Observation layout  (7 + MAX_SHEEP*6 = 67 dims, fixed)
 -------------------------------------------------------
  Global (7):
    dog_x / FIELD,  dog_y / FIELD,
    cos(heading),   sin(heading),
    (pen_x - dog_x) / D,  (pen_y - dog_y) / D,
    n_active / n_sheep
  Per sheep i  (6):
    (sheep_x - dog_x) / D,  (sheep_y - dog_y) / D,   ← pos rel to dog
    (pen_x   - sheep_x) / D, (pen_y  - sheep_y) / D,  ← sheep-to-pen
    is_active   1.0 if not penned, else 0.0
    is_valid    1.0 if i < n_sheep, else 0.0 (padding sentinel)
  After VecNormalize, is_valid for real sheep normalises > 0 and for
  padding tokens < 0 (because mean ∈ (0,1)), so a threshold of 0 cleanly
  separates real from padded without any extra bookkeeping.
 Usage
 -----
    python train_at.py                                 # defaults from config.json
    python train_at.py --max-sheep 10 --steps-per-stage 2000000
    python train_at.py --embed-dim 128 --n-heads 4 --n-layers 3
 """
 import argparse
 import json
 import os
 import time
 from copy import deepcopy
 import numpy as np
 import torch
 import torch.nn as nn
 from gymnasium import spaces
 from stable_baselines3 import PPO
 from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
 from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize
 from herding_env import HerdingEnv
 from train import ProgressCallback, _classify, COMPACT_RADIUS, DEFAULT_CONFIG
 from viz import (
    run_and_record, plot_trajectory, plot_timeseries,
    plot_success_rate, save_episode_gif,
 )
 # ── Per-sheep token observation environment ───────────────────────────────────
 class HerdingEnvAt(HerdingEnv):
    """
    HerdingEnv with a per-sheep token observation for the attention policy.
    Everything else (dynamics, reward, curriculum interface) is inherited.
    """
    OBS_GLOBAL = 7
    OBS_SHEEP  = 6
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        obs_dim = self.OBS_GLOBAL + self.MAX_SHEEP * self.OBS_SHEEP
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32
        )
    def _obs(self) -> np.ndarray:
        S = self.FIELD
        D = 2.0 * self.FIELD
        pen_ref     = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER
        active_mask = ~self.penned[:self.n_sheep]
        n_active    = int(active_mask.sum())
        global_feats = np.array([
            self.dog_pos[0] / S,
            self.dog_pos[1] / S,
            float(np.cos(self.dog_heading)),
            float(np.sin(self.dog_heading)),
            (pen_ref[0] - self.dog_pos[0]) / D,
            (pen_ref[1] - self.dog_pos[1]) / D,
            n_active / max(self.n_sheep, 1),
        ], dtype=np.float32)
        sheep_feats = np.zeros((self.MAX_SHEEP, self.OBS_SHEEP), dtype=np.float32)
        for i in range(self.n_sheep):
            pos = self.sheep_pos[i]
            sheep_feats[i] = [
                (pos[0] - self.dog_pos[0]) / D,
                (pos[1] - self.dog_pos[1]) / D,
                (pen_ref[0] - pos[0]) / D,
                (pen_ref[1] - pos[1]) / D,
                float(not self.penned[i]),
                1.0,   # is_valid: this sheep exists
            ]
        # i >= n_sheep: all zeros, is_valid=0 → masked out in attention
        return np.concatenate([global_feats, sheep_feats.ravel()])
 # ── Attention features extractor ──────────────────────────────────────────────
 class ShepherdAttentionExtractor(BaseFeaturesExtractor):
    """
    Multi-head self-attention over per-sheep tokens, mean-pooled over valid
    (non-padding) tokens and concatenated with global dog/pen features.
    After VecNormalize:
      real sheep  → is_valid_norm > 0   (normalised from 1.0)
      padding     → is_valid_norm ≤ 0   (normalised from 0.0)
    so threshold at 0 is always correct regardless of curriculum stage.
    """
    GLOBAL_DIM = HerdingEnvAt.OBS_GLOBAL   # 7
    SHEEP_DIM  = HerdingEnvAt.OBS_SHEEP    # 6
    MAX_SHEEP  = HerdingEnv.MAX_SHEEP      # 10
    VALID_IDX  = 5                          # index of is_valid within each token
    def __init__(self, observation_space, embed_dim: int = 64,
                 n_heads: int = 4, n_layers: int = 2, ff_dim: int = 128):
        super().__init__(observation_space,
                         features_dim=self.GLOBAL_DIM + embed_dim)
        self.sheep_embed = nn.Linear(self.SHEEP_DIM, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=n_heads, dim_feedforward=ff_dim,
            dropout=0.0, batch_first=True,
        )
        self.transformer = nn.TransformerEncoder(encoder_layer,
                                                 num_layers=n_layers,
                                                 enable_nested_tensor=False)
    def forward(self, obs: torch.Tensor) -> torch.Tensor:
        B = obs.shape[0]
        global_feats = obs[:, :self.GLOBAL_DIM]                       # (B, 7)
        tokens = obs[:, self.GLOBAL_DIM:].view(
            B, self.MAX_SHEEP, self.SHEEP_DIM)                        # (B, 10, 6)
        # is_valid after VecNorm: real > 0, padding ≤ 0
        is_valid_norm    = tokens[:, :, self.VALID_IDX]               # (B, 10)
        key_padding_mask = is_valid_norm <= 0.0                       # True → ignore
        x = self.sheep_embed(tokens)                                  # (B, 10, E)
        x = self.transformer(x, src_key_padding_mask=key_padding_mask)
        valid_w = (is_valid_norm > 0.0).float().unsqueeze(-1)        # (B, 10, 1)
        pooled  = (x * valid_w).sum(1) / valid_w.sum(1).clamp(min=1.0)
        return torch.cat([global_feats, pooled], dim=1)               # (B, 7+E)
 # ── Environment factory ───────────────────────────────────────────────────────
 def make_env_at(n_sheep, seed, max_steps, reward_cfg=None):
    def _init():
        env = HerdingEnvAt(n_sheep=n_sheep, max_steps=max_steps,
                           reward_cfg=reward_cfg)
        env.reset(seed=seed)
        return env
    return _init
 # ── Evaluation ────────────────────────────────────────────────────────────────
 def evaluate_at(model, vn_template, n_sheep, n_episodes, max_steps,
                reward_cfg=None):
    raw = DummyVecEnv([make_env_at(n_sheep, 9999, max_steps, reward_cfg)])
    vn  = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
    vn.obs_rms = deepcopy(vn_template.obs_rms)
    vn.ret_rms = deepcopy(vn_template.ret_rms)
    successes = 0
    ep_lens, min_pen_list, action_mags = [], [], []
    failure_counts, rc_sums = {}, {}
    rc_n = 0
    for _ in range(n_episodes):
        obs  = vn.reset()
        done = False
        steps, min_pen = 0, float("inf")
        mags, ep_radii, ep_com_dists = [], [], []
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, _, dones, infos = vn.step(action)
            done  = dones[0]
            inner = vn.envs[0]
            com, radius, _ = inner._flock_stats()
            min_pen = min(min_pen,
                          float(np.linalg.norm(com - inner.PEN_CENTER)))
            mags.append(float(np.linalg.norm(action[0])))
            ep_radii.append(radius)
            ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER)))
            steps += 1
            rc = infos[0].get("rcomps")
            if rc:
                for k, v in rc.items():
                    rc_sums[k] = rc_sums.get(k, 0.0) + v
                rc_n += 1
        n_penned = infos[0].get("n_penned", 0)
        successes += int(n_penned == n_sheep)
        ep_lens.append(steps)
        min_pen_list.append(min_pen)
        action_mags.extend(mags)
        mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep)
        failure_counts[mode] = failure_counts.get(mode, 0) + 1
    vn.close()
    result = {
        "sr":          successes / n_episodes,
        "mean_len":    float(np.mean(ep_lens)),
        "mean_min_pen": float(np.mean(min_pen_list)),
        "mean_act":    float(np.mean(action_mags)) if action_mags else 0.0,
        "failure_modes": failure_counts,
    }
    if rc_n > 0:
        result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()}
    return result
 # ── CLI ───────────────────────────────────────────────────────────────────────
 def parse_args():
    p = argparse.ArgumentParser(
        description="PPO + attention training for herding task")
    p.add_argument("--config",           type=str, default=None)
    p.add_argument("--max-sheep",        type=int, default=10)
    p.add_argument("--steps-per-stage",  type=int, default=1_500_000)
    p.add_argument("--n-envs",           type=int, default=8)
    p.add_argument("--max-steps",        type=int, default=2500)
    p.add_argument("--eval-episodes",    type=int, default=30)
    p.add_argument("--run-dir",          type=str, default=None)
    p.add_argument("--no-gif",           action="store_true")
    p.add_argument("--gif-fps",          type=int, default=20)
    p.add_argument("--gif-skip",         type=int, default=3)
    # Attention architecture
    p.add_argument("--embed-dim",        type=int, default=64,
                   help="Transformer embedding dimension (default 64)")
    p.add_argument("--n-heads",          type=int, default=4,
                   help="Number of attention heads (default 4)")
    p.add_argument("--n-layers",         type=int, default=2,
                   help="Number of transformer encoder layers (default 2)")
    p.add_argument("--ff-dim",           type=int, default=128,
                   help="Transformer feed-forward dim (default 128)")
    return p.parse_args()
 # ── Main ──────────────────────────────────────────────────────────────────────
 def main():
    args = parse_args()
    cfg = dict(DEFAULT_CONFIG)
    config_path = args.config
    if config_path is None and os.path.exists("config.json"):
        config_path = "config.json"
    if config_path:
        with open(config_path) as f:
            cfg.update(json.load(f))
        print(f"Config loaded from {config_path}")
    rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
    run_dir  = args.run_dir or os.path.join(
        "runs", "at_" + time.strftime("%Y%m%d_%H%M%S"))
    eval_dir = os.path.join(run_dir, "eval")
    os.makedirs(eval_dir, exist_ok=True)
    with open(os.path.join(run_dir, "config.json"), "w") as f:
        json.dump(cfg, f, indent=2)
    print(f"Config:      {cfg}")
    print(f"Run dir:     {run_dir}")
    print(f"Curriculum:  1 → {args.max_sheep} sheep, "
          f"{args.steps_per_stage:,} steps/stage")
    print(f"Transformer: embed={args.embed_dim}  heads={args.n_heads}  "
          f"layers={args.n_layers}  ff={args.ff_dim}\n")
    train_env = SubprocVecEnv([
        make_env_at(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg)
        for i in range(args.n_envs)
    ])
    vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
    model = PPO(
        "MlpPolicy", vn,
        learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
        gamma=0.995, gae_lambda=0.95, clip_range=0.2,
        ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5,
        policy_kwargs=dict(
            features_extractor_class=ShepherdAttentionExtractor,
            features_extractor_kwargs=dict(
                embed_dim=args.embed_dim,
                n_heads=args.n_heads,
                n_layers=args.n_layers,
                ff_dim=args.ff_dim,
            ),
            net_arch=[256, 256],
        ),
        device="cpu",
        verbose=0,
    )
    stage_results = []
    t0 = time.time()
    try:
        for n in range(1, args.max_sheep + 1):
            if n == 1:
                print(f"\n[Stage n_sheep=1] training {args.steps_per_stage:,} steps")
                model.learn(
                    total_timesteps=args.steps_per_stage,
                    reset_num_timesteps=True,
                    callback=ProgressCallback("1 sheep", freq=100_000),
                )
            else:
                half       = max(1, args.n_envs // 2)
                mix_steps  = args.steps_per_stage // 2
                full_steps = args.steps_per_stage - mix_steps
                for i in range(half):
                    vn.env_method("set_n_sheep", n - 1, indices=[i])
                for i in range(half, args.n_envs):
                    vn.env_method("set_n_sheep", n, indices=[i])
                print(f"\n[Stage n_sheep={n}] mixed ({n-1}/{n} sheep) "
                      f"{mix_steps:,} steps")
                model.learn(
                    total_timesteps=mix_steps,
                    reset_num_timesteps=False,
                    callback=ProgressCallback(f"{n-1}→{n} mix", freq=100_000),
                )
                vn.env_method("set_n_sheep", n)
                print(f"[Stage n_sheep={n}] full ({n} sheep) {full_steps:,} steps")
                model.learn(
                    total_timesteps=full_steps,
                    reset_num_timesteps=False,
                    callback=ProgressCallback(f"{n} sheep", freq=100_000),
                )
            print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
            r = evaluate_at(model, vn, n, args.eval_episodes,
                            args.max_steps, rcfg)
            print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}%  "
                  f"mean_len={r['mean_len']:.0f}  "
                  f"mean_min_pen={r['mean_min_pen']:.1f}m  "
                  f"mean_act={r['mean_act']:.2f}")
            if r["failure_modes"]:
                modes = "  ".join(
                    f"{k}={v}" for k, v in sorted(
                        r["failure_modes"].items(), key=lambda x: -x[1]))
                print(f"  failure modes: {modes}")
            if "reward_per_step" in r:
                rps = r["reward_per_step"]
                print("  reward/step: " + "  ".join(
                    f"{k}={v:+.4f}" for k, v in rps.items()))
            hist = run_and_record(
                model, vn, n, args.max_steps, rcfg,
                seed=1000 + n, make_env_fn=make_env_at,
            )
            tag = "success" if hist["success"] else "fail"
            plot_trajectory(hist, os.path.join(eval_dir, f"traj_{n}s_{tag}.png"))
            plot_timeseries(hist, os.path.join(eval_dir, f"ts_{n}s_{tag}.png"))
            if not args.no_gif:
                save_episode_gif(
                    hist,
                    os.path.join(eval_dir, f"ep_{n}s_{tag}.gif"),
                    fps=args.gif_fps, skip=args.gif_skip)
            r["n_sheep"] = n
            stage_results.append(r)
        model.save(os.path.join(run_dir, "final_model"))
        vn.save(os.path.join(run_dir, "vecnorm.pkl"))
        with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
            json.dump(stage_results, f, indent=2)
    finally:
        try:
            vn.close()
        except Exception:
            pass
    elapsed = (time.time() - t0) / 60
    print("\n" + "=" * 70)
    print("  TRAINING SUMMARY  (attention policy)")
    print("=" * 70)
    for r in stage_results:
        print(f"  n_sheep={r['n_sheep']}  sr={r['sr']*100:>3.0f}%  "
              f"len={r['mean_len']:>5.0f}  "
              f"min_pen={r['mean_min_pen']:>5.1f}m  "
              f"act={r['mean_act']:.2f}")
    print(f"\n  Total time: {elapsed:.1f} min")
    print(f"  Artefacts:  {run_dir}/")
    plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png"))
    print(f"  Plots:      {run_dir}/success_rate.png, {eval_dir}/")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,267 @@
 """Train a PPO shepherd-dog policy on ``HerdingEnv`` with curriculum.
 Defaults to 16 parallel ``SubprocVecEnv`` workers feeding a GPU policy.
 Saves checkpoints, the best-eval model, and the VecNormalize stats —
 all three are needed at inference time by the Webots controller.
 Usage::
    python -m training.train_ppo \
        --config training/configs/ppo_default.yaml \
        --out-dir training/runs/baseline
 To resume from a checkpoint::
    python -m training.train_ppo --resume training/runs/baseline/checkpoints/ppo_500000_steps.zip
 """
 from __future__ import annotations
 import argparse
 import os
 import sys
 from pathlib import Path
 import yaml
 _HERE = os.path.dirname(os.path.abspath(__file__))
 _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
 import numpy as np
 import torch as th
 from stable_baselines3 import PPO
 from stable_baselines3.common.callbacks import (
    BaseCallback, CheckpointCallback, EvalCallback,
 )
 from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.vec_env import (
    DummyVecEnv, SubprocVecEnv, VecNormalize,
 )
 from training.herding_env import HerdingEnv
 # --------------------------------------------------------------------------
 # Env factories
 # --------------------------------------------------------------------------
 def _make_env(rank: int, seed: int = 0):
    def _thunk():
        env = HerdingEnv(seed=seed + rank)
        env = Monitor(env, info_keywords=("is_success", "n_sheep", "n_penned"))
        return env
    return _thunk
 # --------------------------------------------------------------------------
 # Curriculum callback
 # --------------------------------------------------------------------------
 class CurriculumCallback(BaseCallback):
    """Drive the env's flock-size + state-space difficulty curriculum.
    Schedule entries: {step, max_n_sheep, difficulty}. The largest entry
    whose step <= num_timesteps wins; both knobs update together.
    """
    def __init__(self, schedule, vec_envs, verbose: int = 0):
        super().__init__(verbose)
        self.schedule = sorted(schedule, key=lambda d: d["step"])
        # Accept a list of envs so the eval env tracks training difficulty.
        self.vec_envs = vec_envs if isinstance(vec_envs, (list, tuple)) else [vec_envs]
        self._last_n = None
        self._last_d = None
    def _call(self, method, value):
        for v in self.vec_envs:
            try:
                v.env_method(method, value)
            except AttributeError:
                v.venv.env_method(method, value)
    def _on_step(self) -> bool:
        t = self.num_timesteps
        n = self.schedule[0]["max_n_sheep"]
        d = self.schedule[0].get("difficulty", 1.0)
        for entry in self.schedule:
            if t >= entry["step"]:
                n = entry["max_n_sheep"]
                d = entry.get("difficulty", 1.0)
        if n != self._last_n:
            self._call("set_max_n_sheep", n)
            self._last_n = n
        if d != self._last_d:
            self._call("set_difficulty", d)
            self._last_d = d
            if self.verbose:
                print(f"[curriculum] t={t} → max_n_sheep={n} difficulty={d}")
        return True
 # --------------------------------------------------------------------------
 # Main
 # --------------------------------------------------------------------------
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", default=os.path.join(_HERE, "configs", "ppo_default.yaml"))
    parser.add_argument("--out-dir", default=os.path.join(_HERE, "runs", "latest"))
    parser.add_argument("--n-envs", type=int, default=None,
                        help="Override config n_envs.")
    parser.add_argument("--total-timesteps", type=int, default=None,
                        help="Override config total_timesteps.")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--resume", type=str, default=None,
                        help="Path to a SB3 zip to resume from.")
    # SB3 recommends CPU for MlpPolicy — GPU helps CNN policies, not MLPs
    # of this size. Override with --device cuda if you really want it.
    parser.add_argument("--device", default="cpu")
    parser.add_argument("--no-vecnorm", action="store_true",
                        help="Disable VecNormalize wrapper. Required when "
                             "resuming from a BC-pretrained policy that "
                             "wasn't trained under it.")
    parser.add_argument("--no-curriculum", action="store_true",
                        help="Skip curriculum callback (resumed policy is "
                             "already competent across the distribution).")
    parser.add_argument("--imitate-weight", type=float, default=None,
                        help="Override env W_IMITATE. Set to 0 to disable "
                             "Strömbom imitation reward.")
    parser.add_argument("--difficulty", type=float, default=None,
                        help="Override env difficulty (0=easy, 1=hard). "
                             "Used in BC fine-tune to skip easy curriculum.")
    parser.add_argument("--log-std", type=float, default=None,
                        help="Override the policy's log_std after load. "
                             "BC trained with std≈1.6 (log_std=0.5) which "
                             "is too noisy for fine-tune. Use -1.5 (std≈0.22) "
                             "to keep PPO close to the BC mean while still "
                             "exploring locally.")
    parser.add_argument("--learning-rate", type=float, default=None,
                        help="Override config learning rate. For BC "
                             "fine-tune, 5e-5 is much safer than the 3e-4 "
                             "default.")
    args = parser.parse_args()
    with open(args.config) as f:
        cfg = yaml.safe_load(f)
    n_envs = args.n_envs or cfg["n_envs"]
    total_timesteps = args.total_timesteps or cfg["total_timesteps"]
    out = Path(args.out_dir)
    out.mkdir(parents=True, exist_ok=True)
    (out / "checkpoints").mkdir(exist_ok=True)
    (out / "best").mkdir(exist_ok=True)
    (out / "evals").mkdir(exist_ok=True)
    print(f"[train] out={out}  n_envs={n_envs}  total={total_timesteps}  device={args.device}")
    # --- Train env (vectorised, optionally normalised) ---
    env_fns = [_make_env(i, seed=args.seed) for i in range(n_envs)]
    venv = SubprocVecEnv(env_fns) if n_envs > 1 else DummyVecEnv(env_fns)
    eval_venv = DummyVecEnv([_make_env(99, seed=args.seed + 999)])
    if not args.no_vecnorm:
        venv = VecNormalize(venv, norm_obs=True, norm_reward=False, clip_obs=10.0)
        eval_venv = VecNormalize(eval_venv, norm_obs=True, norm_reward=False,
                                 clip_obs=10.0, training=False)
        eval_venv.obs_rms = venv.obs_rms
    else:
        print("[train] VecNormalize disabled (resumed policy was trained without it).")
    # Apply env-level overrides (used by BC fine-tune to disable Strömbom
    # imitation and start at full deployment difficulty).
    def _env_call(method, value):
        for v in (venv, eval_venv):
            try:
                v.env_method(method, value)
            except AttributeError:
                v.venv.env_method(method, value)
    if args.imitate_weight is not None:
        _env_call("set_imitate_weight", args.imitate_weight)
        print(f"[train] W_IMITATE overridden to {args.imitate_weight}")
    if args.difficulty is not None:
        _env_call("set_difficulty", args.difficulty)
        print(f"[train] difficulty pinned to {args.difficulty}")
    # --- Model ---
    policy_kwargs = dict(
        net_arch=dict(pi=cfg["net_arch_pi"], vf=cfg["net_arch_vf"]),
        log_std_init=cfg.get("log_std_init", 0.0),
    )
    if args.resume:
        print(f"[train] resuming from {args.resume}")
        custom_objects = {}
        if args.learning_rate is not None:
            custom_objects["learning_rate"] = args.learning_rate
        model = PPO.load(args.resume, env=venv, device=args.device,
                         tensorboard_log=str(out / "tb"),
                         custom_objects=custom_objects or None)
        if args.log_std is not None:
            import torch as _th
            with _th.no_grad():
                model.policy.log_std.fill_(args.log_std)
            print(f"[train] log_std overridden to {args.log_std} "
                  f"(std≈{2.71828 ** args.log_std:.2f})")
        if args.learning_rate is not None:
            print(f"[train] learning_rate overridden to {args.learning_rate}")
    else:
        model = PPO(
            cfg["policy"], venv,
            learning_rate=cfg["learning_rate"],
            n_steps=cfg["n_steps"],
            batch_size=cfg["batch_size"],
            n_epochs=cfg["n_epochs"],
            gamma=cfg["gamma"],
            gae_lambda=cfg["gae_lambda"],
            clip_range=cfg["clip_range"],
            ent_coef=cfg["ent_coef"],
            vf_coef=cfg["vf_coef"],
            max_grad_norm=cfg["max_grad_norm"],
            target_kl=cfg.get("target_kl"),
            policy_kwargs=policy_kwargs,
            tensorboard_log=str(out / "tb"),
            seed=args.seed,
            device=args.device,
            verbose=1,
        )
    # --- Callbacks ---
    ckpt_cb = CheckpointCallback(
        save_freq=max(1, cfg["checkpoint_freq"] // n_envs),
        save_path=str(out / "checkpoints"), name_prefix="ppo",
        save_vecnormalize=True,
    )
    eval_cb = EvalCallback(
        eval_venv,
        best_model_save_path=str(out / "best"),
        log_path=str(out / "evals"),
        eval_freq=max(1, cfg["eval_freq"] // n_envs),
        n_eval_episodes=cfg["n_eval_episodes"],
        deterministic=True,
    )
    callbacks = [ckpt_cb, eval_cb]
    if not args.no_curriculum and "curriculum" in cfg and cfg["curriculum"]:
        callbacks.append(CurriculumCallback(
            cfg["curriculum"], [venv, eval_venv], verbose=1,
        ))
    elif args.no_curriculum:
        print("[train] curriculum disabled — env knobs left at their current values.")
    # --- Train ---
    model.learn(total_timesteps=total_timesteps, callback=callbacks,
                progress_bar=True)
    # --- Save final model + VecNormalize stats ---
    model.save(out / "final.zip")
    venv.save(str(out / "vecnormalize.pkl"))
    # The EvalCallback already wrote best_model.zip into out/best/ — drop the
    # VecNormalize stats next to it for the controller to pick up.
    venv.save(str(out / "best" / "vecnormalize.pkl"))
    print(f"[train] done. saved to {out}")
 if __name__ == "__main__":
    main()
@@ -1,342 +0,0 @@
 """
 All visualization for the herding policy: trajectory plots, timeseries plots,
 success-rate bar chart, and animated GIFs.
 Used both by train.py (auto-rendered after each curriculum stage) and as a CLI
 to render a fresh episode against a saved model.
 CLI usage:
    python viz.py --run-dir runs/v1 --n-sheep 5
    python viz.py --run-dir runs/v1 --n-sheep 10 --no-gif
    python viz.py --model runs/v1/final_model.zip --vecnorm runs/v1/vecnorm.pkl \\
        --n-sheep 3 --out-dir vis_v1_3sheep
 """
 import argparse
 import os
 import json
 from copy import deepcopy
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 import matplotlib.patches as mpatches
 import matplotlib.animation as animation
 from matplotlib.collections import LineCollection
 import numpy as np
 from stable_baselines3 import PPO
 from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
 from herding_env import HerdingEnv
 # ── Palette ──────────────────────────────────────────────────────────────────
 SHEEP_COLORS = [
    "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00",
    "#a65628", "#f781bf", "#999999", "#66c2a5", "#fc8d62",
 ]
 DOG_COLOR = "#4e342e"
 # ── Common drawing primitives ────────────────────────────────────────────────
 def draw_field(ax):
    ax.set_xlim(-16, 16)
    ax.set_ylim(-16, 16)
    ax.set_aspect("equal")
    ax.set_facecolor("#dcedc8")
    ax.add_patch(mpatches.Rectangle(
        (-15, -15), 30, 30, fill=False, edgecolor="#795548", lw=2))
    ax.add_patch(mpatches.Rectangle(
        (10, -15), 3, 7, facecolor="#ffe082", edgecolor="#795548", lw=2))
    ax.text(11.5, -11.5, "pen", ha="center", va="center",
            fontsize=8, color="#795548")
 def faded_path(ax, xs, ys, color, lw=1.5, label=None):
    n = len(xs)
    if n < 2:
        return
    points = np.array([xs, ys]).T.reshape(-1, 1, 2)
    segs = np.concatenate([points[:-1], points[1:]], axis=1)
    alphas = np.linspace(0.15, 1.0, len(segs))
    colors = [(*matplotlib.colors.to_rgb(color), a) for a in alphas]
    ax.add_collection(LineCollection(segs, colors=colors, linewidth=lw))
    if label:
        ax.plot([], [], color=color, lw=lw, label=label)
 # ── Episode rollout ──────────────────────────────────────────────────────────
 def make_eval_env(n_sheep, seed, max_steps, reward_cfg=None):
    def _init():
        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
                         reward_cfg=reward_cfg)
        env.reset(seed=seed)
        return env
    return _init
 def run_and_record(model, vn_template, n_sheep, max_steps,
                   reward_cfg=None, seed=42, make_env_fn=None):
    """Run one deterministic episode and return full trajectory history."""
    _factory = make_env_fn or make_eval_env
    raw = DummyVecEnv([_factory(n_sheep, seed, max_steps, reward_cfg)])
    vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
    vn.obs_rms = deepcopy(vn_template.obs_rms)
    vn.ret_rms = deepcopy(vn_template.ret_rms)
    obs = vn.reset()
    inner = vn.envs[0]
    done = False
    dog_xs, dog_ys = [], []
    sheep_xs = [[] for _ in range(n_sheep)]
    sheep_ys = [[] for _ in range(n_sheep)]
    sheep_penned = [[] for _ in range(n_sheep)]
    radii = []
    pen_dists = [[] for _ in range(n_sheep)]
    action_mags = []
    rewards = []
    penned_at = [None] * n_sheep
    step = 0
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, dones, infos = vn.step(action)
        done = dones[0]
        step += 1
        dog_xs.append(float(inner.dog_pos[0]))
        dog_ys.append(float(inner.dog_pos[1]))
        com, radius, _ = inner._flock_stats()
        radii.append(radius)
        rewards.append(float(reward[0]))
        action_mags.append(float(np.linalg.norm(action[0])))
        for i in range(n_sheep):
            sheep_xs[i].append(float(inner.sheep_pos[i][0]))
            sheep_ys[i].append(float(inner.sheep_pos[i][1]))
            sheep_penned[i].append(bool(inner.penned[i]))
            pen_dists[i].append(
                float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER)))
            if inner.penned[i] and penned_at[i] is None:
                penned_at[i] = step
    n_penned = infos[0].get("n_penned", 0)
    vn.close()
    return dict(
        dog_xs=dog_xs, dog_ys=dog_ys,
        sheep_xs=sheep_xs, sheep_ys=sheep_ys,
        sheep_penned=sheep_penned,
        radii=radii, pen_dists=pen_dists,
        action_mags=action_mags, rewards=rewards,
        penned_at=penned_at,
        n_penned=n_penned, n_sheep=n_sheep,
        success=n_penned == n_sheep, steps=step,
    )
 # ── Static plots ─────────────────────────────────────────────────────────────
 def plot_trajectory(hist, out_path):
    fig, ax = plt.subplots(figsize=(7, 7))
    draw_field(ax)
    for i in range(hist["n_sheep"]):
        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
        xs, ys = hist["sheep_xs"][i], hist["sheep_ys"][i]
        faded_path(ax, xs, ys, c, lw=1.2, label=f"sheep {i+1}")
        ax.plot(xs[0], ys[0], "o", color=c, ms=7, zorder=4)
        end = hist["penned_at"][i] if hist["penned_at"][i] is not None else -1
        ax.plot(xs[end], ys[end], "*", color=c, ms=11, zorder=5)
    faded_path(ax, hist["dog_xs"], hist["dog_ys"], DOG_COLOR, lw=2.0,
               label="dog")
    ax.plot(hist["dog_xs"][0], hist["dog_ys"][0], "s", color=DOG_COLOR,
            ms=10, zorder=5)
    ax.plot(hist["dog_xs"][-1], hist["dog_ys"][-1], "D", color=DOG_COLOR,
            ms=10, zorder=5)
    result = ("SUCCESS" if hist["success"]
              else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})")
    ax.set_title(f"n={hist['n_sheep']}  {result}  {hist['steps']} steps",
                 fontsize=12)
    ax.legend(loc="upper left", fontsize=8)
    plt.tight_layout()
    fig.savefig(out_path, dpi=120)
    plt.close(fig)
 def plot_timeseries(hist, out_path):
    t = np.arange(hist["steps"])
    fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)
    axes[0].plot(t, hist["radii"], color="steelblue")
    axes[0].axhline(5.0, color="orange", ls="--", lw=1, label="compact (5m)")
    axes[0].set_ylabel("flock radius (m)")
    axes[0].legend(fontsize=8)
    axes[0].set_title("Flock radius")
    for i in range(hist["n_sheep"]):
        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
        axes[1].plot(t, hist["pen_dists"][i], color=c, lw=1,
                     label=f"sheep {i+1}")
        if hist["penned_at"][i] is not None:
            axes[1].axvline(hist["penned_at"][i], color=c, ls=":", lw=1)
    axes[1].set_ylabel("dist to pen (m)")
    axes[1].legend(fontsize=7, ncol=min(hist["n_sheep"], 5))
    axes[1].set_title("Per-sheep distance to pen")
    axes[2].plot(t, hist["action_mags"], color="tomato", lw=1)
    axes[2].axhline(1.0, color="gray", ls="--", lw=1, label="max")
    axes[2].set_ylabel("action ||(vx,vy)||")
    axes[2].set_ylim(0, 1.5)
    axes[2].set_title("Dog action magnitude")
    axes[2].legend(fontsize=8)
    axes[3].plot(t, hist["rewards"], color="purple", lw=1, alpha=0.7)
    axes[3].axhline(0, color="black", lw=0.5)
    axes[3].set_ylabel("reward")
    axes[3].set_xlabel("step")
    axes[3].set_title("Reward per step")
    result = ("SUCCESS" if hist["success"]
              else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})")
    fig.suptitle(f"n_sheep={hist['n_sheep']}  {result}  {hist['steps']} steps",
                 fontsize=13)
    plt.tight_layout()
    fig.savefig(out_path, dpi=120)
    plt.close(fig)
 def plot_success_rate(stage_results, out_path):
    fig, ax = plt.subplots(figsize=(8, 4))
    ns = [r["n_sheep"] for r in stage_results]
    srs = [r["sr"] * 100 for r in stage_results]
    bars = ax.bar(ns, srs, color="steelblue", edgecolor="white")
    ax.set_xlabel("Sheep count")
    ax.set_ylabel("Success rate (%)")
    ax.set_ylim(0, 105)
    ax.axhline(90, color="orange", ls="--", lw=1, label="90% target")
    for bar, sr in zip(bars, srs):
        ax.text(bar.get_x() + bar.get_width() / 2,
                bar.get_height() + 1, f"{sr:.0f}%",
                ha="center", fontsize=9)
    ax.legend()
    ax.set_title("Evaluation success rate per sheep count")
    plt.tight_layout()
    fig.savefig(out_path, dpi=120)
    plt.close(fig)
 # ── Animated GIF ─────────────────────────────────────────────────────────────
 def save_episode_gif(hist, out_path, fps=20, skip=3):
    """Render hist as an animated GIF. `skip` keeps every Nth frame (smaller file)."""
    n_sheep = hist["n_sheep"]
    frames = list(range(0, hist["steps"], max(1, skip)))
    if frames[-1] != hist["steps"] - 1:
        frames.append(hist["steps"] - 1)
    fig, ax = plt.subplots(figsize=(6, 6))
    draw_field(ax)
    title = ax.text(0, 16.5, "", ha="center", fontsize=11)
    dog_marker, = ax.plot([], [], "s", color=DOG_COLOR, ms=12,
                          markeredgecolor="black", markeredgewidth=1.5,
                          zorder=5)
    sheep_markers = []
    for i in range(n_sheep):
        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
        m, = ax.plot([], [], "o", color=c, ms=10,
                     markeredgecolor="#333", markeredgewidth=1, zorder=4)
        sheep_markers.append(m)
    dog_trail, = ax.plot([], [], color=DOG_COLOR, lw=1.0, alpha=0.5)
    def update(k):
        title.set_text(
            f"n={n_sheep}  step {k+1}/{hist['steps']}  "
            f"penned {sum(hist['sheep_penned'][i][k] for i in range(n_sheep))}/{n_sheep}")
        dog_marker.set_data([hist["dog_xs"][k]], [hist["dog_ys"][k]])
        dog_trail.set_data(hist["dog_xs"][:k+1], hist["dog_ys"][:k+1])
        for i, m in enumerate(sheep_markers):
            m.set_data([hist["sheep_xs"][i][k]], [hist["sheep_ys"][i][k]])
            penned = hist["sheep_penned"][i][k]
            m.set_color("deeppink" if penned else SHEEP_COLORS[i % len(SHEEP_COLORS)])
        return [title, dog_marker, dog_trail, *sheep_markers]
    anim = animation.FuncAnimation(
        fig, update, frames=frames, interval=1000 / fps, blit=False)
    anim.save(out_path, writer=animation.PillowWriter(fps=fps), dpi=80)
    plt.close(fig)
 # ── CLI ──────────────────────────────────────────────────────────────────────
 def _resolve_paths(args):
    if args.run_dir:
        model_path  = os.path.join(args.run_dir, "final_model.zip")
        vn_path     = os.path.join(args.run_dir, "vecnorm.pkl")
        cfg_path    = os.path.join(args.run_dir, "config.json")
    else:
        model_path  = args.model
        vn_path     = args.vecnorm
        cfg_path    = args.config
    return model_path, vn_path, cfg_path
 def main():
    p = argparse.ArgumentParser(
        description="Render trajectory + timeseries + GIF for a saved policy.")
    p.add_argument("--run-dir", type=str, default=None,
                   help="Run directory containing final_model.zip + vecnorm.pkl + config.json")
    p.add_argument("--model",   type=str, default=None)
    p.add_argument("--vecnorm", type=str, default=None)
    p.add_argument("--config",  type=str, default=None)
    p.add_argument("--n-sheep", type=int, default=3)
    p.add_argument("--seed",    type=int, default=42)
    p.add_argument("--max-steps", type=int, default=2500)
    p.add_argument("--out-dir", type=str, default=None)
    p.add_argument("--no-gif",  action="store_true",
                   help="Skip the animated GIF (PNG-only is faster).")
    p.add_argument("--gif-fps", type=int, default=20)
    p.add_argument("--gif-skip", type=int, default=3)
    args = p.parse_args()
    model_path, vn_path, cfg_path = _resolve_paths(args)
    if not (model_path and vn_path):
        p.error("either --run-dir or both --model and --vecnorm are required")
    rcfg = None
    if cfg_path and os.path.exists(cfg_path):
        with open(cfg_path) as f:
            cfg = json.load(f)
        rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
    out_dir = args.out_dir or os.path.join(
        os.path.dirname(os.path.abspath(model_path)),
        f"vis_{args.n_sheep}s")
    os.makedirs(out_dir, exist_ok=True)
    print(f"Loading model:   {model_path}")
    print(f"Loading vecnorm: {vn_path}")
    model = PPO.load(model_path, device="cpu")
    raw = DummyVecEnv([make_eval_env(args.n_sheep, args.seed, args.max_steps, rcfg)])
    vn = VecNormalize.load(vn_path, raw)
    print(f"Rolling out n_sheep={args.n_sheep} (seed={args.seed})...")
    hist = run_and_record(model, vn, args.n_sheep, args.max_steps,
                          reward_cfg=rcfg, seed=args.seed)
    result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})"
    print(f"  {result} in {hist['steps']} steps")
    plot_trajectory(hist, os.path.join(out_dir, "trajectory.png"))
    plot_timeseries(hist, os.path.join(out_dir, "timeseries.png"))
    print(f"  saved trajectory.png + timeseries.png to {out_dir}/")
    if not args.no_gif:
        gif_path = os.path.join(out_dir, "episode.gif")
        print(f"  rendering GIF (fps={args.gif_fps}, skip={args.gif_skip})...")
        save_episode_gif(hist, gif_path, fps=args.gif_fps, skip=args.gif_skip)
        print(f"  saved {gif_path}")
 if __name__ == "__main__":
    main()
@@ -1,5 +1,5 @@
 Webots Project File version R2025a
-perspectives: 000000ff00000000fd00000002000000010000011c00000298fc0200000001fb0000001400540065007800740045006400690074006f00720100000000000002980000003f00ffffff000000030000084300000238fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000008430000006900ffffff000007250000029800000001000000020000000100000008fc00000000
+perspectives: 000000ff00000000fd00000002000000010000011c000001bcfc0200000001fb0000001400540065007800740045006400690074006f00720100000000000001bc0000003f00ffffff00000003000005c600000220fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000005c60000006900ffffff000004a8000001bc00000001000000020000000100000008fc00000000
 simulationViewPerspectives: 000000ff000000010000000200000100000006250100000002010000000100
 sceneTreePerspectives: 000000ff00000001000000030000001f000000c0000000000100000002010000000200
 maximizedDockId: -1
@@ -10,7 +10,7 @@ EXTERNPROTO "../protos/Sheep.proto"
 # World
 WorldInfo {
  info [
-    "RL-Based Autonomous Shepherd Robot"
+    "Autonomous Shepherd Robot (Strömbom)"
    "Group G25"
  ]
  title "Shepherd Herding"
@@ -106,19 +106,26 @@ Solid { translation -2.5 -15 0.84 children [ Shape { appearance USE CAP geometry
 Solid { translation 14 -15 0.40 children [ Shape { appearance USE STONE_A geometry Box { size 2.0 0.16 0.80 } } ] boundingObject Box { size 2.0 0.16 0.80 } }
 Solid { translation 14 -15 0.84 children [ Shape { appearance USE CAP geometry Box { size 2.1 0.26 0.07 } } ] boundingObject Box { size 2.1 0.26 0.07 } }
 # Gate posts
-Solid { translation 10 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
+Solid { translation 10 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
-Solid { translation 13 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
+Solid { translation 13 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
-# Outer gate (wooden, slightly ajar, Z-brace)
+# Outer gate — fully open, hinged on the west gate post. Modeled as a swung-back
-Solid { translation 11.5 -15.08 0.55 rotation 0 0 1 0.25 children [
+# wooden gate parallel to the south wall, on the west side, so the 3m corridor
 # between gate posts (x=10..13, y=-15) is unobstructed.
 Solid { translation 8.6 -15.05 0.55 rotation 0 0 1 0 children [
  Shape { appearance USE WOOD geometry Box { size 2.80 0.05 1.00 } }
-  Transform { translation 0 0.02 0 rotation 0 1 0 0.34 children [ Shape { appearance DEF FPOST PBRAppearance { baseColor 0.35 0.22 0.10 roughness 0.90 } geometry Box { size 2.97 0.04 0.06 } } ] }
+  # FPOST appearance DEF lives here so the external pen below can USE it.
  Transform { translation 0 0.02 0 rotation 0 1 0 0.34 children [
    Shape { appearance DEF FPOST PBRAppearance { baseColor 0.35 0.22 0.10 roughness 0.90 } geometry Box { size 2.97 0.04 0.06 } }
  ] }
 ] boundingObject Box { size 2.80 0.08 1.00 } }
-# ==================== QUARANTINE PEN (wooden post-and-rail fence, inside field) ====================
+# ==================== EXTERNAL PEN (south of field, accessed through south-wall gate) ====================
-# Flow: main field → inner gate → quarantine area → outer gate → outside
+# Flow: main field → south-wall gate (x ∈ [10, 13], y = -15) → external pen
 # The pen is a wooden post-and-rail rectangle south of the field, x ∈ [10, 13],
 # y ∈ [-22, -15], open on the north side (the gate hole is the entrance).
-# West wall (x=10, ~7m along Y)
+# Pen west wall (x=10, y from -22 to -15, length 7m)
-Solid { translation 10 -11.46 0.55 children [
+Solid { translation 10 -18.5 0.55 children [
  Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
@@ -130,8 +137,8 @@ Solid { translation 10 -11.46 0.55 children [
  Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] }
 ] boundingObject Box { size 0.14 6.92 1.10 } }
-# East wall (x=13)
+# Pen east wall (x=13, y from -22 to -15, length 7m)
-Solid { translation 13 -11.46 0.55 children [
+Solid { translation 13 -18.5 0.55 children [
  Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
@@ -143,39 +150,50 @@ Solid { translation 13 -11.46 0.55 children [
  Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] }
 ] boundingObject Box { size 0.14 6.92 1.10 } }
-# North wall - open entrance (no wall, just corner posts)
+# Pen south wall (y=-22, x from 10 to 13, length 3m, closes the back of the pen)
-Solid { translation 10 -8 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] boundingObject Box { size 0.12 0.12 1.10 } }
+Solid { translation 11.5 -22 0.55 children [
-Solid { translation 13 -8 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] boundingObject Box { size 0.12 0.12 1.10 } }
+  Transform { translation -1.5 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation  0   0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation  1.5 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
  Transform { translation 0 0 -0.38 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] }
  Transform { translation 0 0 -0.05 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] }
  Transform { translation 0 0 0.30 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] }
  Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 2.92 0.14 0.04 } } ] }
 ] boundingObject Box { size 2.92 0.14 1.10 } }
 # Pen north corner posts at the gate opening (no wall — sheep enter here from the field)
 Solid { translation 10 -15.0 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
 Solid { translation 13 -15.0 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
 # Corner pillars
-Solid { translation  15  15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
+Solid { translation  15  15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
-Solid { translation  15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
+Solid { translation  15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
-Solid { translation -15  15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
+Solid { translation -15  15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
-Solid { translation -15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } }
+Solid { translation -15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
 # Mid-pillars every 5 m — East
-Solid { translation  15  10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  15  10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation  15   5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  15   5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation  15   0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  15   0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation  15  -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  15  -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation  15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
 # West
-Solid { translation -15  10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation -15  10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation -15   5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation -15   5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation -15   0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation -15   0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation -15  -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation -15  -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation -15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation -15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
 # North
-Solid { translation  10  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  10  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation   5  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation   5  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation   0  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation   0  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation  -5  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  -5  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation -10  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation -10  15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
 # South
-Solid { translation   5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation   5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation   0 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation   0 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation  -5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation  -5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
-Solid { translation -10 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } }
+Solid { translation -10 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
 # ==================== BARN 1 — Gambrel/Dutch style (NE, outside fence) ====================
 # Body 10×7×4, weathered gray-brown wood, gambrel roof, large double doors
@@ -503,28 +521,16 @@ ShepherdDog {
 }
 # ==================== SHEEP ====================
-Sheep {
+# Up to 10 sheep, scattered through the field's central/north zone. Comment
-  translation 3 2 0.5
+# out trailing slots to test smaller flock sizes; the dog policy is trained
-  name "sheep1"
+# to handle 1..10 sheep so any prefix works.
-  controller "sheep"
+Sheep { translation  3.0  2.0 0.5 name "sheep1"  controller "sheep" }
-}
+Sheep { translation  3.0 -2.0 0.5 name "sheep2"  controller "sheep" }
-Sheep {
+Sheep { translation  4.0  0.0 0.5 name "sheep3"  controller "sheep" }
-  translation 3 -2 0.5
+Sheep { translation -3.0  4.0 0.5 name "sheep4"  controller "sheep" }
-  name "sheep2"
+Sheep { translation -5.0 -2.0 0.5 name "sheep5"  controller "sheep" }
-  controller "sheep"
+Sheep { translation  6.0  5.0 0.5 name "sheep6"  controller "sheep" }
-}
+Sheep { translation -6.0  6.0 0.5 name "sheep7"  controller "sheep" }
-Sheep {
+Sheep { translation  0.0  8.0 0.5 name "sheep8"  controller "sheep" }
-  translation 4 0 0.5
+Sheep { translation -8.0  0.0 0.5 name "sheep9"  controller "sheep" }
-  name "sheep3"
+Sheep { translation  7.0 -4.0 0.5 name "sheep10" controller "sheep" }
  controller "sheep"
 }
 # Sheep {
 #   translation 3.5 1 0.5
 #   name "sheep4"
 #   controller "sheep"
 # }
 # Sheep {
 #   translation 3.5 -1 0.5
 #   name "sheep5"
 #   controller "sheep"
 # }