diff --git a/.gitignore b/.gitignore index 41ebae0..ed9ece1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,18 +4,22 @@ # Python __pycache__/ - -# Training -training/**/events.out.tfevents.* -training/**/checkpoints/ -training/runs/** -!training/runs/.gitkeep - -# Controller runtime artefacts -controllers/shepherd_dog_rl/debug*.csv -controllers/shepherd_dog_rl/debug_out*/ -controllers/shepherd_dog_rl/final_model*.zip -controllers/shepherd_dog_rl/vecnorm*.pkl +*.pyc +.venv/ # Optional env parity debug dog_debug.csv + +# Webots controller scratch +controllers/shepherd_dog/dog_behavior_log.csv + +# Training artefacts +training/runs/* +!training/runs/.gitkeep +*.zip +*.pkl + +# TensorBoard +events.out.tfevents.* +worlds/field_test.wbt +herding_runtime.cfg diff --git a/controllers/sheep/sheep.py b/controllers/sheep/sheep.py index 7b2f1c8..9ce6b75 100644 --- a/controllers/sheep/sheep.py +++ b/controllers/sheep/sheep.py @@ -1,233 +1,148 @@ -""" -Sheep flocking controller (Webots, Reynolds boids variant). +"""Sheep flocking controller (Webots). Each sheep broadcasts its GPS position every 3 steps on channel 1 and -listens for the dog and peer sheep positions. Peers are keyed by robot -name so each neighbour has exactly one current entry in the dict. +listens for the dog and peer sheep positions. The behavioural step is +delegated to ``herding.flocking_sim.compute_heading_speed`` so the +training environment and Webots run identical sheep dynamics. -Force stack each step (summed then converted to a heading + speed): - flee — away from dog, quadratic ramp, dominant when close - cohesion — toward flock centre, halved while fleeing - separation — inverse-distance push, prevents physical overlap - walls — linear repulsion from field boundary - wander — small persistent drift for natural idle motion - -Pen behaviour: on first entry into the quarantine pen the sheep latches -permanently — it turns pink (via the exposed woolColor PROTO field) and -the normal force stack is replaced by pen-confinement forces only. +Pen behaviour: a sheep latches to ``penned`` the first time it crosses +the south-wall gate plane into the gate corridor. Once latched it turns +pink (via the exposed ``woolColor`` PROTO field) and the force stack +switches to in-pen containment. """ -import random import math +import os +import random +import sys + +# --- Make the shared herding/ package importable from this controller dir --- +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + from controller import Supervisor -# --------------------------------------------------------------------------- -# Tuning constants -# --------------------------------------------------------------------------- +from herding.diffdrive import heading_speed_to_wheels +from herding.flocking_sim import MAX_SPEED, compute_heading_speed +from herding.geometry import ( + SHEEP_MAX_WHEEL_OMEGA, + is_penned_position, +) -MAX_SPEED = 22.0 # rad/s hard clamp on both motors -FLEE_SPEED = 20.0 # rad/s upper bound while panicking -WANDER_SPEED = 3.0 # rad/s lower bound during calm wandering - -X_MIN, X_MAX = -14.5, 14.5 # stone wall inner edges (metres) -Y_MIN, Y_MAX = -14.5, 14.5 -WALL_MARGIN = 3.5 # avoidance starts this far from the wall - -FLEE_DIST = 7.0 # dog within this radius triggers flee (metres) -SEPARATION_DIST = 2.5 # inverse-distance push active inside this radius -COHESION_DIST = 8.0 # pull toward flock centre active inside this radius - -PEN_X_MIN, PEN_X_MAX = 10.0, 13.0 # quarantine pen extents (metres) -PEN_Y_MIN, PEN_Y_MAX = -15.0, -8.0 # open entrance at y=-8, gate at y=-15 -PEN_MARGIN = 0.8 # confinement force starts this far from pen wall # --------------------------------------------------------------------------- # Device setup # --------------------------------------------------------------------------- -robot = Supervisor() +robot = Supervisor() timestep = int(robot.getBasicTimeStep()) -name = robot.getName() +name = robot.getName() self_node = robot.getSelf() -left_motor = robot.getDevice("left wheel motor") +left_motor = robot.getDevice("left wheel motor") right_motor = robot.getDevice("right wheel motor") left_motor.setPosition(float("inf")) right_motor.setPosition(float("inf")) left_motor.setVelocity(0.0) right_motor.setVelocity(0.0) +MOTOR_MAX = min(left_motor.getMaxVelocity(), SHEEP_MAX_WHEEL_OMEGA) -gps = robot.getDevice("gps"); gps.enable(timestep) -compass = robot.getDevice("compass"); compass.enable(timestep) +gps = robot.getDevice("gps"); gps.enable(timestep) +compass = robot.getDevice("compass"); compass.enable(timestep) receiver = robot.getDevice("receiver"); receiver.enable(timestep) -emitter = robot.getDevice("emitter") +emitter = robot.getDevice("emitter") + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- -def norm_angle(a): - return math.atan2(math.sin(a), math.cos(a)) - - def bearing(): # Compass returns north direction in sensor frame; for this Z-up world # with north = +Y, atan2(n[0], n[1]) gives the standard math angle - # (0 = east, π/2 = north) matching atan2(fy, fx) used for heading. + # (0 = east, π/2 = north) matching atan2(fy, fx) used for headings. n = compass.getValues() return math.atan2(n[0], n[1]) -def drive(heading, speed): - err = norm_angle(heading - bearing()) - # Scale forward component by cos(err): at 90° error fwd→0 so the robot - # spins in place to realign rather than driving sideways at full speed. - fwd = speed * max(0.0, math.cos(err)) - k = 4.0 - left_motor.setVelocity( max(-MAX_SPEED, min(MAX_SPEED, fwd - k * err))) - right_motor.setVelocity(max(-MAX_SPEED, min(MAX_SPEED, fwd + k * err))) +def drive(heading, speed_motor): + left_w, right_w = heading_speed_to_wheels( + heading, min(speed_motor, MAX_SPEED), bearing(), MOTOR_MAX, k_turn=4.0 + ) + left_motor.setVelocity(left_w) + right_motor.setVelocity(right_w) def paint_pink(): # woolColor is declared as a PROTO field with IS binding to the DEF WOOL - # PBRAppearance baseColor. Changing it here propagates to every USE WOOL - # shape on the body. Direct field access avoids PROTO-internal opacity. + # PBRAppearance baseColor; setting it propagates to every USE WOOL shape. self_node.getField("woolColor").setSFColor([1.0, 0.55, 0.72]) + # --------------------------------------------------------------------------- # State # --------------------------------------------------------------------------- wander_angle = random.uniform(-math.pi, math.pi) -step = 0 -dog_x = None -dog_y = None -peers = {} # name → (x, y), one entry per neighbour, cleared every 30 steps +step_count = 0 +dog_x, dog_y = None, None +peers = {} # name → (x, y), one entry per neighbour, cleared every 30 steps penned = False +# Stuck detection: differential-drive sheep can pin against a wall and need +# a forced reverse-and-rotate to escape. If displacement < STUCK_DIST for +# STUCK_STEPS consecutive steps, drive toward field centre. +_prev_x, _prev_y = None, None +_stuck_count = 0 +STUCK_STEPS = 20 +STUCK_DIST = 0.05 + + # --------------------------------------------------------------------------- # Main loop # --------------------------------------------------------------------------- while robot.step(timestep) != -1: - step += 1 + step_count += 1 pos = gps.getValues() x, y = pos[0], pos[1] - # Pen entry: one-way latch, never unset - if not penned and PEN_X_MIN < x < PEN_X_MAX and PEN_Y_MIN < y < PEN_Y_MAX: + # Pen entry: one-way latch. Penned sheep get pink wool and switch behaviour. + if not penned and is_penned_position(x, y): penned = True paint_pink() - # Refresh peer table (clear before receiving so fresh data is never lost) - if step % 30 == 0: + # Refresh peer table — clear before receiving so fresh data is never lost. + if step_count % 30 == 0: peers.clear() while receiver.getQueueLength() > 0: msg = receiver.getString() receiver.nextPacket() - p = msg.split(":") - if p[0] == "dog" and len(p) >= 3: - dog_x, dog_y = float(p[1]), float(p[2]) - elif p[0] == "sheep" and len(p) >= 4 and p[1] != name: - peers[p[1]] = (float(p[2]), float(p[3])) + parts = msg.split(":") + if parts[0] == "dog" and len(parts) >= 3: + dog_x, dog_y = float(parts[1]), float(parts[2]) + elif parts[0] == "sheep" and len(parts) >= 4 and parts[1] != name: + peers[parts[1]] = (float(parts[2]), float(parts[3])) - fx, fy = 0.0, 0.0 + dog_xy = (dog_x, dog_y) if dog_x is not None and dog_y is not None else None + heading, speed, wander_angle = compute_heading_speed( + x=x, y=y, penned=penned, dog_xy=dog_xy, peers=peers, + wander_angle=wander_angle, + ) - # Repel unpenned sheep from the exterior of the pen's side walls so they - # don't get pinned by flee forces. Only fires when strictly outside the pen - # (x < PEN_X_MIN or x > PEN_X_MAX) at pen height (y in pen y-range). - # Entrance is open on the north (y > PEN_Y_MAX) — no force there. - PEN_EXT_MARGIN = 0.8 - if not penned and PEN_Y_MIN < y < PEN_Y_MAX: - if PEN_X_MIN - PEN_EXT_MARGIN < x < PEN_X_MIN: - fx -= ((x - (PEN_X_MIN - PEN_EXT_MARGIN)) / PEN_EXT_MARGIN) * 6.0 - if PEN_X_MAX < x < PEN_X_MAX + PEN_EXT_MARGIN: - fx += ((PEN_X_MAX + PEN_EXT_MARGIN - x) / PEN_EXT_MARGIN) * 6.0 + # Stuck detection — safety net for differential-drive wall pinning. + if _prev_x is not None: + moved = math.hypot(x - _prev_x, y - _prev_y) + _stuck_count = _stuck_count + 1 if moved < STUCK_DIST else 0 + if _stuck_count >= STUCK_STEPS: + heading = math.atan2(-y, -x) # always points away from the boundary + speed = MAX_SPEED + _stuck_count = 0 + _prev_x, _prev_y = x, y - if penned: - # Inside pen: wander freely, strong boundary forces prevent exit, - # separation still active to avoid collisions with other penned sheep. - - pm = PEN_MARGIN - if x < PEN_X_MIN + pm: fx += ((PEN_X_MIN + pm - x) / pm) * 15.0 - if x > PEN_X_MAX - pm: fx -= ((x - (PEN_X_MAX - pm)) / pm) * 15.0 - if y < PEN_Y_MIN + pm: fy += ((PEN_Y_MIN + pm - y) / pm) * 15.0 - if y > PEN_Y_MAX - pm: fy -= ((y - (PEN_Y_MAX - pm)) / pm) * 15.0 - - for px, py in peers.values(): - dx, dy = px - x, py - y - d = math.hypot(dx, dy) - if 0.05 < d < SEPARATION_DIST: - push = (SEPARATION_DIST - d) / d - fx -= (dx / d) * push * 2.5 - fy -= (dy / d) * push * 2.5 - - if random.random() < 0.02: - wander_angle += random.uniform(-0.6, 0.6) - fx += math.cos(wander_angle) * 0.5 - fy += math.sin(wander_angle) * 0.5 - - else: - fleeing = False - - # Flee — quadratic ramp so force grows rapidly as the dog closes in - if dog_x is not None: - dx = dog_x - x - dy = dog_y - y - dist = math.hypot(dx, dy) - if 0.01 < dist < FLEE_DIST: - fleeing = True - t = 1.0 - dist / FLEE_DIST - s = t * t * 20.0 - fx -= (dx / dist) * s - fy -= (dy / dist) * s - - # Cohesion — halved while fleeing to reduce mid-panic collisions - cx, cy, cn = 0.0, 0.0, 0 - for px, py in peers.values(): - d = math.hypot(px - x, py - y) - if 0.3 < d < COHESION_DIST: - cx += px; cy += py; cn += 1 - if cn > 0: - w = 0.08 if fleeing else 0.15 - fx += (cx / cn - x) * w - fy += (cy / cn - y) * w - - # Separation — inverse-distance: huge when nearly overlapping, fades quickly - for px, py in peers.values(): - dx, dy = px - x, py - y - d = math.hypot(dx, dy) - if 0.05 < d < SEPARATION_DIST: - push = (SEPARATION_DIST - d) / d - fx -= (dx / d) * push * 2.5 - fy -= (dy / d) * push * 2.5 - - # Walls - if x < X_MIN + WALL_MARGIN: fx += ((X_MIN + WALL_MARGIN - x) / WALL_MARGIN) * 6.0 - if x > X_MAX - WALL_MARGIN: fx -= ((x - (X_MAX - WALL_MARGIN)) / WALL_MARGIN) * 6.0 - if y < Y_MIN + WALL_MARGIN: fy += ((Y_MIN + WALL_MARGIN - y) / WALL_MARGIN) * 6.0 - if y > Y_MAX - WALL_MARGIN: fy -= ((y - (Y_MAX - WALL_MARGIN)) / WALL_MARGIN) * 6.0 - - # Wander — suppressed while fleeing so drift cannot deflect the flee heading - if not fleeing: - if random.random() < 0.02: - wander_angle += random.uniform(-0.6, 0.6) - fx += math.cos(wander_angle) * 0.5 - fy += math.sin(wander_angle) * 0.5 - - # Hard-stop clamp: within 0.5 m of a wall, zero any force component that - # would push further into it. Prevents the flee force from pinning a sheep - # against the boundary when the dog approaches from outside. - HS = 0.5 - if x < X_MIN + HS and fx < 0: fx = 0.0 - if x > X_MAX - HS and fx > 0: fx = 0.0 - if y < Y_MIN + HS and fy < 0: fy = 0.0 - if y > Y_MAX - HS and fy > 0: fy = 0.0 - - heading = math.atan2(fy, fx) - mag = math.hypot(fx, fy) - speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0)) drive(heading, speed) - if step % 3 == 0: + if step_count % 3 == 0: emitter.send(f"sheep:{name}:{x:.4f}:{y:.4f}") diff --git a/controllers/shepherd_dog/policy_loader.py b/controllers/shepherd_dog/policy_loader.py new file mode 100644 index 0000000..fd3728a --- /dev/null +++ b/controllers/shepherd_dog/policy_loader.py @@ -0,0 +1,78 @@ +"""Lazy loader for the SB3 PPO policy used by the dog controller. + +Importing stable-baselines3 inside the Webots Python interpreter is only +needed when ``HERDING_MODE=rl``; the Strömbom mode runs without it. This +loader keeps SB3 out of the import path until you actually ask for the RL +policy, so users without SB3 installed can still run the Strömbom +baseline. + +The policy + VecNormalize statistics are saved together by +``training/train_ppo.py``: + + runs//best/best_model.zip # SB3 PPO checkpoint + runs//best/vecnormalize.pkl # observation-normaliser stats + +Pass either the directory or the explicit zip path. +""" + +import os +from pathlib import Path + + +class PolicyHandle: + """Wrap a loaded PPO policy + VecNormalize so the controller can call + ``predict(obs)`` without thinking about either.""" + + def __init__(self, model, vecnorm): + self.model = model + self.vecnorm = vecnorm + + def predict(self, obs): + # VecNormalize expects a batched obs of shape (n_envs, obs_dim). + if self.vecnorm is not None: + import numpy as np + obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1) + obs_b = self.vecnorm.normalize_obs(obs_b) + else: + import numpy as np + obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1) + action, _ = self.model.predict(obs_b, deterministic=True) + return action[0] + + +def load(model_path: str, vecnorm_path: str | None = None) -> PolicyHandle: + """Load a PPO model (and optional VecNormalize) from disk. + + ``model_path`` may be the .zip checkpoint or a directory containing + ``best_model.zip`` (and optionally ``vecnormalize.pkl``). + """ + p = Path(model_path) + if p.is_dir(): + zip_candidates = [p / "best_model.zip", p / "final.zip", p / "policy.zip"] + zip_path = next((z for z in zip_candidates if z.exists()), None) + if zip_path is None: + raise FileNotFoundError( + f"No PPO zip found in {p} (looked for best_model.zip, final.zip, policy.zip)" + ) + if vecnorm_path is None: + vn = p / "vecnormalize.pkl" + if vn.exists(): + vecnorm_path = str(vn) + else: + zip_path = p + + # Imports deferred so the Strömbom path doesn't require SB3. + from stable_baselines3 import PPO + from stable_baselines3.common.vec_env import VecNormalize + + model = PPO.load(str(zip_path), device="auto") + vecnorm = None + if vecnorm_path and os.path.exists(vecnorm_path): + # VecNormalize.load needs a venv to attach to; we only need its stats + # at inference, so we reconstruct the wrapper manually. + import pickle + with open(vecnorm_path, "rb") as f: + vecnorm = pickle.load(f) + vecnorm.training = False + vecnorm.norm_reward = False + return PolicyHandle(model=model, vecnorm=vecnorm) diff --git a/controllers/shepherd_dog/shepherd_dog.py b/controllers/shepherd_dog/shepherd_dog.py index 54d87a1..0830776 100644 --- a/controllers/shepherd_dog/shepherd_dog.py +++ b/controllers/shepherd_dog/shepherd_dog.py @@ -1,88 +1,283 @@ -""" -Shepherd Dog controller (Webots, manual keyboard control). +"""Shepherd Dog controller (Webots). -WASD / arrow keys drive the robot. +/- adjust speed in 10 % increments. -GPS position is broadcast every step on channel 1 so sheep controllers -can compute flee forces. Ears wag continuously via sinusoidal position -targets — purely cosmetic. +Runs in one of two modes selected by the ``HERDING_MODE`` environment +variable: + + HERDING_MODE=rl → load an SB3 PPO policy from + HERDING_POLICY_DIR (default + training/runs/latest/best) and use its + (vx, vy) action each step. + HERDING_MODE=strombom → use the analytic Strömbom collect/drive + heuristic. This is the fallback if the RL + policy can't be loaded (e.g. SB3 not + installed in the Webots Python env, or no + checkpoint yet). + +Both modes share the same low-level differential-drive controller +(``herding.diffdrive.velocity_to_wheels`` + clamped forward speed), so +switching modes does not retune the actuation layer. + +A safety supervisor enforces the "dog stays out of the pen" invariant: +if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is +overridden with a north-driving correction. This is a hard guarantee +the policy cannot escape. """ import math -from controller import Robot, Keyboard +import os +import sys -robot = Robot() +# --- Make the shared herding/ package importable from this controller dir --- +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +from controller import Robot + +from herding.diffdrive import velocity_to_wheels +from herding.geometry import ( + DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, + DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS, + PEN_ENTRY, +) +from herding.obs import build_obs +from herding.sequential import compute_action_debug as sequential_action_debug +from herding.strombom import compute_action_debug as strombom_action_debug + + +# --------------------------------------------------------------------------- +# Mode selection +# --------------------------------------------------------------------------- + +def _load_runtime_config(): + """Read mode + policy_dir overrides from a runtime config file. + + Webots strips HERDING_* env vars in some configurations, so the + launcher writes a tiny ``herding_runtime.cfg`` (key=value lines) + in the project root and the controller reads it here. Env vars + win if both are present; the file is the fallback. + """ + cfg_path = os.path.join(_PROJECT_ROOT, "herding_runtime.cfg") + if not os.path.exists(cfg_path): + return {} + out = {} + try: + with open(cfg_path) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, _, v = line.partition("=") + out[k.strip().upper()] = v.strip() + except OSError: + return {} + return out + + +_runtime_cfg = _load_runtime_config() +MODE = (os.environ.get("HERDING_MODE") + or _runtime_cfg.get("HERDING_MODE") + or "rl").lower() + + +def _resolve_policy_dir() -> str: + """Where to look for the trained policy. + + Priority: + 1. HERDING_POLICY_DIR env var (if set and points to a real dir) + 2. training/runs/bc_pretrained/ (BC-only checkpoint) + 3. training/runs/bc_ppo/best/ (PPO fine-tuned best) + 4. training/runs/latest/best/ (legacy default) + """ + env_dir = (os.environ.get("HERDING_POLICY_DIR") + or _runtime_cfg.get("HERDING_POLICY_DIR")) + if env_dir and os.path.isdir(env_dir): + return env_dir + candidates = [ + os.path.join(_PROJECT_ROOT, "training", "runs", "bc_pretrained"), + os.path.join(_PROJECT_ROOT, "training", "runs", "bc_ppo", "best"), + os.path.join(_PROJECT_ROOT, "training", "runs", "latest", "best"), + ] + for c in candidates: + if os.path.isdir(c): + return c + # Last resort — return env var anyway so error message is informative. + return env_dir or candidates[0] + + +POLICY_DIR = _resolve_policy_dir() + +policy_handle = None +if MODE == "rl": + print(f"[dog] HERDING_MODE={MODE} HERDING_POLICY_DIR(env)=" + f"{os.environ.get('HERDING_POLICY_DIR', '')}") + print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists=" + f"{os.path.isdir(POLICY_DIR)}") + if os.path.isdir(POLICY_DIR): + try: + entries = sorted(os.listdir(POLICY_DIR)) + except OSError: + entries = [] + print(f"[dog] dir contents: {entries}") + try: + from policy_loader import load as _load_policy + policy_handle = _load_policy(POLICY_DIR) + print(f"[dog] RL policy loaded from {POLICY_DIR}") + except Exception as exc: + print(f"[dog] RL policy load failed ({exc!r}); falling back to Strömbom.") + MODE = "strombom" +if MODE not in ("rl", "strombom", "sequential"): + print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.") + MODE = "strombom" +print(f"[dog] running in mode={MODE}") + + +# --------------------------------------------------------------------------- +# Action smoothing + safety supervisor +# --------------------------------------------------------------------------- + +ACTION_SMOOTH = 0.35 +prev_action = (0.0, 0.0) + + +def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple: + """If the dog is near the south barrier and the action would push it + further south, override with a northward action. Hard invariant: the + dog never enters the pen.""" + if dog_y < DOG_SOUTH_LIMIT and vy < 0.0: + return (0.0, 1.0) + if dog_y < DOG_SOUTH_LIMIT + 0.5 and vy < -0.2: + return (vx * 0.5, max(0.0, vy + 0.5)) + return (vx, vy) + + +# --------------------------------------------------------------------------- +# Driving +# --------------------------------------------------------------------------- + +def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float): + if math.hypot(vx, vy) < 1e-3: + left_motor.setVelocity(0.0) + right_motor.setVelocity(0.0) + return + n = compass.getValues() + h = math.atan2(n[0], n[1]) + left, right = velocity_to_wheels( + vx, vy, h, + max_linear=DOG_MAX_LINEAR, + wheel_radius=DOG_WHEEL_RADIUS, + max_wheel_omega=motor_max, + k_turn=4.0, + ) + left_motor.setVelocity(left) + right_motor.setVelocity(right) + + +# --------------------------------------------------------------------------- +# Webots devices +# --------------------------------------------------------------------------- + +robot = Robot() timestep = int(robot.getBasicTimeStep()) -left_motor = robot.getDevice("left wheel motor") +left_motor = robot.getDevice("left wheel motor") right_motor = robot.getDevice("right wheel motor") left_motor.setPosition(float("inf")) right_motor.setPosition(float("inf")) left_motor.setVelocity(0.0) right_motor.setVelocity(0.0) +MOTOR_MAX = min(left_motor.getMaxVelocity(), DOG_MAX_WHEEL_OMEGA) -lidar = robot.getDevice("lidar") -lidar.enable(timestep) -lidar.enablePointCloud() - -gps = robot.getDevice("gps"); gps.enable(timestep) -compass = robot.getDevice("compass"); compass.enable(timestep) -emitter = robot.getDevice("emitter") +gps = robot.getDevice("gps"); gps.enable(timestep) +compass = robot.getDevice("compass"); compass.enable(timestep) receiver = robot.getDevice("receiver"); receiver.enable(timestep) +emitter = robot.getDevice("emitter") -left_ear = robot.getDevice("left ear motor") +# Cosmetic ear motors — ignored by control logic but keep them animated. +left_ear = robot.getDevice("left ear motor") right_ear = robot.getDevice("right ear motor") left_ear.setPosition(float("inf")) right_ear.setPosition(float("inf")) left_ear.setVelocity(0.0) right_ear.setVelocity(0.0) +ear_phase = 0.0 +EAR_AMPLITUDE = 0.35 +EAR_RATE = 8.0 -keyboard = robot.getKeyboard() -keyboard.enable(timestep) -MOTOR_MAX = left_motor.getMaxVelocity() -speed_level = 0.5 # fraction of MOTOR_MAX; adjusted by +/- +# --------------------------------------------------------------------------- +# Main loop +# --------------------------------------------------------------------------- -EAR_AMPLITUDE = 0.35 # rad, peak ear deflection -EAR_RATE = 8.0 # rad/s, how fast the ears are driven -ear_phase = 0.0 +# {name: (x, y)} — kept across all sheep ever heard from. Sheep that drift +# into the pen are tracked by ``penned`` so observations and Strömbom +# agree on which ones still need herding. +sheep_positions: dict = {} +penned_set: set = set() +step_count = 0 + +from herding.geometry import is_penned_position while robot.step(timestep) != -1: - speed = MOTOR_MAX * speed_level - turn = speed * 0.6 # differential turn radius + step_count += 1 - left_vel = 0.0 - right_vel = 0.0 - key = keyboard.getKey() - while key > 0: - if key in (ord('W'), Keyboard.UP): - left_vel = speed - right_vel = speed - elif key in (ord('S'), Keyboard.DOWN): - left_vel = -speed - right_vel = -speed - elif key in (ord('A'), Keyboard.LEFT): - left_vel = -turn - right_vel = turn - elif key in (ord('D'), Keyboard.RIGHT): - left_vel = turn - right_vel = -turn - elif key in (ord('+'), ord('=')): - speed_level = min(1.0, speed_level + 0.1) - print(f"Speed: {speed_level:.0%} ({MOTOR_MAX * speed_level:.1f} rad/s)") - elif key in (ord('-'), ord('_')): - speed_level = max(0.1, speed_level - 0.1) - print(f"Speed: {speed_level:.0%} ({MOTOR_MAX * speed_level:.1f} rad/s)") - key = keyboard.getKey() - - left_motor.setVelocity(left_vel) - right_motor.setVelocity(right_vel) + while receiver.getQueueLength() > 0: + msg = receiver.getString() + receiver.nextPacket() + parts = msg.split(":") + if len(parts) == 4 and parts[0] == "sheep": + try: + x, y = float(parts[2]), float(parts[3]) + except ValueError: + continue + sheep_positions[parts[1]] = (x, y) + if parts[1] not in penned_set and is_penned_position(x, y): + penned_set.add(parts[1]) pos = gps.getValues() - emitter.send(f"dog:{pos[0]}:{pos[1]}") + dog_xy = (pos[0], pos[1]) + n = compass.getValues() + dog_heading = math.atan2(n[0], n[1]) + # ---- Action selection ---- + if MODE == "rl" and policy_handle is not None: + sheep_xy_list = list(sheep_positions.values()) + sheep_names = list(sheep_positions.keys()) + sheep_penned_list = [s in penned_set for s in sheep_names] + obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list) + action = policy_handle.predict(obs) + vx, vy = float(action[0]), float(action[1]) + elif MODE == "sequential": + vx, vy, _mode_str, _dbg = sequential_action_debug( + dog_xy, sheep_positions, PEN_ENTRY, + ) + else: + # Strömbom (canonical baseline). + vx, vy, _mode_str, _dbg = strombom_action_debug( + dog_xy, sheep_positions, PEN_ENTRY, + ) + + # EMA smoothing — reduces oscillation from policy or Strömbom flips. + vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx + vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy + + # Safety: dog must never enter the pen. + vx, vy = safety_clamp(vx, vy, dog_xy[0], dog_xy[1]) + prev_action = (vx, vy) + + drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX) + emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}") + + # Cosmetic ear wiggle — purely visual. ear_phase += 0.12 ear_pos = EAR_AMPLITUDE * math.sin(ear_phase) left_ear.setVelocity(EAR_RATE) right_ear.setVelocity(EAR_RATE) - left_ear.setPosition( ear_pos) + left_ear.setPosition(ear_pos) right_ear.setPosition(-ear_pos) + + if step_count % 200 == 0: + n_active = sum(1 for s in sheep_positions if s not in penned_set) + print(f"[dog mode={MODE}] step={step_count} known={len(sheep_positions)} " + f"penned={len(penned_set)} active={n_active} action=({vx:+.2f}, {vy:+.2f})") diff --git a/controllers/shepherd_dog_rl/final_model.zip b/controllers/shepherd_dog_rl/final_model.zip deleted file mode 100644 index 139a531..0000000 Binary files a/controllers/shepherd_dog_rl/final_model.zip and /dev/null differ diff --git a/controllers/shepherd_dog_rl/plot_debug.py b/controllers/shepherd_dog_rl/plot_debug.py deleted file mode 100644 index 452ac3d..0000000 --- a/controllers/shepherd_dog_rl/plot_debug.py +++ /dev/null @@ -1,153 +0,0 @@ -""" -Render Webots-side debug trajectory from debug.csv. - -The shepherd_dog_rl controller writes per-step state to debug.csv when -DOG_DEBUG=1. This script reads it and produces: - - trajectory.png — dog path + sheep paths overlaid on the field - obs_drift.png — normalized observation distribution over time - actions.png — vx, vy time series - -Run: - python plot_debug.py # uses debug.csv next to this file - python plot_debug.py --csv path/to.csv --out-dir somewhere/ -""" -import argparse -import csv -import os -import sys - -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches -import numpy as np - - -def load_csv(path): - rows = [] - with open(path) as f: - rd = csv.DictReader(f) - for r in rd: - rows.append(r) - if not rows: - sys.exit(f"empty CSV: {path}") - return rows - - -def parse_floats(s): - return [float(x) for x in s.split(";") if x] - - -def plot_trajectory(rows, out_path): - fig, ax = plt.subplots(figsize=(7, 7)) - ax.set_xlim(-16, 16); ax.set_ylim(-16, 16); ax.set_aspect("equal") - ax.set_facecolor("#dcedc8") - ax.add_patch(mpatches.Rectangle((-15, -15), 30, 30, - fill=False, edgecolor="#795548", lw=2)) - ax.add_patch(mpatches.Rectangle((10, -15), 3, 7, - facecolor="#ffe082", edgecolor="#795548", lw=2)) - ax.text(11.5, -11.5, "pen", ha="center", va="center", fontsize=8) - - dog_x = [float(r["dog_x"]) for r in rows] - dog_y = [float(r["dog_y"]) for r in rows] - ax.plot(dog_x, dog_y, color="#4e342e", lw=1.5, alpha=0.7, label="dog") - ax.plot(dog_x[0], dog_y[0], "s", color="#4e342e", ms=10) - ax.plot(dog_x[-1], dog_y[-1], "D", color="#4e342e", ms=10) - - # Sheep — re-shape into per-sheep tracks - sx_all = [parse_floats(r["sheep_xs"]) for r in rows] - sy_all = [parse_floats(r["sheep_ys"]) for r in rows] - if sx_all and sx_all[-1]: - n_sheep = len(sx_all[-1]) - palette = ["#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00", - "#a65628","#f781bf","#999999","#66c2a5","#fc8d62"] - for i in range(n_sheep): - xs = [r[i] if i < len(r) else None for r in sx_all] - ys = [r[i] if i < len(r) else None for r in sy_all] - xs = [x for x in xs if x is not None] - ys = [y for y in ys if y is not None] - if xs: - c = palette[i % len(palette)] - ax.plot(xs, ys, color=c, lw=0.8, alpha=0.6, label=f"sheep {i+1}") - ax.plot(xs[0], ys[0], "o", color=c, ms=6) - ax.plot(xs[-1], ys[-1], "*", color=c, ms=10) - - n_in_pen = int(rows[-1]["n_penned"]) - ax.set_title(f"Webots trajectory {len(rows)} steps penned={n_in_pen}", - fontsize=12) - ax.legend(loc="upper left", fontsize=7, ncol=2) - plt.tight_layout() - fig.savefig(out_path, dpi=120) - plt.close(fig) - - -def plot_actions(rows, out_path): - t = np.arange(len(rows)) - vx = np.array([float(r["vx"]) for r in rows]) - vy = np.array([float(r["vy"]) for r in rows]) - mag = np.sqrt(vx ** 2 + vy ** 2) - - fig, axes = plt.subplots(3, 1, figsize=(12, 7), sharex=True) - axes[0].plot(t, vx, color="tab:red", lw=0.8); axes[0].set_ylabel("vx") - axes[0].axhline(0, color="black", lw=0.4); axes[0].set_ylim(-1.1, 1.1) - axes[1].plot(t, vy, color="tab:blue", lw=0.8); axes[1].set_ylabel("vy") - axes[1].axhline(0, color="black", lw=0.4); axes[1].set_ylim(-1.1, 1.1) - axes[2].plot(t, mag, color="tab:purple", lw=0.8); axes[2].set_ylabel("||action||") - axes[2].axhline(np.sqrt(2), color="orange", ls="--", lw=1, label="saturated √2") - axes[2].axhline(1.0, color="gray", ls="--", lw=1) - axes[2].set_xlabel("step"); axes[2].legend(fontsize=8) - fig.suptitle("Webots action time series") - plt.tight_layout() - fig.savefig(out_path, dpi=120) - plt.close(fig) - - -def plot_obs(rows, out_path): - norm = np.array([parse_floats(r["norm_obs"]) for r in rows]) - raw = np.array([parse_floats(r["raw_obs"]) for r in rows]) - if norm.size == 0: - return - n_dims = norm.shape[1] - labels = [ - "dog_x", "dog_y", "com-dog_x", "com-dog_y", - "far1-com_x", "far1-com_y", "far2-com_x", "far2-com_y", - "far3-com_x", "far3-com_y", "pen-com_x", "pen-com_y", - "pen-far1_x", "pen-far1_y", "radius", "frac_active", - ][:n_dims] - - t = np.arange(norm.shape[0]) - fig, axes = plt.subplots(n_dims, 1, figsize=(11, 1.0 * n_dims), sharex=True) - if n_dims == 1: axes = [axes] - for i in range(n_dims): - axes[i].plot(t, raw[:, i], color="tab:gray", lw=0.6, alpha=0.6, label="raw") - axes[i].plot(t, norm[:, i], color="tab:red", lw=0.8, label="normalised") - axes[i].set_ylabel(labels[i], fontsize=8) - axes[i].tick_params(labelsize=7) - if i == 0: - axes[i].legend(fontsize=7, loc="upper right") - axes[-1].set_xlabel("step") - fig.suptitle("Observation values over time (raw vs VecNormalize-normalised)") - plt.tight_layout() - fig.savefig(out_path, dpi=110) - plt.close(fig) - - -def main(): - p = argparse.ArgumentParser() - here = os.path.dirname(os.path.abspath(__file__)) - p.add_argument("--csv", default=os.path.join(here, "debug.csv")) - p.add_argument("--out-dir", default=os.path.join(here, "debug_out")) - args = p.parse_args() - - rows = load_csv(args.csv) - os.makedirs(args.out_dir, exist_ok=True) - print(f"loaded {len(rows)} rows from {args.csv}") - plot_trajectory(rows, os.path.join(args.out_dir, "trajectory.png")) - plot_actions(rows, os.path.join(args.out_dir, "actions.png")) - plot_obs(rows, os.path.join(args.out_dir, "obs.png")) - print(f"saved trajectory.png + actions.png + obs.png to {args.out_dir}/") - - -if __name__ == "__main__": - main() diff --git a/controllers/shepherd_dog_rl/shepherd_dog_rl.py b/controllers/shepherd_dog_rl/shepherd_dog_rl.py deleted file mode 100644 index 11f8f05..0000000 --- a/controllers/shepherd_dog_rl/shepherd_dog_rl.py +++ /dev/null @@ -1,285 +0,0 @@ -""" -Shepherd Dog RL controller — runs a trained SB3 PPO policy inside Webots. - -Setup ------ -1. Copy your trained files into this directory: - controllers/shepherd_dog_rl/final_model.zip - controllers/shepherd_dog_rl/vecnorm.pkl - -2. In field.wbt, set the ShepherdDog robot's controller field to - "shepherd_dog_rl". You can do this in the Webots GUI: - click the robot → Controller → shepherd_dog_rl - -3. Optional: set controllerArgs to ["5"] (number of sheep) if it differs - from the default of 5. - -The controller reads GPS (dog position) and Receiver (sheep broadcasts), -builds the same 16-dim flock observation the training env used, normalises -it with the saved VecNormalize stats, and converts the (vx, vy) policy -output into differential wheel speeds. - -Debug logging -------------- -Set env var DOG_DEBUG=1 to write a per-step CSV (dog pos, sheep positions, -raw obs, normalised obs, action) to debug.csv alongside this script. Use -plot_debug.py to render trajectories from it. -""" - -import sys -import os -import math -import struct -import numpy as np - -# ── make training code importable ─────────────────────────────────────────── -_HERE = os.path.dirname(os.path.abspath(__file__)) -_TRAINING = os.path.join(_HERE, "..", "..", "training") -sys.path.insert(0, _TRAINING) - -from controller import Robot -from stable_baselines3 import PPO -from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize -from herding_env import HerdingEnv - -# ── constants (must match herding_env.py) ─────────────────────────────────── -FIELD = 15.0 -PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32) -PEN_X = (10.0, 13.0) -PEN_Y = (-15.0, -8.0) -DOG_SPEED = 2.5 # m/s -WHEEL_R = 0.038 # wheel radius (metres) — from ShepherdDog.proto -K_TURN = 4.0 # heading-error gain (rad/s per rad) -EAR_AMPLITUDE = 0.35 -EAR_RATE = 8.0 - -# ── model paths ───────────────────────────────────────────────────────────── -MODEL_PATH = os.path.join(_HERE, "final_model.zip") -VECNORM_PATH = os.path.join(_HERE, "vecnorm.pkl") -DEBUG_CSV = os.path.join(_HERE, "debug.csv") -DEBUG_ENABLED = True # set False to disable debug.csv logging - -# ── action smoothing ───────────────────────────────────────────────────────── -# EMA on policy output to suppress the rapid oscillation (vx/vy flipping -# between -1 and +1 every step) that stalls the physical dog. 0 = no -# smoothing (raw policy), 1 = frozen. 0.3 keeps ~30% of previous action. -ACTION_SMOOTH = 0.3 -prev_action = np.zeros(2, dtype=np.float32) - - -def norm_angle(a: float) -> float: - while a > math.pi: a -= 2 * math.pi - while a < -math.pi: a += 2 * math.pi - return a - - -def in_pen(x: float, y: float) -> bool: - return PEN_X[0] < x < PEN_X[1] and PEN_Y[0] < y < PEN_Y[1] - - -def build_obs(dog_pos: np.ndarray, - sheep_dict: dict, - n_sheep: int, - dog_heading: float = 0.0) -> np.ndarray: - """ - Build the 18-dim flock observation — identical to HerdingEnv._obs(). - - sheep_dict: {name: (x, y)} for ALL known sheep (penned or not). - dog_heading: dog's current world-frame heading in radians. - """ - D = 2 * FIELD - - # Split active vs penned - active_pos = np.array( - [v for v in sheep_dict.values() if not in_pen(*v)], - dtype=np.float32 - ) - n_active = len(active_pos) - - if n_active > 0: - com = active_pos.mean(axis=0) - d_from_com = np.linalg.norm(active_pos - com, axis=1) - sorted_idx = np.argsort(d_from_com)[::-1] - radius = float(d_from_com[sorted_idx[0]]) - def nth(n): - return active_pos[sorted_idx[n]] if len(sorted_idx) > n else com - far1, far2, far3 = nth(0), nth(1), nth(2) - else: - com = PEN_CENTER.copy() - radius = 0.0 - far1 = far2 = far3 = PEN_CENTER.copy() - - frac_active = n_active / max(n_sheep, 1) - - return np.array([ - dog_pos[0] / FIELD, dog_pos[1] / FIELD, - (com[0] - dog_pos[0]) / D, (com[1] - dog_pos[1]) / D, - (far1[0] - com[0]) / D, (far1[1] - com[1]) / D, - (far2[0] - com[0]) / D, (far2[1] - com[1]) / D, - (far3[0] - com[0]) / D, (far3[1] - com[1]) / D, - (PEN_CENTER[0] - com[0]) / D, (PEN_CENTER[1] - com[1]) / D, - (PEN_CENTER[0] - far1[0]) / D, (PEN_CENTER[1] - far1[1]) / D, - radius / D, - frac_active, - math.cos(dog_heading), math.sin(dog_heading), - ], dtype=np.float32) - - -# ── Webots setup ───────────────────────────────────────────────────────────── -robot = Robot() -timestep = int(robot.getBasicTimeStep()) - -# Drive motors -left_motor = robot.getDevice("left wheel motor") -right_motor = robot.getDevice("right wheel motor") -left_motor.setPosition(float("inf")) -right_motor.setPosition(float("inf")) -left_motor.setVelocity(0.0) -right_motor.setVelocity(0.0) -MOTOR_MAX = left_motor.getMaxVelocity() - -# Sensors -gps = robot.getDevice("gps"); gps.enable(timestep) -compass = robot.getDevice("compass"); compass.enable(timestep) -receiver = robot.getDevice("receiver"); receiver.enable(timestep) -emitter = robot.getDevice("emitter") - -# Cosmetic -left_ear = robot.getDevice("left ear motor") -right_ear = robot.getDevice("right ear motor") -left_ear.setPosition(float("inf")); right_ear.setPosition(float("inf")) -left_ear.setVelocity(0.0); right_ear.setVelocity(0.0) -ear_phase = 0.0 - -# Number of sheep (from controllerArgs or default) -try: - n_sheep = int(sys.argv[1]) -except (IndexError, ValueError): - n_sheep = 3 - -# ── Load model ─────────────────────────────────────────────────────────────── -print(f"[RL dog] Loading model from {MODEL_PATH}") -print(f"[RL dog] Loading vecnorm from {VECNORM_PATH}") - -dummy_env = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep)]) -vecnorm = VecNormalize.load(VECNORM_PATH, dummy_env) -vecnorm.training = False -vecnorm.norm_reward = False - -model = PPO.load(MODEL_PATH, device="cpu") -print(f"[RL dog] Model loaded — running with n_sheep={n_sheep}") - -# ── Runtime state ───────────────────────────────────────────────────────────── -sheep_positions: dict = {} # {name: (x, y)} — updated every step from receiver -step_count = 0 - -# Debug CSV — written every step when DOG_DEBUG=1 -debug_file = None -if DEBUG_ENABLED: - import csv - debug_file = open(DEBUG_CSV, "w", newline="") - debug_writer = csv.writer(debug_file) - debug_writer.writerow([ - "step", "dog_x", "dog_y", "heading", - "sheep_xs", "sheep_ys", "n_active", "n_penned", - "raw_obs", "norm_obs", "vx", "vy", - ]) - print(f"[RL dog] DEBUG logging to {DEBUG_CSV}") - - -def bearing() -> float: - """Current robot heading in world frame (radians).""" - n = compass.getValues() - return math.atan2(n[0], n[1]) - - -def drive(action_vx: float, action_vy: float) -> None: - """Convert (vx, vy) policy action to differential wheel speeds.""" - speed_ms = math.sqrt(action_vx ** 2 + action_vy ** 2) * DOG_SPEED - if speed_ms < 0.05: - left_motor.setVelocity(0.0) - right_motor.setVelocity(0.0) - return - - target_heading = math.atan2(action_vy, action_vx) - err = norm_angle(target_heading - bearing()) - - fwd_ms = speed_ms * max(0.0, math.cos(err)) - fwd_rad = fwd_ms / WHEEL_R - turn = K_TURN * err # rad/s correction - - l = max(-MOTOR_MAX, min(MOTOR_MAX, fwd_rad - turn)) - r = max(-MOTOR_MAX, min(MOTOR_MAX, fwd_rad + turn)) - left_motor.setVelocity(l) - right_motor.setVelocity(r) - - -# ── Main loop ───────────────────────────────────────────────────────────────── -while robot.step(timestep) != -1: - step_count += 1 - - # 1. Drain receiver — update sheep position table - while receiver.getQueueLength() > 0: - try: - msg = receiver.getString() - parts = msg.split(":") - if parts[0] == "sheep" and len(parts) == 4: - sheep_positions[parts[1]] = (float(parts[2]), float(parts[3])) - except Exception: - pass - receiver.nextPacket() - - # 2. Dog GPS - gps_vals = gps.getValues() - dog_pos = np.array([gps_vals[0], gps_vals[1]], dtype=np.float32) - - # 3. Build and normalise observation (heading from compass) - raw_obs = build_obs(dog_pos, sheep_positions, n_sheep, - dog_heading=bearing()) - obs_norm = vecnorm.normalize_obs(raw_obs[np.newaxis]) # (1, 13) - - # 4. Policy inference + smoothing - action, _ = model.predict(obs_norm, deterministic=True) - raw_a = np.array([float(action[0][0]), float(action[0][1])], dtype=np.float32) - if ACTION_SMOOTH > 0: - smoothed = ACTION_SMOOTH * prev_action + (1.0 - ACTION_SMOOTH) * raw_a - prev_action[:] = smoothed - vx, vy = float(smoothed[0]), float(smoothed[1]) - else: - vx, vy = float(raw_a[0]), float(raw_a[1]) - - # 5. Drive - drive(vx, vy) - - # 6. Broadcast dog position so sheep can compute flee forces - emitter.send(f"dog:{dog_pos[0]:.4f}:{dog_pos[1]:.4f}") - - # 7. Ear animation - ear_phase += 0.12 - ep = EAR_AMPLITUDE * math.sin(ear_phase) - left_ear.setVelocity(EAR_RATE); right_ear.setVelocity(EAR_RATE) - left_ear.setPosition( ep); right_ear.setPosition(-ep) - - # Periodic status - if step_count % 100 == 0: - n_in_pen = sum(1 for x, y in sheep_positions.values() if in_pen(x, y)) - print(f"[RL dog] step={step_count} known_sheep={len(sheep_positions)}" - f" penned={n_in_pen}/{n_sheep} dog=({dog_pos[0]:.2f},{dog_pos[1]:.2f})" - f" action=({vx:.2f}, {vy:.2f})") - - # Debug CSV row - if debug_file is not None: - n_active = sum(1 for x, y in sheep_positions.values() if not in_pen(x, y)) - n_in_pen = len(sheep_positions) - n_active - debug_writer.writerow([ - step_count, f"{dog_pos[0]:.4f}", f"{dog_pos[1]:.4f}", - f"{bearing():.4f}", - ";".join(f"{v[0]:.3f}" for v in sheep_positions.values()), - ";".join(f"{v[1]:.3f}" for v in sheep_positions.values()), - n_active, n_in_pen, - ";".join(f"{x:.4f}" for x in raw_obs), - ";".join(f"{x:.4f}" for x in obs_norm[0]), - f"{vx:.4f}", f"{vy:.4f}", - ]) - if step_count % 200 == 0: - debug_file.flush() diff --git a/controllers/shepherd_dog_rl/vecnorm.pkl b/controllers/shepherd_dog_rl/vecnorm.pkl deleted file mode 100644 index 695f22c..0000000 Binary files a/controllers/shepherd_dog_rl/vecnorm.pkl and /dev/null differ diff --git a/docs/project.md b/docs/project.md index eea5a9a..19160d6 100644 --- a/docs/project.md +++ b/docs/project.md @@ -6,28 +6,28 @@ - Nelson Neto ## (i) Title and General objectives -**RL-Based Autonomous Shepherd Robot for Livestock Herding** +**Autonomous Shepherd Robot for Livestock Herding (Strömbom)** - Implement effective herding behaviors through proximity and movement strategies - Build a 3D environment with realistic robot dynamics and LIDAR-based perception -- Develop a mobile robot capable of autonomously guiding a flock of sheep into a designated target area using Reinforcement Learning +- Develop a mobile robot capable of autonomously guiding a flock of sheep into a designated target area using the Strömbom heuristic approach # Group G25 - (ii) Intermediate Goals ## Intermediate goals - Set up the Webots simulation environment with an open field and target zone -- Implement lightweight Gymnasium-based 2D herding environment +- Implement lightweight 2D herding environment for algorithm evaluation - Design a Sheep and Dog robot -- Implement a sheep flocking model for fast RL iteration +- Implement a sheep flocking model for fast Strömbom iteration - Validate LiDAR sensor feedback for sheep detection and distance estimation # Group G25 - Course Project (Final) Goals ## (iii) Main goals -- State-of-the-art survey on shepherding algorithms and multi-agent RL herding -- Train the robot using PPO to successfully herd a single sheep into the goal +- State-of-the-art survey on shepherding algorithms with focus on Strömbom herding +- Implement and tune Strömbom controller to successfully herd a single sheep into the goal - Achieve fully autonomous herding of multiple sheep and a full flock into the target area - Optimize robot trajectory to minimize the time required to group the flock - Ensure zero collisions between the robot and the sheep during the task @@ -35,7 +35,7 @@ - Article, demo video, and final presentation ## (iv) Extra Merit -- Curriculum Learning (scaling from 1 sheep to a flock) +- Progressive evaluation (scaling from 1 sheep to a flock) - Comparison of performance between Differential Drive and Mecanum wheels - Robustness testing under sensor noise or varying sheep speeds, configurations and parameters - Multi-shepherd cooperative mode: 2 dogs learn role specialization (collector vs. driver) @@ -46,11 +46,10 @@ ## (v) Tools - Webots for 3D physics simulation with ROS2 integration via `webots_ros2` package -- Stable-Baselines3 for the PPO algorithm implementation -- Gymnasium (OpenAI) for the RL environment wrapper (lightweight 2D herding env for fast RL training) +- Gymnasium (OpenAI) for the simulation wrapper and evaluation tooling - Python as the primary programming language (sheep flocking model, reward shaping, evaluation) ## (vi) Limitations -- Computational Power: Training time might be high for complex flock behaviors +- Computational Power: Large batch evaluation and parameter sweeps can still be time-consuming - Sim-to-Real Gap: No real-world validation of the herding controller; project is simulation-only (2D + Webots 3D) - Model Complexity: Simplified sheep behavior (scripted) may not account for all biological livestock nuances \ No newline at end of file diff --git a/herding/__init__.py b/herding/__init__.py new file mode 100644 index 0000000..781f353 --- /dev/null +++ b/herding/__init__.py @@ -0,0 +1,8 @@ +"""Shared core for the shepherd herding project. + +This package is the single source of truth for world geometry, sheep +flocking dynamics, differential-drive kinematics, observation building, +and the Strömbom heuristic. It is imported both by the Webots +controllers (for inference) and by the Gymnasium training environment +(for fast PPO rollouts), so the two paths cannot drift apart. +""" diff --git a/herding/diffdrive.py b/herding/diffdrive.py new file mode 100644 index 0000000..9b854a0 --- /dev/null +++ b/herding/diffdrive.py @@ -0,0 +1,70 @@ +"""Differential-drive kinematics matching the Webots robot specs. + +The Webots controllers and the training env both use these helpers so the +sim and the real (Webots) physics agree to first order. They do not model +slip, wheel acceleration limits, or contact forces — Webots does that for +us at inference time. The training env has to be close enough that a +policy trained against this kinematic model still works when handed off +to ODE physics. +""" + +import math + + +def kinematics_step(x, y, h, w_left, w_right, wheel_radius, wheel_base, dt): + """Integrate one step of differential-drive forward kinematics. + + Inputs + ------ + x, y : robot position (m) + h : robot heading (rad), 0 = +x axis + w_left, w_right : wheel angular velocities (rad/s) + wheel_radius, wheel_base : robot dimensions (m) + dt : timestep (s) + + Returns (new_x, new_y, new_h). + """ + v = (w_right + w_left) * wheel_radius * 0.5 + omega = (w_right - w_left) * wheel_radius / wheel_base + new_x = x + v * math.cos(h) * dt + new_y = y + v * math.sin(h) * dt + new_h = math.atan2(math.sin(h + omega * dt), math.cos(h + omega * dt)) + return new_x, new_y, new_h + + +def velocity_to_wheels(vx, vy, h, max_linear, wheel_radius, max_wheel_omega, + k_turn=4.0): + """Convert a desired (vx, vy) intent in [-1, 1]^2 to wheel speeds. + + Mirrors ``drive_action`` in controllers/shepherd_dog/shepherd_dog.py: + forward speed scales by ``cos(err)`` (clamped to ±90°), and a P + controller on heading error contributes the wheel-rate differential. + """ + speed_ms = math.hypot(vx, vy) * max_linear + if speed_ms < 1e-3: + return 0.0, 0.0 + target_h = math.atan2(vy, vx) + err = math.atan2(math.sin(target_h - h), math.cos(target_h - h)) + clamped_err = max(-math.pi / 2, min(math.pi / 2, err)) + fwd_ms = speed_ms * math.cos(clamped_err) + fwd_rad = fwd_ms / wheel_radius + turn = k_turn * err + left = max(-max_wheel_omega, min(max_wheel_omega, fwd_rad - turn)) + right = max(-max_wheel_omega, min(max_wheel_omega, fwd_rad + turn)) + return left, right + + +def heading_speed_to_wheels(heading, speed_motor, h, max_wheel_omega, + k_turn=4.0): + """Sheep variant: speed already expressed in motor (wheel rad/s) units. + + Matches the existing sheep controller (``controllers/sheep/sheep.py``) + where ``speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))`` and + these constants are wheel angular velocities, not linear m/s. + """ + err = math.atan2(math.sin(heading - h), math.cos(heading - h)) + fwd = max(0.0, math.cos(err)) * speed_motor + turn = k_turn * err + left = max(-max_wheel_omega, min(max_wheel_omega, fwd - turn)) + right = max(-max_wheel_omega, min(max_wheel_omega, fwd + turn)) + return left, right diff --git a/herding/flocking_sim.py b/herding/flocking_sim.py new file mode 100644 index 0000000..61dcd52 --- /dev/null +++ b/herding/flocking_sim.py @@ -0,0 +1,178 @@ +"""Reynolds-style sheep flocking dynamics. + +This is the per-sheep behavioural step used both by the Webots sheep +controller (scalar, one sheep at a time) and by the training environment +(loop over sheep). The numerics are adapted from the original +``controllers/sheep/flocking.py`` and retuned for the new external-pen +layout: the south stone wall is intact except in the gate column, so +sheep can only reach the pen by walking through that 3-m corridor. + +Force stack each step (summed → heading + speed): + flee — quadratic ramp away from dog within FLEE_DIST + cohesion — drift toward flock centre, halved while fleeing + separation — inverse-distance push from peers + walls — soft repulsion + hard escape band against field walls, + except inside the gate column where the south wall is + absent + wander — small persistent drift for natural idle motion + +A sheep latches to ``penned`` the first time it crosses the gate plane +into the gate column (handled by callers via ``geometry.is_penned_position``); +once latched, ``penned=True`` is passed in here and the force stack +switches to in-pen containment + jitter. +""" + +import math +import random + +from herding.geometry import ( + FIELD_X, FIELD_Y, + PEN_X, PEN_Y, + GATE_X, +) + +# --- Speed and force constants --- +# All speeds here are in wheel rad/s (motor units), matching the existing +# sheep controller. Conversion to m/s = speed * SHEEP_WHEEL_RADIUS. +MAX_SPEED = 22.0 +FLEE_SPEED = 20.0 +WANDER_SPEED = 3.0 + +WALL_MARGIN = 5.0 +WALL_HARD_MARGIN = 1.0 +WALL_HARD_GAIN = 50.0 + +FLEE_DIST = 7.0 +SEPARATION_DIST = 2.5 +COHESION_DIST = 8.0 + +PEN_MARGIN = 0.8 + + +def _peers_iter(peers): + """Accept either a {name: (x, y)} dict or an iterable of (x, y) tuples.""" + if isinstance(peers, dict): + return list(peers.values()) + return list(peers) + + +def compute_heading_speed(x, y, penned, dog_xy, peers, wander_angle, rng=None): + """Return ``(heading, speed, new_wander_angle)`` for one sheep step. + + ``speed`` is in wheel rad/s (motor units), bounded by ``[WANDER_SPEED, + FLEE_SPEED]``. ``heading`` is the world-frame target heading the sheep + should aim for (atan2 convention). + + ``rng`` is an optional ``random.Random``-compatible object used for + the wander-jitter. If ``None``, falls back to Python's global module + (matches Webots controller usage). Pass an env-owned RNG to make + rollouts deterministic given a seed. + """ + fx, fy = 0.0, 0.0 + peer_list = _peers_iter(peers) + rnd = rng if rng is not None else random + + if penned: + # --- Pen containment: bounce off the four pen walls --- + pm = PEN_MARGIN + if x < PEN_X[0] + pm: + fx += ((PEN_X[0] + pm - x) / pm) * 15.0 + if x > PEN_X[1] - pm: + fx -= ((x - (PEN_X[1] - pm)) / pm) * 15.0 + if y < PEN_Y[0] + pm: + fy += ((PEN_Y[0] + pm - y) / pm) * 15.0 + if y > PEN_Y[1] - pm: + fy -= ((y - (PEN_Y[1] - pm)) / pm) * 15.0 + + # Mild peer separation — penned sheep crowd the corner otherwise. + for px, py in peer_list: + dx, dy = px - x, py - y + d = math.hypot(dx, dy) + if 0.05 < d < SEPARATION_DIST: + push = (SEPARATION_DIST - d) / d + fx -= (dx / d) * push * 2.5 + fy -= (dy / d) * push * 2.5 + + if rnd.random() < 0.02: + wander_angle += rnd.uniform(-0.6, 0.6) + fx += math.cos(wander_angle) * 0.5 + fy += math.sin(wander_angle) * 0.5 + + else: + # --- Free-roaming sheep in the field --- + fleeing = False + if dog_xy is not None: + ddx = dog_xy[0] - x + ddy = dog_xy[1] - y + dist = math.hypot(ddx, ddy) + if 0.01 < dist < FLEE_DIST: + fleeing = True + t = 1.0 - dist / FLEE_DIST + s = t * t * 20.0 + fx -= (ddx / dist) * s + fy -= (ddy / dist) * s + + # Cohesion — drift toward flock CoM (peers within COHESION_DIST). + # Cohesion is *stronger* under flee than at rest (the + # predator-confusion / safety-in-numbers effect — sheep huddle when + # threatened). This is what makes shepherding work: the flock stays + # as one unit through the narrow gate instead of fragmenting. + cx, cy, cn = 0.0, 0.0, 0 + for px, py in peer_list: + d = math.hypot(px - x, py - y) + if 0.3 < d < COHESION_DIST: + cx += px + cy += py + cn += 1 + if cn > 0: + # Cohesion needs to be comparable to flee at close range to keep + # the flock together through narrow obstacles like the 3m gate. + # Flee at 2m has magnitude ~10; cohesion at peer-distance 5m + # with w=1.5 contributes ~7.5 — same order, so the flock + # translates as a unit instead of fragmenting under pressure. + w = 1.5 if fleeing else 0.6 + fx += (cx / cn - x) * w + fy += (cy / cn - y) * w + + # Separation — inverse-distance push from peers. + for px, py in peer_list: + ddx, ddy = px - x, py - y + d = math.hypot(ddx, ddy) + if 0.05 < d < SEPARATION_DIST: + push = (SEPARATION_DIST - d) / d + fx -= (ddx / d) * push * 2.5 + fy -= (ddy / d) * push * 2.5 + + # Wall soft repulsion. The south wall is absent inside the gate + # column so sheep can be driven through it by the dog. + if x < FIELD_X[0] + WALL_MARGIN: + fx += ((FIELD_X[0] + WALL_MARGIN - x) / WALL_MARGIN) * 6.0 + if x > FIELD_X[1] - WALL_MARGIN: + fx -= ((x - (FIELD_X[1] - WALL_MARGIN)) / WALL_MARGIN) * 6.0 + if y > FIELD_Y[1] - WALL_MARGIN: + fy -= ((y - (FIELD_Y[1] - WALL_MARGIN)) / WALL_MARGIN) * 6.0 + if y < FIELD_Y[0] + WALL_MARGIN and not (GATE_X[0] <= x <= GATE_X[1]): + fy += ((FIELD_Y[0] + WALL_MARGIN - y) / WALL_MARGIN) * 6.0 + + if not fleeing: + if random.random() < 0.02: + wander_angle += random.uniform(-0.6, 0.6) + fx += math.cos(wander_angle) * 0.5 + fy += math.sin(wander_angle) * 0.5 + + # --- Hard escape band — overrides everything when very close to a wall --- + m, g = WALL_HARD_MARGIN, WALL_HARD_GAIN + if x - FIELD_X[0] < m: + fx = max(fx, g * (1.0 - (x - FIELD_X[0]) / m)) + if FIELD_X[1] - x < m: + fx = min(fx, -g * (1.0 - (FIELD_X[1] - x) / m)) + if FIELD_Y[1] - y < m: + fy = min(fy, -g * (1.0 - (FIELD_Y[1] - y) / m)) + # South wall hard escape only when not in the gate column and not penned. + if (not penned) and (y - FIELD_Y[0] < m) and not (GATE_X[0] <= x <= GATE_X[1]): + fy = max(fy, g * (1.0 - (y - FIELD_Y[0]) / m)) + + heading = math.atan2(fy, fx) + mag = math.hypot(fx, fy) + speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0)) + return heading, speed, wander_angle diff --git a/herding/geometry.py b/herding/geometry.py new file mode 100644 index 0000000..3846e09 --- /dev/null +++ b/herding/geometry.py @@ -0,0 +1,99 @@ +"""World geometry and robot specs. + +All coordinates are in meters. (0, 0) is the centre of the field, +x is +east, +y is north. Z is up but unused here. These constants must match +``worlds/field.wbt`` and the proto files; if the world changes, change +this file and only this file. + +Pen layout (post-refactor) +-------------------------- +The pen is *external* to the field, accessed through a 3 m gate cut into +the south stone wall at y = -15. Sheep entering through the gate end up +in a fenced rectangle south of the field; the dog stays in the field +(soft-limited above DOG_SOUTH_LIMIT during training and inference). + + field +y north + +-----------+ + | | + | | + | ...... | + +---||||----+ y = -15 (south wall, gate at x ∈ [10, 13]) + |||| + |pen| y ∈ [-22, -15] + +---+ +""" + +import math + +# --- Field (square, stone-walled) --- +FIELD_X = (-15.0, 15.0) +FIELD_Y = (-15.0, 15.0) + +# Conservative inside bounds — sheep/dog should not graze the wall. +FIELD_INSIDE_MARGIN = 0.5 + +# --- Pen (external, south of the field) --- +PEN_X = (10.0, 13.0) +PEN_Y = (-22.0, -15.0) +PEN_CENTER = (0.5 * (PEN_X[0] + PEN_X[1]), 0.5 * (PEN_Y[0] + PEN_Y[1])) +# The point the dog drives the flock toward: the gate centre on the field side. +PEN_ENTRY = (0.5 * (PEN_X[0] + PEN_X[1]), -15.0) + +# --- Gate (the hole in the south stone wall) --- +GATE_X = PEN_X +GATE_Y = -15.0 + +# --- Robot specs (must match proto files) --- +# Dog (controllers/shepherd_dog/, protos/ShepherdDog.proto) +DOG_WHEEL_RADIUS = 0.038 # m +DOG_WHEEL_BASE = 0.28 # m, axle-to-axle +DOG_MAX_WHEEL_OMEGA = 70.0 # rad/s +DOG_MAX_LINEAR = DOG_WHEEL_RADIUS * DOG_MAX_WHEEL_OMEGA # ~2.66 m/s + +# Sheep (controllers/sheep/, protos/Sheep.proto) +SHEEP_WHEEL_RADIUS = 0.031 # m +SHEEP_WHEEL_BASE = 0.20 # m +SHEEP_MAX_WHEEL_OMEGA = 25.0 # rad/s +SHEEP_MAX_LINEAR = SHEEP_WHEEL_RADIUS * SHEEP_MAX_WHEEL_OMEGA # ~0.78 m/s + +# --- Webots step --- +WEBOTS_DT = 0.016 # seconds, matches WorldInfo.basicTimeStep = 16 in field.wbt + +# --- Dog "virtual south wall" (training keeps dog out of the pen) --- +# At inference the controller also clips to this so a slightly miscalibrated +# policy doesn't accidentally drive into the pen and trap the sheep. +DOG_SOUTH_LIMIT = -14.5 + +# --- Maximum supported flock size --- +MAX_SHEEP = 10 + + +def in_pen(x: float, y: float) -> bool: + """True if (x, y) lies inside the external pen rectangle.""" + return PEN_X[0] < x < PEN_X[1] and PEN_Y[0] < y < PEN_Y[1] + + +def in_field(x: float, y: float, margin: float = 0.0) -> bool: + return (FIELD_X[0] + margin <= x <= FIELD_X[1] - margin + and FIELD_Y[0] + margin <= y <= FIELD_Y[1] - margin) + + +def in_gate_corridor(x: float, y: float, margin: float = 0.0) -> bool: + """True if (x, y) lies in the column of the gate (between field and pen).""" + return (PEN_X[0] - margin <= x <= PEN_X[1] + margin + and PEN_Y[0] - margin <= y <= GATE_Y + margin) + + +def is_penned_position(x: float, y: float, latch_margin: float = 0.2) -> bool: + """A sheep latches to "penned" once it crosses the gate plane south. + + True iff x is inside the gate column (with a small margin) AND + y has dipped below the gate line. Once latched, the sheep is held by + in-pen forces and will not exit on its own. + """ + return (PEN_X[0] - latch_margin <= x <= PEN_X[1] + latch_margin + and y <= GATE_Y) + + +def distance_to_pen_entry(x: float, y: float) -> float: + return math.hypot(x - PEN_ENTRY[0], y - PEN_ENTRY[1]) diff --git a/herding/obs.py b/herding/obs.py new file mode 100644 index 0000000..117cf4e --- /dev/null +++ b/herding/obs.py @@ -0,0 +1,137 @@ +"""Observation builder for the shepherd dog policy. + +Order-invariant 32-D feature vector — the policy generalises across +flock sizes 1..MAX_SHEEP because individual sheep coordinates never +appear in the observation by index, only summary statistics, a polar +histogram, and two "named" sheep (closest-to-pen and rearmost-from-pen). + +The two named sheep matter for the sequential-driving teacher: it +targets the closest-to-pen sheep specifically, so the policy needs +that channel to mimic the teacher. + +Layout (all components normalised so values stay roughly in [-1, 1]): + + idx field + ----- ---------------------------------------------------------- + 0..3 dog pose: x/15, y/15, cos(heading), sin(heading) + 4..5 active-sheep CoM x/15, y/15 + 6..8 flock dispersion: max-radius/15, std_x/15, std_y/15 + 9..11 vector dog→CoM: dx/30, dy/30, dist/30 + 12..14 vector dog→pen-entry: dx/30, dy/30, dist/30 + 15..16 vector furthest-sheep→CoM: dx/15, dy/15 + 17..18 min sheep-to-wall, min dog-to-wall (both /15) + 19 active-sheep count / MAX_SHEEP + 20..27 8-bin polar histogram of active sheep around the dog, + rotation-aware (binned in dog-relative frame), normalised + so the bins sum to 1. + 28..29 vector dog→closest-to-pen sheep: dx/15, dy/15 + 30..31 vector dog→rearmost (furthest-from-pen) sheep: dx/15, dy/15 +""" + +import math +import numpy as np + +from herding.geometry import ( + FIELD_X, FIELD_Y, PEN_ENTRY, MAX_SHEEP, +) + +OBS_DIM = 32 + + +def build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list, + n_max: int = MAX_SHEEP) -> np.ndarray: + """Assemble the dog policy's observation vector. + + Parameters + ---------- + dog_xy : tuple (x, y) of the dog's GPS position (m) + dog_heading : dog heading in rad + sheep_xy_list : iterable of (x, y) for ALL known sheep + sheep_penned_list : parallel iterable of bool — True if sheep is penned + n_max : maximum supported flock size used for the count normaliser + """ + dog_x, dog_y = dog_xy + obs = np.zeros(OBS_DIM, dtype=np.float32) + + obs[0] = dog_x / 15.0 + obs[1] = dog_y / 15.0 + obs[2] = math.cos(dog_heading) + obs[3] = math.sin(dog_heading) + + active = [(x, y) for (x, y), p + in zip(sheep_xy_list, sheep_penned_list) if not p] + n = len(active) + + pdx0, pdy0 = PEN_ENTRY[0] - dog_x, PEN_ENTRY[1] - dog_y + obs[12] = pdx0 / 30.0 + obs[13] = pdy0 / 30.0 + obs[14] = math.hypot(pdx0, pdy0) / 30.0 + + if n == 0: + # All sheep penned — terminal observation. + obs[19] = 0.0 + return obs + + arr = np.asarray(active, dtype=np.float32) + com_x = float(arr[:, 0].mean()) + com_y = float(arr[:, 1].mean()) + rel = arr - np.array([com_x, com_y], dtype=np.float32) + dists = np.hypot(rel[:, 0], rel[:, 1]) + radius = float(dists.max()) + std_x = float(arr[:, 0].std()) + std_y = float(arr[:, 1].std()) + + obs[4] = com_x / 15.0 + obs[5] = com_y / 15.0 + obs[6] = radius / 15.0 + obs[7] = std_x / 15.0 + obs[8] = std_y / 15.0 + + cdx, cdy = com_x - dog_x, com_y - dog_y + obs[9] = cdx / 30.0 + obs[10] = cdy / 30.0 + obs[11] = math.hypot(cdx, cdy) / 30.0 + + far_idx = int(np.argmax(dists)) + obs[15] = float(rel[far_idx, 0]) / 15.0 + obs[16] = float(rel[far_idx, 1]) / 15.0 + + min_sheep_wall = min( + float(np.min(arr[:, 0] - FIELD_X[0])), + float(np.min(FIELD_X[1] - arr[:, 0])), + float(np.min(arr[:, 1] - FIELD_Y[0])), + float(np.min(FIELD_Y[1] - arr[:, 1])), + ) + min_dog_wall = min( + dog_x - FIELD_X[0], FIELD_X[1] - dog_x, + dog_y - FIELD_Y[0], FIELD_Y[1] - dog_y, + ) + obs[17] = min_sheep_wall / 15.0 + obs[18] = float(min_dog_wall) / 15.0 + obs[19] = n / n_max + + # 8-bin polar histogram in the dog's body frame. + rel_dx = arr[:, 0] - dog_x + rel_dy = arr[:, 1] - dog_y + angles = np.arctan2(rel_dy, rel_dx) - dog_heading + angles = np.arctan2(np.sin(angles), np.cos(angles)) + bins = np.floor((angles + math.pi) / (2 * math.pi) * 8).astype(int) + bins = np.clip(bins, 0, 7) + hist = np.bincount(bins, minlength=8).astype(np.float32) + hist /= max(1, n) + obs[20:28] = hist + + # Closest-to-pen sheep (the sequential teacher's target) and rearmost + # (furthest-from-pen, the natural "next target" once the closest is + # penned). Both expressed as offset from dog. These two channels make + # BC tractable — without them the obs doesn't uniquely identify which + # sheep the teacher is steering toward. + pen_dists = np.hypot(arr[:, 0] - PEN_ENTRY[0], arr[:, 1] - PEN_ENTRY[1]) + closest_idx = int(np.argmin(pen_dists)) + rearmost_idx = int(np.argmax(pen_dists)) + obs[28] = (float(arr[closest_idx, 0]) - dog_x) / 15.0 + obs[29] = (float(arr[closest_idx, 1]) - dog_y) / 15.0 + obs[30] = (float(arr[rearmost_idx, 0]) - dog_x) / 15.0 + obs[31] = (float(arr[rearmost_idx, 1]) - dog_y) / 15.0 + + return obs diff --git a/herding/sequential.py b/herding/sequential.py new file mode 100644 index 0000000..3fd1cf0 --- /dev/null +++ b/herding/sequential.py @@ -0,0 +1,98 @@ +"""Sequential single-target shepherd dog algorithm. + +Strömbom drives the flock's centre of mass; with N sheep and a narrow +3 m gate, this fails because the flock is wider than the gate and CoM +driving abandons stragglers. Real sheepdogs solve this differently: +they pick *one* sheep at a time, drive it through, return for the next. + +This module implements that "pin-and-push" approach. + +Algorithm (one step): +1. Active sheep = those still in the field (not yet penned). +2. Target = the active sheep currently closest to the pen entry. +3. Drive position = ``target + Δ · unit(target − pen_entry)`` — + directly behind the target relative to the goal. +4. Output unit vector pointing the dog at the drive position. + +Once the target crosses the gate it latches as penned and is removed +from the active set; the next-closest unpenned sheep becomes the +target. The algorithm naturally "queues" sheep through the gate. + +Empirically (with our flocking dynamics) this scales linearly with +flock size and works up to at least n=10 within a 15 000-step budget. +""" + +import math + +from herding.geometry import GATE_Y, PEN_ENTRY, in_pen + + +DELTA_DRIVE = 1.5 # standoff behind the target sheep +APPROACH_GAIN = 1.0 # action magnitude scale (1 = full speed) + + +def _unit(x, y): + d = math.hypot(x, y) + if d < 1e-6: + return 0.0, 0.0 + return x / d, y / d + + +def _is_active(x, y) -> bool: + return (not in_pen(x, y)) and y > GATE_Y + + +def compute_action(dog_xy, sheep_positions, pen_target=PEN_ENTRY): + """Return ``(vx, vy, mode)`` where mode encodes the current target. + + Compatible with the Strömbom call signature so it can be drop-in + swapped in the dog controller and the env's imitation reward. + """ + active = [(name, x, y) for name, (x, y) in sheep_positions.items() + if _is_active(x, y)] + if not active: + return 0.0, 0.0, "idle" + + # Pick target = sheep closest to pen entry. Stable choice: as one + # sheep approaches and crosses the gate it stays the target until + # latched; then the next-closest takes over. + name, sx, sy = min( + active, + key=lambda s: math.hypot(s[1] - pen_target[0], s[2] - pen_target[1]), + ) + + # Drive position behind the target along the (target → pen) line. + ux, uy = _unit(sx - pen_target[0], sy - pen_target[1]) + tx = sx + DELTA_DRIVE * ux + ty = sy + DELTA_DRIVE * uy + + ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1]) + return APPROACH_GAIN * ax, APPROACH_GAIN * ay, f"drive:{name}" + + +def compute_action_debug(dog_xy, sheep_positions, pen_target=PEN_ENTRY): + """Debug variant returning ``(vx, vy, mode, debug_dict)``.""" + active = [(name, x, y) for name, (x, y) in sheep_positions.items() + if _is_active(x, y)] + if not active: + return 0.0, 0.0, "idle", { + "n_active": 0, "target_name": "", + "target_x": 0.0, "target_y": 0.0, + "drive_x": dog_xy[0], "drive_y": dog_xy[1], + } + + name, sx, sy = min( + active, + key=lambda s: math.hypot(s[1] - pen_target[0], s[2] - pen_target[1]), + ) + + ux, uy = _unit(sx - pen_target[0], sy - pen_target[1]) + tx = sx + DELTA_DRIVE * ux + ty = sy + DELTA_DRIVE * uy + ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1]) + + return APPROACH_GAIN * ax, APPROACH_GAIN * ay, f"drive:{name}", { + "n_active": len(active), "target_name": name, + "target_x": sx, "target_y": sy, + "drive_x": tx, "drive_y": ty, + } diff --git a/herding/strombom.py b/herding/strombom.py new file mode 100644 index 0000000..767da9b --- /dev/null +++ b/herding/strombom.py @@ -0,0 +1,114 @@ +"""Strömbom collect/drive heuristic for the shepherd dog. + +Adapted from the original ``controllers/shepherd_dog/strombom.py`` and +updated for the external pen layout. Used as a baseline controller and +as the fallback when the RL policy isn't available. + +Reference: Strömbom et al. 2014, "Solving the shepherding problem". +""" + +import math + +from herding.geometry import PEN_ENTRY, GATE_Y, in_pen + +# Algorithm parameters. DELTA_DRIVE / DELTA_COLLECT were tightened from +# the original (4.0 / 2.5) because the new external pen sits ~26 m from +# typical sheep spawn locations — at the old 4 m standoff, the flee force +# (quadratic ramp, 3.7 at 4 m vs ~10 at 2 m) couldn't move sheep through +# the path inside the 3000-step episode budget. +# +# F_FACTOR was 2.0 in the original Strömbom paper; raised to 4.0 here so +# the dog stays in *drive* mode much longer. With our tighter cohesion +# (flocking_sim.py), partially-collected flocks consolidate naturally +# during a drive, and we don't waste 80% of the time budget on a slow +# "collect" pre-phase. +F_FACTOR = 4.0 +DELTA_COLLECT = 1.5 +DELTA_DRIVE = 2.0 + + +def _unit(x, y): + d = math.hypot(x, y) + if d < 1e-6: + return 0.0, 0.0 + return x / d, y / d + + +def _is_active(x, y) -> bool: + """A sheep is "active" if it's still in the field — not in or below + the gate plane (we treat anything south of the gate as committed to + the pen and stop trying to herd it).""" + return (not in_pen(x, y)) and y > GATE_Y + + +def compute_action(dog_xy, sheep_positions, pen_target=PEN_ENTRY): + """Return ``(vx, vy, mode)`` — mode in {idle, collect, drive}. + + ``sheep_positions`` is a ``{name: (x, y)}`` mapping (matches the + Webots controller's representation). + """ + active = [(x, y) for (x, y) in sheep_positions.values() if _is_active(x, y)] + if not active: + return 0.0, 0.0, "idle" + + n = len(active) + com_x = sum(p[0] for p in active) / n + com_y = sum(p[1] for p in active) / n + dists = [math.hypot(p[0] - com_x, p[1] - com_y) for p in active] + radius = max(dists) + + if radius > F_FACTOR * math.sqrt(n): + # Collect: aim at a point behind the furthest sheep, opposite the CoM. + idx = max(range(n), key=lambda i: dists[i]) + sx, sy = active[idx] + ux, uy = _unit(sx - com_x, sy - com_y) + tx, ty = sx + DELTA_COLLECT * ux, sy + DELTA_COLLECT * uy + mode = "collect" + else: + # Drive: aim at a point behind the flock CoM relative to the goal. + ux, uy = _unit(com_x - pen_target[0], com_y - pen_target[1]) + tx, ty = com_x + DELTA_DRIVE * ux, com_y + DELTA_DRIVE * uy + mode = "drive" + + ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1]) + return ax, ay, mode + + +def compute_action_debug(dog_xy, sheep_positions, pen_target=PEN_ENTRY): + """Variant of compute_action that also returns a small debug dict. + + Kept for parity with the legacy controller's CSV logger. + """ + active = [(x, y) for (x, y) in sheep_positions.values() if _is_active(x, y)] + if not active: + return 0.0, 0.0, "idle", { + "n_active": 0, "radius": 0.0, "threshold": 0.0, + "com_x": 0.0, "com_y": 0.0, + "target_x": dog_xy[0], "target_y": dog_xy[1], + } + + n = len(active) + com_x = sum(p[0] for p in active) / n + com_y = sum(p[1] for p in active) / n + dists = [math.hypot(p[0] - com_x, p[1] - com_y) for p in active] + radius = max(dists) + threshold = F_FACTOR * math.sqrt(n) + + if radius > threshold: + idx = max(range(n), key=lambda i: dists[i]) + sx, sy = active[idx] + ux, uy = _unit(sx - com_x, sy - com_y) + tx, ty = sx + DELTA_COLLECT * ux, sy + DELTA_COLLECT * uy + mode = "collect" + else: + ux, uy = _unit(com_x - pen_target[0], com_y - pen_target[1]) + tx, ty = com_x + DELTA_DRIVE * ux, com_y + DELTA_DRIVE * uy + mode = "drive" + + ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1]) + dbg = { + "n_active": n, "radius": radius, "threshold": threshold, + "com_x": com_x, "com_y": com_y, + "target_x": tx, "target_y": ty, + } + return ax, ay, mode, dbg diff --git a/plan.md b/plan.md new file mode 100644 index 0000000..8a7ce27 --- /dev/null +++ b/plan.md @@ -0,0 +1,458 @@ +# RL-Driven Shepherd Herding — Implementation Plan + +This plan turns the existing Strömbom-only Webots project into a dual-mode +shepherd controller (RL primary, Strömbom fallback), with a fast Gymnasium +training environment that mirrors the Webots dynamics tightly enough for +sim-to-sim transfer. Stable-Baselines3 PPO is the learner. + +--- + +## 1. Current state (audit) + +### World geometry — `worlds/field.wbt` +- Field bounded by stone walls at **x,y ∈ [−15, +15]**. Inside-usable area is + ~[−14.5, 14.5] (`X_MIN/MAX` in `flocking.py`). +- **Pen is *inside* the field**: x ∈ [10, 13], y ∈ [−15, −8], with the + opening on its **north** side at y = −8 (post-and-rail fence W/E; open N). +- South stone wall has a **gate at x ∈ [10, 13], y = −15** (split wall + + gate posts at x=10 and x=13). So sheep that get penned end up between the + fence (N side at y=−8) and the south stone wall (with the wooden gate at + y=−15 currently slightly ajar). The pen is effectively an L-shape inside + the field, not external. +- Spawns: dog at origin (0, 0), 3 sheep around (3, ±2) and (4, 0). Two more + sheep are commented out. + +### Robots — protos +- **Sheep** (`protos/Sheep.proto`): differential drive, wheel radius 0.031 m, + axle half-width 0.10 m → wheel base 0.20 m. `maxVelocity = 25 rad/s` → + max linear ≈ **0.78 m/s**. Sensors: GPS, Compass, Emitter+Receiver on + channel 1. `supervisor = TRUE` (used to repaint wool pink on pen entry). +- **ShepherdDog** (`protos/ShepherdDog.proto`): differential drive, wheel + radius 0.038 m, axle half-width 0.14 m → wheel base 0.28 m. + `maxVelocity = 70 rad/s` → max linear ≈ **2.66 m/s**. Sensors: GPS, + Compass, Gyro, Accelerometer, **Lidar** (front-only, FOV 2.44 rad ≈ 140°, + 180 rays, range 0.10–12 m, noise 0.005), Emitter+Receiver on channel 1, + cosmetic ear/tail motors. + +### Sheep controller — `controllers/sheep/{sheep.py,flocking.py}` +- Reynolds-style boid stack: flee (quadratic ramp inside FLEE_DIST=7 m), + cohesion (within 8 m), separation (within 2.5 m), wall soft repulsion + (margin 5 m), wall hard escape (margin 1 m, gain 50), wander. +- Pen-aware: sheep below the gate line but outside the gate corridor get a + northward "deadzone" assist; on first entry into the pen rectangle, + sheep latches `penned=True`, repaints pink, and switches to in-pen + containment + jitter. +- Driver: heading-error PD on diff-drive (k=4), forward velocity scaled by + `cos(err)`, MAX_SPEED=22 (motor units, capped by proto's 25 rad/s). +- Stuck detector: if displacement < 0.05 m for 20 steps, drives toward + field origin to escape wall-pin (a known differential-drive failure mode). + +### Dog controller — `controllers/shepherd_dog/{shepherd_dog.py,strombom.py}` +- Strömbom collect/drive heuristic. CoM-radius gating + `radius > F·√n` with F=2 selects collect (push furthest sheep inward) vs + drive (push CoM toward the pen entry point at (11.5, −8.0)). +- Deadzone rescue: when a sheep is below the gate line and outside the + pen's x-corridor, the dog repositions to a "behind the sheep, opposite + the pen" stand-off so the sheep's flee vector points back through the + gate. Variants 0/1 alternate lateral offset to break corner cycles. +- Stuck-rescue, EMA action smoothing, target-deadband, RESCUE_SPEED_CAP, + cooldown — all empirical fixes for diff-drive oscillation. +- Logs full per-step debug to `dog_behavior_log.csv` (currently 7 MB — + add to `.gitignore`). + +### Deleted training scaffolding (per `git status`) +- `controllers/shepherd_dog_rl/{shepherd_dog_rl.py, final_model.zip, vecnorm.pkl, plot_debug.py}` +- `training/{config.json, herding_env.py, parity_test.py, requirements.txt, train.py, train_at.py, viz.py, runs/.gitkeep}` + +A previous attempt existed; we'll redesign rather than resurrect, keeping +only the lessons (parity-tested env, VecNormalize wrapper, eval cadence). + +--- + +## 2. Design decisions + +### 2.1 Pen location — keep inside-field with N gate +The user offered moving the pen *external* (through a wall hole). Tradeoffs: + +| Option | Pros | Cons | +|---|---|---| +| **(A) Keep inside-field** (current) | World already built; Strömbom logic already tuned; gate corridor is short | Dog must navigate around three pen walls; adds geometric clutter | +| (B) External pen via wall hole | Cleaner field — dog only sees sheep + outer walls; pen as goal region beyond a 3 m hole at y=−15 | Requires editing `field.wbt` (split south wall, add external pen walls beyond y<−15); existing rescue/deadzone logic must be retuned; outside-field flocking constants don't currently apply | + +**Recommendation: keep (A)** for parity with the working Strömbom controller, +but add a **simplification**: widen the pen entrance from 3 m (x ∈ [10, 13]) +to 4 m (x ∈ [9.5, 13.5]) and raise the entrance line from y=−8 to y=−7.5 +to give the dog more turning room. Optional later: gate B as a curriculum +extension (Section 7). + +### 2.2 Where to train + +PPO on Webots directly is too slow (real-time stepping, single env, slow +reset). The previous training scaffolding used a Python 2D sim — that is +the right approach. Constraints for sim-to-sim transfer: + +1. **Use the exact same flocking math**: import `controllers/sheep/flocking.py` + from the env, do not reimplement. +2. **Use the same world constants**: import `controllers/shepherd_dog/strombom.py` + for pen geometry and Strömbom baseline. +3. **Model differential drive faithfully**: match wheel-radius, base, and + max wheel-velocity from the proto files. Heading update from + `(ω_R − ω_L)·r / b`, position from `(ω_R + ω_L)·r / 2`. +4. **Match Webots step**: `basicTimeStep = 16 ms`. The sheep controller runs + at every basic step; the env will use the same `dt = 0.016 s`. +5. **Lidar deferred**: dog policy will use a *symbolic* observation + (positions of dog + sheep, plus pen geometry) — not raw lidar — for the + first iteration. Lidar-from-pixels is a much harder learning problem + and isn't required for the herding task. (See Section 7 for an + optional later upgrade.) + +### 2.3 Action space for the dog + +Two viable choices: + +- **(a) High-level velocity vector** `(vx, vy) ∈ [−1, 1]²`. The same + representation Strömbom emits today; the existing + `drive_action(vx, vy, ...)` function in `shepherd_dog.py` converts this + to wheel speeds. Decouples the policy from low-level diff-drive + oscillations and enables direct A/B against Strömbom. +- (b) Direct wheel speeds `(ω_L, ω_R) ∈ [−1, 1]²`. More expressive but the + policy must learn diff-drive control from scratch — which is exactly + the source of the wall-stuck and oscillation pain we're trying to + avoid. + +**Recommendation: (a)** — high-level `(vx, vy)`. Reuses the well-tuned +`drive_action` controller, which already handles `cos(err)` clamping and +turn gain. RL focuses on *strategy*, not actuation. + +### 2.4 Observation space for the dog + +Symbolic, fixed-size, normalized to [−1, 1]: + +| Field | Dim | Notes | +|---|---|---| +| Dog (x, y, cos h, sin h) | 4 | Position normalized by 15 | +| Sheep CoM (x, y) | 2 | Of *active* (not-penned) sheep | +| Sheep dispersion (radius, std-x, std-y) | 3 | Strömbom collect-vs-drive features | +| Vector dog→CoM (dx, dy, dist) | 3 | Helps the value function | +| Vector dog→pen-entry (dx, dy, dist) | 3 | | +| Vector furthest-sheep→CoM (dx, dy) | 2 | Strömbom collect target hint | +| Min sheep-to-wall distance + min dog-to-wall | 2 | Safety signal | +| Active sheep count / N_max | 1 | | +| 8-bin polar histogram of sheep around dog | 8 | Order-invariant flock shape | + +Total: **28 features**. Order-invariant by construction (histogram + summary +stats), so the policy generalizes across flock sizes 1..N_max. + +### 2.5 Reward + +Sparse-only is too hard at flock scale; we shape conservatively. + +``` +r_t = w_pen · ΔN_penned # +1 per newly penned sheep + + w_progress· (d_CoM_pen[t-1] − d_CoM_pen[t]) # closer-to-pen progress + + w_compact· (R[t-1] − R[t]) # tighter flock progress + − w_time · 1 # constant time penalty + − w_wall · I(min_wall_dist < 1.0 m) # dog too close to wall + − w_collide· I(dog within 0.3 m of any sheep) # avoid contact + + w_done · I(all sheep penned) # terminal bonus +``` + +Initial weights: `w_pen=2.0, w_progress=0.5, w_compact=0.2, w_time=0.005, +w_wall=0.01, w_collide=0.05, w_done=10.0`. Tune via 1-sheep curriculum +first — if the dog learns 1-sheep cleanly, the weights are sane. + +### 2.6 Episode + +- Max steps: 3000 (≈ 48 s at dt=16 ms — generous). +- Termination: all sheep penned (success), dog/sheep stuck > 600 steps with + no progress (failure), step limit (timeout). +- Reset: domain-randomized — sheep count ∈ {1..N_max}, sheep positions + uniform in field minus pen+gate corridor, dog at origin ± U(−2, 2). + +### 2.7 Curriculum + +| Stage | N_sheep | Duration (steps) | Pass criterion | +|---|---|---|---| +| 0 | 1 | 0.5 M | success ≥ 90 % | +| 1 | 2 | 1.0 M | success ≥ 80 % | +| 2 | 3 | 1.5 M | success ≥ 70 % | +| 3 | 1..3 mixed | 2.0 M | mean reward stable | +| 4 (optional) | 5 | 2.0 M | success ≥ 60 % | + +Implemented by changing only `n_sheep` in the env reset. + +--- + +## 3. Repository layout (new) + +``` +project/ +├── controllers/ +│ ├── sheep/ # unchanged +│ ├── shepherd_dog/ # Strömbom controller (renamed entry) +│ │ ├── shepherd_dog.py # mode-switch wrapper: RL | strombom +│ │ ├── strombom.py # unchanged (canonical Strömbom) +│ │ └── policy_loader.py # NEW: loads SB3 zip + VecNormalize +│ └── ... +├── herding/ # NEW: Python package, importable from env + controller +│ ├── __init__.py +│ ├── geometry.py # field/pen constants, in_pen(), wall helpers (single source of truth) +│ ├── flocking_sim.py # vectorised numpy port of flocking.py for fast batched sheep +│ ├── diffdrive.py # diff-drive integrator matching the proto specs +│ └── obs.py # observation builder shared by env and Webots controller +├── training/ # NEW +│ ├── herding_env.py # gymnasium.Env, single-agent (the dog) +│ ├── parity_test.py # asserts env trajectory ≈ Webots trajectory for fixed seeds +│ ├── train_ppo.py # SB3 PPO entry point +│ ├── eval.py # rollout + metrics (success rate, time-to-pen) +│ ├── configs/ +│ │ ├── ppo_default.yaml +│ │ └── curriculum.yaml +│ ├── runs/ # tensorboard + checkpoints (.gitignored) +│ └── requirements.txt +├── docs/ +│ └── project.md # unchanged +├── plan.md # this file +└── ... +``` + +`herding/` becomes the **single source of truth** for geometry and dynamics. +The Webots controllers and the training env both import from it, so when a +constant changes in one place it changes everywhere — eliminating the +sim/Webots-drift class of bugs. + +This means the existing `controllers/sheep/flocking.py` and +`controllers/shepherd_dog/strombom.py` become thin shims that re-export +from `herding/`. Webots controllers can import `herding/` because Webots +adds the project root to `sys.path` at controller startup; we'll verify. + +--- + +## 4. The Gymnasium environment — `training/herding_env.py` + +```python +class HerdingEnv(gymnasium.Env): + metadata = {"render_modes": ["rgb_array", "human"]} + + def __init__(self, n_sheep=3, max_steps=3000, dt=0.016, seed=None): + self.action_space = Box(low=-1, high=1, shape=(2,), dtype=np.float32) + self.observation_space = Box(low=-1, high=1, shape=(28,), dtype=np.float32) + ... + + def reset(self, *, seed=None, options=None): + # Random sheep positions in field \ pen corridor, dog near origin. + # Optional curriculum: options["n_sheep"] overrides. + ... + + def step(self, action): + vx, vy = action # high-level velocity intent + # Convert to wheel speeds via the same drive_action inverse used in Webots + wL, wR = self._diffdrive_inverse(vx, vy, self.dog_state) + self.dog_state = self._integrate_diffdrive(self.dog_state, wL, wR, self.dt) + # Step every sheep one boid step (vectorized in flocking_sim.py) + self.sheep_state = self._step_sheep(self.sheep_state, self.dog_state) + # Update penned set, compute reward, observation, done flags + ... +``` + +Key points: +- **Vectorised sheep update**: re-implements `flocking.py` in numpy so 100 + parallel envs with 5 sheep each take ms, not seconds. Numerical parity + with the scalar version is asserted in `parity_test.py`. +- **Same diff-drive integrator** for the dog as Webots will see at + inference. Wall + pen-fence collisions clamp position (a Webots-realistic + no-pass-through approximation). +- **Domain randomization** in reset: sheep count, spawn positions, sheep + flock-parameter jitter (±10 % on FLEE_DIST, COHESION_DIST, etc.) for + robustness. + +--- + +## 5. Training pipeline — `training/train_ppo.py` + +- **Algorithm**: SB3 `PPO` with `MlpPolicy`, `n_steps=2048`, `batch_size=256`, + `n_epochs=10`, `gamma=0.995`, `gae_lambda=0.95`, `clip_range=0.2`, + `ent_coef=0.005`, `vf_coef=0.5`, `learning_rate=3e-4`. +- **Vec envs**: `SubprocVecEnv` × 16 parallel envs (the env is pure numpy + so subprocs are CPU-cheap). +- **Normalization**: `VecNormalize(norm_obs=True, norm_reward=True, + clip_obs=10.0)`. Pickled alongside the policy zip — both required at + inference. +- **Callbacks**: + - `CheckpointCallback` every 100 k steps. + - `EvalCallback` on a separate eval env (no normalization-update) every + 50 k steps; logs success rate and time-to-pen to TensorBoard. + - Custom `CurriculumCallback`: bumps `n_sheep` when eval success rate + crosses the stage threshold for 3 consecutive evals. +- **Determinism for debugging**: seed-pinned eval env so regressions are + catchable. + +--- + +## 6. Webots integration — RL inference path + +`controllers/shepherd_dog/shepherd_dog.py` becomes a thin wrapper: + +```python +MODE = os.environ.get("HERDING_MODE", "rl") # "rl" | "strombom" + +if MODE == "rl": + policy = policy_loader.load("training/runs/best/policy.zip", + "training/runs/best/vecnormalize.pkl") + obs_fn = build_obs # from herding/obs.py +else: + obs_fn = None # strombom path uses sheep_positions directly + +while robot.step(timestep) != -1: + receive_messages() + if MODE == "rl": + obs = obs_fn(dog_xy, dog_heading, sheep_positions, ...) + action, _ = policy.predict(obs, deterministic=True) + vx, vy = action.tolist() + else: + vx, vy, mode, dbg = compute_action_debug(dog_xy, sheep_positions, PEN_ENTRY) + # plus existing rescue/cooldown/EMA layer + drive_action(vx, vy, ...) +``` + +A **safety supervisor** wraps the RL output: if `obs` indicates the dog is +< 0.6 m from a wall, override with the existing wall-escape behavior +(reverse + turn). This is a hard guarantee diff-drive needs because PPO +may not discover wall-escape reliably from on-policy data. + +`policy_loader.py` handles the SB3 import lazily so the controller still +works with `MODE=strombom` even if SB3 is not installed in the Webots +Python environment. + +--- + +## 7. Optional extensions (post-baseline) + +- **External pen** (Section 2.1 option B): edit `field.wbt` to extend the + south wall hole into an external L-shaped pen with its own walls; update + `herding/geometry.py`; retrain stage 3 only. +- **Lidar observation**: replace symbolic obs with 36-bin downsampled + lidar + ego state; train end-to-end. Useful as the "extra merit" + dimension in the project doc. +- **Two-dog mode**: make env multi-agent, train with `MAPPO`-style shared + critic or independent PPO. The proto already supports multiple dog + instances; world only needs a second `ShepherdDog` node. +- **Mecanum comparison**: swap the dog proto for a mecanum variant; same + policy, different `_integrate_diffdrive` (becomes holonomic). +- **Sheep flock size scaling**: 5, 10, 20 — the obs is order-invariant so + the same policy generalises; just curriculum further. + +--- + +## 8. Risks & mitigations + +| Risk | Mitigation | +|---|---| +| Sim-to-Webots gap (sheep dynamics, wall friction) | `parity_test.py` asserts trajectory match within tolerance for fixed seeds; if it fails, fix the env, not the policy | +| Dog learns to wall-pin sheep against fence | Add `w_collide` penalty + min-sheep-to-wall term in obs; curriculum from 1 sheep first | +| PPO oscillation collapses into spinning | Action smoothing in env step (EMA on `(vx, vy)`, mirroring `ACTION_SMOOTH=0.35` from Strömbom controller); reward small `‖a_t − a_{t-1}‖` penalty | +| Pen approach failures (sheep refuse gate) | Reuse the existing `deadzone_rescue` as a *scripted fallback* triggered when a sheep has been deadzoned > 200 steps — RL handles the common case, scripted handles the corner | +| Gym version mismatch (gymnasium vs gym) | Lock to `gymnasium>=0.29`, `stable-baselines3>=2.3` in requirements | + +--- + +## 9. Milestones (suggested order of implementation) + +1. **M0 — Refactor** (no behavior change): create `herding/` package, move + constants out of `flocking.py`/`strombom.py`, leave shims; verify + Webots still runs Strömbom unchanged. Add `dog_behavior_log.csv` to + `.gitignore`. +2. **M1 — Env & parity**: `herding_env.py`, `parity_test.py`. Asserts + sheep + dog trajectories match Webots within tolerance for 5 fixed + seeds. *Done when parity test green.* +3. **M2 — PPO baseline**: train Stage 0 (1 sheep) for 0.5 M steps; eval + in env at ≥ 90 % success. +4. **M3 — Webots inference**: load Stage 0 policy in `shepherd_dog.py` + with `HERDING_MODE=rl`; verify the dog herds 1 sheep into the pen in + the actual Webots world. *This is the sim-to-sim transfer gate.* +5. **M4 — Curriculum**: stages 1–3, ~5 M steps total, with checkpoints + and eval logs. +6. **M5 — Strömbom comparison**: run both controllers on a fixed eval + suite (same seeds, 1/2/3 sheep), log success rate and time-to-pen. + This is a deliverable for the project's "quantitative evaluation" + goal. +7. **M6 — Documentation**: a short README in `training/` showing how to + train, evaluate, and switch modes in Webots. + +Each milestone is independently demoable. M0–M3 is the critical path to +"RL works in Webots"; M4–M6 polishes it for the project deliverable. + +--- + +## 10. Decisions (locked in by implementation) + +- **Pen layout**: option B (external pen). The pen sits south of the + field at x ∈ [10, 13], y ∈ [-22, -15] and is reached through the + existing 3 m gap in the south stone wall. The old in-field + quarantine fence is gone and the wooden gate is modeled as + swung-open and parked on the west gate post so the corridor is + unobstructed. This kills the deadzone class entirely. +- **Flock size**: 1..10 sheep, sampled uniformly each reset. The order- + invariant observation (CoM, dispersion, polar histogram) lets a + single policy generalise across the whole range. A curriculum widens + ``max_n_sheep`` from 1 to 10 over training to keep early exploration + tractable. +- **Single-sheep mode**: handled by the same policy (n_sheep=1 is the + first stage of the curriculum and stays in the training distribution + throughout). No separate model. +- **Hardware**: GPU for training. SubprocVecEnv × 16 on CPU feeds an + MlpPolicy on GPU; ~2–3 h for the full curriculum. + +## 11. What was built + +``` +herding/ # single source of truth, importable from both + geometry.py # field/pen constants, latch helpers, robot specs + flocking_sim.py # Reynolds boid step (matches Webots controller) + diffdrive.py # diff-drive kinematics + velocity↔wheels + obs.py # 28-D order-invariant observation builder + strombom.py # collect/drive heuristic (baseline + fallback) + +worlds/field.wbt # external pen south of field, 10 sheep slots, + # gate parked open, in-field fence removed + +controllers/sheep/sheep.py # imports from herding/, latches on + # is_penned_position +controllers/shepherd_dog/ + shepherd_dog.py # mode switch (HERDING_MODE=rl|strombom), + # safety supervisor for DOG_SOUTH_LIMIT + policy_loader.py # lazy SB3 zip + VecNormalize loader + strombom.py # shim re-exporting herding.strombom + +training/ + herding_env.py # gymnasium.Env, action smoothing, reward shaping + train_ppo.py # SB3 PPO with VecNormalize, eval, checkpoints, + # curriculum callback + eval.py # success-rate / time-to-pen across n_sheep + parity_test.py # shape, determinism, baseline-rollout smoke test + configs/ppo_default.yaml + requirements.txt + README.md # how to train, evaluate, switch modes in Webots +``` + +## 12. To run + +```bash +# 1. Install deps (CUDA-enabled torch wheel for GPU) +pip install -r training/requirements.txt + +# 2. Smoke test +python -m training.parity_test + +# 3. Train (5 M steps, ~2–3 h on a single GPU) +python -m training.train_ppo --out-dir training/runs/baseline + +# 4. Evaluate vs Strömbom +python -m training.eval --policy training/runs/baseline/best +python -m training.eval --policy strombom + +# 5. Run in Webots +export HERDING_MODE=rl +export HERDING_POLICY_DIR=$PWD/training/runs/baseline/best +webots worlds/field.wbt +``` diff --git a/tools/collect_demos.py b/tools/collect_demos.py new file mode 100644 index 0000000..7a767d6 --- /dev/null +++ b/tools/collect_demos.py @@ -0,0 +1,117 @@ +"""Collect (obs, action) demonstrations from the sequential teacher. + +Runs the sequential algorithm across a grid of (n_sheep, seed) combos +at full difficulty, logs the (observation, action) pair every Nth step, +and saves successful trajectories to a numpy ``.npz`` for behavior +cloning. Failed trajectories are dropped by default — we only want to +teach the policy from good examples. + +Usage:: + + python -m tools.collect_demos --out training/demos.npz +""" + +from __future__ import annotations + +import argparse +import os +import sys +import time +from pathlib import Path + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +import numpy as np + +from herding.geometry import PEN_ENTRY +from herding.sequential import compute_action +from training.herding_env import HerdingEnv + + +def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int): + env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, + difficulty=1.0, seed=seed) + obs, _ = env.reset(seed=seed) + obs_list, action_list = [], [] + for step in range(max_steps): + positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i])) + for i in range(env.n_sheep) if not env.sheep_penned[i]} + if not positions: + break + vx, vy, _mode = compute_action( + (env.dog_x, env.dog_y), positions, PEN_ENTRY, + ) + action = np.array([vx, vy], dtype=np.float32) + if step % subsample == 0: + obs_list.append(obs.copy()) + action_list.append(action.copy()) + obs, _r, term, trunc, _info = env.step(action) + if term or trunc: + break + success = bool(env.sheep_penned.all()) + return ( + np.asarray(obs_list, dtype=np.float32), + np.asarray(action_list, dtype=np.float32), + success, + env.steps, + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--out", default="training/demos.npz") + parser.add_argument("--n-sheep-list", default="1,2,3,5,8,10") + parser.add_argument("--seeds-per-n", type=int, default=15) + parser.add_argument("--max-steps", type=int, default=30000) + parser.add_argument("--subsample", type=int, default=5, + help="Keep every Nth (obs, action) pair.") + parser.add_argument("--keep-failures", action="store_true", + help="Include partial-success trajectories. Default off.") + args = parser.parse_args() + + n_sheep_list = [int(x) for x in args.n_sheep_list.split(",")] + print(f"[demos] grid: n_sheep={n_sheep_list}, seeds={args.seeds_per_n}, " + f"max_steps={args.max_steps}, subsample={args.subsample}") + + all_obs, all_actions, all_meta = [], [], [] + t_start = time.time() + n_success = 0; n_total = 0 + + for n in n_sheep_list: + for seed in range(args.seeds_per_n): + obs, actions, success, total_steps = collect_one( + n, seed, args.max_steps, args.subsample, + ) + n_total += 1 + if success: + n_success += 1 + keep = success or args.keep_failures + if keep and len(obs) > 0: + all_obs.append(obs) + all_actions.append(actions) + all_meta.append((n, seed, len(obs), int(success), total_steps)) + tag = "✓" if success else "✗" + print(f" [{tag}] n={n:>2d} seed={seed:>2d} steps={total_steps:>6d} " + f"logged={len(obs):>5d}") + + if not all_obs: + raise RuntimeError("No trajectories kept — try --keep-failures.") + + obs = np.concatenate(all_obs, axis=0) + actions = np.concatenate(all_actions, axis=0) + meta = np.array(all_meta, dtype=np.int32) + + Path(args.out).parent.mkdir(parents=True, exist_ok=True) + np.savez(args.out, obs=obs, actions=actions, meta=meta) + + elapsed = time.time() - t_start + print(f"\n=== {n_success}/{n_total} trajectories successful ({100*n_success/n_total:.0f}%) ===") + print(f"=== {len(obs)} transitions saved to {args.out} ===") + print(f"=== obs={obs.shape}, actions={actions.shape}, elapsed={elapsed:.0f}s ===") + + +if __name__ == "__main__": + main() diff --git a/tools/run_webots.sh b/tools/run_webots.sh new file mode 100755 index 0000000..cf26b74 --- /dev/null +++ b/tools/run_webots.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Launch Webots with N sheep enabled and the chosen controller mode. +# Generates a temporary world file in worlds/field_test.wbt with sheep +# beyond N commented out, sets the env vars the dog controller reads, +# then execs Webots on it. +# +# Usage: +# tools/run_webots.sh [N] [MODE] +# N : number of active sheep (1..10), default 10 +# MODE : "rl" | "strombom" | "sequential", default "rl" +# +# Examples: +# tools/run_webots.sh 10 rl # BC-trained RL policy, 10 sheep +# tools/run_webots.sh 5 sequential # the analytic teacher, 5 sheep +# tools/run_webots.sh 3 strombom # canonical baseline, 3 sheep +# +# Notes: +# * The RL mode loads training/runs/bc_pretrained/policy.zip by default. +# Override via HERDING_POLICY_DIR=/path/to/run env var. +# * Conda env "tir" must be active (provides stable-baselines3 + torch). + +set -e +N=${1:-10} +MODE=${2:-rl} + +if (( N < 1 || N > 10 )); then + echo "N must be 1..10, got $N" >&2; exit 1 +fi +case "$MODE" in + rl|strombom|sequential) ;; + *) echo "MODE must be rl|strombom|sequential, got '$MODE'" >&2; exit 1 ;; +esac + +ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )" +SRC="$ROOT/worlds/field.wbt" +DST="$ROOT/worlds/field_test.wbt" + +cp "$SRC" "$DST" +# Comment out sheep N+1..10 by prefixing the matching Sheep { ... } line. +for i in $(seq $((N+1)) 10); do + sed -i "s|^Sheep .* \"sheep${i}\".*|# &|" "$DST" +done + +active=$(grep -c '^Sheep' "$DST") +echo "------------------------------------------------------------" +echo "World : $DST" +echo "Mode : $MODE" +echo "Sheep : $active active" +echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}" +echo "------------------------------------------------------------" + +# Webots strips HERDING_* env vars from controller subprocesses in some +# setups, so we also write a runtime config file the controller reads. +RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}" +cat > "$ROOT/herding_runtime.cfg" </best/`. +2. In Webots, set the dog controller's environment variables: + + ```bash + export HERDING_MODE=rl + export HERDING_POLICY_DIR=$(pwd)/training/runs/baseline/best + webots worlds/field.wbt + ``` + + Or set them via Webots' controller args / a `.wbproj` if you prefer. + +3. To force the Strömbom baseline (same world, same controller): + + ```bash + export HERDING_MODE=strombom + webots worlds/field.wbt + ``` + +If `HERDING_MODE=rl` but the policy can't be loaded (SB3 not installed, +zip missing, etc.), the controller logs the error and falls back to +Strömbom automatically. + +## Curriculum knobs + +The default schedule in `configs/ppo_default.yaml` widens +`max_n_sheep` over training. Each reset samples `n_sheep ~ U[1, +max_n_sheep]`, so the final policy has seen every flock size from 1 to +10 in proportion. To pin a specific size, instantiate the env with +`HerdingEnv(n_sheep=N)` (see `eval.py`). + +## Reward shaping + +Weights live in class attributes on `HerdingEnv`. Tune from the 1-sheep +curriculum first — if the dog can't herd a single sheep cleanly, raising +`W_PROGRESS` or lowering `W_TIME` is usually the fix. For multi-sheep +collapse modes (dog spins between sheep), increase `W_COMPACT` so +tightening the flock pays. diff --git a/training/__init__.py b/training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/training/bc_pretrain.py b/training/bc_pretrain.py new file mode 100644 index 0000000..3a82147 --- /dev/null +++ b/training/bc_pretrain.py @@ -0,0 +1,218 @@ +"""Behavior cloning of the sequential teacher into an SB3-compatible policy. + +Trains the policy network (mean-action head) of an SB3 ``MlpPolicy`` to +mimic the demonstrations collected by ``tools.collect_demos``. The +saved zip is loadable via ``PPO.load(...)`` and can be passed to +``train_ppo.py --resume`` for fine-tuning. + +Why this works: the teacher (sequential single-target driving) solves +n=10 at 80%+ in our env. BC gives the RL a competent starting policy, +so PPO doesn't have to discover behavior from scratch — it only has to +*refine* the teacher's strategy via the sparse pen reward. + +Usage:: + + python -m training.bc_pretrain \\ + --demos training/demos.npz \\ + --out training/runs/bc_pretrained +""" + +from __future__ import annotations + +import argparse +import os +import sys +import time +from pathlib import Path + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, TensorDataset + +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import DummyVecEnv + +from training.herding_env import HerdingEnv + + +def build_model(net_arch_pi, net_arch_vf, log_std_init: float): + """Build a fresh SB3 PPO with the same architecture as train_ppo. + + We only need the policy to load weights into; PPO's training-loop + plumbing isn't used during BC. + """ + env = DummyVecEnv([lambda: HerdingEnv()]) + model = PPO( + "MlpPolicy", env, + policy_kwargs=dict( + net_arch=dict(pi=net_arch_pi, vf=net_arch_vf), + log_std_init=log_std_init, + ), + verbose=0, + ) + return model, env + + +def policy_forward_mean(policy, obs_batch): + """Return the policy's deterministic mean action for a batch. + + SB3's ActorCriticPolicy doesn't expose this directly — it goes + through a Distribution wrapper. We replicate the forward path: + extract_features → mlp_extractor → action_net. + """ + features = policy.extract_features(obs_batch) + if isinstance(features, tuple): + # SB3 ≥ 2.0 sometimes returns (pi_features, vf_features) + pi_features = features[0] + else: + pi_features = features + latent_pi, _latent_vf = policy.mlp_extractor(pi_features) + return policy.action_net(latent_pi) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--demos", default="training/demos.npz") + parser.add_argument("--out", default="training/runs/bc_pretrained") + parser.add_argument("--epochs", type=int, default=60) + parser.add_argument("--batch-size", type=int, default=256) + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--val-split", type=float, default=0.1) + parser.add_argument("--net-arch", default="256,256", + help="Comma-separated hidden layer widths.") + parser.add_argument("--log-std-init", type=float, default=0.5) + parser.add_argument("--cos-weight", type=float, default=1.0, + help="Weight on (1 - cosine similarity) loss term. " + "MSE alone shrinks policy output toward zero " + "(zero-magnitude action minimises mean squared " + "error against ±1 targets); cos loss keeps " + "the action pointed correctly even at small " + "magnitudes.") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--device", default="cpu") + args = parser.parse_args() + + torch.manual_seed(args.seed) + np.random.seed(args.seed) + + # --- Load demos --- + print(f"[bc] loading demos from {args.demos}") + data = np.load(args.demos) + obs = data["obs"].astype(np.float32) + actions = data["actions"].astype(np.float32) + meta = data["meta"] + print(f"[bc] obs={obs.shape} actions={actions.shape} trajectories={len(meta)}") + if obs.size == 0: + raise RuntimeError("Empty demo file.") + + # Action sanity check — sequential outputs unit vectors. + a_norms = np.linalg.norm(actions, axis=1) + print(f"[bc] action L2 norm: mean={a_norms.mean():.3f} " + f"min={a_norms.min():.3f} max={a_norms.max():.3f}") + + # --- Train/val split --- + n = len(obs) + perm = np.random.permutation(n) + n_val = int(n * args.val_split) + val_idx, train_idx = perm[:n_val], perm[n_val:] + print(f"[bc] train={len(train_idx)} val={len(val_idx)}") + + obs_t = torch.from_numpy(obs) + act_t = torch.from_numpy(actions) + train_loader = DataLoader( + TensorDataset(obs_t[train_idx], act_t[train_idx]), + batch_size=args.batch_size, shuffle=True, + ) + val_loader = DataLoader( + TensorDataset(obs_t[val_idx], act_t[val_idx]), + batch_size=args.batch_size, shuffle=False, + ) + + # --- Build model --- + net_arch_pi = [int(x) for x in args.net_arch.split(",")] + net_arch_vf = net_arch_pi[:] + model, _env = build_model(net_arch_pi, net_arch_vf, args.log_std_init) + policy = model.policy.to(args.device) + optimizer = optim.Adam(policy.parameters(), lr=args.lr) + + # --- Train --- + print(f"[bc] training: epochs={args.epochs} batch={args.batch_size} " + f"lr={args.lr} device={args.device}") + t_start = time.time() + best_val = float("inf") + + def combined_loss(pred, target): + mse = nn.functional.mse_loss(pred, target) + p_norm = pred.norm(dim=1).clamp_min(1e-6) + t_norm = target.norm(dim=1).clamp_min(1e-6) + cos_sim = (pred * target).sum(dim=1) / (p_norm * t_norm) + cos_loss = (1.0 - cos_sim).mean() + return mse + args.cos_weight * cos_loss, mse.item(), cos_sim.mean().item() + + for epoch in range(args.epochs): + policy.train() + train_loss_total, train_mse_total, train_cos_total, train_count = 0.0, 0.0, 0.0, 0 + for ob_batch, act_batch in train_loader: + ob_batch = ob_batch.to(args.device) + act_batch = act_batch.to(args.device) + optimizer.zero_grad() + mean_action = policy_forward_mean(policy, ob_batch) + loss, mse_val, cos_val = combined_loss(mean_action, act_batch) + loss.backward() + optimizer.step() + bs = ob_batch.size(0) + train_loss_total += loss.item() * bs + train_mse_total += mse_val * bs + train_cos_total += cos_val * bs + train_count += bs + train_mse = train_mse_total / max(1, train_count) + train_cos = train_cos_total / max(1, train_count) + + policy.eval() + val_total, val_count = 0.0, 0 + cos_sim_total = 0.0 + with torch.no_grad(): + for ob_batch, act_batch in val_loader: + ob_batch = ob_batch.to(args.device) + act_batch = act_batch.to(args.device) + mean_action = policy_forward_mean(policy, ob_batch) + bs = ob_batch.size(0) + val_total += nn.functional.mse_loss( + mean_action, act_batch, reduction="sum", + ).item() + # Cosine similarity in action space — useful sanity for + # "is the policy pointing the same way as the teacher?". + m_norm = mean_action.norm(dim=1).clamp_min(1e-6) + a_norm = act_batch.norm(dim=1).clamp_min(1e-6) + cos = (mean_action * act_batch).sum(dim=1) / (m_norm * a_norm) + cos_sim_total += cos.sum().item() + val_count += bs + val_mse = val_total / max(1, val_count) / actions.shape[1] + cos_sim = cos_sim_total / max(1, val_count) + print(f" epoch {epoch+1:>2d}/{args.epochs} " + f"train_mse={train_mse:.4f} train_cos={train_cos:+.3f} " + f"val_mse={val_mse:.4f} val_cos={cos_sim:+.3f}") + if val_mse < best_val: + best_val = val_mse + + elapsed = time.time() - t_start + print(f"[bc] done in {elapsed:.0f}s best_val_mse={best_val:.4f}") + + # --- Save --- + out_dir = Path(args.out) + out_dir.mkdir(parents=True, exist_ok=True) + model.save(out_dir / "policy.zip") + print(f"[bc] saved policy to {out_dir / 'policy.zip'}") + print(f"\n[bc] verify with: " + f"python -m training.eval --policy {out_dir}") + + +if __name__ == "__main__": + main() diff --git a/training/config.json b/training/config.json deleted file mode 100644 index 1bc7fce..0000000 --- a/training/config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "W_PER_SHEEP": 2.0, - "W_ALIGN": 0.05, - "W_PEN_BONUS": 10.0, - "W_COMPLETE": 100.0, - "W_STEP_COST": 0.02, - "W_COMPACT": 0.0, - "W_WALL_TOUCH": 0.0, - "WALL_TOUCH_BUFFER": 0.4, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": true, - "ENTRY_AWARE": true, - "ent_coef": 0.02 -} diff --git a/training/configs/.gitkeep b/training/configs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/training/configs/ppo_default.yaml b/training/configs/ppo_default.yaml new file mode 100644 index 0000000..d2dcff4 --- /dev/null +++ b/training/configs/ppo_default.yaml @@ -0,0 +1,52 @@ +# PPO hyperparameters for the herding env. Tuned for a 28-D obs / 2-D +# continuous action space with 16 parallel envs on GPU. These are SB3 +# defaults nudged toward longer credit assignment (gamma=0.995) and a +# slightly higher entropy bonus to keep exploration alive while curriculum +# expands the flock size. + +# --- PPO --- +learning_rate: 3.0e-4 +n_steps: 2048 # rollout length per env before each update +batch_size: 256 +n_epochs: 10 +gamma: 0.995 +gae_lambda: 0.95 +clip_range: 0.2 +ent_coef: 0.05 # was 0.01 — earlier runs collapsed to ~0 actions +vf_coef: 0.5 +max_grad_norm: 0.5 +target_kl: null # disable early-stop on KL + +# --- Network --- +policy: MlpPolicy +net_arch_pi: [128, 128] +net_arch_vf: [128, 128] +log_std_init: 0.5 # std≈1.6 instead of default 1.0 — more exploration + +# --- Training schedule --- +total_timesteps: 10_000_000 +n_envs: 16 +checkpoint_freq: 500_000 # in env steps +eval_freq: 100_000 # in env steps +n_eval_episodes: 20 + +# --- Curriculum (max-n_sheep schedule, in env steps) --- +# Each entry: at step s, raise the env's max_n_sheep to k. The env samples +# uniformly from [1, max_n_sheep] each reset, so this widens the +# distribution gradually rather than swapping fixed sizes. +# +# State-space curriculum: difficulty controls sheep spawn area +# (0 = sheep spawn just north of gate, 1 = sheep spawn anywhere in field). +# Plus the existing flock-size curriculum. +# +# The two together let the policy first learn "what penning looks like" +# in a regime where random exploration reliably triggers it, then +# gradually generalise to the deployment distribution. +curriculum: + - { step: 0, max_n_sheep: 1, difficulty: 0.0 } + - { step: 1_000_000, max_n_sheep: 1, difficulty: 0.3 } + - { step: 2_000_000, max_n_sheep: 2, difficulty: 0.5 } + - { step: 4_000_000, max_n_sheep: 3, difficulty: 0.8 } + - { step: 6_000_000, max_n_sheep: 5, difficulty: 1.0 } + - { step: 8_000_000, max_n_sheep: 8, difficulty: 1.0 } + - { step: 9_000_000, max_n_sheep: 10, difficulty: 1.0 } diff --git a/training/demos.npz b/training/demos.npz new file mode 100644 index 0000000..b84e4b2 Binary files /dev/null and b/training/demos.npz differ diff --git a/training/eval.py b/training/eval.py new file mode 100644 index 0000000..af3af36 --- /dev/null +++ b/training/eval.py @@ -0,0 +1,136 @@ +"""Evaluate a trained PPO policy (or the Strömbom baseline) on the env. + +Reports success rate and time-to-pen across a fixed seed grid for each +flock size 1..MAX_SHEEP. Used to produce the M5 quantitative comparison +table mentioned in plan.md. + +Usage:: + + python -m training.eval --policy training/runs/latest/best + python -m training.eval --policy strombom +""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path +from statistics import mean, stdev + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +import numpy as np + +from herding.geometry import MAX_SHEEP, PEN_ENTRY +from herding.strombom import compute_action as strombom_action +from herding.sequential import compute_action as sequential_action +from training.herding_env import HerdingEnv + + +def rollout(env: HerdingEnv, predict_fn, max_steps: int) -> dict: + obs, _ = env.reset() + success = False + for t in range(max_steps): + action = predict_fn(env, obs) + obs, _r, terminated, truncated, info = env.step(action) + if terminated or truncated: + success = bool(info.get("is_success", False)) + return {"success": success, "steps": info.get("steps", t + 1), + "n_penned": info.get("n_penned", 0)} + return {"success": False, "steps": max_steps, "n_penned": int(env.sheep_penned.sum())} + + +def make_analytic_predictor(action_fn): + def _predict(env, _obs): + positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i])) + for i in range(env.n_sheep) + if not env.sheep_penned[i]} + vx, vy, _mode = action_fn((env.dog_x, env.dog_y), positions, PEN_ENTRY) + return np.array([vx, vy], dtype=np.float32) + return _predict + + +# Backwards-compat alias. +def make_strombom_predictor(): + return make_analytic_predictor(strombom_action) + + +def make_policy_predictor(model, vecnorm): + def _predict(_env, obs): + if vecnorm is not None: + obs_b = vecnorm.normalize_obs(np.asarray(obs, dtype=np.float32).reshape(1, -1)) + else: + obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1) + action, _ = model.predict(obs_b, deterministic=True) + return action[0] + return _predict + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--policy", required=True, + help="Either 'strombom' or path to an SB3 run directory.") + parser.add_argument("--n-seeds", type=int, default=10) + parser.add_argument("--max-steps", type=int, default=5000) + parser.add_argument("--max-flock", type=int, default=MAX_SHEEP) + # 1.0 = deployment distribution (sheep anywhere in field). + # Lower values use the training-curriculum spawn band (sheep near gate). + parser.add_argument("--difficulty", type=float, default=1.0) + args = parser.parse_args() + + if args.policy == "strombom": + predict = make_analytic_predictor(strombom_action) + elif args.policy == "sequential": + predict = make_analytic_predictor(sequential_action) + else: + from stable_baselines3 import PPO + run = Path(args.policy) + # Resolve to a zip: directory of checkpoints, or a direct zip path. + if run.is_file(): + zip_path = run + else: + for name in ("best_model.zip", "policy.zip", "final.zip"): + if (run / name).exists(): + zip_path = run / name + break + else: + raise FileNotFoundError( + f"No checkpoint found in {run} (tried best_model.zip, " + f"policy.zip, final.zip)" + ) + model = PPO.load(str(zip_path), device="auto") + vecnorm = None + vn_path = run / "vecnormalize.pkl" + if not vn_path.exists() and run.parent.name != "best": + vn_path = run.parent / "vecnormalize.pkl" + if vn_path.exists(): + import pickle + with open(vn_path, "rb") as f: + vecnorm = pickle.load(f) + vecnorm.training = False + vecnorm.norm_reward = False + predict = make_policy_predictor(model, vecnorm) + + print(f"{'n_sheep':>8} {'success%':>10} {'mean_steps':>12} {'mean_penned':>12}") + print("-" * 46) + for n in range(1, args.max_flock + 1): + successes, steps, penned = [], [], [] + for seed in range(args.n_seeds): + env = HerdingEnv(n_sheep=n, max_steps=args.max_steps, + difficulty=args.difficulty, seed=seed) + r = rollout(env, predict, args.max_steps) + successes.append(int(r["success"])) + steps.append(r["steps"]) + penned.append(r["n_penned"]) + sr = 100.0 * mean(successes) + ms = mean(steps) + mp = mean(penned) + print(f"{n:>8d} {sr:>9.1f}% {ms:>12.0f} {mp:>12.2f}") + + +if __name__ == "__main__": + main() diff --git a/training/herding_env.py b/training/herding_env.py index 3cc9fd2..a350500 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -1,773 +1,421 @@ -""" -2D herding environment for PPO training (Gymnasium-compatible). +"""Gymnasium environment for the shepherd-dog herding task. -The dog agent (action: 2D velocity vector) must herd n_sheep into the -quarantine pen. Sheep dynamics mirror the Webots controller exactly: -flee (quadratic ramp), separation (inverse-distance), cohesion, wall -avoidance, and wander. +Single-agent: the agent is the dog. Sheep are environment-controlled +flocking agents whose dynamics are imported verbatim from +``herding.flocking_sim`` so a policy trained here transfers to Webots +without re-tuning. Differential-drive kinematics for both dog and sheep +match the proto specs (wheel radius, base, max wheel ω) via +``herding.diffdrive``. -Coordinate system matches the Webots world file: - field : x ∈ [-15, 15], y ∈ [-15, 15] - pen : x ∈ [10, 13], y ∈ [-15, -8] (SE corner, open north) +Action space +------------ +Box(-1, 1, (2,)) — the dog's desired (vx, vy) velocity *intent*. This +matches the high-level action representation the Webots controller +already uses; the env converts (vx, vy) → wheel speeds with the same +formula. -Observation (16-dim, fixed regardless of n_sheep): - dog position (2), flock COM relative to dog (2), top-3 farthest active - sheep relative to dog (6), pen relative to COM (2), pen relative to - farthest sheep (2), flock radius (1), fraction penned (1). +Observation space +----------------- +Box(-inf, inf, (28,)) — the order-invariant feature vector built by +``herding.obs.build_obs``. See ``herding/obs.py`` for the layout. -Permutation-invariant by design: curriculum stages share the same obs dim -so VecNormalize statistics transfer as n_sheep advances. +Reset +----- +``options["n_sheep"]`` (1..MAX_SHEEP) overrides the default flock size +for the next episode. If absent, flock size is sampled uniformly from +[1, max_n_sheep] each reset, where ``max_n_sheep`` can be raised over +training time by an outer callback. + +Reward +------ +Sparse + shaping (see :func:`HerdingEnv._compute_reward` for weights). + + +2.0 per newly penned sheep + +0.5 · ΔCoM-distance-to-pen (positive when CoM moves closer) + +0.2 · ΔFlock-radius (positive when flock tightens) + -0.005 per step (encourages speed) + - wall and collision penalties + +10.0 terminal bonus when all sheep penned """ -import csv -import numpy as np +from __future__ import annotations + +import math +import os +import random +import sys +from typing import Optional + import gymnasium as gym +import numpy as np from gymnasium import spaces +# Make herding/ importable when run from anywhere. +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +from herding.diffdrive import ( + heading_speed_to_wheels, kinematics_step, velocity_to_wheels, +) +from herding.flocking_sim import ( + FLEE_SPEED, MAX_SPEED, WANDER_SPEED, compute_heading_speed, +) +from herding.geometry import ( + DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_BASE, + DOG_WHEEL_RADIUS, FIELD_X, FIELD_Y, GATE_X, MAX_SHEEP, + PEN_ENTRY, PEN_X, PEN_Y, + SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS, + WEBOTS_DT, is_penned_position, +) +from herding.obs import OBS_DIM, build_obs +from herding.strombom import compute_action as strombom_action + class HerdingEnv(gym.Env): - metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 30} + """Single-agent shepherd-dog herding env. - # ----------------------------------------------------------------------- - # World constants — must match Webots world file - # ----------------------------------------------------------------------- - MAX_SHEEP = 10 - FIELD = 15.0 # field wall geometry in world file - SHEEP_WALL_INNER = 14.5 # sheep.py wall checks use ±14.5 - PEN_X = (10.0, 13.0) - PEN_Y = (-15.0, -8.0) - PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32) - PEN_ENTRY = np.array([11.5, -8.0], dtype=np.float32) # north entrance face center + Each step is one Webots ``basicTimeStep`` (16 ms). Episodes terminate + when all sheep are penned, or after ``max_steps`` steps (truncation). + """ - # ----------------------------------------------------------------------- - # Dynamics — calibrated to match Webots robot specs - # ----------------------------------------------------------------------- - DOG_SPEED = 2.5 # m/s - SHEEP_FLEE_V = 0.62 # m/s (20 rad/s * 0.031 m wheel radius in sheep.py) - SHEEP_WANDER_V = 0.093 # m/s (3 rad/s * 0.031 m wheel radius in sheep.py) - DT = 0.1 # seconds per step + metadata = {"render_modes": []} - # Differential-drive dog dynamics — mirrors shepherd_dog_rl.py drive(): - # speed_ms = ||a|| * DOG_SPEED - # err = wrap(target_heading - heading) - # fwd_ms = speed_ms * max(0, cos(err)) - # fwd_rad = fwd_ms / DOG_WHEEL_R - # turn = DOG_K_TURN * err - # l = clamp(fwd_rad - turn), r = clamp(fwd_rad + turn) - # Then integrated as unicycle kinematics using wheel geometry. - DOG_K_TURN = 4.0 # rad/s per rad (matches Webots controller) - DOG_WHEEL_R = 0.038 # m (ShepherdDog.proto wheel radius) - DOG_AXLE_TRACK = 0.28 # m (wheel anchors at y=±0.14 in proto) - DOG_MOTOR_MAX = 70.0 # rad/s (ShepherdDog.proto motor maxVelocity) - DOG_STOP_THRESHOLD = 0.05 # ||action|| below this → dog stops in place + # Reward shaping weights. Re-tuned after the first run got stuck at + # 0% success: progress reward must dominate the time penalty by a + # large margin, and the pen-event bonus must be big enough that PPO's + # advantage estimator can credit-assign across the long path that + # leads to it. Per-step shaping is bounded by the clamps inside + # _compute_reward. + # Drastically simplified after two runs got stuck farming a position + # bonus instead of penning sheep. Reward now is essentially: + # • huge jackpot for actually penning sheep (+100 per pen, +500 done) + # • small dense gradient: per-sheep mean distance to pen + # No position shaping (gameable), no compactness shaping (gameable), + # no engagement bonus (gameable). The terminal per-unpenned penalty + # forbids "good enough" partial herds. + # We have a working analytic baseline (Strömbom, 100 % on easy mode). + # Use it as a teacher: per-step bonus proportional to the cosine + # similarity between the policy's action and what Strömbom would do + # at the same state. This drags the policy out of "do nothing" local + # optima without locking it to the teacher — PPO can still find + # improvements over Strömbom because pen jackpots dominate. + W_PEN_DELTA = 100.0 + W_PROGRESS = 20.0 + W_IMITATE = 0.5 # per-step max ±0.5 (action cosine sim, [-1, 1]) + W_TIME = 0.0 + W_WALL = 0.0 + W_COLLISION = 0.0 + W_DONE = 500.0 - # Differential-drive sheep dynamics — mirrors sheep.py drive(): - SHEEP_K_TURN = 4.0 # rad/s per rad heading error (sheep.py k=4.0) - SHEEP_WHEEL_R = 0.031 # m (Sheep.proto wheel radius) - SHEEP_AXLE_TRACK = 0.20 # m (wheel anchors at y=+/-0.10 in proto) - SHEEP_MOTOR_MAX = 22.0 # rad/s (sheep.py MAX_SPEED clamp) - - # Sub-stepping: 6 x ~16.7ms ≈ 100ms per env step (Webots basicTimeStep=16ms) - N_SUBSTEPS = 6 - - # Peer communication lag — sheep broadcast every 3 Webots steps - PEER_BROADCAST_INTERVAL = 3 - - # Action smoothing EMA alpha; 0 = disabled (smoothing applied at Webots inference) + # Action smoothing during training: 0 = none. The Webots controller + # still applies its own EMA at inference for actuator stability, so + # the policy doesn't need to learn smoothness explicitly. ACTION_SMOOTH = 0.0 - # Boid parameters — identical to sheep.py - FLEE_DIST = 7.0 - SEPARATION_DIST = 2.5 - COHESION_DIST = 8.0 - WALL_MARGIN = 3.5 + # Episode budget. ~80 s of sim time at dt=0.016. The new external-pen + # layout has paths up to ~28 m from spawn to pen entry; at sheep flee + # speed ~0.4 m/s, that's 70 s minimum. 3000 steps (48 s) was leaving + # the dog with no margin for collect-then-drive on multi-sheep cases. + DEFAULT_MAX_STEPS = 5000 - # ----------------------------------------------------------------------- - # Reward weights (simple per-sheep progress — no phases, no gating) - # ----------------------------------------------------------------------- - W_PER_SHEEP = 2.0 # progress: sum of per-sheep distance-to-pen reductions - W_ALIGN = 0.05 # gated on action magnitude — dog only earns it when moving. - # Without gating this created a sit-still trap from n_sheep≥2. - W_PEN_BONUS = 10.0 # per sheep penned - W_COMPLETE = 100.0 # all sheep penned - W_STEP_COST = 0.02 # time penalty — strong enough to punish doing nothing - W_SOUTH = 0.01 # per-sheep per-metre penalty for active sheep below the pen - # entrance (y < PEN_Y[1]=-8). Keeps the dog from letting - # sheep drift into the dead zone below the open face where - # they must reverse direction (north) to enter — hard to - # recover. 0.01 ≈ half step_cost per metre below per sheep. - W_COMPACT = 0.0 # reward for flock-radius reduction (off by default) - W_WALL_TOUCH = 0.01 # per-sheep max penalty at wall surface. Linear ramp - # within WALL_TOUCH_BUFFER. Covers field outer walls and - # pen W/E/S walls. Kept small (≈ step_cost/2) so it - # nudges away from walls without dominating progress. - WALL_TOUCH_BUFFER = 0.4 # metres from wall where penalty starts ramping - ALIGN_SHAPE = "standoff" # "standoff" (peaks at IDEAL) | "near" (peaks at 0) - ALIGN_GATED = True # gate alignment on action magnitude - ENTRY_AWARE = False # When True, targets PEN_ENTRY (entrance face) instead - # of PEN_CENTER for progress/obs. Intended to fix wall- - # corralling but collapsed n_sheep≥2 success rate. - # The wall-touch gradient penalty handles wall avoidance - # without breaking the core herding signal. + # Distance under which the dog is considered "colliding" with a sheep. + COLLISION_DIST = 0.30 - # Initial sheep spawn: first sheep placed anywhere; rest within CLUSTER_RADIUS - # of it. Set to None for legacy uniform-scatter behaviour. - # Cluster radius ≤ COHESION_DIST (8m) so boid cohesion keeps the flock together. - INIT_CLUSTER_RADIUS = 5.0 - - def __init__(self, n_sheep: int = 1, max_steps: int = 2000, - render_mode: str = None, random_n_sheep: bool = False, - reward_cfg: dict = None): + def __init__( + self, + n_sheep: Optional[int] = None, + max_n_sheep: int = MAX_SHEEP, + max_steps: int = DEFAULT_MAX_STEPS, + difficulty: float = 0.0, + seed: Optional[int] = None, + ): super().__init__() - assert 1 <= n_sheep <= self.MAX_SHEEP - self.n_sheep = n_sheep - self.max_steps = max_steps - self.render_mode = render_mode - self.random_n_sheep = random_n_sheep # if True, randomise n_sheep each reset - - # Override class-default reward weights / shape with per-instance config - # so sweeps can ship configs into subprocess envs via pickled make_env. - if reward_cfg: - for k, v in reward_cfg.items(): - if not hasattr(self.__class__, k): - raise ValueError(f"unknown reward_cfg key: {k}") - setattr(self, k, v) - - # Fixed 18-dim observation regardless of n_sheep: - # dog_pos(2) + rel_com(2) + rel_far1(2) + rel_far2(2) + rel_far3(2) - # + com_to_pen(2) + far1_to_pen(2) + radius(1) + frac_penned(1) - # + cos(heading)(1) + sin(heading)(1) ← new, for wheeled dynamics + self.action_space = spaces.Box(-1.0, 1.0, shape=(2,), dtype=np.float32) self.observation_space = spaces.Box( - low=-np.inf, high=np.inf, shape=(18,), dtype=np.float32 + low=-np.inf, high=np.inf, shape=(OBS_DIM,), dtype=np.float32, ) - # Action: desired velocity (vx, vy) ∈ [-1, 1]², scaled by DOG_SPEED - self.action_space = spaces.Box( - low=-1.0, high=1.0, shape=(2,), dtype=np.float32 - ) + # If n_sheep is None, env will sample uniformly from [1, max_n_sheep] + # on every reset — this is the default for curriculum-free training. + self._fixed_n_sheep = n_sheep + self._max_n_sheep = max_n_sheep + self.max_steps = max_steps + # difficulty ∈ [0, 1]: 0 = sheep spawn next to the gate (easy), + # 1 = sheep spawn anywhere in the field (hard, the deployment + # distribution). Curriculum bumps this from 0 → 1 over training. + self._difficulty = float(difficulty) + self._initial_seed = seed - # Runtime state (populated by reset) - self._step_count = 0 - self._prev_penned = 0 - self._prev_pen_dist_sum = 0.0 - self.dog_pos = np.zeros(2, dtype=np.float32) - self.dog_heading = 0.0 # radians, world frame - self.sheep_pos = np.zeros((self.MAX_SHEEP, 2), dtype=np.float32) - self.sheep_heading = np.zeros(self.MAX_SHEEP, dtype=np.float32) - self.penned = np.ones(self.MAX_SHEEP, dtype=bool) - self.wander_ang = np.zeros(self.MAX_SHEEP, dtype=np.float32) - self._delayed_sheep_pos = np.zeros((self.MAX_SHEEP, 2), dtype=np.float32) - self._prev_action = np.zeros(2, dtype=np.float32) + # State (initialized in reset) + self.dog_x = self.dog_y = self.dog_heading = 0.0 + self.sheep_x = np.zeros(0, dtype=np.float32) + self.sheep_y = np.zeros(0, dtype=np.float32) + self.sheep_h = np.zeros(0, dtype=np.float32) + self.sheep_penned = np.zeros(0, dtype=bool) + self.sheep_wander = np.zeros(0, dtype=np.float32) - self._fig = None - # Differential-drive debug CSV for sim/Webots parity checks. - # Always on by design. - self._dog_debug_file = open("dog_debug.csv", "w", newline="") - self._dog_debug_writer = csv.writer(self._dog_debug_file) - self._dog_debug_writer.writerow([ - "step", "act_x", "act_y", "act_mag", "heading", "target_heading", - "heading_err", "fwd_speed", "left_w", "right_w", "v", "w", - "dog_x", "dog_y", - ]) + self.prev_action = np.zeros(2, dtype=np.float32) + self.smoothed_action = np.zeros(2, dtype=np.float32) + self.steps = 0 + self.n_sheep = 0 + self.prev_n_penned = 0 + self.prev_d_pen = 0.0 + self.prev_radius = 0.0 - # ------------------------------------------------------------------ - # Curriculum interface - # ------------------------------------------------------------------ + # Env-owned RNG for the flocking wander-jitter, seeded fresh on each + # reset so determinism is preserved without touching the global + # random module. + self._py_rng = random.Random() - def set_n_sheep(self, n: int): - """Advance curriculum difficulty; takes effect on next reset().""" - assert 1 <= n <= self.MAX_SHEEP - self.n_sheep = n + # ---- public knobs (used by curriculum callback) ---- + def set_max_n_sheep(self, value: int) -> None: + self._max_n_sheep = int(np.clip(value, 1, MAX_SHEEP)) - # ------------------------------------------------------------------ - # Gymnasium API - # ------------------------------------------------------------------ + def set_difficulty(self, value: float) -> None: + self._difficulty = float(np.clip(value, 0.0, 1.0)) - def reset(self, seed=None, options=None): + def set_imitate_weight(self, value: float) -> None: + """Override W_IMITATE (instance-level) — used to disable the + Strömbom imitation reward during BC fine-tuning, when the policy + already mimics a stronger teacher (sequential).""" + self.W_IMITATE = float(value) + + # ---- gym API ---- + def reset(self, *, seed=None, options=None): super().reset(seed=seed) - self._step_count = 0 - self._prev_penned = 0 + # Re-seed the flocking RNG from np_random so flocking jitter is + # reproducible alongside everything else the env samples. + self._py_rng.seed(int(self.np_random.integers(0, 2**31 - 1))) + opts = options or {} - if self.random_n_sheep: - self.n_sheep = int(self.np_random.integers(1, self.MAX_SHEEP + 1)) - - # Active sheep (0 .. n_sheep-1): random non-pen positions - self.sheep_pos[:] = self.PEN_CENTER - self.penned[:] = True - - # Spawn first sheep anywhere; subsequent sheep clustered around it - # so boid cohesion (active within 8m) keeps the flock together. - # Without clustering, sheep can start 25m apart and never coalesce — - # task becomes intractable for n_sheep ≥ 2. - placed = 0 - cluster_center = None - radius = self.INIT_CLUSTER_RADIUS - while placed < self.n_sheep: - if placed == 0 or radius is None: - p = self.np_random.uniform(-12.0, 12.0, size=(2,)).astype(np.float32) - else: - offset = self.np_random.uniform(-radius, radius, size=(2,)) - p = (cluster_center + offset).astype(np.float32) - p = np.clip(p, -12.0, 12.0) - if not self._in_pen(p): - self.sheep_pos[placed] = p - self.penned[placed] = False - if placed == 0: - cluster_center = p.copy() - placed += 1 - - # Dog: 50% of resets start already behind the flock (anti-pen side, - # within flee range) to give early training aligned experiences. - # Use the flock COM as the reference (not sheep[0]) so the bias - # generalizes from 1-sheep to multi-sheep without putting the dog - # in front of or inside the flock. - if self.np_random.random() < 0.5: - active_pts = self.sheep_pos[:self.n_sheep][~self.penned[:self.n_sheep]] - ref = active_pts.mean(axis=0) if len(active_pts) else self.sheep_pos[0] - away = ref - self.PEN_CENTER - d = float(np.linalg.norm(away)) - if d > 0.1: - away = away / d - offset = away * self.np_random.uniform(2.0, self.FLEE_DIST * 0.8) - self.dog_pos = np.clip( - (ref + offset).astype(np.float32), -self.FIELD, self.FIELD - ) + if "n_sheep" in opts and opts["n_sheep"] is not None: + self.n_sheep = int(opts["n_sheep"]) + elif self._fixed_n_sheep is not None: + self.n_sheep = int(self._fixed_n_sheep) else: - self.dog_pos = self.np_random.uniform( - -self.FIELD * 0.8, self.FIELD * 0.8, size=(2,) - ).astype(np.float32) + self.n_sheep = int(self.np_random.integers(1, self._max_n_sheep + 1)) - # Random initial heading so the policy learns to handle any orientation. - self.dog_heading = float(self.np_random.uniform(-np.pi, np.pi)) + # Dog spawns near origin with random heading. + self.dog_x = float(self.np_random.uniform(-2.5, 2.5)) + self.dog_y = float(self.np_random.uniform(-2.5, 2.5)) + self.dog_heading = float(self.np_random.uniform(-math.pi, math.pi)) - self.sheep_heading = self.np_random.uniform( - -np.pi, np.pi, size=(self.MAX_SHEEP,) - ).astype(np.float32) + # Sheep spawn region scales with difficulty: + # 0.0 → narrow box just north of the gate (x ∈ [7, 14], y ∈ [-12, -6]) + # 1.0 → full field (x ∈ [-13, 13], y ∈ [-12, 13]) + # Linear interpolation between the two for intermediate values. + d = self._difficulty + sx_lo = 7.0 - d * 20.0 # → -13 at d=1 + sx_hi = 14.0 - d * 1.0 # → 13 at d=1 + sy_lo = -12.0 + d * 0.0 # → -12 at d=1 + sy_hi = -6.0 + d * 19.0 # → 13 at d=1 - self.wander_ang = self.np_random.uniform( - -np.pi, np.pi, size=(self.MAX_SHEEP,) - ).astype(np.float32) + sxs, sys_, shs, sws = [], [], [], [] + for _ in range(self.n_sheep): + for _try in range(100): + sx = float(self.np_random.uniform(sx_lo, sx_hi)) + sy = float(self.np_random.uniform(sy_lo, sy_hi)) + # Reject too close to dog or to other sheep. + if math.hypot(sx - self.dog_x, sy - self.dog_y) < 3.0: + continue + if any(math.hypot(sx - x, sy - y) < 1.5 + for x, y in zip(sxs, sys_)): + continue + # Reject inside the gate column already (they'd start "penned"). + if PEN_X[0] <= sx <= PEN_X[1] and sy < -8.0: + continue + break + sxs.append(sx); sys_.append(sy) + shs.append(float(self.np_random.uniform(-math.pi, math.pi))) + sws.append(float(self.np_random.uniform(-math.pi, math.pi))) - self._delayed_sheep_pos[:self.n_sheep] = self.sheep_pos[:self.n_sheep].copy() - self._prev_action = np.zeros(2, dtype=np.float32) + self.sheep_x = np.asarray(sxs, dtype=np.float32) + self.sheep_y = np.asarray(sys_, dtype=np.float32) + self.sheep_h = np.asarray(shs, dtype=np.float32) + self.sheep_wander = np.asarray(sws, dtype=np.float32) + self.sheep_penned = np.zeros(self.n_sheep, dtype=bool) - # Initialise per-sheep pen-distance sum for progress reward - active = ~self.penned[:self.n_sheep] - target = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER - if active.any(): - self._prev_pen_dist_sum = float( - np.linalg.norm( - self.sheep_pos[:self.n_sheep][active] - target, axis=1 - ).sum() - ) - com0 = self.sheep_pos[:self.n_sheep][active].mean(axis=0) - self._prev_radius = float( - np.linalg.norm(self.sheep_pos[:self.n_sheep][active] - com0, axis=1).max() - ) - else: - self._prev_pen_dist_sum = 0.0 - self._prev_radius = 0.0 + self.prev_action = np.zeros(2, dtype=np.float32) + self.smoothed_action = np.zeros(2, dtype=np.float32) + self.steps = 0 + self.prev_n_penned = 0 + self.prev_d_pen, self.prev_radius = self._flock_metrics() - return self._obs(), {} + obs = self._build_obs() + info = {"n_sheep": self.n_sheep} + return obs, info def step(self, action): - self._step_count += 1 + action = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0) - act = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0) + # EMA smoothing — the Webots controller does this too. + self.smoothed_action = ( + self.ACTION_SMOOTH * self.prev_action + + (1.0 - self.ACTION_SMOOTH) * action + ) + self.prev_action = self.smoothed_action.copy() + vx, vy = float(self.smoothed_action[0]), float(self.smoothed_action[1]) - # Action smoothing EMA — matches shepherd_dog_rl.py ACTION_SMOOTH - if self.ACTION_SMOOTH > 0: - act = self.ACTION_SMOOTH * self._prev_action + (1.0 - self.ACTION_SMOOTH) * act - self._prev_action = act.copy() + # Safety supervisor mirrored from the controller — keeps the dog + # north of the gate so the policy can't strand itself in the pen. + if self.dog_y < DOG_SOUTH_LIMIT and vy < 0.0: + vx, vy = 0.0, 1.0 - act_mag = float(np.linalg.norm(act)) - sub_dt = self.DT / self.N_SUBSTEPS - dog_dbg = { - "target_heading": float(self.dog_heading), - "err": 0.0, "fwd_speed": 0.0, - "left_w": 0.0, "right_w": 0.0, "v": 0.0, "w": 0.0, - } - - for _sub in range(self.N_SUBSTEPS): - # Snapshot peer positions every 3 sub-steps (mirrors sheep broadcast) - if _sub % self.PEER_BROADCAST_INTERVAL == 0: - self._delayed_sheep_pos[:self.n_sheep] = self.sheep_pos[:self.n_sheep].copy() - - # Dog differential-drive sub-step - dbg = self._step_dog_substep(act, sub_dt) - if dbg["v"] != 0.0 or dbg["w"] != 0.0: - dog_dbg = dbg - - # Sheep dynamics sub-step - for i in range(self.n_sheep): - self.sheep_pos[i] = self._step_sheep(i, sub_dt) - if self._in_pen(self.sheep_pos[i]): - self.penned[i] = True - - n_penned = int(self.penned[:self.n_sheep].sum()) - newly_penned = n_penned - self._prev_penned - self._prev_penned = n_penned - - reward, rcomps = self._reward(n_penned, newly_penned, act) - terminated = n_penned == self.n_sheep - truncated = self._step_count >= self.max_steps - info = {"n_penned": n_penned, "n_sheep": self.n_sheep, - "rcomps": rcomps, "dog_dyn": dog_dbg} - - self._dog_debug_writer.writerow([ - self._step_count, - float(act[0]), float(act[1]), act_mag, - float(self.dog_heading), dog_dbg["target_heading"], dog_dbg["err"], - dog_dbg["fwd_speed"], dog_dbg["left_w"], dog_dbg["right_w"], - dog_dbg["v"], dog_dbg["w"], - float(self.dog_pos[0]), float(self.dog_pos[1]), - ]) - if self._step_count % 200 == 0: - self._dog_debug_file.flush() - - if self.render_mode == "human": - self.render() - - return self._obs(), float(reward), terminated, truncated, info - - def render(self): - import matplotlib.pyplot as plt - import matplotlib.patches as mpatches - - if self._fig is None: - plt.ion() - self._fig, self._ax = plt.subplots(figsize=(6, 6)) - - ax = self._ax - ax.clear() - ax.set_xlim(-16, 16); ax.set_ylim(-16, 16) - ax.set_aspect("equal"); ax.set_facecolor("#dcedc8") - - ax.add_patch(mpatches.Rectangle( - (-15, -15), 30, 30, fill=False, edgecolor="#795548", linewidth=2 - )) - pw = self.PEN_X[1] - self.PEN_X[0] - ph = self.PEN_Y[1] - self.PEN_Y[0] - ax.add_patch(mpatches.Rectangle( - (self.PEN_X[0], self.PEN_Y[0]), pw, ph, - facecolor="#ffe082", edgecolor="#795548", linewidth=2 - )) - ax.text(11.5, -11.5, "pen", ha="center", va="center", - fontsize=8, color="#795548") - - com, radius, _ = self._flock_stats() - ax.add_patch(plt.Circle(com, radius, color="steelblue", - fill=False, linestyle="--", linewidth=1)) - ax.plot(*com, "+", color="steelblue", markersize=10) + # --- Step the dog --- + wL, wR = velocity_to_wheels( + vx, vy, self.dog_heading, + max_linear=DOG_MAX_LINEAR, + wheel_radius=DOG_WHEEL_RADIUS, + max_wheel_omega=DOG_MAX_WHEEL_OMEGA, + k_turn=4.0, + ) + self.dog_x, self.dog_y, self.dog_heading = kinematics_step( + self.dog_x, self.dog_y, self.dog_heading, + wL, wR, DOG_WHEEL_RADIUS, DOG_WHEEL_BASE, WEBOTS_DT, + ) + # Clip dog to field bounds and out of pen — same as the Webots stone walls. + self.dog_x = float(np.clip(self.dog_x, FIELD_X[0] + 0.3, FIELD_X[1] - 0.3)) + self.dog_y = float(np.clip(self.dog_y, DOG_SOUTH_LIMIT, FIELD_Y[1] - 0.3)) + # --- Step each sheep --- for i in range(self.n_sheep): - if i >= self.n_sheep: - continue - color = "deeppink" if self.penned[i] else "white" - ax.plot(*self.sheep_pos[i], "o", color=color, markersize=11, - markeredgecolor="#555", markeredgewidth=1.5) + self._step_one_sheep(i) - ax.plot(*self.dog_pos, "s", color="#4e342e", markersize=13, - markeredgecolor="black", markeredgewidth=1.5) + # --- Update penned state --- + for i in range(self.n_sheep): + if (not self.sheep_penned[i] + and is_penned_position(self.sheep_x[i], self.sheep_y[i])): + self.sheep_penned[i] = True - ax.set_title( - f"step {self._step_count} | " - f"penned {int(self.penned[:self.n_sheep].sum())}/{self.n_sheep} | " - f"r={radius:.1f}m", - fontsize=11 - ) - self._fig.canvas.draw() - self._fig.canvas.flush_events() - plt.pause(0.001) + # --- Reward, termination --- + d_pen, radius = self._flock_metrics() + reward = self._compute_reward(d_pen, radius, action=action) + self.prev_d_pen = d_pen + self.prev_radius = radius + self.prev_n_penned = int(self.sheep_penned.sum()) - def close(self): - if self._fig is not None: - import matplotlib.pyplot as plt - plt.close(self._fig) - self._fig = None - self._dog_debug_file.close() + self.steps += 1 + all_penned = bool(self.sheep_penned.all()) + terminated = all_penned + truncated = self.steps >= self.max_steps + if all_penned: + reward += self.W_DONE + # No timeout penalty: a per-unpenned penalty made "do nothing" + # strictly preferable to noisy-random under reward-progress shaping + # (random sometimes pushes sheep away → negative progress, then + # always ate the timeout penalty), which collapsed exploration to + # tiny actions. The pen jackpot alone provides the directional + # signal once exploration is wide enough to find it. - # ------------------------------------------------------------------ - # Internals - # ------------------------------------------------------------------ - - def _in_pen(self, pos: np.ndarray) -> bool: - return (self.PEN_X[0] < pos[0] < self.PEN_X[1] and - self.PEN_Y[0] < pos[1] < self.PEN_Y[1]) - - def _sheep_drive(self, i: int, target_heading: float, speed_rad: float, - dt: float) -> np.ndarray: - """Differential-drive integration for sheep i over one sub-step dt. - - Mirrors sheep.py drive(): heading error -> cos(err) forward scaling -> - wheel speeds with saturation -> unicycle kinematics. - """ - heading = float(self.sheep_heading[i]) - err = (target_heading - heading + np.pi) % (2 * np.pi) - np.pi - - fwd_rad = speed_rad * max(0.0, float(np.cos(err))) - turn = self.SHEEP_K_TURN * err - - left_w = np.clip(fwd_rad - turn, -self.SHEEP_MOTOR_MAX, self.SHEEP_MOTOR_MAX) - right_w = np.clip(fwd_rad + turn, -self.SHEEP_MOTOR_MAX, self.SHEEP_MOTOR_MAX) - - v = self.SHEEP_WHEEL_R * 0.5 * (right_w + left_w) - w = (self.SHEEP_WHEEL_R / self.SHEEP_AXLE_TRACK) * (right_w - left_w) - - self.sheep_heading[i] = float( - ((heading + w * dt) + np.pi) % (2 * np.pi) - np.pi - ) - step_vec = np.array( - [np.cos(self.sheep_heading[i]), np.sin(self.sheep_heading[i])], - dtype=np.float32 - ) - return (self.sheep_pos[i] + step_vec * v * dt).astype(np.float32) - - def _step_dog_substep(self, act: np.ndarray, dt: float) -> dict: - """Move the dog one sub-step with differential-drive kinematics. - - Returns debug dict with wheel/velocity info. - """ - old_dog = self.dog_pos.copy() - act_mag = float(np.linalg.norm(act)) - dog_dbg = { - "target_heading": float(self.dog_heading), - "err": 0.0, "fwd_speed": 0.0, - "left_w": 0.0, "right_w": 0.0, "v": 0.0, "w": 0.0, + obs = self._build_obs() + info = { + "n_sheep": self.n_sheep, + "n_penned": self.prev_n_penned, + "is_success": all_penned, + "steps": self.steps, } + return obs, float(reward), terminated, truncated, info - if act_mag < self.DOG_STOP_THRESHOLD: - return dog_dbg - - target_heading = float(np.arctan2(act[1], act[0])) - err = (target_heading - self.dog_heading + np.pi) % (2 * np.pi) - np.pi - - target_speed = act_mag * self.DOG_SPEED - fwd_speed = target_speed * max(0.0, float(np.cos(err))) - fwd_rad = fwd_speed / self.DOG_WHEEL_R - turn = self.DOG_K_TURN * err - - left_w = np.clip(fwd_rad - turn, -self.DOG_MOTOR_MAX, self.DOG_MOTOR_MAX) - right_w = np.clip(fwd_rad + turn, -self.DOG_MOTOR_MAX, self.DOG_MOTOR_MAX) - - v = self.DOG_WHEEL_R * 0.5 * (right_w + left_w) - w = (self.DOG_WHEEL_R / self.DOG_AXLE_TRACK) * (right_w - left_w) - dog_dbg.update({ - "target_heading": target_heading, "err": float(err), - "fwd_speed": float(fwd_speed), "left_w": float(left_w), - "right_w": float(right_w), "v": float(v), "w": float(w), - }) - - self.dog_heading = float( - ((self.dog_heading + w * dt) + np.pi) % (2 * np.pi) - np.pi + # ---- internals ---- + def _step_one_sheep(self, i: int) -> None: + x, y = float(self.sheep_x[i]), float(self.sheep_y[i]) + peers = [(float(self.sheep_x[j]), float(self.sheep_y[j])) + for j in range(self.n_sheep) if j != i] + heading, speed_motor, new_wander = compute_heading_speed( + x, y, + penned=bool(self.sheep_penned[i]), + dog_xy=(self.dog_x, self.dog_y), + peers=peers, + wander_angle=float(self.sheep_wander[i]), + rng=self._py_rng, ) - step_vec = np.array( - [np.cos(self.dog_heading), np.sin(self.dog_heading)], - dtype=np.float32 + self.sheep_wander[i] = new_wander + + wL, wR = heading_speed_to_wheels( + heading, speed_motor, float(self.sheep_h[i]), + max_wheel_omega=SHEEP_MAX_WHEEL_OMEGA, k_turn=4.0, ) - new_dog = np.clip( - self.dog_pos + step_vec * v * dt, -self.FIELD, self.FIELD, + nx, ny, nh = kinematics_step( + x, y, float(self.sheep_h[i]), wL, wR, + SHEEP_WHEEL_RADIUS, SHEEP_WHEEL_BASE, WEBOTS_DT, ) - # Pen wall collision - px0, px1 = self.PEN_X - py0, py1 = self.PEN_Y - if py0 < new_dog[1] < py1: - if old_dog[0] < px0 <= new_dog[0]: - new_dog[0] = px0 - 1e-3 - elif old_dog[0] > px0 >= new_dog[0]: - new_dog[0] = px0 + 1e-3 - if old_dog[0] > px1 >= new_dog[0]: - new_dog[0] = px1 + 1e-3 - elif old_dog[0] < px1 <= new_dog[0]: - new_dog[0] = px1 - 1e-3 - self.dog_pos = new_dog.astype(np.float32) - return dog_dbg + # Wall clipping — matches Webots stone walls, except in the gate column + # where the south wall is absent. + nx = float(np.clip(nx, FIELD_X[0] + 0.2, FIELD_X[1] - 0.2)) + in_gate_col = PEN_X[0] <= nx <= PEN_X[1] + if in_gate_col: + ny = float(np.clip(ny, PEN_Y[0] + 0.2, FIELD_Y[1] - 0.2)) + else: + ny = float(np.clip(ny, FIELD_Y[0] + 0.2, FIELD_Y[1] - 0.2)) - def _flock_stats(self): - """Return (COM, radius, mean_dispersion) over active sheep.""" - active_mask = ~self.penned[:self.n_sheep] + self.sheep_x[i] = nx + self.sheep_y[i] = ny + self.sheep_h[i] = nh + + def _flock_metrics(self): + """(per-sheep mean distance to pen entry, max-radius). + + Using the per-sheep mean instead of CoM-distance ensures stragglers + keep contributing to the progress signal — the dog can't game the + shaping by herding the bulk of the flock and abandoning one + outlier (CoM moves toward pen, but mean-distance doesn't). + """ + active_mask = ~self.sheep_penned if not active_mask.any(): - return self.PEN_CENTER.copy(), 0.0, 0.0 - pts = self.sheep_pos[:self.n_sheep][active_mask] - com = pts.mean(axis=0) - dists = np.linalg.norm(pts - com, axis=1) - return com, float(dists.max()), float(dists.mean()) - - def _obs(self) -> np.ndarray: - com, radius, _ = self._flock_stats() - active_mask = ~self.penned[:self.n_sheep] - - if active_mask.any(): - pts = self.sheep_pos[:self.n_sheep][active_mask] - dists = np.linalg.norm(pts - com, axis=1) - sorted_idx = np.argsort(dists)[::-1] # farthest first - # Top-3 stragglers; pad with COM when fewer active sheep exist - def nth(n): - return pts[sorted_idx[n]] if len(sorted_idx) > n else com - far1, far2, far3 = nth(0), nth(1), nth(2) + return 0.0, 0.0 + xs = self.sheep_x[active_mask] + ys = self.sheep_y[active_mask] + per_sheep_d = np.hypot(xs - PEN_ENTRY[0], ys - PEN_ENTRY[1]) + d_pen = float(per_sheep_d.mean()) + com_x, com_y = float(xs.mean()), float(ys.mean()) + if active_mask.sum() == 1: + radius = 0.0 else: - far1 = far2 = far3 = self.PEN_CENTER.copy() + radius = float(np.hypot(xs - com_x, ys - com_y).max()) + return d_pen, radius - S = self.FIELD - D = 2 * self.FIELD + def _compute_reward(self, d_pen: float, radius: float, action=None) -> float: + """Sparse + per-sheep distance shaping + Strömbom imitation. - # far1/far2/far3 expressed relative to COM, not dog. - # For 1 sheep: far1-COM = far2-COM = far3-COM = [0,0] → cleanly ignorable. - # For 3+ sheep: non-zero vectors tell the dog where each straggler is - # within the group, without conflicting with weights trained on 1 sheep. - # Pen reference for the policy. Aligned with the reward target so the - # policy isn't forced to learn an implicit offset between what it sees - # ("pen is here") and what it's rewarded for ("get sheep close to here"). - pen_ref = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER - return np.array([ - self.dog_pos[0] / S, self.dog_pos[1] / S, - (com[0] - self.dog_pos[0]) / D, (com[1] - self.dog_pos[1]) / D, - (far1[0] - com[0]) / D, (far1[1] - com[1]) / D, - (far2[0] - com[0]) / D, (far2[1] - com[1]) / D, - (far3[0] - com[0]) / D, (far3[1] - com[1]) / D, - (pen_ref[0] - com[0]) / D, (pen_ref[1] - com[1]) / D, - (pen_ref[0] - far1[0]) / D, (pen_ref[1] - far1[1]) / D, - radius / D, - active_mask.sum() / self.n_sheep, - float(np.cos(self.dog_heading)), - float(np.sin(self.dog_heading)), - ], dtype=np.float32) + d_pen is the *mean* distance over active sheep, so progress only + accrues when ALL active sheep get closer to the pen on average — + the dog can't farm it by herding one sheep while ignoring others. - def _reward(self, n_penned: int, newly_penned: int, action: np.ndarray): - active = ~self.penned[:self.n_sheep] + The imitation term is computed by querying Strömbom for the + recommended action at the *current* (post-step) state and + rewarding cosine similarity with what the policy actually did. + """ + n_penned = int(self.sheep_penned.sum()) + delta_pen = n_penned - self.prev_n_penned - # Per-sheep progress toward pen: fires whenever any sheep moves closer. - # Naturally rewards keeping the flock together and pushing toward pen: - # dog behind flock → all sheep flee toward pen → all contribute positive reward. - # Dog from wrong side → sheep scatter away from pen → negative reward. - target = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER - if active.any(): - pen_dists = np.linalg.norm( - self.sheep_pos[:self.n_sheep][active] - target, axis=1 - ) - cur_sum = float(pen_dists.sum()) - r_progress = (self._prev_pen_dist_sum - cur_sum) * self.W_PER_SHEEP - self._prev_pen_dist_sum = cur_sum - else: - r_progress = 0.0 + d_progress = max(-5.0, min(5.0, self.prev_d_pen - d_pen)) + r = self.W_PEN_DELTA * delta_pen + self.W_PROGRESS * d_progress - com, _, _ = self._flock_stats() - com_dist = float(np.linalg.norm(com - target)) - d_dog_com = float(np.linalg.norm(self.dog_pos - com)) - if d_dog_com > 0.1 and com_dist > 0.1: - pen_dir = (target - com) / com_dist - dog_dir = (self.dog_pos - com) / d_dog_com - cosine = -float(np.dot(pen_dir, dog_dir)) - if self.ALIGN_SHAPE == "standoff": - IDEAL = 0.5 * (self.SEPARATION_DIST + self.FLEE_DIST) - HALF = self.FLEE_DIST - IDEAL - proximity = max(0.0, 1.0 - abs(d_dog_com - IDEAL) / HALF) - else: # "near" - proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST) - move_gate = (min(1.0, float(np.linalg.norm(action))) - if self.ALIGN_GATED else 1.0) - alignment = cosine * proximity * move_gate * self.W_ALIGN - else: - alignment = 0.0 + if action is not None and self.W_IMITATE > 0.0: + positions = { + f"s{i}": (float(self.sheep_x[i]), float(self.sheep_y[i])) + for i in range(self.n_sheep) if not self.sheep_penned[i] + } + if positions: + sx, sy, _mode = strombom_action( + (self.dog_x, self.dog_y), positions, PEN_ENTRY, + ) + a_norm = math.hypot(float(action[0]), float(action[1])) + s_norm = math.hypot(sx, sy) + if a_norm > 1e-3 and s_norm > 1e-3: + cos_sim = (float(action[0]) * sx + float(action[1]) * sy) / (a_norm * s_norm) + r += self.W_IMITATE * cos_sim - # Wall-touch penalty: distance-based gradient covering ALL solid surfaces - # the sheep can hit — the four field outer walls (always present) plus - # the three solid pen walls (west, east, south). Linearly ramps from 0 - # at buffer edge to W_WALL_TOUCH at the wall surface. Goal: sheep should - # never end up pinned against any wall (transfer concern: Webots fences - # have pillars that can physically trap sheep). - if self.W_WALL_TOUCH and active.any(): - pts = self.sheep_pos[:self.n_sheep][active] - px0, px1 = self.PEN_X - py0, py1 = self.PEN_Y - F = self.FIELD - buf = self.WALL_TOUCH_BUFFER - far = buf + 1.0 - # Field outer walls — sheep is always inside [-F, F]^2. - d_fw = pts[:, 0] - (-F) # distance to west field wall - d_fe = F - pts[:, 0] # east field wall - d_fs = pts[:, 1] - (-F) # south field wall - d_fn = F - pts[:, 1] # north field wall - # Pen W/E/S walls — only relevant approached from outside. - d_pw = np.where((pts[:, 0] < px0) & (pts[:, 1] > py0) & (pts[:, 1] < py1), - px0 - pts[:, 0], far) - d_pe = np.where((pts[:, 0] > px1) & (pts[:, 1] > py0) & (pts[:, 1] < py1), - pts[:, 0] - px1, far) - d_ps = np.where((pts[:, 1] < py0) & (pts[:, 0] > px0) & (pts[:, 0] < px1), - py0 - pts[:, 1], far) - d_min = np.minimum.reduce([d_fw, d_fe, d_fs, d_fn, d_pw, d_pe, d_ps]) - penalties = np.maximum(0.0, 1.0 - d_min / buf) * self.W_WALL_TOUCH - r_wall_touch = -float(penalties.sum()) - else: - r_wall_touch = 0.0 + return float(r) - # South penalty: discourage active sheep from drifting below the pen - # entrance (y < PEN_Y[1]) while OUTSIDE the pen's x-range. Sheep at - # y<-8 with x∈[PEN_X] are entering through the gate — that's desired. - # The dead zone is y<-8 and x outside [PEN_X]: stuck against pen walls, - # must reverse direction (north) to reach the entrance — hard to recover. - if self.W_SOUTH and active.any(): - pts = self.sheep_pos[:self.n_sheep][active] - depth = np.maximum(0.0, self.PEN_Y[1] - pts[:, 1]) - outside_pen_x = (pts[:, 0] < self.PEN_X[0]) | (pts[:, 0] > self.PEN_X[1]) - r_south = -float((depth * outside_pen_x).sum()) * self.W_SOUTH - else: - r_south = 0.0 - - # Compactness shaping: reward decreases in flock radius (active sheep only) - if self.W_COMPACT and active.any(): - cur_radius = float(np.linalg.norm( - self.sheep_pos[:self.n_sheep][active] - com, axis=1 - ).max()) - r_compact = (self._prev_radius - cur_radius) * self.W_COMPACT - self._prev_radius = cur_radius - else: - r_compact = 0.0 - - r_pen_bonus = newly_penned * self.W_PEN_BONUS - r_step_cost = -self.W_STEP_COST - r_complete = self.W_COMPLETE if n_penned == self.n_sheep else 0.0 - reward = (r_progress + alignment + r_south + r_compact + r_wall_touch - + r_pen_bonus + r_step_cost + r_complete) - rcomps = { - "progress": float(r_progress), - "alignment": float(alignment), - "south": float(r_south), - "compact": float(r_compact), - "wall_touch": float(r_wall_touch), - "pen_bonus": float(r_pen_bonus), - "step_cost": float(r_step_cost), - "complete": float(r_complete), - } - return reward, rcomps - - def _step_sheep(self, i: int, sub_dt: float) -> np.ndarray: - """Apply one sub-step of boid dynamics to sheep i (mirrors sheep.py).""" - old_pos = self.sheep_pos[i].copy() - pos = old_pos.copy() - fx, fy = 0.0, 0.0 - if self.penned[i]: - pm = 0.8 # PEN_MARGIN in sheep.py - px0, px1 = self.PEN_X - py0, py1 = self.PEN_Y - x, y = float(pos[0]), float(pos[1]) - if x < px0 + pm: fx += ((px0 + pm - x) / pm) * 15.0 - if x > px1 - pm: fx -= ((x - (px1 - pm)) / pm) * 15.0 - if y < py0 + pm: fy += ((py0 + pm - y) / pm) * 15.0 - if y > py1 - pm: fy -= ((y - (py1 - pm)) / pm) * 15.0 - - for j in range(self.n_sheep): - if j == i or not self.penned[j]: - continue - dv = self._delayed_sheep_pos[j] - pos - dj = float(np.linalg.norm(dv)) - if 0.05 < dj < self.SEPARATION_DIST: - push = (self.SEPARATION_DIST - dj) / dj - fx -= (dv[0] / dj) * push * 2.5 - fy -= (dv[1] / dj) * push * 2.5 - - if self.np_random.random() < 0.02: - self.wander_ang[i] += float(self.np_random.uniform(-0.6, 0.6)) - fx += float(np.cos(self.wander_ang[i])) * 0.5 - fy += float(np.sin(self.wander_ang[i])) * 0.5 - - force = np.array([fx, fy], dtype=np.float32) - mag = float(np.linalg.norm(force)) - if mag > 0.01: - target_heading = float(np.arctan2(fy, fx)) - speed_rad = max(3.0, min(20.0, mag * 3.0)) - pos = self._sheep_drive(i, target_heading, speed_rad, sub_dt) - pos = np.clip(pos, -self.FIELD, self.FIELD) - return pos.astype(np.float32) - - fleeing = False - - # Flee from dog — quadratic ramp - diff = self.dog_pos - pos - dist = float(np.linalg.norm(diff)) - if 0.01 < dist < self.FLEE_DIST: - t = 1.0 - dist / self.FLEE_DIST - s = t * t * 20.0 - fx -= (diff[0] / dist) * s - fy -= (diff[1] / dist) * s - fleeing = True - - # Repel unpenned sheep from pen side-wall exteriors (sheep.py PEN_EXT_MARGIN). - if self.PEN_Y[0] < pos[1] < self.PEN_Y[1]: - pem = 0.8 - if self.PEN_X[0] - pem < pos[0] < self.PEN_X[0]: - fx -= ((pos[0] - (self.PEN_X[0] - pem)) / pem) * 6.0 - if self.PEN_X[1] < pos[0] < self.PEN_X[1] + pem: - fx += ((self.PEN_X[1] + pem - pos[0]) / pem) * 6.0 - - # Separation (inverse-distance) + Cohesion — uses delayed peer positions - cx, cy, cn = 0.0, 0.0, 0 - for j in range(self.n_sheep): - if j == i or self.penned[j]: - continue - dv = self._delayed_sheep_pos[j] - pos - dj = float(np.linalg.norm(dv)) - if 0.3 < dj < self.COHESION_DIST: - cx += self._delayed_sheep_pos[j][0] - cy += self._delayed_sheep_pos[j][1] - cn += 1 - if 0.05 < dj < self.SEPARATION_DIST: - push = (self.SEPARATION_DIST - dj) / dj - fx -= (dv[0] / dj) * push * 2.5 - fy -= (dv[1] / dj) * push * 2.5 - if cn > 0: - w = 0.08 if fleeing else 0.15 - fx += (cx / cn - pos[0]) * w - fy += (cy / cn - pos[1]) * w - - # Wall avoidance - m, F = self.WALL_MARGIN, self.SHEEP_WALL_INNER - if pos[0] < -F + m: fx += ((-F + m - pos[0]) / m) * 6.0 - if pos[0] > F - m: fx -= ((pos[0] - (F - m)) / m) * 6.0 - if pos[1] < -F + m: fy += ((-F + m - pos[1]) / m) * 6.0 - if pos[1] > F - m: fy -= ((pos[1] - (F - m)) / m) * 6.0 - - - # Hard-stop clamp: mirrors sheep.py — zero any force driving further - # into the wall within 0.5 m so the flee force cannot pin the sheep. - HS = 0.5 - if pos[0] < -F + HS and fx < 0: fx = 0.0 - if pos[0] > F - HS and fx > 0: fx = 0.0 - if pos[1] < -F + HS and fy < 0: fy = 0.0 - if pos[1] > F - HS and fy > 0: fy = 0.0 - - # Wander — suppressed while fleeing - if not fleeing: - if self.np_random.random() < 0.02: - self.wander_ang[i] += float(self.np_random.uniform(-0.6, 0.6)) - fx += float(np.cos(self.wander_ang[i])) * 0.5 - fy += float(np.sin(self.wander_ang[i])) * 0.5 - - # Integrate via differential-drive (mirrors sheep.py speed mapping + drive()) - force = np.array([fx, fy]) - mag = float(np.linalg.norm(force)) - if mag > 0.01: - target_heading = float(np.arctan2(fy, fx)) - speed_rad = max(3.0, min(20.0, mag * 3.0)) # sheep.py line 229 - pos = self._sheep_drive(i, target_heading, speed_rad, sub_dt) - pos = np.clip(pos, -self.FIELD, self.FIELD) - - # Pen solid wall collision — mirrors Webots geometry. - px0, px1 = self.PEN_X[0], self.PEN_X[1] - py0, py1 = self.PEN_Y[0], self.PEN_Y[1] - entered_from_north = ( - old_pos[1] >= py1 and pos[1] < py1 and px0 < pos[0] < px1 + def _build_obs(self) -> np.ndarray: + sheep_xy_list = list(zip(self.sheep_x.tolist(), self.sheep_y.tolist())) + sheep_penned_list = self.sheep_penned.tolist() + return build_obs( + (self.dog_x, self.dog_y), self.dog_heading, + sheep_xy_list, sheep_penned_list, + n_max=self._max_n_sheep, ) - if not entered_from_north: - # Block crossing through west wall from outside - if old_pos[0] < px0 <= pos[0] and py0 < pos[1] < py1: - pos = np.array([px0 - 1e-3, pos[1]], dtype=np.float32) - # Block crossing through east wall from outside - if old_pos[0] > px1 >= pos[0] and py0 < pos[1] < py1: - pos = np.array([px1 + 1e-3, pos[1]], dtype=np.float32) - # Block crossing through south wall from outside - if old_pos[1] < py0 <= pos[1] and px0 < pos[0] < px1: - pos = np.array([pos[0], py0 - 1e-3], dtype=np.float32) - - return pos.astype(np.float32) diff --git a/training/parity_test.py b/training/parity_test.py index e54fb74..57b6c9d 100644 --- a/training/parity_test.py +++ b/training/parity_test.py @@ -1,318 +1,96 @@ -""" -Parity test: verify 2D training env matches Webots controller implementations. +"""Parity smoke-test for the herding env. -Tests: -1. Observation building: HerdingEnv._obs() vs shepherd_dog_rl.build_obs() -2. Dog drive: HerdingEnv._step_dog_substep() vs shepherd_dog_rl.drive() math -3. Sheep drive: HerdingEnv._sheep_drive() vs sheep.py drive() math +Verifies (a) all imports resolve, (b) the env's reset/step contract is +correct, (c) deterministic seeds give deterministic trajectories, and +(d) the Strömbom baseline can drive the env without crashing. + +Run:: + + python -m training.parity_test """ -import sys +from __future__ import annotations + import os -import math +import sys + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + import numpy as np -# Make imports work from project root -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -sys.path.insert(0, os.path.join(os.path.dirname(__file__))) -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "controllers", "shepherd_dog_rl")) - -from herding_env import HerdingEnv - -# Re-implement the Webots functions standalone (no Webots dependency) -FIELD = 15.0 -PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32) -PEN_ENTRY = np.array([11.5, -8.0], dtype=np.float32) -PEN_X = (10.0, 13.0) -PEN_Y = (-15.0, -8.0) -ENTRY_AWARE = True +from herding.geometry import MAX_SHEEP, PEN_ENTRY +from herding.obs import OBS_DIM +from herding.strombom import compute_action +from training.herding_env import HerdingEnv -def webots_build_obs(dog_pos, sheep_positions, n_sheep, dog_heading): - """Standalone version of shepherd_dog_rl.py build_obs().""" - D = 2 * FIELD - active_pos = np.array( - [p for p in sheep_positions - if not (PEN_X[0] < p[0] < PEN_X[1] and PEN_Y[0] < p[1] < PEN_Y[1])], - dtype=np.float32 - ) - n_active = len(active_pos) - if n_active > 0: - com = active_pos.mean(axis=0) - d_from_com = np.linalg.norm(active_pos - com, axis=1) - sorted_idx = np.argsort(d_from_com)[::-1] - radius = float(d_from_com[sorted_idx[0]]) - def nth(n): - return active_pos[sorted_idx[n]] if len(sorted_idx) > n else com - far1, far2, far3 = nth(0), nth(1), nth(2) - else: - com = PEN_CENTER.copy() - radius = 0.0 - far1 = far2 = far3 = PEN_CENTER.copy() - frac_active = n_active / max(n_sheep, 1) - pen_ref = PEN_ENTRY if ENTRY_AWARE else PEN_CENTER - return np.array([ - dog_pos[0] / FIELD, dog_pos[1] / FIELD, - (com[0] - dog_pos[0]) / D, (com[1] - dog_pos[1]) / D, - (far1[0] - com[0]) / D, (far1[1] - com[1]) / D, - (far2[0] - com[0]) / D, (far2[1] - com[1]) / D, - (far3[0] - com[0]) / D, (far3[1] - com[1]) / D, - (pen_ref[0] - com[0]) / D, (pen_ref[1] - com[1]) / D, - (pen_ref[0] - far1[0]) / D, (pen_ref[1] - far1[1]) / D, - radius / D, - frac_active, - math.cos(dog_heading), math.sin(dog_heading), - ], dtype=np.float32) +def test_obs_action_shapes(): + env = HerdingEnv(n_sheep=3, seed=0) + obs, info = env.reset() + assert obs.shape == (OBS_DIM,), obs.shape + assert obs.dtype == np.float32 + obs2, r, term, trunc, info = env.step(np.array([0.5, 0.0], dtype=np.float32)) + assert obs2.shape == (OBS_DIM,) + assert isinstance(r, float) + assert isinstance(term, bool) and isinstance(trunc, bool) + print("[ok] shapes") -def webots_dog_drive(heading, speed_ms, wheel_r=0.038, k_turn=4.0, - motor_max=70.0, axle_track=0.28): - """Standalone version of shepherd_dog_rl.py drive() kinematics. +def test_reset_determinism(): + """Reset with the same seed should give the same initial observation. - Returns (v_linear, omega, left_w, right_w). + We don't require step-determinism — PPO doesn't need it, and chasing + bit-exactness through the flocking jitter isn't worth the complexity. """ - err = math.atan2(math.sin(heading), math.cos(heading)) - fwd_ms = speed_ms * max(0.0, math.cos(err)) - fwd_rad = fwd_ms / wheel_r - turn = k_turn * err - l = max(-motor_max, min(motor_max, fwd_rad - turn)) - r = max(-motor_max, min(motor_max, fwd_rad + turn)) - v = wheel_r * 0.5 * (r + l) - w = (wheel_r / axle_track) * (r - l) - return v, w, l, r + env_a = HerdingEnv(n_sheep=3, seed=42) + env_b = HerdingEnv(n_sheep=3, seed=42) + obs_a, _ = env_a.reset(seed=42) + obs_b, _ = env_b.reset(seed=42) + assert np.allclose(obs_a, obs_b), "Reset is non-deterministic for same seed" + print("[ok] reset determinism") -def webots_sheep_drive(heading, speed_rad, wheel_r=0.031, k_turn=4.0, - motor_max=22.0, axle_track=0.20): - """Standalone version of sheep.py drive() kinematics.""" - err = math.atan2(math.sin(heading), math.cos(heading)) - fwd = speed_rad * max(0.0, math.cos(err)) - k = 4.0 - l = max(-motor_max, min(motor_max, fwd - k * err)) - r = max(-motor_max, min(motor_max, fwd + k * err)) - v = wheel_r * 0.5 * (r + l) - w = (wheel_r / axle_track) * (r - l) - return v, w, l, r +def test_curriculum_n_sheep_varies(): + env = HerdingEnv(seed=0) + sizes = set() + for _ in range(40): + _, info = env.reset() + sizes.add(info["n_sheep"]) + assert 1 in sizes + assert max(sizes) <= MAX_SHEEP + print(f"[ok] curriculum sampling — saw n_sheep in {sorted(sizes)}") -def test_obs_parity(): - """Test that build_obs matches between 2D env and Webots controller.""" - print("=== Test 1: Observation Parity ===") - env = HerdingEnv(n_sheep=3) - # Set ENTRY_AWARE to match our webots constant - env.ENTRY_AWARE = ENTRY_AWARE - env.reset(seed=42) - - # Manually set positions for a controlled test - env.dog_pos = np.array([5.0, 3.0], dtype=np.float32) - env.dog_heading = 1.2 - env.sheep_pos[0] = np.array([0.0, 0.0], dtype=np.float32) - env.sheep_pos[1] = np.array([2.0, -1.0], dtype=np.float32) - env.sheep_pos[2] = np.array([11.5, -11.5], dtype=np.float32) # penned - env.penned[0] = False - env.penned[1] = False - env.penned[2] = True - - obs_2d = env._obs() - - # Build equivalent Webots observation - sheep_positions = [ - env.sheep_pos[0].tolist(), - env.sheep_pos[1].tolist(), - env.sheep_pos[2].tolist(), - ] - obs_webots = webots_build_obs( - env.dog_pos, sheep_positions, 3, env.dog_heading - ) - - max_diff = float(np.max(np.abs(obs_2d - obs_webots))) - print(f" Max element-wise diff: {max_diff:.2e}") - if max_diff < 1e-6: - print(" PASS: Observations match") - else: - print(" FAIL: Observations differ!") - for i in range(18): - if abs(obs_2d[i] - obs_webots[i]) > 1e-6: - print(f" dim {i}: 2d={obs_2d[i]:.6f} webots={obs_webots[i]:.6f}") - return max_diff < 1e-6 +def test_strombom_drives_env(): + """Quick functional check that the analytic baseline can play the env + without exploding. Not a success-rate test — just no errors / NaNs.""" + env = HerdingEnv(n_sheep=2, max_steps=400, seed=1) + obs, _ = env.reset() + for t in range(400): + positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i])) + for i in range(env.n_sheep) + if not env.sheep_penned[i]} + if not positions: + break + vx, vy, _mode = compute_action((env.dog_x, env.dog_y), positions, PEN_ENTRY) + obs, r, term, trunc, info = env.step(np.array([vx, vy], dtype=np.float32)) + assert np.isfinite(obs).all(), f"NaN/Inf in obs at step {t}" + assert np.isfinite(r), f"NaN reward at step {t}" + if term or trunc: + break + print(f"[ok] strombom rollout — final n_penned={int(env.sheep_penned.sum())}/{env.n_sheep} after {env.steps} steps") -def test_dog_drive_parity(): - """Test that dog diff-drive matches Webots controller.""" - print("\n=== Test 2: Dog Drive Parity ===") - env = HerdingEnv(n_sheep=1) - env.reset(seed=42) - - all_pass = True - test_cases = [ - # (heading_error, speed_ms) — target_heading relative to current heading - (0.0, 2.5), # aligned, full speed - (0.5, 2.5), # 30deg error - (1.5, 2.5), # ~86deg error - (3.14, 2.5), # ~180deg error — should spin in place - (0.0, 0.5), # aligned, slow - (0.3, 1.0), # small error, medium speed - ] - - for heading_err, speed_ms in test_cases: - env.dog_heading = 0.0 - target_heading = heading_err - action = np.array([ - math.cos(target_heading), math.sin(target_heading) - ], dtype=np.float32) * (speed_ms / env.DOG_SPEED) - - # 2D env step - dbg = env._step_dog_substep(action, 0.016) - v_2d = dbg["v"] - w_2d = dbg["w"] - l_2d = dbg["left_w"] - r_2d = dbg["right_w"] - - # Webots equivalent - v_w, w_w, l_w, r_w = webots_dog_drive(heading_err, speed_ms) - - diffs = { - "v": abs(v_2d - v_w), - "w": abs(w_2d - w_w), - "left": abs(l_2d - l_w), - "right": abs(r_2d - r_w), - } - max_diff = max(diffs.values()) - ok = max_diff < 1e-6 - status = "PASS" if ok else "FAIL" - print(f" err={heading_err:.2f} spd={speed_ms:.1f}: {status} (max_diff={max_diff:.2e})") - if not ok: - for k, d in diffs.items(): - if d > 1e-6: - print(f" {k}: 2d={eval(k+'_2d'):.6f} webots={eval(k+'_w'):.6f}") - all_pass = False - - return all_pass - - -def test_sheep_drive_parity(): - """Test that sheep diff-drive matches Webots sheep controller.""" - print("\n=== Test 3: Sheep Drive Parity ===") - env = HerdingEnv(n_sheep=1) - env.reset(seed=42) - - all_pass = True - test_cases = [ - # (heading_error, speed_rad) - (0.0, 20.0), # aligned, flee speed - (0.0, 3.0), # aligned, wander speed - (0.5, 15.0), # moderate error - (1.57, 10.0), # 90deg — should spin in place - (3.14, 20.0), # 180deg — should spin in place fast - (0.2, 8.0), # small error, medium speed - ] - - for heading_err, speed_rad in test_cases: - env.sheep_heading[0] = 0.0 - env.sheep_pos[0] = np.array([0.0, 0.0], dtype=np.float32) - target_heading = heading_err - - # 2D env - new_pos = env._sheep_drive(0, target_heading, speed_rad, 0.016) - v_2d_raw = float(np.linalg.norm(new_pos - np.array([0.0, 0.0]))) / 0.016 - # Re-derive v, w from the internal state - heading_2d = env.sheep_heading[0] - - # Webots equivalent - v_w, w_w, l_w, r_w = webots_sheep_drive(heading_err, speed_rad) - - # For 2D, compute the same intermediate values - err_2d = (target_heading - 0.0 + np.pi) % (2 * np.pi) - np.pi - fwd_2d = speed_rad * max(0.0, math.cos(err_2d)) - turn_2d = 4.0 * err_2d - l_2d = max(-22.0, min(22.0, fwd_2d - turn_2d)) - r_2d = max(-22.0, min(22.0, fwd_2d + turn_2d)) - - diffs = { - "left": abs(l_2d - l_w), - "right": abs(r_2d - r_w), - } - max_diff = max(diffs.values()) - ok = max_diff < 1e-6 - status = "PASS" if ok else "FAIL" - print(f" err={heading_err:.2f} spd={speed_rad:.1f}: {status} (max_diff={max_diff:.2e})") - if not ok: - for k, d in diffs.items(): - if d > 1e-6: - print(f" {k}: 2d={l_2d if k=='left' else r_2d:.6f} webots={l_w if k=='left' else r_w:.6f}") - all_pass = False - - return all_pass - - -def test_full_trajectory_parity(): - """Test that running identical actions produces matching trajectories.""" - print("\n=== Test 4: Full Trajectory Parity (dog only) ===") - # Run 50 steps with a fixed action, compare dog heading/position - # at each step between 2D env kinematics and pure Webots kinematics. - env = HerdingEnv(n_sheep=1) - env.reset(seed=42) - env.dog_pos = np.array([0.0, 0.0], dtype=np.float32) - env.dog_heading = 0.0 - env.ENTRY_AWARE = ENTRY_AWARE - - action = np.array([0.8, -0.6], dtype=np.float32) # magnitude 1.0 - dt = 0.016667 # sub_dt - - # Webots-side tracking - wb_heading = 0.0 - wb_x, wb_y = 0.0, 0.0 - - max_heading_diff = 0.0 - max_pos_diff = 0.0 - - for step in range(50): - # 2D env sub-step - env._step_dog_substep(action, dt) - - # Webots-side computation - speed_ms = 1.0 * 2.5 - target_heading = math.atan2(-0.6, 0.8) - err = math.atan2(math.sin(target_heading - wb_heading), - math.cos(target_heading - wb_heading)) - fwd_ms = speed_ms * max(0.0, math.cos(err)) - fwd_rad = fwd_ms / 0.038 - turn = 4.0 * err - l = max(-70.0, min(70.0, fwd_rad - turn)) - r = max(-70.0, min(70.0, fwd_rad + turn)) - v = 0.038 * 0.5 * (r + l) - w = (0.038 / 0.28) * (r - l) - wb_heading = math.atan2(math.sin(wb_heading + w * dt), - math.cos(wb_heading + w * dt)) - wb_x += math.cos(wb_heading) * v * dt - wb_y += math.sin(wb_heading) * v * dt - - heading_diff = abs(env.dog_heading - wb_heading) - pos_diff = math.hypot(env.dog_pos[0] - wb_x, env.dog_pos[1] - wb_y) - max_heading_diff = max(max_heading_diff, heading_diff) - max_pos_diff = max(max_pos_diff, pos_diff) - - print(f" Max heading diff over 50 steps: {max_heading_diff:.2e} rad") - print(f" Max position diff over 50 steps: {max_pos_diff:.2e} m") - ok = max_pos_diff < 1e-4 - print(f" {'PASS' if ok else 'FAIL'}: Trajectories match") - return ok +def main(): + test_obs_action_shapes() + test_reset_determinism() + test_curriculum_n_sheep_varies() + test_strombom_drives_env() + print("\nAll parity checks passed.") if __name__ == "__main__": - results = [] - results.append(("Obs parity", test_obs_parity())) - results.append(("Dog drive parity", test_dog_drive_parity())) - results.append(("Sheep drive parity", test_sheep_drive_parity())) - results.append(("Trajectory parity", test_full_trajectory_parity())) - - print("\n" + "=" * 50) - print("RESULTS") - print("=" * 50) - all_pass = True - for name, passed in results: - print(f" {name}: {'PASS' if passed else 'FAIL'}") - if not passed: - all_pass = False - print(f"\nOverall: {'ALL PASS' if all_pass else 'SOME FAILURES'}") - env.close() + main() diff --git a/training/requirements.txt b/training/requirements.txt index d192563..708f4cc 100644 --- a/training/requirements.txt +++ b/training/requirements.txt @@ -1,6 +1,8 @@ -gymnasium>=0.29 -stable-baselines3>=2.3 -torch>=2.2 -numpy>=1.26 -matplotlib>=3.8 -tensorboard>=2.16 +# Pin major versions; SB3 2.x requires gymnasium and torch >= 1.13. +gymnasium>=0.29,<2.0 +stable-baselines3[extra]>=2.3,<3.0 +torch>=2.1 +numpy>=1.24 +pyyaml>=6.0 +tensorboard>=2.14 +tqdm>=4.66 diff --git a/training/runs/.gitkeep b/training/runs/.gitkeep index 8b13789..e69de29 100644 --- a/training/runs/.gitkeep +++ b/training/runs/.gitkeep @@ -1 +0,0 @@ - diff --git a/training/train.py b/training/train.py deleted file mode 100644 index 94a1f44..0000000 --- a/training/train.py +++ /dev/null @@ -1,392 +0,0 @@ -""" -PPO training for the herding task with curriculum learning. - -Trains from scratch through a 1→max_sheep curriculum, evaluates after each -stage, and auto-generates trajectory/timeseries plots plus a summary chart. - -Usage ------ - python train.py # defaults from config.json - python train.py --config my_config.json --max-sheep 5 - python train.py --max-sheep 3 --steps-per-stage 1000000 - -Outputs (in runs//): - config.json resolved config - final_model.zip trained PPO model - vecnorm.pkl VecNormalize statistics - stage_results.json per-stage evaluation metrics - success_rate.png summary bar chart - eval/ trajectory & timeseries plots per sheep count -""" - -import argparse -import json -import os -import time -from copy import deepcopy - -import numpy as np -from stable_baselines3 import PPO -from stable_baselines3.common.callbacks import BaseCallback -from stable_baselines3.common.vec_env import ( - DummyVecEnv, - SubprocVecEnv, - VecNormalize, -) - -from herding_env import HerdingEnv -from viz import ( - run_and_record, - plot_trajectory, - plot_timeseries, - plot_success_rate, - save_episode_gif, -) - - -# ── Callbacks ──────────────────────────────────────────────────────────────── - -class ProgressCallback(BaseCallback): - """One-line progress summary every `freq` env steps.""" - - def __init__(self, stage_label: str, freq: int = 100_000): - super().__init__() - self.stage_label = stage_label - self.freq = freq - self._last = 0 - self._ep_returns = [] - self._ep_success = [] - self._total_eps = 0 - self._total_success = 0 - self._cur_ret = None - - def _on_step(self) -> bool: - rewards = self.locals.get("rewards") - dones = self.locals.get("dones") - infos = self.locals.get("infos", []) - if rewards is None or dones is None: - return True - if self._cur_ret is None or len(self._cur_ret) != len(rewards): - self._cur_ret = np.zeros(len(rewards), dtype=np.float64) - self._cur_ret += np.asarray(rewards, dtype=np.float64) - for i, d in enumerate(dones): - if not d: - continue - self._ep_returns.append(float(self._cur_ret[i])) - info = infos[i] if i < len(infos) else {} - success = int(info.get("n_penned", 0) == info.get("n_sheep", -1)) - self._ep_success.append(success) - self._total_eps += 1 - self._total_success += success - self._cur_ret[i] = 0.0 - if len(self._ep_returns) > 50: - self._ep_returns.pop(0) - self._ep_success.pop(0) - if self.num_timesteps - self._last >= self.freq: - self._last = self.num_timesteps - n = len(self._ep_returns) - mean_r = float(np.mean(self._ep_returns)) if n else float("nan") - win_sr = float(np.mean(self._ep_success)) if n else float("nan") - cum_sr = (self._total_success / self._total_eps - if self._total_eps else float("nan")) - print(f" ... [{self.stage_label} | " - f"{self.num_timesteps:>7,} steps | " - f"ret(last {n})={mean_r:+.2f} " - f"win_sr={win_sr*100:.0f}% cum_sr={cum_sr*100:.0f}%]", - flush=True) - return True - - -# ── Environment factory ────────────────────────────────────────────────────── - -def make_env(n_sheep, seed, max_steps, reward_cfg=None): - def _init(): - env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, - reward_cfg=reward_cfg) - env.reset(seed=seed) - return env - return _init - - -# ── Failure-mode classification ────────────────────────────────────────────── - -COMPACT_RADIUS = 5.0 - - -def _classify(ep_radii, ep_com_dists, n_penned, n_sheep): - if n_penned == n_sheep: - return "SUCCESS" - if min(ep_radii) > COMPACT_RADIUS: - return "NEVER_COMPACT" - first = next(i for i, r in enumerate(ep_radii) if r <= COMPACT_RADIUS) - if min(ep_com_dists[first:]) > 3.0: - return "COMPACT_CANT_DRIVE" - if n_penned == 0: - return "DROVE_NO_SHEEP" - return f"PARTIAL_{n_penned}of{n_sheep}" - - -# ── Evaluation ─────────────────────────────────────────────────────────────── - -def evaluate(model, vn_template, n_sheep, n_episodes, max_steps, - reward_cfg=None): - """Evaluate at a given sheep count; returns metrics dict.""" - raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, reward_cfg)]) - vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) - vn.obs_rms = deepcopy(vn_template.obs_rms) - vn.ret_rms = deepcopy(vn_template.ret_rms) - - successes = 0 - ep_lens = [] - min_pen_list = [] - action_mags = [] - failure_counts = {} - rc_sums = {} - rc_n = 0 - - for _ in range(n_episodes): - obs = vn.reset() - done = False - steps = 0 - min_pen = float("inf") - mags = [] - ep_radii = [] - ep_com_dists = [] - while not done: - action, _ = model.predict(obs, deterministic=True) - obs, _, dones, infos = vn.step(action) - done = dones[0] - inner = vn.envs[0] - com, radius, _ = inner._flock_stats() - min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER))) - mags.append(float(np.linalg.norm(action[0]))) - ep_radii.append(radius) - ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER))) - steps += 1 - rc = infos[0].get("rcomps") - if rc: - for k, v in rc.items(): - rc_sums[k] = rc_sums.get(k, 0.0) + v - rc_n += 1 - n_penned = infos[0].get("n_penned", 0) - success = n_penned == n_sheep - successes += int(success) - ep_lens.append(steps) - min_pen_list.append(min_pen) - action_mags.extend(mags) - mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep) - failure_counts[mode] = failure_counts.get(mode, 0) + 1 - - vn.close() - - result = { - "sr": successes / n_episodes, - "mean_len": float(np.mean(ep_lens)), - "mean_min_pen": float(np.mean(min_pen_list)), - "mean_act": float(np.mean(action_mags)) if action_mags else 0.0, - "failure_modes": failure_counts, - } - if rc_n > 0: - result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()} - return result - - - -# ── CLI ────────────────────────────────────────────────────────────────────── - -DEFAULT_CONFIG = { - "W_PER_SHEEP": 2.0, - "W_ALIGN": 0.05, - "W_PEN_BONUS": 10.0, - "W_COMPLETE": 100.0, - "W_STEP_COST": 0.02, - "W_SOUTH": 0.01, - "W_COMPACT": 0.0, - "W_WALL_TOUCH": 0.04, - "WALL_TOUCH_BUFFER": 0.3, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": True, - "ENTRY_AWARE": True, - "ent_coef": 0.02, -} - - -def parse_args(): - p = argparse.ArgumentParser( - description="PPO training for herding task with curriculum learning") - p.add_argument("--config", type=str, default=None, - help="JSON config file (reward weights + ent_coef)") - p.add_argument("--max-sheep", type=int, default=10) - p.add_argument("--steps-per-stage", type=int, default=1_500_000) - p.add_argument("--n-envs", type=int, default=8) - p.add_argument("--max-steps", type=int, default=2500) - p.add_argument("--eval-episodes", type=int, default=30) - p.add_argument("--run-dir", type=str, default=None) - p.add_argument("--no-gif", action="store_true", - help="Skip per-stage GIF rendering (PNGs still produced).") - p.add_argument("--gif-fps", type=int, default=20) - p.add_argument("--gif-skip", type=int, default=3, - help="Keep every Nth frame (smaller GIF; default 3).") - return p.parse_args() - - -# ── Main ───────────────────────────────────────────────────────────────────── - -def main(): - args = parse_args() - - # Load config: --config overrides, else auto-load config.json if present - cfg = dict(DEFAULT_CONFIG) - config_path = args.config - if config_path is None and os.path.exists("config.json"): - config_path = "config.json" - if config_path: - with open(config_path) as f: - cfg.update(json.load(f)) - print(f"Config loaded from {config_path}") - - rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)} - - # Run directory - run_dir = args.run_dir or os.path.join( - "runs", time.strftime("%Y%m%d_%H%M%S")) - eval_dir = os.path.join(run_dir, "eval") - os.makedirs(eval_dir, exist_ok=True) - with open(os.path.join(run_dir, "config.json"), "w") as f: - json.dump(cfg, f, indent=2) - - print(f"Config: {cfg}") - print(f"Run dir: {run_dir}") - print(f"Curriculum: 1 → {args.max_sheep} sheep, " - f"{args.steps_per_stage:,} steps/stage\n") - - # Training envs - train_env = SubprocVecEnv([ - make_env(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg) - for i in range(args.n_envs) - ]) - vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, - clip_obs=10.0) - - # Model — force CPU (PPO with MLP runs faster on CPU than GPU; SB3 warns - # about this otherwise). - model = PPO( - "MlpPolicy", vn, - learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10, - gamma=0.995, gae_lambda=0.95, clip_range=0.2, - ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5, - policy_kwargs=dict(net_arch=[256, 256]), - device="cpu", - verbose=0, - ) - - # Curriculum training - stage_results = [] - t0 = time.time() - - try: - for n in range(1, args.max_sheep + 1): - if n == 1: - print(f"\n[Stage n_sheep=1] training {args.steps_per_stage:,} steps") - model.learn( - total_timesteps=args.steps_per_stage, - reset_num_timesteps=True, - callback=ProgressCallback("1 sheep", freq=100_000), - ) - else: - # Mixed transition: half envs stay at n-1, half advance to n, - # for the first half of the stage budget. This prevents the - # n+1 task's noisy early gradients from destroying the n policy - # (catastrophic forgetting) before it has a chance to adapt. - half = max(1, args.n_envs // 2) - for i in range(half): - vn.env_method("set_n_sheep", n - 1, indices=[i]) - for i in range(half, args.n_envs): - vn.env_method("set_n_sheep", n, indices=[i]) - mix_steps = args.steps_per_stage // 2 - full_steps = args.steps_per_stage - mix_steps - print(f"\n[Stage n_sheep={n}] mixed ({n-1}/{n} sheep) " - f"{mix_steps:,} steps") - model.learn( - total_timesteps=mix_steps, - reset_num_timesteps=False, - callback=ProgressCallback(f"{n-1}→{n} mix", freq=100_000), - ) - vn.env_method("set_n_sheep", n) - print(f"[Stage n_sheep={n}] full ({n} sheep) {full_steps:,} steps") - model.learn( - total_timesteps=full_steps, - reset_num_timesteps=False, - callback=ProgressCallback(f"{n} sheep", freq=100_000), - ) - - # Evaluate - print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps") - r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) - print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% " - f"mean_len={r['mean_len']:.0f} " - f"mean_min_pen={r['mean_min_pen']:.1f}m " - f"mean_act={r['mean_act']:.2f}") - - # Failure-mode breakdown - if r["failure_modes"]: - modes = " ".join( - f"{k}={v}" for k, v in sorted( - r["failure_modes"].items(), key=lambda x: -x[1])) - print(f" failure modes: {modes}") - - # Reward breakdown - if "reward_per_step" in r: - rps = r["reward_per_step"] - print(f" reward/step: " + " ".join( - f"{k}={v:+.4f}" for k, v in rps.items())) - - # Episode visualisation: trajectory + timeseries + animated GIF - hist = run_and_record(model, vn, n, args.max_steps, rcfg, - seed=1000 + n) - tag = "success" if hist["success"] else "fail" - plot_trajectory( - hist, - os.path.join(eval_dir, f"traj_{n}s_{tag}.png")) - plot_timeseries( - hist, - os.path.join(eval_dir, f"ts_{n}s_{tag}.png")) - if not args.no_gif: - save_episode_gif( - hist, - os.path.join(eval_dir, f"ep_{n}s_{tag}.gif"), - fps=args.gif_fps, skip=args.gif_skip) - - r["n_sheep"] = n - stage_results.append(r) - - # Save artefacts - model.save(os.path.join(run_dir, "final_model")) - vn.save(os.path.join(run_dir, "vecnorm.pkl")) - with open(os.path.join(run_dir, "stage_results.json"), "w") as f: - json.dump(stage_results, f, indent=2) - - finally: - try: - vn.close() - except Exception: - pass - - # Summary - elapsed = (time.time() - t0) / 60 - print("\n" + "=" * 70) - print(" TRAINING SUMMARY") - print("=" * 70) - for r in stage_results: - print(f" n_sheep={r['n_sheep']} sr={r['sr']*100:>3.0f}% " - f"len={r['mean_len']:>5.0f} min_pen={r['mean_min_pen']:>5.1f}m " - f"act={r['mean_act']:.2f}") - print(f"\n Total time: {elapsed:.1f} min") - print(f" Artefacts: {run_dir}/") - - plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png")) - print(f" Plots: {run_dir}/success_rate.png, {eval_dir}/") - - -if __name__ == "__main__": - main() diff --git a/training/train_at.py b/training/train_at.py deleted file mode 100644 index 2289496..0000000 --- a/training/train_at.py +++ /dev/null @@ -1,412 +0,0 @@ -""" -PPO training with attention-based policy (train_at.py). - -Key difference from train.py ------------------------------ -- Observation exposes ALL sheep as individual per-sheep tokens rather than - only the top-3 farthest. The policy therefore has complete flock visibility - at any sheep count — no hidden sheep even at n=10. -- A TransformerFeaturesExtractor processes the sheep tokens with multi-head - self-attention (permutation-invariant), then mean-pools over valid tokens - and concatenates the result with global dog/pen features. -- Curriculum transition uses the same mixed-env approach as train.py: half - the envs stay at n-1 for the first half of each new stage to suppress - catastrophic forgetting. - -Observation layout (7 + MAX_SHEEP*6 = 67 dims, fixed) -------------------------------------------------------- - Global (7): - dog_x / FIELD, dog_y / FIELD, - cos(heading), sin(heading), - (pen_x - dog_x) / D, (pen_y - dog_y) / D, - n_active / n_sheep - - Per sheep i (6): - (sheep_x - dog_x) / D, (sheep_y - dog_y) / D, ← pos rel to dog - (pen_x - sheep_x) / D, (pen_y - sheep_y) / D, ← sheep-to-pen - is_active 1.0 if not penned, else 0.0 - is_valid 1.0 if i < n_sheep, else 0.0 (padding sentinel) - - After VecNormalize, is_valid for real sheep normalises > 0 and for - padding tokens < 0 (because mean ∈ (0,1)), so a threshold of 0 cleanly - separates real from padded without any extra bookkeeping. - -Usage ------ - python train_at.py # defaults from config.json - python train_at.py --max-sheep 10 --steps-per-stage 2000000 - python train_at.py --embed-dim 128 --n-heads 4 --n-layers 3 -""" - -import argparse -import json -import os -import time -from copy import deepcopy - -import numpy as np -import torch -import torch.nn as nn -from gymnasium import spaces -from stable_baselines3 import PPO -from stable_baselines3.common.torch_layers import BaseFeaturesExtractor -from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize - -from herding_env import HerdingEnv -from train import ProgressCallback, _classify, COMPACT_RADIUS, DEFAULT_CONFIG -from viz import ( - run_and_record, plot_trajectory, plot_timeseries, - plot_success_rate, save_episode_gif, -) - - -# ── Per-sheep token observation environment ─────────────────────────────────── - -class HerdingEnvAt(HerdingEnv): - """ - HerdingEnv with a per-sheep token observation for the attention policy. - Everything else (dynamics, reward, curriculum interface) is inherited. - """ - - OBS_GLOBAL = 7 - OBS_SHEEP = 6 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - obs_dim = self.OBS_GLOBAL + self.MAX_SHEEP * self.OBS_SHEEP - self.observation_space = spaces.Box( - low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32 - ) - - def _obs(self) -> np.ndarray: - S = self.FIELD - D = 2.0 * self.FIELD - pen_ref = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER - active_mask = ~self.penned[:self.n_sheep] - n_active = int(active_mask.sum()) - - global_feats = np.array([ - self.dog_pos[0] / S, - self.dog_pos[1] / S, - float(np.cos(self.dog_heading)), - float(np.sin(self.dog_heading)), - (pen_ref[0] - self.dog_pos[0]) / D, - (pen_ref[1] - self.dog_pos[1]) / D, - n_active / max(self.n_sheep, 1), - ], dtype=np.float32) - - sheep_feats = np.zeros((self.MAX_SHEEP, self.OBS_SHEEP), dtype=np.float32) - for i in range(self.n_sheep): - pos = self.sheep_pos[i] - sheep_feats[i] = [ - (pos[0] - self.dog_pos[0]) / D, - (pos[1] - self.dog_pos[1]) / D, - (pen_ref[0] - pos[0]) / D, - (pen_ref[1] - pos[1]) / D, - float(not self.penned[i]), - 1.0, # is_valid: this sheep exists - ] - # i >= n_sheep: all zeros, is_valid=0 → masked out in attention - - return np.concatenate([global_feats, sheep_feats.ravel()]) - - -# ── Attention features extractor ────────────────────────────────────────────── - -class ShepherdAttentionExtractor(BaseFeaturesExtractor): - """ - Multi-head self-attention over per-sheep tokens, mean-pooled over valid - (non-padding) tokens and concatenated with global dog/pen features. - - After VecNormalize: - real sheep → is_valid_norm > 0 (normalised from 1.0) - padding → is_valid_norm ≤ 0 (normalised from 0.0) - so threshold at 0 is always correct regardless of curriculum stage. - """ - - GLOBAL_DIM = HerdingEnvAt.OBS_GLOBAL # 7 - SHEEP_DIM = HerdingEnvAt.OBS_SHEEP # 6 - MAX_SHEEP = HerdingEnv.MAX_SHEEP # 10 - VALID_IDX = 5 # index of is_valid within each token - - def __init__(self, observation_space, embed_dim: int = 64, - n_heads: int = 4, n_layers: int = 2, ff_dim: int = 128): - super().__init__(observation_space, - features_dim=self.GLOBAL_DIM + embed_dim) - self.sheep_embed = nn.Linear(self.SHEEP_DIM, embed_dim) - encoder_layer = nn.TransformerEncoderLayer( - d_model=embed_dim, nhead=n_heads, dim_feedforward=ff_dim, - dropout=0.0, batch_first=True, - ) - self.transformer = nn.TransformerEncoder(encoder_layer, - num_layers=n_layers, - enable_nested_tensor=False) - - def forward(self, obs: torch.Tensor) -> torch.Tensor: - B = obs.shape[0] - global_feats = obs[:, :self.GLOBAL_DIM] # (B, 7) - tokens = obs[:, self.GLOBAL_DIM:].view( - B, self.MAX_SHEEP, self.SHEEP_DIM) # (B, 10, 6) - - # is_valid after VecNorm: real > 0, padding ≤ 0 - is_valid_norm = tokens[:, :, self.VALID_IDX] # (B, 10) - key_padding_mask = is_valid_norm <= 0.0 # True → ignore - - x = self.sheep_embed(tokens) # (B, 10, E) - x = self.transformer(x, src_key_padding_mask=key_padding_mask) - - valid_w = (is_valid_norm > 0.0).float().unsqueeze(-1) # (B, 10, 1) - pooled = (x * valid_w).sum(1) / valid_w.sum(1).clamp(min=1.0) - - return torch.cat([global_feats, pooled], dim=1) # (B, 7+E) - - -# ── Environment factory ─────────────────────────────────────────────────────── - -def make_env_at(n_sheep, seed, max_steps, reward_cfg=None): - def _init(): - env = HerdingEnvAt(n_sheep=n_sheep, max_steps=max_steps, - reward_cfg=reward_cfg) - env.reset(seed=seed) - return env - return _init - - -# ── Evaluation ──────────────────────────────────────────────────────────────── - -def evaluate_at(model, vn_template, n_sheep, n_episodes, max_steps, - reward_cfg=None): - raw = DummyVecEnv([make_env_at(n_sheep, 9999, max_steps, reward_cfg)]) - vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) - vn.obs_rms = deepcopy(vn_template.obs_rms) - vn.ret_rms = deepcopy(vn_template.ret_rms) - - successes = 0 - ep_lens, min_pen_list, action_mags = [], [], [] - failure_counts, rc_sums = {}, {} - rc_n = 0 - - for _ in range(n_episodes): - obs = vn.reset() - done = False - steps, min_pen = 0, float("inf") - mags, ep_radii, ep_com_dists = [], [], [] - while not done: - action, _ = model.predict(obs, deterministic=True) - obs, _, dones, infos = vn.step(action) - done = dones[0] - inner = vn.envs[0] - com, radius, _ = inner._flock_stats() - min_pen = min(min_pen, - float(np.linalg.norm(com - inner.PEN_CENTER))) - mags.append(float(np.linalg.norm(action[0]))) - ep_radii.append(radius) - ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER))) - steps += 1 - rc = infos[0].get("rcomps") - if rc: - for k, v in rc.items(): - rc_sums[k] = rc_sums.get(k, 0.0) + v - rc_n += 1 - n_penned = infos[0].get("n_penned", 0) - successes += int(n_penned == n_sheep) - ep_lens.append(steps) - min_pen_list.append(min_pen) - action_mags.extend(mags) - mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep) - failure_counts[mode] = failure_counts.get(mode, 0) + 1 - - vn.close() - result = { - "sr": successes / n_episodes, - "mean_len": float(np.mean(ep_lens)), - "mean_min_pen": float(np.mean(min_pen_list)), - "mean_act": float(np.mean(action_mags)) if action_mags else 0.0, - "failure_modes": failure_counts, - } - if rc_n > 0: - result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()} - return result - - -# ── CLI ─────────────────────────────────────────────────────────────────────── - -def parse_args(): - p = argparse.ArgumentParser( - description="PPO + attention training for herding task") - p.add_argument("--config", type=str, default=None) - p.add_argument("--max-sheep", type=int, default=10) - p.add_argument("--steps-per-stage", type=int, default=1_500_000) - p.add_argument("--n-envs", type=int, default=8) - p.add_argument("--max-steps", type=int, default=2500) - p.add_argument("--eval-episodes", type=int, default=30) - p.add_argument("--run-dir", type=str, default=None) - p.add_argument("--no-gif", action="store_true") - p.add_argument("--gif-fps", type=int, default=20) - p.add_argument("--gif-skip", type=int, default=3) - # Attention architecture - p.add_argument("--embed-dim", type=int, default=64, - help="Transformer embedding dimension (default 64)") - p.add_argument("--n-heads", type=int, default=4, - help="Number of attention heads (default 4)") - p.add_argument("--n-layers", type=int, default=2, - help="Number of transformer encoder layers (default 2)") - p.add_argument("--ff-dim", type=int, default=128, - help="Transformer feed-forward dim (default 128)") - return p.parse_args() - - -# ── Main ────────────────────────────────────────────────────────────────────── - -def main(): - args = parse_args() - - cfg = dict(DEFAULT_CONFIG) - config_path = args.config - if config_path is None and os.path.exists("config.json"): - config_path = "config.json" - if config_path: - with open(config_path) as f: - cfg.update(json.load(f)) - print(f"Config loaded from {config_path}") - - rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)} - - run_dir = args.run_dir or os.path.join( - "runs", "at_" + time.strftime("%Y%m%d_%H%M%S")) - eval_dir = os.path.join(run_dir, "eval") - os.makedirs(eval_dir, exist_ok=True) - with open(os.path.join(run_dir, "config.json"), "w") as f: - json.dump(cfg, f, indent=2) - - print(f"Config: {cfg}") - print(f"Run dir: {run_dir}") - print(f"Curriculum: 1 → {args.max_sheep} sheep, " - f"{args.steps_per_stage:,} steps/stage") - print(f"Transformer: embed={args.embed_dim} heads={args.n_heads} " - f"layers={args.n_layers} ff={args.ff_dim}\n") - - train_env = SubprocVecEnv([ - make_env_at(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg) - for i in range(args.n_envs) - ]) - vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0) - - model = PPO( - "MlpPolicy", vn, - learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10, - gamma=0.995, gae_lambda=0.95, clip_range=0.2, - ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5, - policy_kwargs=dict( - features_extractor_class=ShepherdAttentionExtractor, - features_extractor_kwargs=dict( - embed_dim=args.embed_dim, - n_heads=args.n_heads, - n_layers=args.n_layers, - ff_dim=args.ff_dim, - ), - net_arch=[256, 256], - ), - device="cpu", - verbose=0, - ) - - stage_results = [] - t0 = time.time() - - try: - for n in range(1, args.max_sheep + 1): - if n == 1: - print(f"\n[Stage n_sheep=1] training {args.steps_per_stage:,} steps") - model.learn( - total_timesteps=args.steps_per_stage, - reset_num_timesteps=True, - callback=ProgressCallback("1 sheep", freq=100_000), - ) - else: - half = max(1, args.n_envs // 2) - mix_steps = args.steps_per_stage // 2 - full_steps = args.steps_per_stage - mix_steps - - for i in range(half): - vn.env_method("set_n_sheep", n - 1, indices=[i]) - for i in range(half, args.n_envs): - vn.env_method("set_n_sheep", n, indices=[i]) - - print(f"\n[Stage n_sheep={n}] mixed ({n-1}/{n} sheep) " - f"{mix_steps:,} steps") - model.learn( - total_timesteps=mix_steps, - reset_num_timesteps=False, - callback=ProgressCallback(f"{n-1}→{n} mix", freq=100_000), - ) - - vn.env_method("set_n_sheep", n) - print(f"[Stage n_sheep={n}] full ({n} sheep) {full_steps:,} steps") - model.learn( - total_timesteps=full_steps, - reset_num_timesteps=False, - callback=ProgressCallback(f"{n} sheep", freq=100_000), - ) - - print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps") - r = evaluate_at(model, vn, n, args.eval_episodes, - args.max_steps, rcfg) - print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% " - f"mean_len={r['mean_len']:.0f} " - f"mean_min_pen={r['mean_min_pen']:.1f}m " - f"mean_act={r['mean_act']:.2f}") - if r["failure_modes"]: - modes = " ".join( - f"{k}={v}" for k, v in sorted( - r["failure_modes"].items(), key=lambda x: -x[1])) - print(f" failure modes: {modes}") - if "reward_per_step" in r: - rps = r["reward_per_step"] - print(" reward/step: " + " ".join( - f"{k}={v:+.4f}" for k, v in rps.items())) - - hist = run_and_record( - model, vn, n, args.max_steps, rcfg, - seed=1000 + n, make_env_fn=make_env_at, - ) - tag = "success" if hist["success"] else "fail" - plot_trajectory(hist, os.path.join(eval_dir, f"traj_{n}s_{tag}.png")) - plot_timeseries(hist, os.path.join(eval_dir, f"ts_{n}s_{tag}.png")) - if not args.no_gif: - save_episode_gif( - hist, - os.path.join(eval_dir, f"ep_{n}s_{tag}.gif"), - fps=args.gif_fps, skip=args.gif_skip) - - r["n_sheep"] = n - stage_results.append(r) - - model.save(os.path.join(run_dir, "final_model")) - vn.save(os.path.join(run_dir, "vecnorm.pkl")) - with open(os.path.join(run_dir, "stage_results.json"), "w") as f: - json.dump(stage_results, f, indent=2) - - finally: - try: - vn.close() - except Exception: - pass - - elapsed = (time.time() - t0) / 60 - print("\n" + "=" * 70) - print(" TRAINING SUMMARY (attention policy)") - print("=" * 70) - for r in stage_results: - print(f" n_sheep={r['n_sheep']} sr={r['sr']*100:>3.0f}% " - f"len={r['mean_len']:>5.0f} " - f"min_pen={r['mean_min_pen']:>5.1f}m " - f"act={r['mean_act']:.2f}") - print(f"\n Total time: {elapsed:.1f} min") - print(f" Artefacts: {run_dir}/") - plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png")) - print(f" Plots: {run_dir}/success_rate.png, {eval_dir}/") - - -if __name__ == "__main__": - main() diff --git a/training/train_ppo.py b/training/train_ppo.py new file mode 100644 index 0000000..4a674b0 --- /dev/null +++ b/training/train_ppo.py @@ -0,0 +1,267 @@ +"""Train a PPO shepherd-dog policy on ``HerdingEnv`` with curriculum. + +Defaults to 16 parallel ``SubprocVecEnv`` workers feeding a GPU policy. +Saves checkpoints, the best-eval model, and the VecNormalize stats — +all three are needed at inference time by the Webots controller. + +Usage:: + + python -m training.train_ppo \ + --config training/configs/ppo_default.yaml \ + --out-dir training/runs/baseline + +To resume from a checkpoint:: + + python -m training.train_ppo --resume training/runs/baseline/checkpoints/ppo_500000_steps.zip +""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +import yaml + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +import numpy as np +import torch as th +from stable_baselines3 import PPO +from stable_baselines3.common.callbacks import ( + BaseCallback, CheckpointCallback, EvalCallback, +) +from stable_baselines3.common.monitor import Monitor +from stable_baselines3.common.vec_env import ( + DummyVecEnv, SubprocVecEnv, VecNormalize, +) + +from training.herding_env import HerdingEnv + + +# -------------------------------------------------------------------------- +# Env factories +# -------------------------------------------------------------------------- + +def _make_env(rank: int, seed: int = 0): + def _thunk(): + env = HerdingEnv(seed=seed + rank) + env = Monitor(env, info_keywords=("is_success", "n_sheep", "n_penned")) + return env + return _thunk + + +# -------------------------------------------------------------------------- +# Curriculum callback +# -------------------------------------------------------------------------- + +class CurriculumCallback(BaseCallback): + """Drive the env's flock-size + state-space difficulty curriculum. + + Schedule entries: {step, max_n_sheep, difficulty}. The largest entry + whose step <= num_timesteps wins; both knobs update together. + """ + + def __init__(self, schedule, vec_envs, verbose: int = 0): + super().__init__(verbose) + self.schedule = sorted(schedule, key=lambda d: d["step"]) + # Accept a list of envs so the eval env tracks training difficulty. + self.vec_envs = vec_envs if isinstance(vec_envs, (list, tuple)) else [vec_envs] + self._last_n = None + self._last_d = None + + def _call(self, method, value): + for v in self.vec_envs: + try: + v.env_method(method, value) + except AttributeError: + v.venv.env_method(method, value) + + def _on_step(self) -> bool: + t = self.num_timesteps + n = self.schedule[0]["max_n_sheep"] + d = self.schedule[0].get("difficulty", 1.0) + for entry in self.schedule: + if t >= entry["step"]: + n = entry["max_n_sheep"] + d = entry.get("difficulty", 1.0) + if n != self._last_n: + self._call("set_max_n_sheep", n) + self._last_n = n + if d != self._last_d: + self._call("set_difficulty", d) + self._last_d = d + if self.verbose: + print(f"[curriculum] t={t} → max_n_sheep={n} difficulty={d}") + return True + + +# -------------------------------------------------------------------------- +# Main +# -------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--config", default=os.path.join(_HERE, "configs", "ppo_default.yaml")) + parser.add_argument("--out-dir", default=os.path.join(_HERE, "runs", "latest")) + parser.add_argument("--n-envs", type=int, default=None, + help="Override config n_envs.") + parser.add_argument("--total-timesteps", type=int, default=None, + help="Override config total_timesteps.") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--resume", type=str, default=None, + help="Path to a SB3 zip to resume from.") + # SB3 recommends CPU for MlpPolicy — GPU helps CNN policies, not MLPs + # of this size. Override with --device cuda if you really want it. + parser.add_argument("--device", default="cpu") + parser.add_argument("--no-vecnorm", action="store_true", + help="Disable VecNormalize wrapper. Required when " + "resuming from a BC-pretrained policy that " + "wasn't trained under it.") + parser.add_argument("--no-curriculum", action="store_true", + help="Skip curriculum callback (resumed policy is " + "already competent across the distribution).") + parser.add_argument("--imitate-weight", type=float, default=None, + help="Override env W_IMITATE. Set to 0 to disable " + "Strömbom imitation reward.") + parser.add_argument("--difficulty", type=float, default=None, + help="Override env difficulty (0=easy, 1=hard). " + "Used in BC fine-tune to skip easy curriculum.") + parser.add_argument("--log-std", type=float, default=None, + help="Override the policy's log_std after load. " + "BC trained with std≈1.6 (log_std=0.5) which " + "is too noisy for fine-tune. Use -1.5 (std≈0.22) " + "to keep PPO close to the BC mean while still " + "exploring locally.") + parser.add_argument("--learning-rate", type=float, default=None, + help="Override config learning rate. For BC " + "fine-tune, 5e-5 is much safer than the 3e-4 " + "default.") + args = parser.parse_args() + + with open(args.config) as f: + cfg = yaml.safe_load(f) + + n_envs = args.n_envs or cfg["n_envs"] + total_timesteps = args.total_timesteps or cfg["total_timesteps"] + + out = Path(args.out_dir) + out.mkdir(parents=True, exist_ok=True) + (out / "checkpoints").mkdir(exist_ok=True) + (out / "best").mkdir(exist_ok=True) + (out / "evals").mkdir(exist_ok=True) + + print(f"[train] out={out} n_envs={n_envs} total={total_timesteps} device={args.device}") + + # --- Train env (vectorised, optionally normalised) --- + env_fns = [_make_env(i, seed=args.seed) for i in range(n_envs)] + venv = SubprocVecEnv(env_fns) if n_envs > 1 else DummyVecEnv(env_fns) + eval_venv = DummyVecEnv([_make_env(99, seed=args.seed + 999)]) + if not args.no_vecnorm: + venv = VecNormalize(venv, norm_obs=True, norm_reward=False, clip_obs=10.0) + eval_venv = VecNormalize(eval_venv, norm_obs=True, norm_reward=False, + clip_obs=10.0, training=False) + eval_venv.obs_rms = venv.obs_rms + else: + print("[train] VecNormalize disabled (resumed policy was trained without it).") + + # Apply env-level overrides (used by BC fine-tune to disable Strömbom + # imitation and start at full deployment difficulty). + def _env_call(method, value): + for v in (venv, eval_venv): + try: + v.env_method(method, value) + except AttributeError: + v.venv.env_method(method, value) + + if args.imitate_weight is not None: + _env_call("set_imitate_weight", args.imitate_weight) + print(f"[train] W_IMITATE overridden to {args.imitate_weight}") + if args.difficulty is not None: + _env_call("set_difficulty", args.difficulty) + print(f"[train] difficulty pinned to {args.difficulty}") + + # --- Model --- + policy_kwargs = dict( + net_arch=dict(pi=cfg["net_arch_pi"], vf=cfg["net_arch_vf"]), + log_std_init=cfg.get("log_std_init", 0.0), + ) + + if args.resume: + print(f"[train] resuming from {args.resume}") + custom_objects = {} + if args.learning_rate is not None: + custom_objects["learning_rate"] = args.learning_rate + model = PPO.load(args.resume, env=venv, device=args.device, + tensorboard_log=str(out / "tb"), + custom_objects=custom_objects or None) + if args.log_std is not None: + import torch as _th + with _th.no_grad(): + model.policy.log_std.fill_(args.log_std) + print(f"[train] log_std overridden to {args.log_std} " + f"(std≈{2.71828 ** args.log_std:.2f})") + if args.learning_rate is not None: + print(f"[train] learning_rate overridden to {args.learning_rate}") + else: + model = PPO( + cfg["policy"], venv, + learning_rate=cfg["learning_rate"], + n_steps=cfg["n_steps"], + batch_size=cfg["batch_size"], + n_epochs=cfg["n_epochs"], + gamma=cfg["gamma"], + gae_lambda=cfg["gae_lambda"], + clip_range=cfg["clip_range"], + ent_coef=cfg["ent_coef"], + vf_coef=cfg["vf_coef"], + max_grad_norm=cfg["max_grad_norm"], + target_kl=cfg.get("target_kl"), + policy_kwargs=policy_kwargs, + tensorboard_log=str(out / "tb"), + seed=args.seed, + device=args.device, + verbose=1, + ) + + # --- Callbacks --- + ckpt_cb = CheckpointCallback( + save_freq=max(1, cfg["checkpoint_freq"] // n_envs), + save_path=str(out / "checkpoints"), name_prefix="ppo", + save_vecnormalize=True, + ) + eval_cb = EvalCallback( + eval_venv, + best_model_save_path=str(out / "best"), + log_path=str(out / "evals"), + eval_freq=max(1, cfg["eval_freq"] // n_envs), + n_eval_episodes=cfg["n_eval_episodes"], + deterministic=True, + ) + callbacks = [ckpt_cb, eval_cb] + if not args.no_curriculum and "curriculum" in cfg and cfg["curriculum"]: + callbacks.append(CurriculumCallback( + cfg["curriculum"], [venv, eval_venv], verbose=1, + )) + elif args.no_curriculum: + print("[train] curriculum disabled — env knobs left at their current values.") + + # --- Train --- + model.learn(total_timesteps=total_timesteps, callback=callbacks, + progress_bar=True) + + # --- Save final model + VecNormalize stats --- + model.save(out / "final.zip") + venv.save(str(out / "vecnormalize.pkl")) + # The EvalCallback already wrote best_model.zip into out/best/ — drop the + # VecNormalize stats next to it for the controller to pick up. + venv.save(str(out / "best" / "vecnormalize.pkl")) + print(f"[train] done. saved to {out}") + + +if __name__ == "__main__": + main() diff --git a/training/viz.py b/training/viz.py deleted file mode 100644 index 1b3ada2..0000000 --- a/training/viz.py +++ /dev/null @@ -1,342 +0,0 @@ -""" -All visualization for the herding policy: trajectory plots, timeseries plots, -success-rate bar chart, and animated GIFs. - -Used both by train.py (auto-rendered after each curriculum stage) and as a CLI -to render a fresh episode against a saved model. - -CLI usage: - python viz.py --run-dir runs/v1 --n-sheep 5 - python viz.py --run-dir runs/v1 --n-sheep 10 --no-gif - python viz.py --model runs/v1/final_model.zip --vecnorm runs/v1/vecnorm.pkl \\ - --n-sheep 3 --out-dir vis_v1_3sheep -""" -import argparse -import os -import json -from copy import deepcopy - -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches -import matplotlib.animation as animation -from matplotlib.collections import LineCollection -import numpy as np -from stable_baselines3 import PPO -from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize - -from herding_env import HerdingEnv - - -# ── Palette ────────────────────────────────────────────────────────────────── - -SHEEP_COLORS = [ - "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", - "#a65628", "#f781bf", "#999999", "#66c2a5", "#fc8d62", -] -DOG_COLOR = "#4e342e" - - -# ── Common drawing primitives ──────────────────────────────────────────────── - -def draw_field(ax): - ax.set_xlim(-16, 16) - ax.set_ylim(-16, 16) - ax.set_aspect("equal") - ax.set_facecolor("#dcedc8") - ax.add_patch(mpatches.Rectangle( - (-15, -15), 30, 30, fill=False, edgecolor="#795548", lw=2)) - ax.add_patch(mpatches.Rectangle( - (10, -15), 3, 7, facecolor="#ffe082", edgecolor="#795548", lw=2)) - ax.text(11.5, -11.5, "pen", ha="center", va="center", - fontsize=8, color="#795548") - - -def faded_path(ax, xs, ys, color, lw=1.5, label=None): - n = len(xs) - if n < 2: - return - points = np.array([xs, ys]).T.reshape(-1, 1, 2) - segs = np.concatenate([points[:-1], points[1:]], axis=1) - alphas = np.linspace(0.15, 1.0, len(segs)) - colors = [(*matplotlib.colors.to_rgb(color), a) for a in alphas] - ax.add_collection(LineCollection(segs, colors=colors, linewidth=lw)) - if label: - ax.plot([], [], color=color, lw=lw, label=label) - - -# ── Episode rollout ────────────────────────────────────────────────────────── - -def make_eval_env(n_sheep, seed, max_steps, reward_cfg=None): - def _init(): - env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, - reward_cfg=reward_cfg) - env.reset(seed=seed) - return env - return _init - - -def run_and_record(model, vn_template, n_sheep, max_steps, - reward_cfg=None, seed=42, make_env_fn=None): - """Run one deterministic episode and return full trajectory history.""" - _factory = make_env_fn or make_eval_env - raw = DummyVecEnv([_factory(n_sheep, seed, max_steps, reward_cfg)]) - vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) - vn.obs_rms = deepcopy(vn_template.obs_rms) - vn.ret_rms = deepcopy(vn_template.ret_rms) - - obs = vn.reset() - inner = vn.envs[0] - done = False - - dog_xs, dog_ys = [], [] - sheep_xs = [[] for _ in range(n_sheep)] - sheep_ys = [[] for _ in range(n_sheep)] - sheep_penned = [[] for _ in range(n_sheep)] - radii = [] - pen_dists = [[] for _ in range(n_sheep)] - action_mags = [] - rewards = [] - penned_at = [None] * n_sheep - step = 0 - - while not done: - action, _ = model.predict(obs, deterministic=True) - obs, reward, dones, infos = vn.step(action) - done = dones[0] - step += 1 - - dog_xs.append(float(inner.dog_pos[0])) - dog_ys.append(float(inner.dog_pos[1])) - com, radius, _ = inner._flock_stats() - radii.append(radius) - rewards.append(float(reward[0])) - action_mags.append(float(np.linalg.norm(action[0]))) - for i in range(n_sheep): - sheep_xs[i].append(float(inner.sheep_pos[i][0])) - sheep_ys[i].append(float(inner.sheep_pos[i][1])) - sheep_penned[i].append(bool(inner.penned[i])) - pen_dists[i].append( - float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER))) - if inner.penned[i] and penned_at[i] is None: - penned_at[i] = step - - n_penned = infos[0].get("n_penned", 0) - vn.close() - - return dict( - dog_xs=dog_xs, dog_ys=dog_ys, - sheep_xs=sheep_xs, sheep_ys=sheep_ys, - sheep_penned=sheep_penned, - radii=radii, pen_dists=pen_dists, - action_mags=action_mags, rewards=rewards, - penned_at=penned_at, - n_penned=n_penned, n_sheep=n_sheep, - success=n_penned == n_sheep, steps=step, - ) - - -# ── Static plots ───────────────────────────────────────────────────────────── - -def plot_trajectory(hist, out_path): - fig, ax = plt.subplots(figsize=(7, 7)) - draw_field(ax) - for i in range(hist["n_sheep"]): - c = SHEEP_COLORS[i % len(SHEEP_COLORS)] - xs, ys = hist["sheep_xs"][i], hist["sheep_ys"][i] - faded_path(ax, xs, ys, c, lw=1.2, label=f"sheep {i+1}") - ax.plot(xs[0], ys[0], "o", color=c, ms=7, zorder=4) - end = hist["penned_at"][i] if hist["penned_at"][i] is not None else -1 - ax.plot(xs[end], ys[end], "*", color=c, ms=11, zorder=5) - faded_path(ax, hist["dog_xs"], hist["dog_ys"], DOG_COLOR, lw=2.0, - label="dog") - ax.plot(hist["dog_xs"][0], hist["dog_ys"][0], "s", color=DOG_COLOR, - ms=10, zorder=5) - ax.plot(hist["dog_xs"][-1], hist["dog_ys"][-1], "D", color=DOG_COLOR, - ms=10, zorder=5) - result = ("SUCCESS" if hist["success"] - else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})") - ax.set_title(f"n={hist['n_sheep']} {result} {hist['steps']} steps", - fontsize=12) - ax.legend(loc="upper left", fontsize=8) - plt.tight_layout() - fig.savefig(out_path, dpi=120) - plt.close(fig) - - -def plot_timeseries(hist, out_path): - t = np.arange(hist["steps"]) - fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True) - - axes[0].plot(t, hist["radii"], color="steelblue") - axes[0].axhline(5.0, color="orange", ls="--", lw=1, label="compact (5m)") - axes[0].set_ylabel("flock radius (m)") - axes[0].legend(fontsize=8) - axes[0].set_title("Flock radius") - - for i in range(hist["n_sheep"]): - c = SHEEP_COLORS[i % len(SHEEP_COLORS)] - axes[1].plot(t, hist["pen_dists"][i], color=c, lw=1, - label=f"sheep {i+1}") - if hist["penned_at"][i] is not None: - axes[1].axvline(hist["penned_at"][i], color=c, ls=":", lw=1) - axes[1].set_ylabel("dist to pen (m)") - axes[1].legend(fontsize=7, ncol=min(hist["n_sheep"], 5)) - axes[1].set_title("Per-sheep distance to pen") - - axes[2].plot(t, hist["action_mags"], color="tomato", lw=1) - axes[2].axhline(1.0, color="gray", ls="--", lw=1, label="max") - axes[2].set_ylabel("action ||(vx,vy)||") - axes[2].set_ylim(0, 1.5) - axes[2].set_title("Dog action magnitude") - axes[2].legend(fontsize=8) - - axes[3].plot(t, hist["rewards"], color="purple", lw=1, alpha=0.7) - axes[3].axhline(0, color="black", lw=0.5) - axes[3].set_ylabel("reward") - axes[3].set_xlabel("step") - axes[3].set_title("Reward per step") - - result = ("SUCCESS" if hist["success"] - else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})") - fig.suptitle(f"n_sheep={hist['n_sheep']} {result} {hist['steps']} steps", - fontsize=13) - plt.tight_layout() - fig.savefig(out_path, dpi=120) - plt.close(fig) - - -def plot_success_rate(stage_results, out_path): - fig, ax = plt.subplots(figsize=(8, 4)) - ns = [r["n_sheep"] for r in stage_results] - srs = [r["sr"] * 100 for r in stage_results] - bars = ax.bar(ns, srs, color="steelblue", edgecolor="white") - ax.set_xlabel("Sheep count") - ax.set_ylabel("Success rate (%)") - ax.set_ylim(0, 105) - ax.axhline(90, color="orange", ls="--", lw=1, label="90% target") - for bar, sr in zip(bars, srs): - ax.text(bar.get_x() + bar.get_width() / 2, - bar.get_height() + 1, f"{sr:.0f}%", - ha="center", fontsize=9) - ax.legend() - ax.set_title("Evaluation success rate per sheep count") - plt.tight_layout() - fig.savefig(out_path, dpi=120) - plt.close(fig) - - -# ── Animated GIF ───────────────────────────────────────────────────────────── - -def save_episode_gif(hist, out_path, fps=20, skip=3): - """Render hist as an animated GIF. `skip` keeps every Nth frame (smaller file).""" - n_sheep = hist["n_sheep"] - frames = list(range(0, hist["steps"], max(1, skip))) - if frames[-1] != hist["steps"] - 1: - frames.append(hist["steps"] - 1) - - fig, ax = plt.subplots(figsize=(6, 6)) - draw_field(ax) - title = ax.text(0, 16.5, "", ha="center", fontsize=11) - dog_marker, = ax.plot([], [], "s", color=DOG_COLOR, ms=12, - markeredgecolor="black", markeredgewidth=1.5, - zorder=5) - sheep_markers = [] - for i in range(n_sheep): - c = SHEEP_COLORS[i % len(SHEEP_COLORS)] - m, = ax.plot([], [], "o", color=c, ms=10, - markeredgecolor="#333", markeredgewidth=1, zorder=4) - sheep_markers.append(m) - dog_trail, = ax.plot([], [], color=DOG_COLOR, lw=1.0, alpha=0.5) - - def update(k): - title.set_text( - f"n={n_sheep} step {k+1}/{hist['steps']} " - f"penned {sum(hist['sheep_penned'][i][k] for i in range(n_sheep))}/{n_sheep}") - dog_marker.set_data([hist["dog_xs"][k]], [hist["dog_ys"][k]]) - dog_trail.set_data(hist["dog_xs"][:k+1], hist["dog_ys"][:k+1]) - for i, m in enumerate(sheep_markers): - m.set_data([hist["sheep_xs"][i][k]], [hist["sheep_ys"][i][k]]) - penned = hist["sheep_penned"][i][k] - m.set_color("deeppink" if penned else SHEEP_COLORS[i % len(SHEEP_COLORS)]) - return [title, dog_marker, dog_trail, *sheep_markers] - - anim = animation.FuncAnimation( - fig, update, frames=frames, interval=1000 / fps, blit=False) - anim.save(out_path, writer=animation.PillowWriter(fps=fps), dpi=80) - plt.close(fig) - - -# ── CLI ────────────────────────────────────────────────────────────────────── - -def _resolve_paths(args): - if args.run_dir: - model_path = os.path.join(args.run_dir, "final_model.zip") - vn_path = os.path.join(args.run_dir, "vecnorm.pkl") - cfg_path = os.path.join(args.run_dir, "config.json") - else: - model_path = args.model - vn_path = args.vecnorm - cfg_path = args.config - return model_path, vn_path, cfg_path - - -def main(): - p = argparse.ArgumentParser( - description="Render trajectory + timeseries + GIF for a saved policy.") - p.add_argument("--run-dir", type=str, default=None, - help="Run directory containing final_model.zip + vecnorm.pkl + config.json") - p.add_argument("--model", type=str, default=None) - p.add_argument("--vecnorm", type=str, default=None) - p.add_argument("--config", type=str, default=None) - p.add_argument("--n-sheep", type=int, default=3) - p.add_argument("--seed", type=int, default=42) - p.add_argument("--max-steps", type=int, default=2500) - p.add_argument("--out-dir", type=str, default=None) - p.add_argument("--no-gif", action="store_true", - help="Skip the animated GIF (PNG-only is faster).") - p.add_argument("--gif-fps", type=int, default=20) - p.add_argument("--gif-skip", type=int, default=3) - args = p.parse_args() - - model_path, vn_path, cfg_path = _resolve_paths(args) - if not (model_path and vn_path): - p.error("either --run-dir or both --model and --vecnorm are required") - - rcfg = None - if cfg_path and os.path.exists(cfg_path): - with open(cfg_path) as f: - cfg = json.load(f) - rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)} - - out_dir = args.out_dir or os.path.join( - os.path.dirname(os.path.abspath(model_path)), - f"vis_{args.n_sheep}s") - os.makedirs(out_dir, exist_ok=True) - - print(f"Loading model: {model_path}") - print(f"Loading vecnorm: {vn_path}") - model = PPO.load(model_path, device="cpu") - - raw = DummyVecEnv([make_eval_env(args.n_sheep, args.seed, args.max_steps, rcfg)]) - vn = VecNormalize.load(vn_path, raw) - - print(f"Rolling out n_sheep={args.n_sheep} (seed={args.seed})...") - hist = run_and_record(model, vn, args.n_sheep, args.max_steps, - reward_cfg=rcfg, seed=args.seed) - result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})" - print(f" {result} in {hist['steps']} steps") - - plot_trajectory(hist, os.path.join(out_dir, "trajectory.png")) - plot_timeseries(hist, os.path.join(out_dir, "timeseries.png")) - print(f" saved trajectory.png + timeseries.png to {out_dir}/") - if not args.no_gif: - gif_path = os.path.join(out_dir, "episode.gif") - print(f" rendering GIF (fps={args.gif_fps}, skip={args.gif_skip})...") - save_episode_gif(hist, gif_path, fps=args.gif_fps, skip=args.gif_skip) - print(f" saved {gif_path}") - - -if __name__ == "__main__": - main() diff --git a/worlds/.field.wbproj b/worlds/.field.wbproj index 176c39a..255f061 100644 --- a/worlds/.field.wbproj +++ b/worlds/.field.wbproj @@ -1,5 +1,5 @@ Webots Project File version R2025a -perspectives: 000000ff00000000fd00000002000000010000011c00000298fc0200000001fb0000001400540065007800740045006400690074006f00720100000000000002980000003f00ffffff000000030000084300000238fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000008430000006900ffffff000007250000029800000001000000020000000100000008fc00000000 +perspectives: 000000ff00000000fd00000002000000010000011c000001bcfc0200000001fb0000001400540065007800740045006400690074006f00720100000000000001bc0000003f00ffffff00000003000005c600000220fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000005c60000006900ffffff000004a8000001bc00000001000000020000000100000008fc00000000 simulationViewPerspectives: 000000ff000000010000000200000100000006250100000002010000000100 sceneTreePerspectives: 000000ff00000001000000030000001f000000c0000000000100000002010000000200 maximizedDockId: -1 diff --git a/worlds/field.wbt b/worlds/field.wbt index 7865026..f3f3b18 100644 --- a/worlds/field.wbt +++ b/worlds/field.wbt @@ -10,7 +10,7 @@ EXTERNPROTO "../protos/Sheep.proto" # World WorldInfo { info [ - "RL-Based Autonomous Shepherd Robot" + "Autonomous Shepherd Robot (Strömbom)" "Group G25" ] title "Shepherd Herding" @@ -106,19 +106,26 @@ Solid { translation -2.5 -15 0.84 children [ Shape { appearance USE CAP geometry Solid { translation 14 -15 0.40 children [ Shape { appearance USE STONE_A geometry Box { size 2.0 0.16 0.80 } } ] boundingObject Box { size 2.0 0.16 0.80 } } Solid { translation 14 -15 0.84 children [ Shape { appearance USE CAP geometry Box { size 2.1 0.26 0.07 } } ] boundingObject Box { size 2.1 0.26 0.07 } } # Gate posts -Solid { translation 10 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } -Solid { translation 13 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } -# Outer gate (wooden, slightly ajar, Z-brace) -Solid { translation 11.5 -15.08 0.55 rotation 0 0 1 0.25 children [ +Solid { translation 10 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] } +Solid { translation 13 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] } +# Outer gate — fully open, hinged on the west gate post. Modeled as a swung-back +# wooden gate parallel to the south wall, on the west side, so the 3m corridor +# between gate posts (x=10..13, y=-15) is unobstructed. +Solid { translation 8.6 -15.05 0.55 rotation 0 0 1 0 children [ Shape { appearance USE WOOD geometry Box { size 2.80 0.05 1.00 } } - Transform { translation 0 0.02 0 rotation 0 1 0 0.34 children [ Shape { appearance DEF FPOST PBRAppearance { baseColor 0.35 0.22 0.10 roughness 0.90 } geometry Box { size 2.97 0.04 0.06 } } ] } + # FPOST appearance DEF lives here so the external pen below can USE it. + Transform { translation 0 0.02 0 rotation 0 1 0 0.34 children [ + Shape { appearance DEF FPOST PBRAppearance { baseColor 0.35 0.22 0.10 roughness 0.90 } geometry Box { size 2.97 0.04 0.06 } } + ] } ] boundingObject Box { size 2.80 0.08 1.00 } } -# ==================== QUARANTINE PEN (wooden post-and-rail fence, inside field) ==================== -# Flow: main field → inner gate → quarantine area → outer gate → outside +# ==================== EXTERNAL PEN (south of field, accessed through south-wall gate) ==================== +# Flow: main field → south-wall gate (x ∈ [10, 13], y = -15) → external pen +# The pen is a wooden post-and-rail rectangle south of the field, x ∈ [10, 13], +# y ∈ [-22, -15], open on the north side (the gate hole is the entrance). -# West wall (x=10, ~7m along Y) -Solid { translation 10 -11.46 0.55 children [ +# Pen west wall (x=10, y from -22 to -15, length 7m) +Solid { translation 10 -18.5 0.55 children [ Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } @@ -130,8 +137,8 @@ Solid { translation 10 -11.46 0.55 children [ Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] } ] boundingObject Box { size 0.14 6.92 1.10 } } -# East wall (x=13) -Solid { translation 13 -11.46 0.55 children [ +# Pen east wall (x=13, y from -22 to -15, length 7m) +Solid { translation 13 -18.5 0.55 children [ Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } @@ -143,39 +150,50 @@ Solid { translation 13 -11.46 0.55 children [ Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] } ] boundingObject Box { size 0.14 6.92 1.10 } } -# North wall - open entrance (no wall, just corner posts) -Solid { translation 10 -8 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] boundingObject Box { size 0.12 0.12 1.10 } } -Solid { translation 13 -8 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] boundingObject Box { size 0.12 0.12 1.10 } } +# Pen south wall (y=-22, x from 10 to 13, length 3m, closes the back of the pen) +Solid { translation 11.5 -22 0.55 children [ + Transform { translation -1.5 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } + Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } + Transform { translation 1.5 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } + Transform { translation 0 0 -0.38 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] } + Transform { translation 0 0 -0.05 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] } + Transform { translation 0 0 0.30 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] } + Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 2.92 0.14 0.04 } } ] } +] boundingObject Box { size 2.92 0.14 1.10 } } + +# Pen north corner posts at the gate opening (no wall — sheep enter here from the field) +Solid { translation 10 -15.0 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } +Solid { translation 13 -15.0 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } # Corner pillars -Solid { translation 15 15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } -Solid { translation 15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } -Solid { translation -15 15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } -Solid { translation -15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } +Solid { translation 15 15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] } +Solid { translation 15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] } +Solid { translation -15 15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] } +Solid { translation -15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] } # Mid-pillars every 5 m — East -Solid { translation 15 10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation 15 5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation 15 0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation 15 -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation 15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } +Solid { translation 15 10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation 15 5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation 15 0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation 15 -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation 15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } # West -Solid { translation -15 10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation -15 5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation -15 0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation -15 -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation -15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } +Solid { translation -15 10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation -15 5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation -15 0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation -15 -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation -15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } # North -Solid { translation 10 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation 5 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation 0 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation -5 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation -10 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } +Solid { translation 10 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation 5 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation 0 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation -5 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation -10 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } # South -Solid { translation 5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation 0 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation -5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } -Solid { translation -10 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } +Solid { translation 5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation 0 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation -5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } +Solid { translation -10 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] } # ==================== BARN 1 — Gambrel/Dutch style (NE, outside fence) ==================== # Body 10×7×4, weathered gray-brown wood, gambrel roof, large double doors @@ -503,28 +521,16 @@ ShepherdDog { } # ==================== SHEEP ==================== -Sheep { - translation 3 2 0.5 - name "sheep1" - controller "sheep" -} -Sheep { - translation 3 -2 0.5 - name "sheep2" - controller "sheep" -} -Sheep { - translation 4 0 0.5 - name "sheep3" - controller "sheep" -} -# Sheep { -# translation 3.5 1 0.5 -# name "sheep4" -# controller "sheep" -# } -# Sheep { -# translation 3.5 -1 0.5 -# name "sheep5" -# controller "sheep" -# } +# Up to 10 sheep, scattered through the field's central/north zone. Comment +# out trailing slots to test smaller flock sizes; the dog policy is trained +# to handle 1..10 sheep so any prefix works. +Sheep { translation 3.0 2.0 0.5 name "sheep1" controller "sheep" } +Sheep { translation 3.0 -2.0 0.5 name "sheep2" controller "sheep" } +Sheep { translation 4.0 0.0 0.5 name "sheep3" controller "sheep" } +Sheep { translation -3.0 4.0 0.5 name "sheep4" controller "sheep" } +Sheep { translation -5.0 -2.0 0.5 name "sheep5" controller "sheep" } +Sheep { translation 6.0 5.0 0.5 name "sheep6" controller "sheep" } +Sheep { translation -6.0 6.0 0.5 name "sheep7" controller "sheep" } +Sheep { translation 0.0 8.0 0.5 name "sheep8" controller "sheep" } +Sheep { translation -8.0 0.0 0.5 name "sheep9" controller "sheep" } +Sheep { translation 7.0 -4.0 0.5 name "sheep10" controller "sheep" }