Checkpoint 2

This commit is contained in:
Johnny Fernandes
2026-05-07 22:00:10 +01:00
parent 90aa3bbcb4
commit 1bb9415414
37 changed files with 3068 additions and 2912 deletions
+16 -12
View File
@@ -4,18 +4,22 @@
# Python # Python
__pycache__/ __pycache__/
*.pyc
# Training .venv/
training/**/events.out.tfevents.*
training/**/checkpoints/
training/runs/**
!training/runs/.gitkeep
# Controller runtime artefacts
controllers/shepherd_dog_rl/debug*.csv
controllers/shepherd_dog_rl/debug_out*/
controllers/shepherd_dog_rl/final_model*.zip
controllers/shepherd_dog_rl/vecnorm*.pkl
# Optional env parity debug # Optional env parity debug
dog_debug.csv dog_debug.csv
# Webots controller scratch
controllers/shepherd_dog/dog_behavior_log.csv
# Training artefacts
training/runs/*
!training/runs/.gitkeep
*.zip
*.pkl
# TensorBoard
events.out.tfevents.*
worlds/field_test.wbt
herding_runtime.cfg
+71 -156
View File
@@ -1,45 +1,36 @@
""" """Sheep flocking controller (Webots).
Sheep flocking controller (Webots, Reynolds boids variant).
Each sheep broadcasts its GPS position every 3 steps on channel 1 and Each sheep broadcasts its GPS position every 3 steps on channel 1 and
listens for the dog and peer sheep positions. Peers are keyed by robot listens for the dog and peer sheep positions. The behavioural step is
name so each neighbour has exactly one current entry in the dict. delegated to ``herding.flocking_sim.compute_heading_speed`` so the
training environment and Webots run identical sheep dynamics.
Force stack each step (summed then converted to a heading + speed): Pen behaviour: a sheep latches to ``penned`` the first time it crosses
flee — away from dog, quadratic ramp, dominant when close the south-wall gate plane into the gate corridor. Once latched it turns
cohesion — toward flock centre, halved while fleeing pink (via the exposed ``woolColor`` PROTO field) and the force stack
separation — inverse-distance push, prevents physical overlap switches to in-pen containment.
walls — linear repulsion from field boundary
wander — small persistent drift for natural idle motion
Pen behaviour: on first entry into the quarantine pen the sheep latches
permanently — it turns pink (via the exposed woolColor PROTO field) and
the normal force stack is replaced by pen-confinement forces only.
""" """
import random
import math import math
import os
import random
import sys
# --- Make the shared herding/ package importable from this controller dir ---
_HERE = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
from controller import Supervisor from controller import Supervisor
# --------------------------------------------------------------------------- from herding.diffdrive import heading_speed_to_wheels
# Tuning constants from herding.flocking_sim import MAX_SPEED, compute_heading_speed
# --------------------------------------------------------------------------- from herding.geometry import (
SHEEP_MAX_WHEEL_OMEGA,
is_penned_position,
)
MAX_SPEED = 22.0 # rad/s hard clamp on both motors
FLEE_SPEED = 20.0 # rad/s upper bound while panicking
WANDER_SPEED = 3.0 # rad/s lower bound during calm wandering
X_MIN, X_MAX = -14.5, 14.5 # stone wall inner edges (metres)
Y_MIN, Y_MAX = -14.5, 14.5
WALL_MARGIN = 3.5 # avoidance starts this far from the wall
FLEE_DIST = 7.0 # dog within this radius triggers flee (metres)
SEPARATION_DIST = 2.5 # inverse-distance push active inside this radius
COHESION_DIST = 8.0 # pull toward flock centre active inside this radius
PEN_X_MIN, PEN_X_MAX = 10.0, 13.0 # quarantine pen extents (metres)
PEN_Y_MIN, PEN_Y_MAX = -15.0, -8.0 # open entrance at y=-8, gate at y=-15
PEN_MARGIN = 0.8 # confinement force starts this far from pen wall
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Device setup # Device setup
@@ -56,178 +47,102 @@ left_motor.setPosition(float("inf"))
right_motor.setPosition(float("inf")) right_motor.setPosition(float("inf"))
left_motor.setVelocity(0.0) left_motor.setVelocity(0.0)
right_motor.setVelocity(0.0) right_motor.setVelocity(0.0)
MOTOR_MAX = min(left_motor.getMaxVelocity(), SHEEP_MAX_WHEEL_OMEGA)
gps = robot.getDevice("gps"); gps.enable(timestep) gps = robot.getDevice("gps"); gps.enable(timestep)
compass = robot.getDevice("compass"); compass.enable(timestep) compass = robot.getDevice("compass"); compass.enable(timestep)
receiver = robot.getDevice("receiver"); receiver.enable(timestep) receiver = robot.getDevice("receiver"); receiver.enable(timestep)
emitter = robot.getDevice("emitter") emitter = robot.getDevice("emitter")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Helpers # Helpers
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def norm_angle(a):
return math.atan2(math.sin(a), math.cos(a))
def bearing(): def bearing():
# Compass returns north direction in sensor frame; for this Z-up world # Compass returns north direction in sensor frame; for this Z-up world
# with north = +Y, atan2(n[0], n[1]) gives the standard math angle # with north = +Y, atan2(n[0], n[1]) gives the standard math angle
# (0 = east, π/2 = north) matching atan2(fy, fx) used for heading. # (0 = east, π/2 = north) matching atan2(fy, fx) used for headings.
n = compass.getValues() n = compass.getValues()
return math.atan2(n[0], n[1]) return math.atan2(n[0], n[1])
def drive(heading, speed): def drive(heading, speed_motor):
err = norm_angle(heading - bearing()) left_w, right_w = heading_speed_to_wheels(
# Scale forward component by cos(err): at 90° error fwd→0 so the robot heading, min(speed_motor, MAX_SPEED), bearing(), MOTOR_MAX, k_turn=4.0
# spins in place to realign rather than driving sideways at full speed. )
fwd = speed * max(0.0, math.cos(err)) left_motor.setVelocity(left_w)
k = 4.0 right_motor.setVelocity(right_w)
left_motor.setVelocity( max(-MAX_SPEED, min(MAX_SPEED, fwd - k * err)))
right_motor.setVelocity(max(-MAX_SPEED, min(MAX_SPEED, fwd + k * err)))
def paint_pink(): def paint_pink():
# woolColor is declared as a PROTO field with IS binding to the DEF WOOL # woolColor is declared as a PROTO field with IS binding to the DEF WOOL
# PBRAppearance baseColor. Changing it here propagates to every USE WOOL # PBRAppearance baseColor; setting it propagates to every USE WOOL shape.
# shape on the body. Direct field access avoids PROTO-internal opacity.
self_node.getField("woolColor").setSFColor([1.0, 0.55, 0.72]) self_node.getField("woolColor").setSFColor([1.0, 0.55, 0.72])
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# State # State
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
wander_angle = random.uniform(-math.pi, math.pi) wander_angle = random.uniform(-math.pi, math.pi)
step = 0 step_count = 0
dog_x = None dog_x, dog_y = None, None
dog_y = None
peers = {} # name → (x, y), one entry per neighbour, cleared every 30 steps peers = {} # name → (x, y), one entry per neighbour, cleared every 30 steps
penned = False penned = False
# Stuck detection: differential-drive sheep can pin against a wall and need
# a forced reverse-and-rotate to escape. If displacement < STUCK_DIST for
# STUCK_STEPS consecutive steps, drive toward field centre.
_prev_x, _prev_y = None, None
_stuck_count = 0
STUCK_STEPS = 20
STUCK_DIST = 0.05
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Main loop # Main loop
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
while robot.step(timestep) != -1: while robot.step(timestep) != -1:
step += 1 step_count += 1
pos = gps.getValues() pos = gps.getValues()
x, y = pos[0], pos[1] x, y = pos[0], pos[1]
# Pen entry: one-way latch, never unset # Pen entry: one-way latch. Penned sheep get pink wool and switch behaviour.
if not penned and PEN_X_MIN < x < PEN_X_MAX and PEN_Y_MIN < y < PEN_Y_MAX: if not penned and is_penned_position(x, y):
penned = True penned = True
paint_pink() paint_pink()
# Refresh peer table (clear before receiving so fresh data is never lost) # Refresh peer table clear before receiving so fresh data is never lost.
if step % 30 == 0: if step_count % 30 == 0:
peers.clear() peers.clear()
while receiver.getQueueLength() > 0: while receiver.getQueueLength() > 0:
msg = receiver.getString() msg = receiver.getString()
receiver.nextPacket() receiver.nextPacket()
p = msg.split(":") parts = msg.split(":")
if p[0] == "dog" and len(p) >= 3: if parts[0] == "dog" and len(parts) >= 3:
dog_x, dog_y = float(p[1]), float(p[2]) dog_x, dog_y = float(parts[1]), float(parts[2])
elif p[0] == "sheep" and len(p) >= 4 and p[1] != name: elif parts[0] == "sheep" and len(parts) >= 4 and parts[1] != name:
peers[p[1]] = (float(p[2]), float(p[3])) peers[parts[1]] = (float(parts[2]), float(parts[3]))
fx, fy = 0.0, 0.0 dog_xy = (dog_x, dog_y) if dog_x is not None and dog_y is not None else None
heading, speed, wander_angle = compute_heading_speed(
x=x, y=y, penned=penned, dog_xy=dog_xy, peers=peers,
wander_angle=wander_angle,
)
# Repel unpenned sheep from the exterior of the pen's side walls so they # Stuck detection — safety net for differential-drive wall pinning.
# don't get pinned by flee forces. Only fires when strictly outside the pen if _prev_x is not None:
# (x < PEN_X_MIN or x > PEN_X_MAX) at pen height (y in pen y-range). moved = math.hypot(x - _prev_x, y - _prev_y)
# Entrance is open on the north (y > PEN_Y_MAX) — no force there. _stuck_count = _stuck_count + 1 if moved < STUCK_DIST else 0
PEN_EXT_MARGIN = 0.8 if _stuck_count >= STUCK_STEPS:
if not penned and PEN_Y_MIN < y < PEN_Y_MAX: heading = math.atan2(-y, -x) # always points away from the boundary
if PEN_X_MIN - PEN_EXT_MARGIN < x < PEN_X_MIN: speed = MAX_SPEED
fx -= ((x - (PEN_X_MIN - PEN_EXT_MARGIN)) / PEN_EXT_MARGIN) * 6.0 _stuck_count = 0
if PEN_X_MAX < x < PEN_X_MAX + PEN_EXT_MARGIN: _prev_x, _prev_y = x, y
fx += ((PEN_X_MAX + PEN_EXT_MARGIN - x) / PEN_EXT_MARGIN) * 6.0
if penned:
# Inside pen: wander freely, strong boundary forces prevent exit,
# separation still active to avoid collisions with other penned sheep.
pm = PEN_MARGIN
if x < PEN_X_MIN + pm: fx += ((PEN_X_MIN + pm - x) / pm) * 15.0
if x > PEN_X_MAX - pm: fx -= ((x - (PEN_X_MAX - pm)) / pm) * 15.0
if y < PEN_Y_MIN + pm: fy += ((PEN_Y_MIN + pm - y) / pm) * 15.0
if y > PEN_Y_MAX - pm: fy -= ((y - (PEN_Y_MAX - pm)) / pm) * 15.0
for px, py in peers.values():
dx, dy = px - x, py - y
d = math.hypot(dx, dy)
if 0.05 < d < SEPARATION_DIST:
push = (SEPARATION_DIST - d) / d
fx -= (dx / d) * push * 2.5
fy -= (dy / d) * push * 2.5
if random.random() < 0.02:
wander_angle += random.uniform(-0.6, 0.6)
fx += math.cos(wander_angle) * 0.5
fy += math.sin(wander_angle) * 0.5
else:
fleeing = False
# Flee — quadratic ramp so force grows rapidly as the dog closes in
if dog_x is not None:
dx = dog_x - x
dy = dog_y - y
dist = math.hypot(dx, dy)
if 0.01 < dist < FLEE_DIST:
fleeing = True
t = 1.0 - dist / FLEE_DIST
s = t * t * 20.0
fx -= (dx / dist) * s
fy -= (dy / dist) * s
# Cohesion — halved while fleeing to reduce mid-panic collisions
cx, cy, cn = 0.0, 0.0, 0
for px, py in peers.values():
d = math.hypot(px - x, py - y)
if 0.3 < d < COHESION_DIST:
cx += px; cy += py; cn += 1
if cn > 0:
w = 0.08 if fleeing else 0.15
fx += (cx / cn - x) * w
fy += (cy / cn - y) * w
# Separation — inverse-distance: huge when nearly overlapping, fades quickly
for px, py in peers.values():
dx, dy = px - x, py - y
d = math.hypot(dx, dy)
if 0.05 < d < SEPARATION_DIST:
push = (SEPARATION_DIST - d) / d
fx -= (dx / d) * push * 2.5
fy -= (dy / d) * push * 2.5
# Walls
if x < X_MIN + WALL_MARGIN: fx += ((X_MIN + WALL_MARGIN - x) / WALL_MARGIN) * 6.0
if x > X_MAX - WALL_MARGIN: fx -= ((x - (X_MAX - WALL_MARGIN)) / WALL_MARGIN) * 6.0
if y < Y_MIN + WALL_MARGIN: fy += ((Y_MIN + WALL_MARGIN - y) / WALL_MARGIN) * 6.0
if y > Y_MAX - WALL_MARGIN: fy -= ((y - (Y_MAX - WALL_MARGIN)) / WALL_MARGIN) * 6.0
# Wander — suppressed while fleeing so drift cannot deflect the flee heading
if not fleeing:
if random.random() < 0.02:
wander_angle += random.uniform(-0.6, 0.6)
fx += math.cos(wander_angle) * 0.5
fy += math.sin(wander_angle) * 0.5
# Hard-stop clamp: within 0.5 m of a wall, zero any force component that
# would push further into it. Prevents the flee force from pinning a sheep
# against the boundary when the dog approaches from outside.
HS = 0.5
if x < X_MIN + HS and fx < 0: fx = 0.0
if x > X_MAX - HS and fx > 0: fx = 0.0
if y < Y_MIN + HS and fy < 0: fy = 0.0
if y > Y_MAX - HS and fy > 0: fy = 0.0
heading = math.atan2(fy, fx)
mag = math.hypot(fx, fy)
speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))
drive(heading, speed) drive(heading, speed)
if step % 3 == 0: if step_count % 3 == 0:
emitter.send(f"sheep:{name}:{x:.4f}:{y:.4f}") emitter.send(f"sheep:{name}:{x:.4f}:{y:.4f}")
+78
View File
@@ -0,0 +1,78 @@
"""Lazy loader for the SB3 PPO policy used by the dog controller.
Importing stable-baselines3 inside the Webots Python interpreter is only
needed when ``HERDING_MODE=rl``; the Strömbom mode runs without it. This
loader keeps SB3 out of the import path until you actually ask for the RL
policy, so users without SB3 installed can still run the Strömbom
baseline.
The policy + VecNormalize statistics are saved together by
``training/train_ppo.py``:
runs/<name>/best/best_model.zip # SB3 PPO checkpoint
runs/<name>/best/vecnormalize.pkl # observation-normaliser stats
Pass either the directory or the explicit zip path.
"""
import os
from pathlib import Path
class PolicyHandle:
"""Wrap a loaded PPO policy + VecNormalize so the controller can call
``predict(obs)`` without thinking about either."""
def __init__(self, model, vecnorm):
self.model = model
self.vecnorm = vecnorm
def predict(self, obs):
# VecNormalize expects a batched obs of shape (n_envs, obs_dim).
if self.vecnorm is not None:
import numpy as np
obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
obs_b = self.vecnorm.normalize_obs(obs_b)
else:
import numpy as np
obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
action, _ = self.model.predict(obs_b, deterministic=True)
return action[0]
def load(model_path: str, vecnorm_path: str | None = None) -> PolicyHandle:
"""Load a PPO model (and optional VecNormalize) from disk.
``model_path`` may be the .zip checkpoint or a directory containing
``best_model.zip`` (and optionally ``vecnormalize.pkl``).
"""
p = Path(model_path)
if p.is_dir():
zip_candidates = [p / "best_model.zip", p / "final.zip", p / "policy.zip"]
zip_path = next((z for z in zip_candidates if z.exists()), None)
if zip_path is None:
raise FileNotFoundError(
f"No PPO zip found in {p} (looked for best_model.zip, final.zip, policy.zip)"
)
if vecnorm_path is None:
vn = p / "vecnormalize.pkl"
if vn.exists():
vecnorm_path = str(vn)
else:
zip_path = p
# Imports deferred so the Strömbom path doesn't require SB3.
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize
model = PPO.load(str(zip_path), device="auto")
vecnorm = None
if vecnorm_path and os.path.exists(vecnorm_path):
# VecNormalize.load needs a venv to attach to; we only need its stats
# at inference, so we reconstruct the wrapper manually.
import pickle
with open(vecnorm_path, "rb") as f:
vecnorm = pickle.load(f)
vecnorm.training = False
vecnorm.norm_reward = False
return PolicyHandle(model=model, vecnorm=vecnorm)
+245 -50
View File
@@ -1,14 +1,182 @@
""" """Shepherd Dog controller (Webots).
Shepherd Dog controller (Webots, manual keyboard control).
WASD / arrow keys drive the robot. +/- adjust speed in 10 % increments. Runs in one of two modes selected by the ``HERDING_MODE`` environment
GPS position is broadcast every step on channel 1 so sheep controllers variable:
can compute flee forces. Ears wag continuously via sinusoidal position
targets — purely cosmetic. HERDING_MODE=rl → load an SB3 PPO policy from
HERDING_POLICY_DIR (default
training/runs/latest/best) and use its
(vx, vy) action each step.
HERDING_MODE=strombom → use the analytic Strömbom collect/drive
heuristic. This is the fallback if the RL
policy can't be loaded (e.g. SB3 not
installed in the Webots Python env, or no
checkpoint yet).
Both modes share the same low-level differential-drive controller
(``herding.diffdrive.velocity_to_wheels`` + clamped forward speed), so
switching modes does not retune the actuation layer.
A safety supervisor enforces the "dog stays out of the pen" invariant:
if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is
overridden with a north-driving correction. This is a hard guarantee
the policy cannot escape.
""" """
import math import math
from controller import Robot, Keyboard import os
import sys
# --- Make the shared herding/ package importable from this controller dir ---
_HERE = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
from controller import Robot
from herding.diffdrive import velocity_to_wheels
from herding.geometry import (
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA,
DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS,
PEN_ENTRY,
)
from herding.obs import build_obs
from herding.sequential import compute_action_debug as sequential_action_debug
from herding.strombom import compute_action_debug as strombom_action_debug
# ---------------------------------------------------------------------------
# Mode selection
# ---------------------------------------------------------------------------
def _load_runtime_config():
"""Read mode + policy_dir overrides from a runtime config file.
Webots strips HERDING_* env vars in some configurations, so the
launcher writes a tiny ``herding_runtime.cfg`` (key=value lines)
in the project root and the controller reads it here. Env vars
win if both are present; the file is the fallback.
"""
cfg_path = os.path.join(_PROJECT_ROOT, "herding_runtime.cfg")
if not os.path.exists(cfg_path):
return {}
out = {}
try:
with open(cfg_path) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
k, _, v = line.partition("=")
out[k.strip().upper()] = v.strip()
except OSError:
return {}
return out
_runtime_cfg = _load_runtime_config()
MODE = (os.environ.get("HERDING_MODE")
or _runtime_cfg.get("HERDING_MODE")
or "rl").lower()
def _resolve_policy_dir() -> str:
"""Where to look for the trained policy.
Priority:
1. HERDING_POLICY_DIR env var (if set and points to a real dir)
2. training/runs/bc_pretrained/ (BC-only checkpoint)
3. training/runs/bc_ppo/best/ (PPO fine-tuned best)
4. training/runs/latest/best/ (legacy default)
"""
env_dir = (os.environ.get("HERDING_POLICY_DIR")
or _runtime_cfg.get("HERDING_POLICY_DIR"))
if env_dir and os.path.isdir(env_dir):
return env_dir
candidates = [
os.path.join(_PROJECT_ROOT, "training", "runs", "bc_pretrained"),
os.path.join(_PROJECT_ROOT, "training", "runs", "bc_ppo", "best"),
os.path.join(_PROJECT_ROOT, "training", "runs", "latest", "best"),
]
for c in candidates:
if os.path.isdir(c):
return c
# Last resort — return env var anyway so error message is informative.
return env_dir or candidates[0]
POLICY_DIR = _resolve_policy_dir()
policy_handle = None
if MODE == "rl":
print(f"[dog] HERDING_MODE={MODE} HERDING_POLICY_DIR(env)="
f"{os.environ.get('HERDING_POLICY_DIR', '<unset>')}")
print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists="
f"{os.path.isdir(POLICY_DIR)}")
if os.path.isdir(POLICY_DIR):
try:
entries = sorted(os.listdir(POLICY_DIR))
except OSError:
entries = []
print(f"[dog] dir contents: {entries}")
try:
from policy_loader import load as _load_policy
policy_handle = _load_policy(POLICY_DIR)
print(f"[dog] RL policy loaded from {POLICY_DIR}")
except Exception as exc:
print(f"[dog] RL policy load failed ({exc!r}); falling back to Strömbom.")
MODE = "strombom"
if MODE not in ("rl", "strombom", "sequential"):
print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
MODE = "strombom"
print(f"[dog] running in mode={MODE}")
# ---------------------------------------------------------------------------
# Action smoothing + safety supervisor
# ---------------------------------------------------------------------------
ACTION_SMOOTH = 0.35
prev_action = (0.0, 0.0)
def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
"""If the dog is near the south barrier and the action would push it
further south, override with a northward action. Hard invariant: the
dog never enters the pen."""
if dog_y < DOG_SOUTH_LIMIT and vy < 0.0:
return (0.0, 1.0)
if dog_y < DOG_SOUTH_LIMIT + 0.5 and vy < -0.2:
return (vx * 0.5, max(0.0, vy + 0.5))
return (vx, vy)
# ---------------------------------------------------------------------------
# Driving
# ---------------------------------------------------------------------------
def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float):
if math.hypot(vx, vy) < 1e-3:
left_motor.setVelocity(0.0)
right_motor.setVelocity(0.0)
return
n = compass.getValues()
h = math.atan2(n[0], n[1])
left, right = velocity_to_wheels(
vx, vy, h,
max_linear=DOG_MAX_LINEAR,
wheel_radius=DOG_WHEEL_RADIUS,
max_wheel_omega=motor_max,
k_turn=4.0,
)
left_motor.setVelocity(left)
right_motor.setVelocity(right)
# ---------------------------------------------------------------------------
# Webots devices
# ---------------------------------------------------------------------------
robot = Robot() robot = Robot()
timestep = int(robot.getBasicTimeStep()) timestep = int(robot.getBasicTimeStep())
@@ -19,70 +187,97 @@ left_motor.setPosition(float("inf"))
right_motor.setPosition(float("inf")) right_motor.setPosition(float("inf"))
left_motor.setVelocity(0.0) left_motor.setVelocity(0.0)
right_motor.setVelocity(0.0) right_motor.setVelocity(0.0)
MOTOR_MAX = min(left_motor.getMaxVelocity(), DOG_MAX_WHEEL_OMEGA)
lidar = robot.getDevice("lidar")
lidar.enable(timestep)
lidar.enablePointCloud()
gps = robot.getDevice("gps"); gps.enable(timestep) gps = robot.getDevice("gps"); gps.enable(timestep)
compass = robot.getDevice("compass"); compass.enable(timestep) compass = robot.getDevice("compass"); compass.enable(timestep)
emitter = robot.getDevice("emitter")
receiver = robot.getDevice("receiver"); receiver.enable(timestep) receiver = robot.getDevice("receiver"); receiver.enable(timestep)
emitter = robot.getDevice("emitter")
# Cosmetic ear motors — ignored by control logic but keep them animated.
left_ear = robot.getDevice("left ear motor") left_ear = robot.getDevice("left ear motor")
right_ear = robot.getDevice("right ear motor") right_ear = robot.getDevice("right ear motor")
left_ear.setPosition(float("inf")) left_ear.setPosition(float("inf"))
right_ear.setPosition(float("inf")) right_ear.setPosition(float("inf"))
left_ear.setVelocity(0.0) left_ear.setVelocity(0.0)
right_ear.setVelocity(0.0) right_ear.setVelocity(0.0)
keyboard = robot.getKeyboard()
keyboard.enable(timestep)
MOTOR_MAX = left_motor.getMaxVelocity()
speed_level = 0.5 # fraction of MOTOR_MAX; adjusted by +/-
EAR_AMPLITUDE = 0.35 # rad, peak ear deflection
EAR_RATE = 8.0 # rad/s, how fast the ears are driven
ear_phase = 0.0 ear_phase = 0.0
EAR_AMPLITUDE = 0.35
EAR_RATE = 8.0
# ---------------------------------------------------------------------------
# Main loop
# ---------------------------------------------------------------------------
# {name: (x, y)} — kept across all sheep ever heard from. Sheep that drift
# into the pen are tracked by ``penned`` so observations and Strömbom
# agree on which ones still need herding.
sheep_positions: dict = {}
penned_set: set = set()
step_count = 0
from herding.geometry import is_penned_position
while robot.step(timestep) != -1: while robot.step(timestep) != -1:
speed = MOTOR_MAX * speed_level step_count += 1
turn = speed * 0.6 # differential turn radius
left_vel = 0.0 while receiver.getQueueLength() > 0:
right_vel = 0.0 msg = receiver.getString()
key = keyboard.getKey() receiver.nextPacket()
while key > 0: parts = msg.split(":")
if key in (ord('W'), Keyboard.UP): if len(parts) == 4 and parts[0] == "sheep":
left_vel = speed try:
right_vel = speed x, y = float(parts[2]), float(parts[3])
elif key in (ord('S'), Keyboard.DOWN): except ValueError:
left_vel = -speed continue
right_vel = -speed sheep_positions[parts[1]] = (x, y)
elif key in (ord('A'), Keyboard.LEFT): if parts[1] not in penned_set and is_penned_position(x, y):
left_vel = -turn penned_set.add(parts[1])
right_vel = turn
elif key in (ord('D'), Keyboard.RIGHT):
left_vel = turn
right_vel = -turn
elif key in (ord('+'), ord('=')):
speed_level = min(1.0, speed_level + 0.1)
print(f"Speed: {speed_level:.0%} ({MOTOR_MAX * speed_level:.1f} rad/s)")
elif key in (ord('-'), ord('_')):
speed_level = max(0.1, speed_level - 0.1)
print(f"Speed: {speed_level:.0%} ({MOTOR_MAX * speed_level:.1f} rad/s)")
key = keyboard.getKey()
left_motor.setVelocity(left_vel)
right_motor.setVelocity(right_vel)
pos = gps.getValues() pos = gps.getValues()
emitter.send(f"dog:{pos[0]}:{pos[1]}") dog_xy = (pos[0], pos[1])
n = compass.getValues()
dog_heading = math.atan2(n[0], n[1])
# ---- Action selection ----
if MODE == "rl" and policy_handle is not None:
sheep_xy_list = list(sheep_positions.values())
sheep_names = list(sheep_positions.keys())
sheep_penned_list = [s in penned_set for s in sheep_names]
obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list)
action = policy_handle.predict(obs)
vx, vy = float(action[0]), float(action[1])
elif MODE == "sequential":
vx, vy, _mode_str, _dbg = sequential_action_debug(
dog_xy, sheep_positions, PEN_ENTRY,
)
else:
# Strömbom (canonical baseline).
vx, vy, _mode_str, _dbg = strombom_action_debug(
dog_xy, sheep_positions, PEN_ENTRY,
)
# EMA smoothing — reduces oscillation from policy or Strömbom flips.
vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx
vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy
# Safety: dog must never enter the pen.
vx, vy = safety_clamp(vx, vy, dog_xy[0], dog_xy[1])
prev_action = (vx, vy)
drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX)
emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}")
# Cosmetic ear wiggle — purely visual.
ear_phase += 0.12 ear_phase += 0.12
ear_pos = EAR_AMPLITUDE * math.sin(ear_phase) ear_pos = EAR_AMPLITUDE * math.sin(ear_phase)
left_ear.setVelocity(EAR_RATE) left_ear.setVelocity(EAR_RATE)
right_ear.setVelocity(EAR_RATE) right_ear.setVelocity(EAR_RATE)
left_ear.setPosition(ear_pos) left_ear.setPosition(ear_pos)
right_ear.setPosition(-ear_pos) right_ear.setPosition(-ear_pos)
if step_count % 200 == 0:
n_active = sum(1 for s in sheep_positions if s not in penned_set)
print(f"[dog mode={MODE}] step={step_count} known={len(sheep_positions)} "
f"penned={len(penned_set)} active={n_active} action=({vx:+.2f}, {vy:+.2f})")
Binary file not shown.
-153
View File
@@ -1,153 +0,0 @@
"""
Render Webots-side debug trajectory from debug.csv.
The shepherd_dog_rl controller writes per-step state to debug.csv when
DOG_DEBUG=1. This script reads it and produces:
trajectory.png — dog path + sheep paths overlaid on the field
obs_drift.png — normalized observation distribution over time
actions.png — vx, vy time series
Run:
python plot_debug.py # uses debug.csv next to this file
python plot_debug.py --csv path/to.csv --out-dir somewhere/
"""
import argparse
import csv
import os
import sys
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
def load_csv(path):
rows = []
with open(path) as f:
rd = csv.DictReader(f)
for r in rd:
rows.append(r)
if not rows:
sys.exit(f"empty CSV: {path}")
return rows
def parse_floats(s):
return [float(x) for x in s.split(";") if x]
def plot_trajectory(rows, out_path):
fig, ax = plt.subplots(figsize=(7, 7))
ax.set_xlim(-16, 16); ax.set_ylim(-16, 16); ax.set_aspect("equal")
ax.set_facecolor("#dcedc8")
ax.add_patch(mpatches.Rectangle((-15, -15), 30, 30,
fill=False, edgecolor="#795548", lw=2))
ax.add_patch(mpatches.Rectangle((10, -15), 3, 7,
facecolor="#ffe082", edgecolor="#795548", lw=2))
ax.text(11.5, -11.5, "pen", ha="center", va="center", fontsize=8)
dog_x = [float(r["dog_x"]) for r in rows]
dog_y = [float(r["dog_y"]) for r in rows]
ax.plot(dog_x, dog_y, color="#4e342e", lw=1.5, alpha=0.7, label="dog")
ax.plot(dog_x[0], dog_y[0], "s", color="#4e342e", ms=10)
ax.plot(dog_x[-1], dog_y[-1], "D", color="#4e342e", ms=10)
# Sheep — re-shape into per-sheep tracks
sx_all = [parse_floats(r["sheep_xs"]) for r in rows]
sy_all = [parse_floats(r["sheep_ys"]) for r in rows]
if sx_all and sx_all[-1]:
n_sheep = len(sx_all[-1])
palette = ["#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00",
"#a65628","#f781bf","#999999","#66c2a5","#fc8d62"]
for i in range(n_sheep):
xs = [r[i] if i < len(r) else None for r in sx_all]
ys = [r[i] if i < len(r) else None for r in sy_all]
xs = [x for x in xs if x is not None]
ys = [y for y in ys if y is not None]
if xs:
c = palette[i % len(palette)]
ax.plot(xs, ys, color=c, lw=0.8, alpha=0.6, label=f"sheep {i+1}")
ax.plot(xs[0], ys[0], "o", color=c, ms=6)
ax.plot(xs[-1], ys[-1], "*", color=c, ms=10)
n_in_pen = int(rows[-1]["n_penned"])
ax.set_title(f"Webots trajectory {len(rows)} steps penned={n_in_pen}",
fontsize=12)
ax.legend(loc="upper left", fontsize=7, ncol=2)
plt.tight_layout()
fig.savefig(out_path, dpi=120)
plt.close(fig)
def plot_actions(rows, out_path):
t = np.arange(len(rows))
vx = np.array([float(r["vx"]) for r in rows])
vy = np.array([float(r["vy"]) for r in rows])
mag = np.sqrt(vx ** 2 + vy ** 2)
fig, axes = plt.subplots(3, 1, figsize=(12, 7), sharex=True)
axes[0].plot(t, vx, color="tab:red", lw=0.8); axes[0].set_ylabel("vx")
axes[0].axhline(0, color="black", lw=0.4); axes[0].set_ylim(-1.1, 1.1)
axes[1].plot(t, vy, color="tab:blue", lw=0.8); axes[1].set_ylabel("vy")
axes[1].axhline(0, color="black", lw=0.4); axes[1].set_ylim(-1.1, 1.1)
axes[2].plot(t, mag, color="tab:purple", lw=0.8); axes[2].set_ylabel("||action||")
axes[2].axhline(np.sqrt(2), color="orange", ls="--", lw=1, label="saturated √2")
axes[2].axhline(1.0, color="gray", ls="--", lw=1)
axes[2].set_xlabel("step"); axes[2].legend(fontsize=8)
fig.suptitle("Webots action time series")
plt.tight_layout()
fig.savefig(out_path, dpi=120)
plt.close(fig)
def plot_obs(rows, out_path):
norm = np.array([parse_floats(r["norm_obs"]) for r in rows])
raw = np.array([parse_floats(r["raw_obs"]) for r in rows])
if norm.size == 0:
return
n_dims = norm.shape[1]
labels = [
"dog_x", "dog_y", "com-dog_x", "com-dog_y",
"far1-com_x", "far1-com_y", "far2-com_x", "far2-com_y",
"far3-com_x", "far3-com_y", "pen-com_x", "pen-com_y",
"pen-far1_x", "pen-far1_y", "radius", "frac_active",
][:n_dims]
t = np.arange(norm.shape[0])
fig, axes = plt.subplots(n_dims, 1, figsize=(11, 1.0 * n_dims), sharex=True)
if n_dims == 1: axes = [axes]
for i in range(n_dims):
axes[i].plot(t, raw[:, i], color="tab:gray", lw=0.6, alpha=0.6, label="raw")
axes[i].plot(t, norm[:, i], color="tab:red", lw=0.8, label="normalised")
axes[i].set_ylabel(labels[i], fontsize=8)
axes[i].tick_params(labelsize=7)
if i == 0:
axes[i].legend(fontsize=7, loc="upper right")
axes[-1].set_xlabel("step")
fig.suptitle("Observation values over time (raw vs VecNormalize-normalised)")
plt.tight_layout()
fig.savefig(out_path, dpi=110)
plt.close(fig)
def main():
p = argparse.ArgumentParser()
here = os.path.dirname(os.path.abspath(__file__))
p.add_argument("--csv", default=os.path.join(here, "debug.csv"))
p.add_argument("--out-dir", default=os.path.join(here, "debug_out"))
args = p.parse_args()
rows = load_csv(args.csv)
os.makedirs(args.out_dir, exist_ok=True)
print(f"loaded {len(rows)} rows from {args.csv}")
plot_trajectory(rows, os.path.join(args.out_dir, "trajectory.png"))
plot_actions(rows, os.path.join(args.out_dir, "actions.png"))
plot_obs(rows, os.path.join(args.out_dir, "obs.png"))
print(f"saved trajectory.png + actions.png + obs.png to {args.out_dir}/")
if __name__ == "__main__":
main()
@@ -1,285 +0,0 @@
"""
Shepherd Dog RL controller — runs a trained SB3 PPO policy inside Webots.
Setup
-----
1. Copy your trained files into this directory:
controllers/shepherd_dog_rl/final_model.zip
controllers/shepherd_dog_rl/vecnorm.pkl
2. In field.wbt, set the ShepherdDog robot's controller field to
"shepherd_dog_rl". You can do this in the Webots GUI:
click the robot → Controller → shepherd_dog_rl
3. Optional: set controllerArgs to ["5"] (number of sheep) if it differs
from the default of 5.
The controller reads GPS (dog position) and Receiver (sheep broadcasts),
builds the same 16-dim flock observation the training env used, normalises
it with the saved VecNormalize stats, and converts the (vx, vy) policy
output into differential wheel speeds.
Debug logging
-------------
Set env var DOG_DEBUG=1 to write a per-step CSV (dog pos, sheep positions,
raw obs, normalised obs, action) to debug.csv alongside this script. Use
plot_debug.py to render trajectories from it.
"""
import sys
import os
import math
import struct
import numpy as np
# ── make training code importable ───────────────────────────────────────────
_HERE = os.path.dirname(os.path.abspath(__file__))
_TRAINING = os.path.join(_HERE, "..", "..", "training")
sys.path.insert(0, _TRAINING)
from controller import Robot
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from herding_env import HerdingEnv
# ── constants (must match herding_env.py) ───────────────────────────────────
FIELD = 15.0
PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
PEN_X = (10.0, 13.0)
PEN_Y = (-15.0, -8.0)
DOG_SPEED = 2.5 # m/s
WHEEL_R = 0.038 # wheel radius (metres) — from ShepherdDog.proto
K_TURN = 4.0 # heading-error gain (rad/s per rad)
EAR_AMPLITUDE = 0.35
EAR_RATE = 8.0
# ── model paths ─────────────────────────────────────────────────────────────
MODEL_PATH = os.path.join(_HERE, "final_model.zip")
VECNORM_PATH = os.path.join(_HERE, "vecnorm.pkl")
DEBUG_CSV = os.path.join(_HERE, "debug.csv")
DEBUG_ENABLED = True # set False to disable debug.csv logging
# ── action smoothing ─────────────────────────────────────────────────────────
# EMA on policy output to suppress the rapid oscillation (vx/vy flipping
# between -1 and +1 every step) that stalls the physical dog. 0 = no
# smoothing (raw policy), 1 = frozen. 0.3 keeps ~30% of previous action.
ACTION_SMOOTH = 0.3
prev_action = np.zeros(2, dtype=np.float32)
def norm_angle(a: float) -> float:
while a > math.pi: a -= 2 * math.pi
while a < -math.pi: a += 2 * math.pi
return a
def in_pen(x: float, y: float) -> bool:
return PEN_X[0] < x < PEN_X[1] and PEN_Y[0] < y < PEN_Y[1]
def build_obs(dog_pos: np.ndarray,
sheep_dict: dict,
n_sheep: int,
dog_heading: float = 0.0) -> np.ndarray:
"""
Build the 18-dim flock observation — identical to HerdingEnv._obs().
sheep_dict: {name: (x, y)} for ALL known sheep (penned or not).
dog_heading: dog's current world-frame heading in radians.
"""
D = 2 * FIELD
# Split active vs penned
active_pos = np.array(
[v for v in sheep_dict.values() if not in_pen(*v)],
dtype=np.float32
)
n_active = len(active_pos)
if n_active > 0:
com = active_pos.mean(axis=0)
d_from_com = np.linalg.norm(active_pos - com, axis=1)
sorted_idx = np.argsort(d_from_com)[::-1]
radius = float(d_from_com[sorted_idx[0]])
def nth(n):
return active_pos[sorted_idx[n]] if len(sorted_idx) > n else com
far1, far2, far3 = nth(0), nth(1), nth(2)
else:
com = PEN_CENTER.copy()
radius = 0.0
far1 = far2 = far3 = PEN_CENTER.copy()
frac_active = n_active / max(n_sheep, 1)
return np.array([
dog_pos[0] / FIELD, dog_pos[1] / FIELD,
(com[0] - dog_pos[0]) / D, (com[1] - dog_pos[1]) / D,
(far1[0] - com[0]) / D, (far1[1] - com[1]) / D,
(far2[0] - com[0]) / D, (far2[1] - com[1]) / D,
(far3[0] - com[0]) / D, (far3[1] - com[1]) / D,
(PEN_CENTER[0] - com[0]) / D, (PEN_CENTER[1] - com[1]) / D,
(PEN_CENTER[0] - far1[0]) / D, (PEN_CENTER[1] - far1[1]) / D,
radius / D,
frac_active,
math.cos(dog_heading), math.sin(dog_heading),
], dtype=np.float32)
# ── Webots setup ─────────────────────────────────────────────────────────────
robot = Robot()
timestep = int(robot.getBasicTimeStep())
# Drive motors
left_motor = robot.getDevice("left wheel motor")
right_motor = robot.getDevice("right wheel motor")
left_motor.setPosition(float("inf"))
right_motor.setPosition(float("inf"))
left_motor.setVelocity(0.0)
right_motor.setVelocity(0.0)
MOTOR_MAX = left_motor.getMaxVelocity()
# Sensors
gps = robot.getDevice("gps"); gps.enable(timestep)
compass = robot.getDevice("compass"); compass.enable(timestep)
receiver = robot.getDevice("receiver"); receiver.enable(timestep)
emitter = robot.getDevice("emitter")
# Cosmetic
left_ear = robot.getDevice("left ear motor")
right_ear = robot.getDevice("right ear motor")
left_ear.setPosition(float("inf")); right_ear.setPosition(float("inf"))
left_ear.setVelocity(0.0); right_ear.setVelocity(0.0)
ear_phase = 0.0
# Number of sheep (from controllerArgs or default)
try:
n_sheep = int(sys.argv[1])
except (IndexError, ValueError):
n_sheep = 3
# ── Load model ───────────────────────────────────────────────────────────────
print(f"[RL dog] Loading model from {MODEL_PATH}")
print(f"[RL dog] Loading vecnorm from {VECNORM_PATH}")
dummy_env = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep)])
vecnorm = VecNormalize.load(VECNORM_PATH, dummy_env)
vecnorm.training = False
vecnorm.norm_reward = False
model = PPO.load(MODEL_PATH, device="cpu")
print(f"[RL dog] Model loaded — running with n_sheep={n_sheep}")
# ── Runtime state ─────────────────────────────────────────────────────────────
sheep_positions: dict = {} # {name: (x, y)} — updated every step from receiver
step_count = 0
# Debug CSV — written every step when DOG_DEBUG=1
debug_file = None
if DEBUG_ENABLED:
import csv
debug_file = open(DEBUG_CSV, "w", newline="")
debug_writer = csv.writer(debug_file)
debug_writer.writerow([
"step", "dog_x", "dog_y", "heading",
"sheep_xs", "sheep_ys", "n_active", "n_penned",
"raw_obs", "norm_obs", "vx", "vy",
])
print(f"[RL dog] DEBUG logging to {DEBUG_CSV}")
def bearing() -> float:
"""Current robot heading in world frame (radians)."""
n = compass.getValues()
return math.atan2(n[0], n[1])
def drive(action_vx: float, action_vy: float) -> None:
"""Convert (vx, vy) policy action to differential wheel speeds."""
speed_ms = math.sqrt(action_vx ** 2 + action_vy ** 2) * DOG_SPEED
if speed_ms < 0.05:
left_motor.setVelocity(0.0)
right_motor.setVelocity(0.0)
return
target_heading = math.atan2(action_vy, action_vx)
err = norm_angle(target_heading - bearing())
fwd_ms = speed_ms * max(0.0, math.cos(err))
fwd_rad = fwd_ms / WHEEL_R
turn = K_TURN * err # rad/s correction
l = max(-MOTOR_MAX, min(MOTOR_MAX, fwd_rad - turn))
r = max(-MOTOR_MAX, min(MOTOR_MAX, fwd_rad + turn))
left_motor.setVelocity(l)
right_motor.setVelocity(r)
# ── Main loop ─────────────────────────────────────────────────────────────────
while robot.step(timestep) != -1:
step_count += 1
# 1. Drain receiver — update sheep position table
while receiver.getQueueLength() > 0:
try:
msg = receiver.getString()
parts = msg.split(":")
if parts[0] == "sheep" and len(parts) == 4:
sheep_positions[parts[1]] = (float(parts[2]), float(parts[3]))
except Exception:
pass
receiver.nextPacket()
# 2. Dog GPS
gps_vals = gps.getValues()
dog_pos = np.array([gps_vals[0], gps_vals[1]], dtype=np.float32)
# 3. Build and normalise observation (heading from compass)
raw_obs = build_obs(dog_pos, sheep_positions, n_sheep,
dog_heading=bearing())
obs_norm = vecnorm.normalize_obs(raw_obs[np.newaxis]) # (1, 13)
# 4. Policy inference + smoothing
action, _ = model.predict(obs_norm, deterministic=True)
raw_a = np.array([float(action[0][0]), float(action[0][1])], dtype=np.float32)
if ACTION_SMOOTH > 0:
smoothed = ACTION_SMOOTH * prev_action + (1.0 - ACTION_SMOOTH) * raw_a
prev_action[:] = smoothed
vx, vy = float(smoothed[0]), float(smoothed[1])
else:
vx, vy = float(raw_a[0]), float(raw_a[1])
# 5. Drive
drive(vx, vy)
# 6. Broadcast dog position so sheep can compute flee forces
emitter.send(f"dog:{dog_pos[0]:.4f}:{dog_pos[1]:.4f}")
# 7. Ear animation
ear_phase += 0.12
ep = EAR_AMPLITUDE * math.sin(ear_phase)
left_ear.setVelocity(EAR_RATE); right_ear.setVelocity(EAR_RATE)
left_ear.setPosition( ep); right_ear.setPosition(-ep)
# Periodic status
if step_count % 100 == 0:
n_in_pen = sum(1 for x, y in sheep_positions.values() if in_pen(x, y))
print(f"[RL dog] step={step_count} known_sheep={len(sheep_positions)}"
f" penned={n_in_pen}/{n_sheep} dog=({dog_pos[0]:.2f},{dog_pos[1]:.2f})"
f" action=({vx:.2f}, {vy:.2f})")
# Debug CSV row
if debug_file is not None:
n_active = sum(1 for x, y in sheep_positions.values() if not in_pen(x, y))
n_in_pen = len(sheep_positions) - n_active
debug_writer.writerow([
step_count, f"{dog_pos[0]:.4f}", f"{dog_pos[1]:.4f}",
f"{bearing():.4f}",
";".join(f"{v[0]:.3f}" for v in sheep_positions.values()),
";".join(f"{v[1]:.3f}" for v in sheep_positions.values()),
n_active, n_in_pen,
";".join(f"{x:.4f}" for x in raw_obs),
";".join(f"{x:.4f}" for x in obs_norm[0]),
f"{vx:.4f}", f"{vy:.4f}",
])
if step_count % 200 == 0:
debug_file.flush()
Binary file not shown.
+9 -10
View File
@@ -6,28 +6,28 @@
- Nelson Neto <up202108117@up.pt> - Nelson Neto <up202108117@up.pt>
## (i) Title and General objectives ## (i) Title and General objectives
**RL-Based Autonomous Shepherd Robot for Livestock Herding** **Autonomous Shepherd Robot for Livestock Herding (Strömbom)**
- Implement effective herding behaviors through proximity and movement strategies - Implement effective herding behaviors through proximity and movement strategies
- Build a 3D environment with realistic robot dynamics and LIDAR-based perception - Build a 3D environment with realistic robot dynamics and LIDAR-based perception
- Develop a mobile robot capable of autonomously guiding a flock of sheep into a designated target area using Reinforcement Learning - Develop a mobile robot capable of autonomously guiding a flock of sheep into a designated target area using the Strömbom heuristic approach
# Group G25 - (ii) Intermediate Goals # Group G25 - (ii) Intermediate Goals
## Intermediate goals ## Intermediate goals
- Set up the Webots simulation environment with an open field and target zone - Set up the Webots simulation environment with an open field and target zone
- Implement lightweight Gymnasium-based 2D herding environment - Implement lightweight 2D herding environment for algorithm evaluation
- Design a Sheep and Dog robot - Design a Sheep and Dog robot
- Implement a sheep flocking model for fast RL iteration - Implement a sheep flocking model for fast Strömbom iteration
- Validate LiDAR sensor feedback for sheep detection and distance estimation - Validate LiDAR sensor feedback for sheep detection and distance estimation
# Group G25 - Course Project (Final) Goals # Group G25 - Course Project (Final) Goals
## (iii) Main goals ## (iii) Main goals
- State-of-the-art survey on shepherding algorithms and multi-agent RL herding - State-of-the-art survey on shepherding algorithms with focus on Strömbom herding
- Train the robot using PPO to successfully herd a single sheep into the goal - Implement and tune Strömbom controller to successfully herd a single sheep into the goal
- Achieve fully autonomous herding of multiple sheep and a full flock into the target area - Achieve fully autonomous herding of multiple sheep and a full flock into the target area
- Optimize robot trajectory to minimize the time required to group the flock - Optimize robot trajectory to minimize the time required to group the flock
- Ensure zero collisions between the robot and the sheep during the task - Ensure zero collisions between the robot and the sheep during the task
@@ -35,7 +35,7 @@
- Article, demo video, and final presentation - Article, demo video, and final presentation
## (iv) Extra Merit ## (iv) Extra Merit
- Curriculum Learning (scaling from 1 sheep to a flock) - Progressive evaluation (scaling from 1 sheep to a flock)
- Comparison of performance between Differential Drive and Mecanum wheels - Comparison of performance between Differential Drive and Mecanum wheels
- Robustness testing under sensor noise or varying sheep speeds, configurations and parameters - Robustness testing under sensor noise or varying sheep speeds, configurations and parameters
- Multi-shepherd cooperative mode: 2 dogs learn role specialization (collector vs. driver) - Multi-shepherd cooperative mode: 2 dogs learn role specialization (collector vs. driver)
@@ -46,11 +46,10 @@
## (v) Tools ## (v) Tools
- Webots for 3D physics simulation with ROS2 integration via `webots_ros2` package - Webots for 3D physics simulation with ROS2 integration via `webots_ros2` package
- Stable-Baselines3 for the PPO algorithm implementation - Gymnasium (OpenAI) for the simulation wrapper and evaluation tooling
- Gymnasium (OpenAI) for the RL environment wrapper (lightweight 2D herding env for fast RL training)
- Python as the primary programming language (sheep flocking model, reward shaping, evaluation) - Python as the primary programming language (sheep flocking model, reward shaping, evaluation)
## (vi) Limitations ## (vi) Limitations
- Computational Power: Training time might be high for complex flock behaviors - Computational Power: Large batch evaluation and parameter sweeps can still be time-consuming
- Sim-to-Real Gap: No real-world validation of the herding controller; project is simulation-only (2D + Webots 3D) - Sim-to-Real Gap: No real-world validation of the herding controller; project is simulation-only (2D + Webots 3D)
- Model Complexity: Simplified sheep behavior (scripted) may not account for all biological livestock nuances - Model Complexity: Simplified sheep behavior (scripted) may not account for all biological livestock nuances
+8
View File
@@ -0,0 +1,8 @@
"""Shared core for the shepherd herding project.
This package is the single source of truth for world geometry, sheep
flocking dynamics, differential-drive kinematics, observation building,
and the Strömbom heuristic. It is imported both by the Webots
controllers (for inference) and by the Gymnasium training environment
(for fast PPO rollouts), so the two paths cannot drift apart.
"""
+70
View File
@@ -0,0 +1,70 @@
"""Differential-drive kinematics matching the Webots robot specs.
The Webots controllers and the training env both use these helpers so the
sim and the real (Webots) physics agree to first order. They do not model
slip, wheel acceleration limits, or contact forces — Webots does that for
us at inference time. The training env has to be close enough that a
policy trained against this kinematic model still works when handed off
to ODE physics.
"""
import math
def kinematics_step(x, y, h, w_left, w_right, wheel_radius, wheel_base, dt):
"""Integrate one step of differential-drive forward kinematics.
Inputs
------
x, y : robot position (m)
h : robot heading (rad), 0 = +x axis
w_left, w_right : wheel angular velocities (rad/s)
wheel_radius, wheel_base : robot dimensions (m)
dt : timestep (s)
Returns (new_x, new_y, new_h).
"""
v = (w_right + w_left) * wheel_radius * 0.5
omega = (w_right - w_left) * wheel_radius / wheel_base
new_x = x + v * math.cos(h) * dt
new_y = y + v * math.sin(h) * dt
new_h = math.atan2(math.sin(h + omega * dt), math.cos(h + omega * dt))
return new_x, new_y, new_h
def velocity_to_wheels(vx, vy, h, max_linear, wheel_radius, max_wheel_omega,
k_turn=4.0):
"""Convert a desired (vx, vy) intent in [-1, 1]^2 to wheel speeds.
Mirrors ``drive_action`` in controllers/shepherd_dog/shepherd_dog.py:
forward speed scales by ``cos(err)`` (clamped to ±90°), and a P
controller on heading error contributes the wheel-rate differential.
"""
speed_ms = math.hypot(vx, vy) * max_linear
if speed_ms < 1e-3:
return 0.0, 0.0
target_h = math.atan2(vy, vx)
err = math.atan2(math.sin(target_h - h), math.cos(target_h - h))
clamped_err = max(-math.pi / 2, min(math.pi / 2, err))
fwd_ms = speed_ms * math.cos(clamped_err)
fwd_rad = fwd_ms / wheel_radius
turn = k_turn * err
left = max(-max_wheel_omega, min(max_wheel_omega, fwd_rad - turn))
right = max(-max_wheel_omega, min(max_wheel_omega, fwd_rad + turn))
return left, right
def heading_speed_to_wheels(heading, speed_motor, h, max_wheel_omega,
k_turn=4.0):
"""Sheep variant: speed already expressed in motor (wheel rad/s) units.
Matches the existing sheep controller (``controllers/sheep/sheep.py``)
where ``speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))`` and
these constants are wheel angular velocities, not linear m/s.
"""
err = math.atan2(math.sin(heading - h), math.cos(heading - h))
fwd = max(0.0, math.cos(err)) * speed_motor
turn = k_turn * err
left = max(-max_wheel_omega, min(max_wheel_omega, fwd - turn))
right = max(-max_wheel_omega, min(max_wheel_omega, fwd + turn))
return left, right
+178
View File
@@ -0,0 +1,178 @@
"""Reynolds-style sheep flocking dynamics.
This is the per-sheep behavioural step used both by the Webots sheep
controller (scalar, one sheep at a time) and by the training environment
(loop over sheep). The numerics are adapted from the original
``controllers/sheep/flocking.py`` and retuned for the new external-pen
layout: the south stone wall is intact except in the gate column, so
sheep can only reach the pen by walking through that 3-m corridor.
Force stack each step (summed → heading + speed):
flee — quadratic ramp away from dog within FLEE_DIST
cohesion — drift toward flock centre, halved while fleeing
separation — inverse-distance push from peers
walls — soft repulsion + hard escape band against field walls,
except inside the gate column where the south wall is
absent
wander — small persistent drift for natural idle motion
A sheep latches to ``penned`` the first time it crosses the gate plane
into the gate column (handled by callers via ``geometry.is_penned_position``);
once latched, ``penned=True`` is passed in here and the force stack
switches to in-pen containment + jitter.
"""
import math
import random
from herding.geometry import (
FIELD_X, FIELD_Y,
PEN_X, PEN_Y,
GATE_X,
)
# --- Speed and force constants ---
# All speeds here are in wheel rad/s (motor units), matching the existing
# sheep controller. Conversion to m/s = speed * SHEEP_WHEEL_RADIUS.
MAX_SPEED = 22.0
FLEE_SPEED = 20.0
WANDER_SPEED = 3.0
WALL_MARGIN = 5.0
WALL_HARD_MARGIN = 1.0
WALL_HARD_GAIN = 50.0
FLEE_DIST = 7.0
SEPARATION_DIST = 2.5
COHESION_DIST = 8.0
PEN_MARGIN = 0.8
def _peers_iter(peers):
"""Accept either a {name: (x, y)} dict or an iterable of (x, y) tuples."""
if isinstance(peers, dict):
return list(peers.values())
return list(peers)
def compute_heading_speed(x, y, penned, dog_xy, peers, wander_angle, rng=None):
"""Return ``(heading, speed, new_wander_angle)`` for one sheep step.
``speed`` is in wheel rad/s (motor units), bounded by ``[WANDER_SPEED,
FLEE_SPEED]``. ``heading`` is the world-frame target heading the sheep
should aim for (atan2 convention).
``rng`` is an optional ``random.Random``-compatible object used for
the wander-jitter. If ``None``, falls back to Python's global module
(matches Webots controller usage). Pass an env-owned RNG to make
rollouts deterministic given a seed.
"""
fx, fy = 0.0, 0.0
peer_list = _peers_iter(peers)
rnd = rng if rng is not None else random
if penned:
# --- Pen containment: bounce off the four pen walls ---
pm = PEN_MARGIN
if x < PEN_X[0] + pm:
fx += ((PEN_X[0] + pm - x) / pm) * 15.0
if x > PEN_X[1] - pm:
fx -= ((x - (PEN_X[1] - pm)) / pm) * 15.0
if y < PEN_Y[0] + pm:
fy += ((PEN_Y[0] + pm - y) / pm) * 15.0
if y > PEN_Y[1] - pm:
fy -= ((y - (PEN_Y[1] - pm)) / pm) * 15.0
# Mild peer separation — penned sheep crowd the corner otherwise.
for px, py in peer_list:
dx, dy = px - x, py - y
d = math.hypot(dx, dy)
if 0.05 < d < SEPARATION_DIST:
push = (SEPARATION_DIST - d) / d
fx -= (dx / d) * push * 2.5
fy -= (dy / d) * push * 2.5
if rnd.random() < 0.02:
wander_angle += rnd.uniform(-0.6, 0.6)
fx += math.cos(wander_angle) * 0.5
fy += math.sin(wander_angle) * 0.5
else:
# --- Free-roaming sheep in the field ---
fleeing = False
if dog_xy is not None:
ddx = dog_xy[0] - x
ddy = dog_xy[1] - y
dist = math.hypot(ddx, ddy)
if 0.01 < dist < FLEE_DIST:
fleeing = True
t = 1.0 - dist / FLEE_DIST
s = t * t * 20.0
fx -= (ddx / dist) * s
fy -= (ddy / dist) * s
# Cohesion — drift toward flock CoM (peers within COHESION_DIST).
# Cohesion is *stronger* under flee than at rest (the
# predator-confusion / safety-in-numbers effect — sheep huddle when
# threatened). This is what makes shepherding work: the flock stays
# as one unit through the narrow gate instead of fragmenting.
cx, cy, cn = 0.0, 0.0, 0
for px, py in peer_list:
d = math.hypot(px - x, py - y)
if 0.3 < d < COHESION_DIST:
cx += px
cy += py
cn += 1
if cn > 0:
# Cohesion needs to be comparable to flee at close range to keep
# the flock together through narrow obstacles like the 3m gate.
# Flee at 2m has magnitude ~10; cohesion at peer-distance 5m
# with w=1.5 contributes ~7.5 — same order, so the flock
# translates as a unit instead of fragmenting under pressure.
w = 1.5 if fleeing else 0.6
fx += (cx / cn - x) * w
fy += (cy / cn - y) * w
# Separation — inverse-distance push from peers.
for px, py in peer_list:
ddx, ddy = px - x, py - y
d = math.hypot(ddx, ddy)
if 0.05 < d < SEPARATION_DIST:
push = (SEPARATION_DIST - d) / d
fx -= (ddx / d) * push * 2.5
fy -= (ddy / d) * push * 2.5
# Wall soft repulsion. The south wall is absent inside the gate
# column so sheep can be driven through it by the dog.
if x < FIELD_X[0] + WALL_MARGIN:
fx += ((FIELD_X[0] + WALL_MARGIN - x) / WALL_MARGIN) * 6.0
if x > FIELD_X[1] - WALL_MARGIN:
fx -= ((x - (FIELD_X[1] - WALL_MARGIN)) / WALL_MARGIN) * 6.0
if y > FIELD_Y[1] - WALL_MARGIN:
fy -= ((y - (FIELD_Y[1] - WALL_MARGIN)) / WALL_MARGIN) * 6.0
if y < FIELD_Y[0] + WALL_MARGIN and not (GATE_X[0] <= x <= GATE_X[1]):
fy += ((FIELD_Y[0] + WALL_MARGIN - y) / WALL_MARGIN) * 6.0
if not fleeing:
if random.random() < 0.02:
wander_angle += random.uniform(-0.6, 0.6)
fx += math.cos(wander_angle) * 0.5
fy += math.sin(wander_angle) * 0.5
# --- Hard escape band — overrides everything when very close to a wall ---
m, g = WALL_HARD_MARGIN, WALL_HARD_GAIN
if x - FIELD_X[0] < m:
fx = max(fx, g * (1.0 - (x - FIELD_X[0]) / m))
if FIELD_X[1] - x < m:
fx = min(fx, -g * (1.0 - (FIELD_X[1] - x) / m))
if FIELD_Y[1] - y < m:
fy = min(fy, -g * (1.0 - (FIELD_Y[1] - y) / m))
# South wall hard escape only when not in the gate column and not penned.
if (not penned) and (y - FIELD_Y[0] < m) and not (GATE_X[0] <= x <= GATE_X[1]):
fy = max(fy, g * (1.0 - (y - FIELD_Y[0]) / m))
heading = math.atan2(fy, fx)
mag = math.hypot(fx, fy)
speed = max(WANDER_SPEED, min(FLEE_SPEED, mag * 3.0))
return heading, speed, wander_angle
+99
View File
@@ -0,0 +1,99 @@
"""World geometry and robot specs.
All coordinates are in meters. (0, 0) is the centre of the field, +x is
east, +y is north. Z is up but unused here. These constants must match
``worlds/field.wbt`` and the proto files; if the world changes, change
this file and only this file.
Pen layout (post-refactor)
--------------------------
The pen is *external* to the field, accessed through a 3 m gate cut into
the south stone wall at y = -15. Sheep entering through the gate end up
in a fenced rectangle south of the field; the dog stays in the field
(soft-limited above DOG_SOUTH_LIMIT during training and inference).
field +y north
+-----------+
| |
| |
| ...... |
+---||||----+ y = -15 (south wall, gate at x ∈ [10, 13])
||||
|pen| y ∈ [-22, -15]
+---+
"""
import math
# --- Field (square, stone-walled) ---
FIELD_X = (-15.0, 15.0)
FIELD_Y = (-15.0, 15.0)
# Conservative inside bounds — sheep/dog should not graze the wall.
FIELD_INSIDE_MARGIN = 0.5
# --- Pen (external, south of the field) ---
PEN_X = (10.0, 13.0)
PEN_Y = (-22.0, -15.0)
PEN_CENTER = (0.5 * (PEN_X[0] + PEN_X[1]), 0.5 * (PEN_Y[0] + PEN_Y[1]))
# The point the dog drives the flock toward: the gate centre on the field side.
PEN_ENTRY = (0.5 * (PEN_X[0] + PEN_X[1]), -15.0)
# --- Gate (the hole in the south stone wall) ---
GATE_X = PEN_X
GATE_Y = -15.0
# --- Robot specs (must match proto files) ---
# Dog (controllers/shepherd_dog/, protos/ShepherdDog.proto)
DOG_WHEEL_RADIUS = 0.038 # m
DOG_WHEEL_BASE = 0.28 # m, axle-to-axle
DOG_MAX_WHEEL_OMEGA = 70.0 # rad/s
DOG_MAX_LINEAR = DOG_WHEEL_RADIUS * DOG_MAX_WHEEL_OMEGA # ~2.66 m/s
# Sheep (controllers/sheep/, protos/Sheep.proto)
SHEEP_WHEEL_RADIUS = 0.031 # m
SHEEP_WHEEL_BASE = 0.20 # m
SHEEP_MAX_WHEEL_OMEGA = 25.0 # rad/s
SHEEP_MAX_LINEAR = SHEEP_WHEEL_RADIUS * SHEEP_MAX_WHEEL_OMEGA # ~0.78 m/s
# --- Webots step ---
WEBOTS_DT = 0.016 # seconds, matches WorldInfo.basicTimeStep = 16 in field.wbt
# --- Dog "virtual south wall" (training keeps dog out of the pen) ---
# At inference the controller also clips to this so a slightly miscalibrated
# policy doesn't accidentally drive into the pen and trap the sheep.
DOG_SOUTH_LIMIT = -14.5
# --- Maximum supported flock size ---
MAX_SHEEP = 10
def in_pen(x: float, y: float) -> bool:
"""True if (x, y) lies inside the external pen rectangle."""
return PEN_X[0] < x < PEN_X[1] and PEN_Y[0] < y < PEN_Y[1]
def in_field(x: float, y: float, margin: float = 0.0) -> bool:
return (FIELD_X[0] + margin <= x <= FIELD_X[1] - margin
and FIELD_Y[0] + margin <= y <= FIELD_Y[1] - margin)
def in_gate_corridor(x: float, y: float, margin: float = 0.0) -> bool:
"""True if (x, y) lies in the column of the gate (between field and pen)."""
return (PEN_X[0] - margin <= x <= PEN_X[1] + margin
and PEN_Y[0] - margin <= y <= GATE_Y + margin)
def is_penned_position(x: float, y: float, latch_margin: float = 0.2) -> bool:
"""A sheep latches to "penned" once it crosses the gate plane south.
True iff x is inside the gate column (with a small margin) AND
y has dipped below the gate line. Once latched, the sheep is held by
in-pen forces and will not exit on its own.
"""
return (PEN_X[0] - latch_margin <= x <= PEN_X[1] + latch_margin
and y <= GATE_Y)
def distance_to_pen_entry(x: float, y: float) -> float:
return math.hypot(x - PEN_ENTRY[0], y - PEN_ENTRY[1])
+137
View File
@@ -0,0 +1,137 @@
"""Observation builder for the shepherd dog policy.
Order-invariant 32-D feature vector — the policy generalises across
flock sizes 1..MAX_SHEEP because individual sheep coordinates never
appear in the observation by index, only summary statistics, a polar
histogram, and two "named" sheep (closest-to-pen and rearmost-from-pen).
The two named sheep matter for the sequential-driving teacher: it
targets the closest-to-pen sheep specifically, so the policy needs
that channel to mimic the teacher.
Layout (all components normalised so values stay roughly in [-1, 1]):
idx field
----- ----------------------------------------------------------
0..3 dog pose: x/15, y/15, cos(heading), sin(heading)
4..5 active-sheep CoM x/15, y/15
6..8 flock dispersion: max-radius/15, std_x/15, std_y/15
9..11 vector dog→CoM: dx/30, dy/30, dist/30
12..14 vector dog→pen-entry: dx/30, dy/30, dist/30
15..16 vector furthest-sheep→CoM: dx/15, dy/15
17..18 min sheep-to-wall, min dog-to-wall (both /15)
19 active-sheep count / MAX_SHEEP
20..27 8-bin polar histogram of active sheep around the dog,
rotation-aware (binned in dog-relative frame), normalised
so the bins sum to 1.
28..29 vector dog→closest-to-pen sheep: dx/15, dy/15
30..31 vector dog→rearmost (furthest-from-pen) sheep: dx/15, dy/15
"""
import math
import numpy as np
from herding.geometry import (
FIELD_X, FIELD_Y, PEN_ENTRY, MAX_SHEEP,
)
OBS_DIM = 32
def build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list,
n_max: int = MAX_SHEEP) -> np.ndarray:
"""Assemble the dog policy's observation vector.
Parameters
----------
dog_xy : tuple (x, y) of the dog's GPS position (m)
dog_heading : dog heading in rad
sheep_xy_list : iterable of (x, y) for ALL known sheep
sheep_penned_list : parallel iterable of bool — True if sheep is penned
n_max : maximum supported flock size used for the count normaliser
"""
dog_x, dog_y = dog_xy
obs = np.zeros(OBS_DIM, dtype=np.float32)
obs[0] = dog_x / 15.0
obs[1] = dog_y / 15.0
obs[2] = math.cos(dog_heading)
obs[3] = math.sin(dog_heading)
active = [(x, y) for (x, y), p
in zip(sheep_xy_list, sheep_penned_list) if not p]
n = len(active)
pdx0, pdy0 = PEN_ENTRY[0] - dog_x, PEN_ENTRY[1] - dog_y
obs[12] = pdx0 / 30.0
obs[13] = pdy0 / 30.0
obs[14] = math.hypot(pdx0, pdy0) / 30.0
if n == 0:
# All sheep penned — terminal observation.
obs[19] = 0.0
return obs
arr = np.asarray(active, dtype=np.float32)
com_x = float(arr[:, 0].mean())
com_y = float(arr[:, 1].mean())
rel = arr - np.array([com_x, com_y], dtype=np.float32)
dists = np.hypot(rel[:, 0], rel[:, 1])
radius = float(dists.max())
std_x = float(arr[:, 0].std())
std_y = float(arr[:, 1].std())
obs[4] = com_x / 15.0
obs[5] = com_y / 15.0
obs[6] = radius / 15.0
obs[7] = std_x / 15.0
obs[8] = std_y / 15.0
cdx, cdy = com_x - dog_x, com_y - dog_y
obs[9] = cdx / 30.0
obs[10] = cdy / 30.0
obs[11] = math.hypot(cdx, cdy) / 30.0
far_idx = int(np.argmax(dists))
obs[15] = float(rel[far_idx, 0]) / 15.0
obs[16] = float(rel[far_idx, 1]) / 15.0
min_sheep_wall = min(
float(np.min(arr[:, 0] - FIELD_X[0])),
float(np.min(FIELD_X[1] - arr[:, 0])),
float(np.min(arr[:, 1] - FIELD_Y[0])),
float(np.min(FIELD_Y[1] - arr[:, 1])),
)
min_dog_wall = min(
dog_x - FIELD_X[0], FIELD_X[1] - dog_x,
dog_y - FIELD_Y[0], FIELD_Y[1] - dog_y,
)
obs[17] = min_sheep_wall / 15.0
obs[18] = float(min_dog_wall) / 15.0
obs[19] = n / n_max
# 8-bin polar histogram in the dog's body frame.
rel_dx = arr[:, 0] - dog_x
rel_dy = arr[:, 1] - dog_y
angles = np.arctan2(rel_dy, rel_dx) - dog_heading
angles = np.arctan2(np.sin(angles), np.cos(angles))
bins = np.floor((angles + math.pi) / (2 * math.pi) * 8).astype(int)
bins = np.clip(bins, 0, 7)
hist = np.bincount(bins, minlength=8).astype(np.float32)
hist /= max(1, n)
obs[20:28] = hist
# Closest-to-pen sheep (the sequential teacher's target) and rearmost
# (furthest-from-pen, the natural "next target" once the closest is
# penned). Both expressed as offset from dog. These two channels make
# BC tractable — without them the obs doesn't uniquely identify which
# sheep the teacher is steering toward.
pen_dists = np.hypot(arr[:, 0] - PEN_ENTRY[0], arr[:, 1] - PEN_ENTRY[1])
closest_idx = int(np.argmin(pen_dists))
rearmost_idx = int(np.argmax(pen_dists))
obs[28] = (float(arr[closest_idx, 0]) - dog_x) / 15.0
obs[29] = (float(arr[closest_idx, 1]) - dog_y) / 15.0
obs[30] = (float(arr[rearmost_idx, 0]) - dog_x) / 15.0
obs[31] = (float(arr[rearmost_idx, 1]) - dog_y) / 15.0
return obs
+98
View File
@@ -0,0 +1,98 @@
"""Sequential single-target shepherd dog algorithm.
Strömbom drives the flock's centre of mass; with N sheep and a narrow
3 m gate, this fails because the flock is wider than the gate and CoM
driving abandons stragglers. Real sheepdogs solve this differently:
they pick *one* sheep at a time, drive it through, return for the next.
This module implements that "pin-and-push" approach.
Algorithm (one step):
1. Active sheep = those still in the field (not yet penned).
2. Target = the active sheep currently closest to the pen entry.
3. Drive position = ``target + Δ · unit(target pen_entry)`` —
directly behind the target relative to the goal.
4. Output unit vector pointing the dog at the drive position.
Once the target crosses the gate it latches as penned and is removed
from the active set; the next-closest unpenned sheep becomes the
target. The algorithm naturally "queues" sheep through the gate.
Empirically (with our flocking dynamics) this scales linearly with
flock size and works up to at least n=10 within a 15 000-step budget.
"""
import math
from herding.geometry import GATE_Y, PEN_ENTRY, in_pen
DELTA_DRIVE = 1.5 # standoff behind the target sheep
APPROACH_GAIN = 1.0 # action magnitude scale (1 = full speed)
def _unit(x, y):
d = math.hypot(x, y)
if d < 1e-6:
return 0.0, 0.0
return x / d, y / d
def _is_active(x, y) -> bool:
return (not in_pen(x, y)) and y > GATE_Y
def compute_action(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
"""Return ``(vx, vy, mode)`` where mode encodes the current target.
Compatible with the Strömbom call signature so it can be drop-in
swapped in the dog controller and the env's imitation reward.
"""
active = [(name, x, y) for name, (x, y) in sheep_positions.items()
if _is_active(x, y)]
if not active:
return 0.0, 0.0, "idle"
# Pick target = sheep closest to pen entry. Stable choice: as one
# sheep approaches and crosses the gate it stays the target until
# latched; then the next-closest takes over.
name, sx, sy = min(
active,
key=lambda s: math.hypot(s[1] - pen_target[0], s[2] - pen_target[1]),
)
# Drive position behind the target along the (target → pen) line.
ux, uy = _unit(sx - pen_target[0], sy - pen_target[1])
tx = sx + DELTA_DRIVE * ux
ty = sy + DELTA_DRIVE * uy
ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
return APPROACH_GAIN * ax, APPROACH_GAIN * ay, f"drive:{name}"
def compute_action_debug(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
"""Debug variant returning ``(vx, vy, mode, debug_dict)``."""
active = [(name, x, y) for name, (x, y) in sheep_positions.items()
if _is_active(x, y)]
if not active:
return 0.0, 0.0, "idle", {
"n_active": 0, "target_name": "",
"target_x": 0.0, "target_y": 0.0,
"drive_x": dog_xy[0], "drive_y": dog_xy[1],
}
name, sx, sy = min(
active,
key=lambda s: math.hypot(s[1] - pen_target[0], s[2] - pen_target[1]),
)
ux, uy = _unit(sx - pen_target[0], sy - pen_target[1])
tx = sx + DELTA_DRIVE * ux
ty = sy + DELTA_DRIVE * uy
ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
return APPROACH_GAIN * ax, APPROACH_GAIN * ay, f"drive:{name}", {
"n_active": len(active), "target_name": name,
"target_x": sx, "target_y": sy,
"drive_x": tx, "drive_y": ty,
}
+114
View File
@@ -0,0 +1,114 @@
"""Strömbom collect/drive heuristic for the shepherd dog.
Adapted from the original ``controllers/shepherd_dog/strombom.py`` and
updated for the external pen layout. Used as a baseline controller and
as the fallback when the RL policy isn't available.
Reference: Strömbom et al. 2014, "Solving the shepherding problem".
"""
import math
from herding.geometry import PEN_ENTRY, GATE_Y, in_pen
# Algorithm parameters. DELTA_DRIVE / DELTA_COLLECT were tightened from
# the original (4.0 / 2.5) because the new external pen sits ~26 m from
# typical sheep spawn locations — at the old 4 m standoff, the flee force
# (quadratic ramp, 3.7 at 4 m vs ~10 at 2 m) couldn't move sheep through
# the path inside the 3000-step episode budget.
#
# F_FACTOR was 2.0 in the original Strömbom paper; raised to 4.0 here so
# the dog stays in *drive* mode much longer. With our tighter cohesion
# (flocking_sim.py), partially-collected flocks consolidate naturally
# during a drive, and we don't waste 80% of the time budget on a slow
# "collect" pre-phase.
F_FACTOR = 4.0
DELTA_COLLECT = 1.5
DELTA_DRIVE = 2.0
def _unit(x, y):
d = math.hypot(x, y)
if d < 1e-6:
return 0.0, 0.0
return x / d, y / d
def _is_active(x, y) -> bool:
"""A sheep is "active" if it's still in the field — not in or below
the gate plane (we treat anything south of the gate as committed to
the pen and stop trying to herd it)."""
return (not in_pen(x, y)) and y > GATE_Y
def compute_action(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
"""Return ``(vx, vy, mode)`` — mode in {idle, collect, drive}.
``sheep_positions`` is a ``{name: (x, y)}`` mapping (matches the
Webots controller's representation).
"""
active = [(x, y) for (x, y) in sheep_positions.values() if _is_active(x, y)]
if not active:
return 0.0, 0.0, "idle"
n = len(active)
com_x = sum(p[0] for p in active) / n
com_y = sum(p[1] for p in active) / n
dists = [math.hypot(p[0] - com_x, p[1] - com_y) for p in active]
radius = max(dists)
if radius > F_FACTOR * math.sqrt(n):
# Collect: aim at a point behind the furthest sheep, opposite the CoM.
idx = max(range(n), key=lambda i: dists[i])
sx, sy = active[idx]
ux, uy = _unit(sx - com_x, sy - com_y)
tx, ty = sx + DELTA_COLLECT * ux, sy + DELTA_COLLECT * uy
mode = "collect"
else:
# Drive: aim at a point behind the flock CoM relative to the goal.
ux, uy = _unit(com_x - pen_target[0], com_y - pen_target[1])
tx, ty = com_x + DELTA_DRIVE * ux, com_y + DELTA_DRIVE * uy
mode = "drive"
ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
return ax, ay, mode
def compute_action_debug(dog_xy, sheep_positions, pen_target=PEN_ENTRY):
"""Variant of compute_action that also returns a small debug dict.
Kept for parity with the legacy controller's CSV logger.
"""
active = [(x, y) for (x, y) in sheep_positions.values() if _is_active(x, y)]
if not active:
return 0.0, 0.0, "idle", {
"n_active": 0, "radius": 0.0, "threshold": 0.0,
"com_x": 0.0, "com_y": 0.0,
"target_x": dog_xy[0], "target_y": dog_xy[1],
}
n = len(active)
com_x = sum(p[0] for p in active) / n
com_y = sum(p[1] for p in active) / n
dists = [math.hypot(p[0] - com_x, p[1] - com_y) for p in active]
radius = max(dists)
threshold = F_FACTOR * math.sqrt(n)
if radius > threshold:
idx = max(range(n), key=lambda i: dists[i])
sx, sy = active[idx]
ux, uy = _unit(sx - com_x, sy - com_y)
tx, ty = sx + DELTA_COLLECT * ux, sy + DELTA_COLLECT * uy
mode = "collect"
else:
ux, uy = _unit(com_x - pen_target[0], com_y - pen_target[1])
tx, ty = com_x + DELTA_DRIVE * ux, com_y + DELTA_DRIVE * uy
mode = "drive"
ax, ay = _unit(tx - dog_xy[0], ty - dog_xy[1])
dbg = {
"n_active": n, "radius": radius, "threshold": threshold,
"com_x": com_x, "com_y": com_y,
"target_x": tx, "target_y": ty,
}
return ax, ay, mode, dbg
+458
View File
@@ -0,0 +1,458 @@
# RL-Driven Shepherd Herding — Implementation Plan
This plan turns the existing Strömbom-only Webots project into a dual-mode
shepherd controller (RL primary, Strömbom fallback), with a fast Gymnasium
training environment that mirrors the Webots dynamics tightly enough for
sim-to-sim transfer. Stable-Baselines3 PPO is the learner.
---
## 1. Current state (audit)
### World geometry — `worlds/field.wbt`
- Field bounded by stone walls at **x,y ∈ [15, +15]**. Inside-usable area is
~[14.5, 14.5] (`X_MIN/MAX` in `flocking.py`).
- **Pen is *inside* the field**: x ∈ [10, 13], y ∈ [15, 8], with the
opening on its **north** side at y = 8 (post-and-rail fence W/E; open N).
- South stone wall has a **gate at x ∈ [10, 13], y = 15** (split wall +
gate posts at x=10 and x=13). So sheep that get penned end up between the
fence (N side at y=8) and the south stone wall (with the wooden gate at
y=15 currently slightly ajar). The pen is effectively an L-shape inside
the field, not external.
- Spawns: dog at origin (0, 0), 3 sheep around (3, ±2) and (4, 0). Two more
sheep are commented out.
### Robots — protos
- **Sheep** (`protos/Sheep.proto`): differential drive, wheel radius 0.031 m,
axle half-width 0.10 m → wheel base 0.20 m. `maxVelocity = 25 rad/s`
max linear ≈ **0.78 m/s**. Sensors: GPS, Compass, Emitter+Receiver on
channel 1. `supervisor = TRUE` (used to repaint wool pink on pen entry).
- **ShepherdDog** (`protos/ShepherdDog.proto`): differential drive, wheel
radius 0.038 m, axle half-width 0.14 m → wheel base 0.28 m.
`maxVelocity = 70 rad/s` → max linear ≈ **2.66 m/s**. Sensors: GPS,
Compass, Gyro, Accelerometer, **Lidar** (front-only, FOV 2.44 rad ≈ 140°,
180 rays, range 0.1012 m, noise 0.005), Emitter+Receiver on channel 1,
cosmetic ear/tail motors.
### Sheep controller — `controllers/sheep/{sheep.py,flocking.py}`
- Reynolds-style boid stack: flee (quadratic ramp inside FLEE_DIST=7 m),
cohesion (within 8 m), separation (within 2.5 m), wall soft repulsion
(margin 5 m), wall hard escape (margin 1 m, gain 50), wander.
- Pen-aware: sheep below the gate line but outside the gate corridor get a
northward "deadzone" assist; on first entry into the pen rectangle,
sheep latches `penned=True`, repaints pink, and switches to in-pen
containment + jitter.
- Driver: heading-error PD on diff-drive (k=4), forward velocity scaled by
`cos(err)`, MAX_SPEED=22 (motor units, capped by proto's 25 rad/s).
- Stuck detector: if displacement < 0.05 m for 20 steps, drives toward
field origin to escape wall-pin (a known differential-drive failure mode).
### Dog controller — `controllers/shepherd_dog/{shepherd_dog.py,strombom.py}`
- Strömbom collect/drive heuristic. CoM-radius gating
`radius > F·√n` with F=2 selects collect (push furthest sheep inward) vs
drive (push CoM toward the pen entry point at (11.5, 8.0)).
- Deadzone rescue: when a sheep is below the gate line and outside the
pen's x-corridor, the dog repositions to a "behind the sheep, opposite
the pen" stand-off so the sheep's flee vector points back through the
gate. Variants 0/1 alternate lateral offset to break corner cycles.
- Stuck-rescue, EMA action smoothing, target-deadband, RESCUE_SPEED_CAP,
cooldown — all empirical fixes for diff-drive oscillation.
- Logs full per-step debug to `dog_behavior_log.csv` (currently 7 MB —
add to `.gitignore`).
### Deleted training scaffolding (per `git status`)
- `controllers/shepherd_dog_rl/{shepherd_dog_rl.py, final_model.zip, vecnorm.pkl, plot_debug.py}`
- `training/{config.json, herding_env.py, parity_test.py, requirements.txt, train.py, train_at.py, viz.py, runs/.gitkeep}`
A previous attempt existed; we'll redesign rather than resurrect, keeping
only the lessons (parity-tested env, VecNormalize wrapper, eval cadence).
---
## 2. Design decisions
### 2.1 Pen location — keep inside-field with N gate
The user offered moving the pen *external* (through a wall hole). Tradeoffs:
| Option | Pros | Cons |
|---|---|---|
| **(A) Keep inside-field** (current) | World already built; Strömbom logic already tuned; gate corridor is short | Dog must navigate around three pen walls; adds geometric clutter |
| (B) External pen via wall hole | Cleaner field — dog only sees sheep + outer walls; pen as goal region beyond a 3 m hole at y=15 | Requires editing `field.wbt` (split south wall, add external pen walls beyond y<15); existing rescue/deadzone logic must be retuned; outside-field flocking constants don't currently apply |
**Recommendation: keep (A)** for parity with the working Strömbom controller,
but add a **simplification**: widen the pen entrance from 3 m (x ∈ [10, 13])
to 4 m (x ∈ [9.5, 13.5]) and raise the entrance line from y=8 to y=7.5
to give the dog more turning room. Optional later: gate B as a curriculum
extension (Section 7).
### 2.2 Where to train
PPO on Webots directly is too slow (real-time stepping, single env, slow
reset). The previous training scaffolding used a Python 2D sim — that is
the right approach. Constraints for sim-to-sim transfer:
1. **Use the exact same flocking math**: import `controllers/sheep/flocking.py`
from the env, do not reimplement.
2. **Use the same world constants**: import `controllers/shepherd_dog/strombom.py`
for pen geometry and Strömbom baseline.
3. **Model differential drive faithfully**: match wheel-radius, base, and
max wheel-velocity from the proto files. Heading update from
`(ω_R ω_L)·r / b`, position from `(ω_R + ω_L)·r / 2`.
4. **Match Webots step**: `basicTimeStep = 16 ms`. The sheep controller runs
at every basic step; the env will use the same `dt = 0.016 s`.
5. **Lidar deferred**: dog policy will use a *symbolic* observation
(positions of dog + sheep, plus pen geometry) — not raw lidar — for the
first iteration. Lidar-from-pixels is a much harder learning problem
and isn't required for the herding task. (See Section 7 for an
optional later upgrade.)
### 2.3 Action space for the dog
Two viable choices:
- **(a) High-level velocity vector** `(vx, vy) ∈ [1, 1]²`. The same
representation Strömbom emits today; the existing
`drive_action(vx, vy, ...)` function in `shepherd_dog.py` converts this
to wheel speeds. Decouples the policy from low-level diff-drive
oscillations and enables direct A/B against Strömbom.
- (b) Direct wheel speeds `(ω_L, ω_R) ∈ [1, 1]²`. More expressive but the
policy must learn diff-drive control from scratch — which is exactly
the source of the wall-stuck and oscillation pain we're trying to
avoid.
**Recommendation: (a)** — high-level `(vx, vy)`. Reuses the well-tuned
`drive_action` controller, which already handles `cos(err)` clamping and
turn gain. RL focuses on *strategy*, not actuation.
### 2.4 Observation space for the dog
Symbolic, fixed-size, normalized to [1, 1]:
| Field | Dim | Notes |
|---|---|---|
| Dog (x, y, cos h, sin h) | 4 | Position normalized by 15 |
| Sheep CoM (x, y) | 2 | Of *active* (not-penned) sheep |
| Sheep dispersion (radius, std-x, std-y) | 3 | Strömbom collect-vs-drive features |
| Vector dog→CoM (dx, dy, dist) | 3 | Helps the value function |
| Vector dog→pen-entry (dx, dy, dist) | 3 | |
| Vector furthest-sheep→CoM (dx, dy) | 2 | Strömbom collect target hint |
| Min sheep-to-wall distance + min dog-to-wall | 2 | Safety signal |
| Active sheep count / N_max | 1 | |
| 8-bin polar histogram of sheep around dog | 8 | Order-invariant flock shape |
Total: **28 features**. Order-invariant by construction (histogram + summary
stats), so the policy generalizes across flock sizes 1..N_max.
### 2.5 Reward
Sparse-only is too hard at flock scale; we shape conservatively.
```
r_t = w_pen · ΔN_penned # +1 per newly penned sheep
+ w_progress· (d_CoM_pen[t-1] d_CoM_pen[t]) # closer-to-pen progress
+ w_compact· (R[t-1] R[t]) # tighter flock progress
w_time · 1 # constant time penalty
w_wall · I(min_wall_dist < 1.0 m) # dog too close to wall
w_collide· I(dog within 0.3 m of any sheep) # avoid contact
+ w_done · I(all sheep penned) # terminal bonus
```
Initial weights: `w_pen=2.0, w_progress=0.5, w_compact=0.2, w_time=0.005,
w_wall=0.01, w_collide=0.05, w_done=10.0`. Tune via 1-sheep curriculum
first — if the dog learns 1-sheep cleanly, the weights are sane.
### 2.6 Episode
- Max steps: 3000 (≈ 48 s at dt=16 ms — generous).
- Termination: all sheep penned (success), dog/sheep stuck > 600 steps with
no progress (failure), step limit (timeout).
- Reset: domain-randomized — sheep count ∈ {1..N_max}, sheep positions
uniform in field minus pen+gate corridor, dog at origin ± U(2, 2).
### 2.7 Curriculum
| Stage | N_sheep | Duration (steps) | Pass criterion |
|---|---|---|---|
| 0 | 1 | 0.5 M | success ≥ 90 % |
| 1 | 2 | 1.0 M | success ≥ 80 % |
| 2 | 3 | 1.5 M | success ≥ 70 % |
| 3 | 1..3 mixed | 2.0 M | mean reward stable |
| 4 (optional) | 5 | 2.0 M | success ≥ 60 % |
Implemented by changing only `n_sheep` in the env reset.
---
## 3. Repository layout (new)
```
project/
├── controllers/
│ ├── sheep/ # unchanged
│ ├── shepherd_dog/ # Strömbom controller (renamed entry)
│ │ ├── shepherd_dog.py # mode-switch wrapper: RL | strombom
│ │ ├── strombom.py # unchanged (canonical Strömbom)
│ │ └── policy_loader.py # NEW: loads SB3 zip + VecNormalize
│ └── ...
├── herding/ # NEW: Python package, importable from env + controller
│ ├── __init__.py
│ ├── geometry.py # field/pen constants, in_pen(), wall helpers (single source of truth)
│ ├── flocking_sim.py # vectorised numpy port of flocking.py for fast batched sheep
│ ├── diffdrive.py # diff-drive integrator matching the proto specs
│ └── obs.py # observation builder shared by env and Webots controller
├── training/ # NEW
│ ├── herding_env.py # gymnasium.Env, single-agent (the dog)
│ ├── parity_test.py # asserts env trajectory ≈ Webots trajectory for fixed seeds
│ ├── train_ppo.py # SB3 PPO entry point
│ ├── eval.py # rollout + metrics (success rate, time-to-pen)
│ ├── configs/
│ │ ├── ppo_default.yaml
│ │ └── curriculum.yaml
│ ├── runs/ # tensorboard + checkpoints (.gitignored)
│ └── requirements.txt
├── docs/
│ └── project.md # unchanged
├── plan.md # this file
└── ...
```
`herding/` becomes the **single source of truth** for geometry and dynamics.
The Webots controllers and the training env both import from it, so when a
constant changes in one place it changes everywhere — eliminating the
sim/Webots-drift class of bugs.
This means the existing `controllers/sheep/flocking.py` and
`controllers/shepherd_dog/strombom.py` become thin shims that re-export
from `herding/`. Webots controllers can import `herding/` because Webots
adds the project root to `sys.path` at controller startup; we'll verify.
---
## 4. The Gymnasium environment — `training/herding_env.py`
```python
class HerdingEnv(gymnasium.Env):
metadata = {"render_modes": ["rgb_array", "human"]}
def __init__(self, n_sheep=3, max_steps=3000, dt=0.016, seed=None):
self.action_space = Box(low=-1, high=1, shape=(2,), dtype=np.float32)
self.observation_space = Box(low=-1, high=1, shape=(28,), dtype=np.float32)
...
def reset(self, *, seed=None, options=None):
# Random sheep positions in field \ pen corridor, dog near origin.
# Optional curriculum: options["n_sheep"] overrides.
...
def step(self, action):
vx, vy = action # high-level velocity intent
# Convert to wheel speeds via the same drive_action inverse used in Webots
wL, wR = self._diffdrive_inverse(vx, vy, self.dog_state)
self.dog_state = self._integrate_diffdrive(self.dog_state, wL, wR, self.dt)
# Step every sheep one boid step (vectorized in flocking_sim.py)
self.sheep_state = self._step_sheep(self.sheep_state, self.dog_state)
# Update penned set, compute reward, observation, done flags
...
```
Key points:
- **Vectorised sheep update**: re-implements `flocking.py` in numpy so 100
parallel envs with 5 sheep each take ms, not seconds. Numerical parity
with the scalar version is asserted in `parity_test.py`.
- **Same diff-drive integrator** for the dog as Webots will see at
inference. Wall + pen-fence collisions clamp position (a Webots-realistic
no-pass-through approximation).
- **Domain randomization** in reset: sheep count, spawn positions, sheep
flock-parameter jitter (±10 % on FLEE_DIST, COHESION_DIST, etc.) for
robustness.
---
## 5. Training pipeline — `training/train_ppo.py`
- **Algorithm**: SB3 `PPO` with `MlpPolicy`, `n_steps=2048`, `batch_size=256`,
`n_epochs=10`, `gamma=0.995`, `gae_lambda=0.95`, `clip_range=0.2`,
`ent_coef=0.005`, `vf_coef=0.5`, `learning_rate=3e-4`.
- **Vec envs**: `SubprocVecEnv` × 16 parallel envs (the env is pure numpy
so subprocs are CPU-cheap).
- **Normalization**: `VecNormalize(norm_obs=True, norm_reward=True,
clip_obs=10.0)`. Pickled alongside the policy zip — both required at
inference.
- **Callbacks**:
- `CheckpointCallback` every 100 k steps.
- `EvalCallback` on a separate eval env (no normalization-update) every
50 k steps; logs success rate and time-to-pen to TensorBoard.
- Custom `CurriculumCallback`: bumps `n_sheep` when eval success rate
crosses the stage threshold for 3 consecutive evals.
- **Determinism for debugging**: seed-pinned eval env so regressions are
catchable.
---
## 6. Webots integration — RL inference path
`controllers/shepherd_dog/shepherd_dog.py` becomes a thin wrapper:
```python
MODE = os.environ.get("HERDING_MODE", "rl") # "rl" | "strombom"
if MODE == "rl":
policy = policy_loader.load("training/runs/best/policy.zip",
"training/runs/best/vecnormalize.pkl")
obs_fn = build_obs # from herding/obs.py
else:
obs_fn = None # strombom path uses sheep_positions directly
while robot.step(timestep) != -1:
receive_messages()
if MODE == "rl":
obs = obs_fn(dog_xy, dog_heading, sheep_positions, ...)
action, _ = policy.predict(obs, deterministic=True)
vx, vy = action.tolist()
else:
vx, vy, mode, dbg = compute_action_debug(dog_xy, sheep_positions, PEN_ENTRY)
# plus existing rescue/cooldown/EMA layer
drive_action(vx, vy, ...)
```
A **safety supervisor** wraps the RL output: if `obs` indicates the dog is
< 0.6 m from a wall, override with the existing wall-escape behavior
(reverse + turn). This is a hard guarantee diff-drive needs because PPO
may not discover wall-escape reliably from on-policy data.
`policy_loader.py` handles the SB3 import lazily so the controller still
works with `MODE=strombom` even if SB3 is not installed in the Webots
Python environment.
---
## 7. Optional extensions (post-baseline)
- **External pen** (Section 2.1 option B): edit `field.wbt` to extend the
south wall hole into an external L-shaped pen with its own walls; update
`herding/geometry.py`; retrain stage 3 only.
- **Lidar observation**: replace symbolic obs with 36-bin downsampled
lidar + ego state; train end-to-end. Useful as the "extra merit"
dimension in the project doc.
- **Two-dog mode**: make env multi-agent, train with `MAPPO`-style shared
critic or independent PPO. The proto already supports multiple dog
instances; world only needs a second `ShepherdDog` node.
- **Mecanum comparison**: swap the dog proto for a mecanum variant; same
policy, different `_integrate_diffdrive` (becomes holonomic).
- **Sheep flock size scaling**: 5, 10, 20 — the obs is order-invariant so
the same policy generalises; just curriculum further.
---
## 8. Risks & mitigations
| Risk | Mitigation |
|---|---|
| Sim-to-Webots gap (sheep dynamics, wall friction) | `parity_test.py` asserts trajectory match within tolerance for fixed seeds; if it fails, fix the env, not the policy |
| Dog learns to wall-pin sheep against fence | Add `w_collide` penalty + min-sheep-to-wall term in obs; curriculum from 1 sheep first |
| PPO oscillation collapses into spinning | Action smoothing in env step (EMA on `(vx, vy)`, mirroring `ACTION_SMOOTH=0.35` from Strömbom controller); reward small `‖a_t a_{t-1}‖` penalty |
| Pen approach failures (sheep refuse gate) | Reuse the existing `deadzone_rescue` as a *scripted fallback* triggered when a sheep has been deadzoned > 200 steps — RL handles the common case, scripted handles the corner |
| Gym version mismatch (gymnasium vs gym) | Lock to `gymnasium>=0.29`, `stable-baselines3>=2.3` in requirements |
---
## 9. Milestones (suggested order of implementation)
1. **M0 — Refactor** (no behavior change): create `herding/` package, move
constants out of `flocking.py`/`strombom.py`, leave shims; verify
Webots still runs Strömbom unchanged. Add `dog_behavior_log.csv` to
`.gitignore`.
2. **M1 — Env & parity**: `herding_env.py`, `parity_test.py`. Asserts
sheep + dog trajectories match Webots within tolerance for 5 fixed
seeds. *Done when parity test green.*
3. **M2 — PPO baseline**: train Stage 0 (1 sheep) for 0.5 M steps; eval
in env at ≥ 90 % success.
4. **M3 — Webots inference**: load Stage 0 policy in `shepherd_dog.py`
with `HERDING_MODE=rl`; verify the dog herds 1 sheep into the pen in
the actual Webots world. *This is the sim-to-sim transfer gate.*
5. **M4 — Curriculum**: stages 13, ~5 M steps total, with checkpoints
and eval logs.
6. **M5 — Strömbom comparison**: run both controllers on a fixed eval
suite (same seeds, 1/2/3 sheep), log success rate and time-to-pen.
This is a deliverable for the project's "quantitative evaluation"
goal.
7. **M6 — Documentation**: a short README in `training/` showing how to
train, evaluate, and switch modes in Webots.
Each milestone is independently demoable. M0M3 is the critical path to
"RL works in Webots"; M4M6 polishes it for the project deliverable.
---
## 10. Decisions (locked in by implementation)
- **Pen layout**: option B (external pen). The pen sits south of the
field at x ∈ [10, 13], y ∈ [-22, -15] and is reached through the
existing 3 m gap in the south stone wall. The old in-field
quarantine fence is gone and the wooden gate is modeled as
swung-open and parked on the west gate post so the corridor is
unobstructed. This kills the deadzone class entirely.
- **Flock size**: 1..10 sheep, sampled uniformly each reset. The order-
invariant observation (CoM, dispersion, polar histogram) lets a
single policy generalise across the whole range. A curriculum widens
``max_n_sheep`` from 1 to 10 over training to keep early exploration
tractable.
- **Single-sheep mode**: handled by the same policy (n_sheep=1 is the
first stage of the curriculum and stays in the training distribution
throughout). No separate model.
- **Hardware**: GPU for training. SubprocVecEnv × 16 on CPU feeds an
MlpPolicy on GPU; ~23 h for the full curriculum.
## 11. What was built
```
herding/ # single source of truth, importable from both
geometry.py # field/pen constants, latch helpers, robot specs
flocking_sim.py # Reynolds boid step (matches Webots controller)
diffdrive.py # diff-drive kinematics + velocity↔wheels
obs.py # 28-D order-invariant observation builder
strombom.py # collect/drive heuristic (baseline + fallback)
worlds/field.wbt # external pen south of field, 10 sheep slots,
# gate parked open, in-field fence removed
controllers/sheep/sheep.py # imports from herding/, latches on
# is_penned_position
controllers/shepherd_dog/
shepherd_dog.py # mode switch (HERDING_MODE=rl|strombom),
# safety supervisor for DOG_SOUTH_LIMIT
policy_loader.py # lazy SB3 zip + VecNormalize loader
strombom.py # shim re-exporting herding.strombom
training/
herding_env.py # gymnasium.Env, action smoothing, reward shaping
train_ppo.py # SB3 PPO with VecNormalize, eval, checkpoints,
# curriculum callback
eval.py # success-rate / time-to-pen across n_sheep
parity_test.py # shape, determinism, baseline-rollout smoke test
configs/ppo_default.yaml
requirements.txt
README.md # how to train, evaluate, switch modes in Webots
```
## 12. To run
```bash
# 1. Install deps (CUDA-enabled torch wheel for GPU)
pip install -r training/requirements.txt
# 2. Smoke test
python -m training.parity_test
# 3. Train (5 M steps, ~23 h on a single GPU)
python -m training.train_ppo --out-dir training/runs/baseline
# 4. Evaluate vs Strömbom
python -m training.eval --policy training/runs/baseline/best
python -m training.eval --policy strombom
# 5. Run in Webots
export HERDING_MODE=rl
export HERDING_POLICY_DIR=$PWD/training/runs/baseline/best
webots worlds/field.wbt
```
+117
View File
@@ -0,0 +1,117 @@
"""Collect (obs, action) demonstrations from the sequential teacher.
Runs the sequential algorithm across a grid of (n_sheep, seed) combos
at full difficulty, logs the (observation, action) pair every Nth step,
and saves successful trajectories to a numpy ``.npz`` for behavior
cloning. Failed trajectories are dropped by default — we only want to
teach the policy from good examples.
Usage::
python -m tools.collect_demos --out training/demos.npz
"""
from __future__ import annotations
import argparse
import os
import sys
import time
from pathlib import Path
_HERE = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
import numpy as np
from herding.geometry import PEN_ENTRY
from herding.sequential import compute_action
from training.herding_env import HerdingEnv
def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int):
env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
difficulty=1.0, seed=seed)
obs, _ = env.reset(seed=seed)
obs_list, action_list = [], []
for step in range(max_steps):
positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
for i in range(env.n_sheep) if not env.sheep_penned[i]}
if not positions:
break
vx, vy, _mode = compute_action(
(env.dog_x, env.dog_y), positions, PEN_ENTRY,
)
action = np.array([vx, vy], dtype=np.float32)
if step % subsample == 0:
obs_list.append(obs.copy())
action_list.append(action.copy())
obs, _r, term, trunc, _info = env.step(action)
if term or trunc:
break
success = bool(env.sheep_penned.all())
return (
np.asarray(obs_list, dtype=np.float32),
np.asarray(action_list, dtype=np.float32),
success,
env.steps,
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--out", default="training/demos.npz")
parser.add_argument("--n-sheep-list", default="1,2,3,5,8,10")
parser.add_argument("--seeds-per-n", type=int, default=15)
parser.add_argument("--max-steps", type=int, default=30000)
parser.add_argument("--subsample", type=int, default=5,
help="Keep every Nth (obs, action) pair.")
parser.add_argument("--keep-failures", action="store_true",
help="Include partial-success trajectories. Default off.")
args = parser.parse_args()
n_sheep_list = [int(x) for x in args.n_sheep_list.split(",")]
print(f"[demos] grid: n_sheep={n_sheep_list}, seeds={args.seeds_per_n}, "
f"max_steps={args.max_steps}, subsample={args.subsample}")
all_obs, all_actions, all_meta = [], [], []
t_start = time.time()
n_success = 0; n_total = 0
for n in n_sheep_list:
for seed in range(args.seeds_per_n):
obs, actions, success, total_steps = collect_one(
n, seed, args.max_steps, args.subsample,
)
n_total += 1
if success:
n_success += 1
keep = success or args.keep_failures
if keep and len(obs) > 0:
all_obs.append(obs)
all_actions.append(actions)
all_meta.append((n, seed, len(obs), int(success), total_steps))
tag = "" if success else ""
print(f" [{tag}] n={n:>2d} seed={seed:>2d} steps={total_steps:>6d} "
f"logged={len(obs):>5d}")
if not all_obs:
raise RuntimeError("No trajectories kept — try --keep-failures.")
obs = np.concatenate(all_obs, axis=0)
actions = np.concatenate(all_actions, axis=0)
meta = np.array(all_meta, dtype=np.int32)
Path(args.out).parent.mkdir(parents=True, exist_ok=True)
np.savez(args.out, obs=obs, actions=actions, meta=meta)
elapsed = time.time() - t_start
print(f"\n=== {n_success}/{n_total} trajectories successful ({100*n_success/n_total:.0f}%) ===")
print(f"=== {len(obs)} transitions saved to {args.out} ===")
print(f"=== obs={obs.shape}, actions={actions.shape}, elapsed={elapsed:.0f}s ===")
if __name__ == "__main__":
main()
+63
View File
@@ -0,0 +1,63 @@
#!/bin/bash
# Launch Webots with N sheep enabled and the chosen controller mode.
# Generates a temporary world file in worlds/field_test.wbt with sheep
# beyond N commented out, sets the env vars the dog controller reads,
# then execs Webots on it.
#
# Usage:
# tools/run_webots.sh [N] [MODE]
# N : number of active sheep (1..10), default 10
# MODE : "rl" | "strombom" | "sequential", default "rl"
#
# Examples:
# tools/run_webots.sh 10 rl # BC-trained RL policy, 10 sheep
# tools/run_webots.sh 5 sequential # the analytic teacher, 5 sheep
# tools/run_webots.sh 3 strombom # canonical baseline, 3 sheep
#
# Notes:
# * The RL mode loads training/runs/bc_pretrained/policy.zip by default.
# Override via HERDING_POLICY_DIR=/path/to/run env var.
# * Conda env "tir" must be active (provides stable-baselines3 + torch).
set -e
N=${1:-10}
MODE=${2:-rl}
if (( N < 1 || N > 10 )); then
echo "N must be 1..10, got $N" >&2; exit 1
fi
case "$MODE" in
rl|strombom|sequential) ;;
*) echo "MODE must be rl|strombom|sequential, got '$MODE'" >&2; exit 1 ;;
esac
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
SRC="$ROOT/worlds/field.wbt"
DST="$ROOT/worlds/field_test.wbt"
cp "$SRC" "$DST"
# Comment out sheep N+1..10 by prefixing the matching Sheep { ... } line.
for i in $(seq $((N+1)) 10); do
sed -i "s|^Sheep .* \"sheep${i}\".*|# &|" "$DST"
done
active=$(grep -c '^Sheep' "$DST")
echo "------------------------------------------------------------"
echo "World : $DST"
echo "Mode : $MODE"
echo "Sheep : $active active"
echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}"
echo "------------------------------------------------------------"
# Webots strips HERDING_* env vars from controller subprocesses in some
# setups, so we also write a runtime config file the controller reads.
RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}"
cat > "$ROOT/herding_runtime.cfg" <<EOF
HERDING_MODE=$MODE
HERDING_POLICY_DIR=$RESOLVED_POLICY_DIR
EOF
export HERDING_MODE="$MODE"
export HERDING_POLICY_DIR="$RESOLVED_POLICY_DIR"
exec webots "$DST"
+115
View File
@@ -0,0 +1,115 @@
# Shepherd Herding — Training & Inference
This directory holds the Gymnasium environment, PPO training script, and
evaluation harness for the RL shepherd-dog policy. The Webots controller
in `controllers/shepherd_dog/` loads the resulting policy at inference
time when launched with `HERDING_MODE=rl`.
## Layout
```
training/
├── herding_env.py # gymnasium.Env — the dog is the agent
├── train_ppo.py # SB3 PPO entry point (vec envs, eval, curriculum)
├── eval.py # rollout success-rate / time-to-pen across flock sizes
├── parity_test.py # smoke test: shapes, determinism, baseline rollout
├── configs/ppo_default.yaml
├── runs/ # tensorboard + checkpoints (gitignored)
└── requirements.txt
```
## Setup
```bash
python -m venv .venv && source .venv/bin/activate
pip install -r training/requirements.txt
```
CPU is the default and also the recommended device — SB3's PPO with an
MLP policy of this size runs faster on CPU than on GPU because the
bottleneck is rollout collection, not gradient compute. The 16 SubprocVecEnv
workers saturate ~16 CPU cores. To force CUDA anyway, pass `--device cuda`.
## Train
```bash
# Full curriculum (1 → 10 sheep), ~5M steps, ~23h on a single GPU.
python -m training.train_ppo \
--config training/configs/ppo_default.yaml \
--out-dir training/runs/baseline
```
Outputs:
- `training/runs/baseline/best/best_model.zip` — best eval checkpoint
- `training/runs/baseline/best/vecnormalize.pkl` — observation stats
- `training/runs/baseline/checkpoints/ppo_*.zip` — periodic checkpoints
- `training/runs/baseline/tb/` — TensorBoard logs (`tensorboard --logdir`)
To resume:
```bash
python -m training.train_ppo --resume training/runs/baseline/checkpoints/ppo_500000_steps.zip
```
## Evaluate
```bash
# RL policy
python -m training.eval --policy training/runs/baseline/best
# Strömbom baseline
python -m training.eval --policy strombom
```
Prints success rate, mean steps, and mean penned-count per flock size.
Use the same `--n-seeds` for both to get a fair RL-vs-Strömbom A/B.
## Parity / smoke test
```bash
python -m training.parity_test
```
Checks observation/action shapes, deterministic seeding, the curriculum
sampler, and a 400-step Strömbom rollout. Run this before every long
training job — catches the boring class of bugs in seconds.
## Run the policy in Webots
1. Train (above) — produces `training/runs/<name>/best/`.
2. In Webots, set the dog controller's environment variables:
```bash
export HERDING_MODE=rl
export HERDING_POLICY_DIR=$(pwd)/training/runs/baseline/best
webots worlds/field.wbt
```
Or set them via Webots' controller args / a `.wbproj` if you prefer.
3. To force the Strömbom baseline (same world, same controller):
```bash
export HERDING_MODE=strombom
webots worlds/field.wbt
```
If `HERDING_MODE=rl` but the policy can't be loaded (SB3 not installed,
zip missing, etc.), the controller logs the error and falls back to
Strömbom automatically.
## Curriculum knobs
The default schedule in `configs/ppo_default.yaml` widens
`max_n_sheep` over training. Each reset samples `n_sheep ~ U[1,
max_n_sheep]`, so the final policy has seen every flock size from 1 to
10 in proportion. To pin a specific size, instantiate the env with
`HerdingEnv(n_sheep=N)` (see `eval.py`).
## Reward shaping
Weights live in class attributes on `HerdingEnv`. Tune from the 1-sheep
curriculum first — if the dog can't herd a single sheep cleanly, raising
`W_PROGRESS` or lowering `W_TIME` is usually the fix. For multi-sheep
collapse modes (dog spins between sheep), increase `W_COMPACT` so
tightening the flock pays.
View File
+218
View File
@@ -0,0 +1,218 @@
"""Behavior cloning of the sequential teacher into an SB3-compatible policy.
Trains the policy network (mean-action head) of an SB3 ``MlpPolicy`` to
mimic the demonstrations collected by ``tools.collect_demos``. The
saved zip is loadable via ``PPO.load(...)`` and can be passed to
``train_ppo.py --resume`` for fine-tuning.
Why this works: the teacher (sequential single-target driving) solves
n=10 at 80%+ in our env. BC gives the RL a competent starting policy,
so PPO doesn't have to discover behavior from scratch — it only has to
*refine* the teacher's strategy via the sparse pen reward.
Usage::
python -m training.bc_pretrain \\
--demos training/demos.npz \\
--out training/runs/bc_pretrained
"""
from __future__ import annotations
import argparse
import os
import sys
import time
from pathlib import Path
_HERE = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from training.herding_env import HerdingEnv
def build_model(net_arch_pi, net_arch_vf, log_std_init: float):
"""Build a fresh SB3 PPO with the same architecture as train_ppo.
We only need the policy to load weights into; PPO's training-loop
plumbing isn't used during BC.
"""
env = DummyVecEnv([lambda: HerdingEnv()])
model = PPO(
"MlpPolicy", env,
policy_kwargs=dict(
net_arch=dict(pi=net_arch_pi, vf=net_arch_vf),
log_std_init=log_std_init,
),
verbose=0,
)
return model, env
def policy_forward_mean(policy, obs_batch):
"""Return the policy's deterministic mean action for a batch.
SB3's ActorCriticPolicy doesn't expose this directly — it goes
through a Distribution wrapper. We replicate the forward path:
extract_features → mlp_extractor → action_net.
"""
features = policy.extract_features(obs_batch)
if isinstance(features, tuple):
# SB3 ≥ 2.0 sometimes returns (pi_features, vf_features)
pi_features = features[0]
else:
pi_features = features
latent_pi, _latent_vf = policy.mlp_extractor(pi_features)
return policy.action_net(latent_pi)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--demos", default="training/demos.npz")
parser.add_argument("--out", default="training/runs/bc_pretrained")
parser.add_argument("--epochs", type=int, default=60)
parser.add_argument("--batch-size", type=int, default=256)
parser.add_argument("--lr", type=float, default=1e-3)
parser.add_argument("--val-split", type=float, default=0.1)
parser.add_argument("--net-arch", default="256,256",
help="Comma-separated hidden layer widths.")
parser.add_argument("--log-std-init", type=float, default=0.5)
parser.add_argument("--cos-weight", type=float, default=1.0,
help="Weight on (1 - cosine similarity) loss term. "
"MSE alone shrinks policy output toward zero "
"(zero-magnitude action minimises mean squared "
"error against ±1 targets); cos loss keeps "
"the action pointed correctly even at small "
"magnitudes.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--device", default="cpu")
args = parser.parse_args()
torch.manual_seed(args.seed)
np.random.seed(args.seed)
# --- Load demos ---
print(f"[bc] loading demos from {args.demos}")
data = np.load(args.demos)
obs = data["obs"].astype(np.float32)
actions = data["actions"].astype(np.float32)
meta = data["meta"]
print(f"[bc] obs={obs.shape} actions={actions.shape} trajectories={len(meta)}")
if obs.size == 0:
raise RuntimeError("Empty demo file.")
# Action sanity check — sequential outputs unit vectors.
a_norms = np.linalg.norm(actions, axis=1)
print(f"[bc] action L2 norm: mean={a_norms.mean():.3f} "
f"min={a_norms.min():.3f} max={a_norms.max():.3f}")
# --- Train/val split ---
n = len(obs)
perm = np.random.permutation(n)
n_val = int(n * args.val_split)
val_idx, train_idx = perm[:n_val], perm[n_val:]
print(f"[bc] train={len(train_idx)} val={len(val_idx)}")
obs_t = torch.from_numpy(obs)
act_t = torch.from_numpy(actions)
train_loader = DataLoader(
TensorDataset(obs_t[train_idx], act_t[train_idx]),
batch_size=args.batch_size, shuffle=True,
)
val_loader = DataLoader(
TensorDataset(obs_t[val_idx], act_t[val_idx]),
batch_size=args.batch_size, shuffle=False,
)
# --- Build model ---
net_arch_pi = [int(x) for x in args.net_arch.split(",")]
net_arch_vf = net_arch_pi[:]
model, _env = build_model(net_arch_pi, net_arch_vf, args.log_std_init)
policy = model.policy.to(args.device)
optimizer = optim.Adam(policy.parameters(), lr=args.lr)
# --- Train ---
print(f"[bc] training: epochs={args.epochs} batch={args.batch_size} "
f"lr={args.lr} device={args.device}")
t_start = time.time()
best_val = float("inf")
def combined_loss(pred, target):
mse = nn.functional.mse_loss(pred, target)
p_norm = pred.norm(dim=1).clamp_min(1e-6)
t_norm = target.norm(dim=1).clamp_min(1e-6)
cos_sim = (pred * target).sum(dim=1) / (p_norm * t_norm)
cos_loss = (1.0 - cos_sim).mean()
return mse + args.cos_weight * cos_loss, mse.item(), cos_sim.mean().item()
for epoch in range(args.epochs):
policy.train()
train_loss_total, train_mse_total, train_cos_total, train_count = 0.0, 0.0, 0.0, 0
for ob_batch, act_batch in train_loader:
ob_batch = ob_batch.to(args.device)
act_batch = act_batch.to(args.device)
optimizer.zero_grad()
mean_action = policy_forward_mean(policy, ob_batch)
loss, mse_val, cos_val = combined_loss(mean_action, act_batch)
loss.backward()
optimizer.step()
bs = ob_batch.size(0)
train_loss_total += loss.item() * bs
train_mse_total += mse_val * bs
train_cos_total += cos_val * bs
train_count += bs
train_mse = train_mse_total / max(1, train_count)
train_cos = train_cos_total / max(1, train_count)
policy.eval()
val_total, val_count = 0.0, 0
cos_sim_total = 0.0
with torch.no_grad():
for ob_batch, act_batch in val_loader:
ob_batch = ob_batch.to(args.device)
act_batch = act_batch.to(args.device)
mean_action = policy_forward_mean(policy, ob_batch)
bs = ob_batch.size(0)
val_total += nn.functional.mse_loss(
mean_action, act_batch, reduction="sum",
).item()
# Cosine similarity in action space — useful sanity for
# "is the policy pointing the same way as the teacher?".
m_norm = mean_action.norm(dim=1).clamp_min(1e-6)
a_norm = act_batch.norm(dim=1).clamp_min(1e-6)
cos = (mean_action * act_batch).sum(dim=1) / (m_norm * a_norm)
cos_sim_total += cos.sum().item()
val_count += bs
val_mse = val_total / max(1, val_count) / actions.shape[1]
cos_sim = cos_sim_total / max(1, val_count)
print(f" epoch {epoch+1:>2d}/{args.epochs} "
f"train_mse={train_mse:.4f} train_cos={train_cos:+.3f} "
f"val_mse={val_mse:.4f} val_cos={cos_sim:+.3f}")
if val_mse < best_val:
best_val = val_mse
elapsed = time.time() - t_start
print(f"[bc] done in {elapsed:.0f}s best_val_mse={best_val:.4f}")
# --- Save ---
out_dir = Path(args.out)
out_dir.mkdir(parents=True, exist_ok=True)
model.save(out_dir / "policy.zip")
print(f"[bc] saved policy to {out_dir / 'policy.zip'}")
print(f"\n[bc] verify with: "
f"python -m training.eval --policy {out_dir}")
if __name__ == "__main__":
main()
-14
View File
@@ -1,14 +0,0 @@
{
"W_PER_SHEEP": 2.0,
"W_ALIGN": 0.05,
"W_PEN_BONUS": 10.0,
"W_COMPLETE": 100.0,
"W_STEP_COST": 0.02,
"W_COMPACT": 0.0,
"W_WALL_TOUCH": 0.0,
"WALL_TOUCH_BUFFER": 0.4,
"ALIGN_SHAPE": "standoff",
"ALIGN_GATED": true,
"ENTRY_AWARE": true,
"ent_coef": 0.02
}
View File
+52
View File
@@ -0,0 +1,52 @@
# PPO hyperparameters for the herding env. Tuned for a 28-D obs / 2-D
# continuous action space with 16 parallel envs on GPU. These are SB3
# defaults nudged toward longer credit assignment (gamma=0.995) and a
# slightly higher entropy bonus to keep exploration alive while curriculum
# expands the flock size.
# --- PPO ---
learning_rate: 3.0e-4
n_steps: 2048 # rollout length per env before each update
batch_size: 256
n_epochs: 10
gamma: 0.995
gae_lambda: 0.95
clip_range: 0.2
ent_coef: 0.05 # was 0.01 — earlier runs collapsed to ~0 actions
vf_coef: 0.5
max_grad_norm: 0.5
target_kl: null # disable early-stop on KL
# --- Network ---
policy: MlpPolicy
net_arch_pi: [128, 128]
net_arch_vf: [128, 128]
log_std_init: 0.5 # std≈1.6 instead of default 1.0 — more exploration
# --- Training schedule ---
total_timesteps: 10_000_000
n_envs: 16
checkpoint_freq: 500_000 # in env steps
eval_freq: 100_000 # in env steps
n_eval_episodes: 20
# --- Curriculum (max-n_sheep schedule, in env steps) ---
# Each entry: at step s, raise the env's max_n_sheep to k. The env samples
# uniformly from [1, max_n_sheep] each reset, so this widens the
# distribution gradually rather than swapping fixed sizes.
#
# State-space curriculum: difficulty controls sheep spawn area
# (0 = sheep spawn just north of gate, 1 = sheep spawn anywhere in field).
# Plus the existing flock-size curriculum.
#
# The two together let the policy first learn "what penning looks like"
# in a regime where random exploration reliably triggers it, then
# gradually generalise to the deployment distribution.
curriculum:
- { step: 0, max_n_sheep: 1, difficulty: 0.0 }
- { step: 1_000_000, max_n_sheep: 1, difficulty: 0.3 }
- { step: 2_000_000, max_n_sheep: 2, difficulty: 0.5 }
- { step: 4_000_000, max_n_sheep: 3, difficulty: 0.8 }
- { step: 6_000_000, max_n_sheep: 5, difficulty: 1.0 }
- { step: 8_000_000, max_n_sheep: 8, difficulty: 1.0 }
- { step: 9_000_000, max_n_sheep: 10, difficulty: 1.0 }
Binary file not shown.
+136
View File
@@ -0,0 +1,136 @@
"""Evaluate a trained PPO policy (or the Strömbom baseline) on the env.
Reports success rate and time-to-pen across a fixed seed grid for each
flock size 1..MAX_SHEEP. Used to produce the M5 quantitative comparison
table mentioned in plan.md.
Usage::
python -m training.eval --policy training/runs/latest/best
python -m training.eval --policy strombom
"""
from __future__ import annotations
import argparse
import os
import sys
from pathlib import Path
from statistics import mean, stdev
_HERE = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
import numpy as np
from herding.geometry import MAX_SHEEP, PEN_ENTRY
from herding.strombom import compute_action as strombom_action
from herding.sequential import compute_action as sequential_action
from training.herding_env import HerdingEnv
def rollout(env: HerdingEnv, predict_fn, max_steps: int) -> dict:
obs, _ = env.reset()
success = False
for t in range(max_steps):
action = predict_fn(env, obs)
obs, _r, terminated, truncated, info = env.step(action)
if terminated or truncated:
success = bool(info.get("is_success", False))
return {"success": success, "steps": info.get("steps", t + 1),
"n_penned": info.get("n_penned", 0)}
return {"success": False, "steps": max_steps, "n_penned": int(env.sheep_penned.sum())}
def make_analytic_predictor(action_fn):
def _predict(env, _obs):
positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
for i in range(env.n_sheep)
if not env.sheep_penned[i]}
vx, vy, _mode = action_fn((env.dog_x, env.dog_y), positions, PEN_ENTRY)
return np.array([vx, vy], dtype=np.float32)
return _predict
# Backwards-compat alias.
def make_strombom_predictor():
return make_analytic_predictor(strombom_action)
def make_policy_predictor(model, vecnorm):
def _predict(_env, obs):
if vecnorm is not None:
obs_b = vecnorm.normalize_obs(np.asarray(obs, dtype=np.float32).reshape(1, -1))
else:
obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
action, _ = model.predict(obs_b, deterministic=True)
return action[0]
return _predict
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--policy", required=True,
help="Either 'strombom' or path to an SB3 run directory.")
parser.add_argument("--n-seeds", type=int, default=10)
parser.add_argument("--max-steps", type=int, default=5000)
parser.add_argument("--max-flock", type=int, default=MAX_SHEEP)
# 1.0 = deployment distribution (sheep anywhere in field).
# Lower values use the training-curriculum spawn band (sheep near gate).
parser.add_argument("--difficulty", type=float, default=1.0)
args = parser.parse_args()
if args.policy == "strombom":
predict = make_analytic_predictor(strombom_action)
elif args.policy == "sequential":
predict = make_analytic_predictor(sequential_action)
else:
from stable_baselines3 import PPO
run = Path(args.policy)
# Resolve to a zip: directory of checkpoints, or a direct zip path.
if run.is_file():
zip_path = run
else:
for name in ("best_model.zip", "policy.zip", "final.zip"):
if (run / name).exists():
zip_path = run / name
break
else:
raise FileNotFoundError(
f"No checkpoint found in {run} (tried best_model.zip, "
f"policy.zip, final.zip)"
)
model = PPO.load(str(zip_path), device="auto")
vecnorm = None
vn_path = run / "vecnormalize.pkl"
if not vn_path.exists() and run.parent.name != "best":
vn_path = run.parent / "vecnormalize.pkl"
if vn_path.exists():
import pickle
with open(vn_path, "rb") as f:
vecnorm = pickle.load(f)
vecnorm.training = False
vecnorm.norm_reward = False
predict = make_policy_predictor(model, vecnorm)
print(f"{'n_sheep':>8} {'success%':>10} {'mean_steps':>12} {'mean_penned':>12}")
print("-" * 46)
for n in range(1, args.max_flock + 1):
successes, steps, penned = [], [], []
for seed in range(args.n_seeds):
env = HerdingEnv(n_sheep=n, max_steps=args.max_steps,
difficulty=args.difficulty, seed=seed)
r = rollout(env, predict, args.max_steps)
successes.append(int(r["success"]))
steps.append(r["steps"])
penned.append(r["n_penned"])
sr = 100.0 * mean(successes)
ms = mean(steps)
mp = mean(penned)
print(f"{n:>8d} {sr:>9.1f}% {ms:>12.0f} {mp:>12.2f}")
if __name__ == "__main__":
main()
+354 -706
View File
File diff suppressed because it is too large Load Diff
+75 -297
View File
@@ -1,318 +1,96 @@
""" """Parity smoke-test for the herding env.
Parity test: verify 2D training env matches Webots controller implementations.
Tests: Verifies (a) all imports resolve, (b) the env's reset/step contract is
1. Observation building: HerdingEnv._obs() vs shepherd_dog_rl.build_obs() correct, (c) deterministic seeds give deterministic trajectories, and
2. Dog drive: HerdingEnv._step_dog_substep() vs shepherd_dog_rl.drive() math (d) the Strömbom baseline can drive the env without crashing.
3. Sheep drive: HerdingEnv._sheep_drive() vs sheep.py drive() math
Run::
python -m training.parity_test
""" """
import sys from __future__ import annotations
import os import os
import math import sys
_HERE = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
import numpy as np import numpy as np
# Make imports work from project root from herding.geometry import MAX_SHEEP, PEN_ENTRY
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from herding.obs import OBS_DIM
sys.path.insert(0, os.path.join(os.path.dirname(__file__))) from herding.strombom import compute_action
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "controllers", "shepherd_dog_rl")) from training.herding_env import HerdingEnv
from herding_env import HerdingEnv
# Re-implement the Webots functions standalone (no Webots dependency)
FIELD = 15.0
PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
PEN_ENTRY = np.array([11.5, -8.0], dtype=np.float32)
PEN_X = (10.0, 13.0)
PEN_Y = (-15.0, -8.0)
ENTRY_AWARE = True
def webots_build_obs(dog_pos, sheep_positions, n_sheep, dog_heading): def test_obs_action_shapes():
"""Standalone version of shepherd_dog_rl.py build_obs().""" env = HerdingEnv(n_sheep=3, seed=0)
D = 2 * FIELD obs, info = env.reset()
active_pos = np.array( assert obs.shape == (OBS_DIM,), obs.shape
[p for p in sheep_positions assert obs.dtype == np.float32
if not (PEN_X[0] < p[0] < PEN_X[1] and PEN_Y[0] < p[1] < PEN_Y[1])], obs2, r, term, trunc, info = env.step(np.array([0.5, 0.0], dtype=np.float32))
dtype=np.float32 assert obs2.shape == (OBS_DIM,)
) assert isinstance(r, float)
n_active = len(active_pos) assert isinstance(term, bool) and isinstance(trunc, bool)
if n_active > 0: print("[ok] shapes")
com = active_pos.mean(axis=0)
d_from_com = np.linalg.norm(active_pos - com, axis=1)
sorted_idx = np.argsort(d_from_com)[::-1]
radius = float(d_from_com[sorted_idx[0]])
def nth(n):
return active_pos[sorted_idx[n]] if len(sorted_idx) > n else com
far1, far2, far3 = nth(0), nth(1), nth(2)
else:
com = PEN_CENTER.copy()
radius = 0.0
far1 = far2 = far3 = PEN_CENTER.copy()
frac_active = n_active / max(n_sheep, 1)
pen_ref = PEN_ENTRY if ENTRY_AWARE else PEN_CENTER
return np.array([
dog_pos[0] / FIELD, dog_pos[1] / FIELD,
(com[0] - dog_pos[0]) / D, (com[1] - dog_pos[1]) / D,
(far1[0] - com[0]) / D, (far1[1] - com[1]) / D,
(far2[0] - com[0]) / D, (far2[1] - com[1]) / D,
(far3[0] - com[0]) / D, (far3[1] - com[1]) / D,
(pen_ref[0] - com[0]) / D, (pen_ref[1] - com[1]) / D,
(pen_ref[0] - far1[0]) / D, (pen_ref[1] - far1[1]) / D,
radius / D,
frac_active,
math.cos(dog_heading), math.sin(dog_heading),
], dtype=np.float32)
def webots_dog_drive(heading, speed_ms, wheel_r=0.038, k_turn=4.0, def test_reset_determinism():
motor_max=70.0, axle_track=0.28): """Reset with the same seed should give the same initial observation.
"""Standalone version of shepherd_dog_rl.py drive() kinematics.
Returns (v_linear, omega, left_w, right_w). We don't require step-determinism — PPO doesn't need it, and chasing
bit-exactness through the flocking jitter isn't worth the complexity.
""" """
err = math.atan2(math.sin(heading), math.cos(heading)) env_a = HerdingEnv(n_sheep=3, seed=42)
fwd_ms = speed_ms * max(0.0, math.cos(err)) env_b = HerdingEnv(n_sheep=3, seed=42)
fwd_rad = fwd_ms / wheel_r obs_a, _ = env_a.reset(seed=42)
turn = k_turn * err obs_b, _ = env_b.reset(seed=42)
l = max(-motor_max, min(motor_max, fwd_rad - turn)) assert np.allclose(obs_a, obs_b), "Reset is non-deterministic for same seed"
r = max(-motor_max, min(motor_max, fwd_rad + turn)) print("[ok] reset determinism")
v = wheel_r * 0.5 * (r + l)
w = (wheel_r / axle_track) * (r - l)
return v, w, l, r
def webots_sheep_drive(heading, speed_rad, wheel_r=0.031, k_turn=4.0, def test_curriculum_n_sheep_varies():
motor_max=22.0, axle_track=0.20): env = HerdingEnv(seed=0)
"""Standalone version of sheep.py drive() kinematics.""" sizes = set()
err = math.atan2(math.sin(heading), math.cos(heading)) for _ in range(40):
fwd = speed_rad * max(0.0, math.cos(err)) _, info = env.reset()
k = 4.0 sizes.add(info["n_sheep"])
l = max(-motor_max, min(motor_max, fwd - k * err)) assert 1 in sizes
r = max(-motor_max, min(motor_max, fwd + k * err)) assert max(sizes) <= MAX_SHEEP
v = wheel_r * 0.5 * (r + l) print(f"[ok] curriculum sampling — saw n_sheep in {sorted(sizes)}")
w = (wheel_r / axle_track) * (r - l)
return v, w, l, r
def test_obs_parity(): def test_strombom_drives_env():
"""Test that build_obs matches between 2D env and Webots controller.""" """Quick functional check that the analytic baseline can play the env
print("=== Test 1: Observation Parity ===") without exploding. Not a success-rate test — just no errors / NaNs."""
env = HerdingEnv(n_sheep=3) env = HerdingEnv(n_sheep=2, max_steps=400, seed=1)
# Set ENTRY_AWARE to match our webots constant obs, _ = env.reset()
env.ENTRY_AWARE = ENTRY_AWARE for t in range(400):
env.reset(seed=42) positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
for i in range(env.n_sheep)
# Manually set positions for a controlled test if not env.sheep_penned[i]}
env.dog_pos = np.array([5.0, 3.0], dtype=np.float32) if not positions:
env.dog_heading = 1.2 break
env.sheep_pos[0] = np.array([0.0, 0.0], dtype=np.float32) vx, vy, _mode = compute_action((env.dog_x, env.dog_y), positions, PEN_ENTRY)
env.sheep_pos[1] = np.array([2.0, -1.0], dtype=np.float32) obs, r, term, trunc, info = env.step(np.array([vx, vy], dtype=np.float32))
env.sheep_pos[2] = np.array([11.5, -11.5], dtype=np.float32) # penned assert np.isfinite(obs).all(), f"NaN/Inf in obs at step {t}"
env.penned[0] = False assert np.isfinite(r), f"NaN reward at step {t}"
env.penned[1] = False if term or trunc:
env.penned[2] = True break
print(f"[ok] strombom rollout — final n_penned={int(env.sheep_penned.sum())}/{env.n_sheep} after {env.steps} steps")
obs_2d = env._obs()
# Build equivalent Webots observation
sheep_positions = [
env.sheep_pos[0].tolist(),
env.sheep_pos[1].tolist(),
env.sheep_pos[2].tolist(),
]
obs_webots = webots_build_obs(
env.dog_pos, sheep_positions, 3, env.dog_heading
)
max_diff = float(np.max(np.abs(obs_2d - obs_webots)))
print(f" Max element-wise diff: {max_diff:.2e}")
if max_diff < 1e-6:
print(" PASS: Observations match")
else:
print(" FAIL: Observations differ!")
for i in range(18):
if abs(obs_2d[i] - obs_webots[i]) > 1e-6:
print(f" dim {i}: 2d={obs_2d[i]:.6f} webots={obs_webots[i]:.6f}")
return max_diff < 1e-6
def test_dog_drive_parity(): def main():
"""Test that dog diff-drive matches Webots controller.""" test_obs_action_shapes()
print("\n=== Test 2: Dog Drive Parity ===") test_reset_determinism()
env = HerdingEnv(n_sheep=1) test_curriculum_n_sheep_varies()
env.reset(seed=42) test_strombom_drives_env()
print("\nAll parity checks passed.")
all_pass = True
test_cases = [
# (heading_error, speed_ms) — target_heading relative to current heading
(0.0, 2.5), # aligned, full speed
(0.5, 2.5), # 30deg error
(1.5, 2.5), # ~86deg error
(3.14, 2.5), # ~180deg error — should spin in place
(0.0, 0.5), # aligned, slow
(0.3, 1.0), # small error, medium speed
]
for heading_err, speed_ms in test_cases:
env.dog_heading = 0.0
target_heading = heading_err
action = np.array([
math.cos(target_heading), math.sin(target_heading)
], dtype=np.float32) * (speed_ms / env.DOG_SPEED)
# 2D env step
dbg = env._step_dog_substep(action, 0.016)
v_2d = dbg["v"]
w_2d = dbg["w"]
l_2d = dbg["left_w"]
r_2d = dbg["right_w"]
# Webots equivalent
v_w, w_w, l_w, r_w = webots_dog_drive(heading_err, speed_ms)
diffs = {
"v": abs(v_2d - v_w),
"w": abs(w_2d - w_w),
"left": abs(l_2d - l_w),
"right": abs(r_2d - r_w),
}
max_diff = max(diffs.values())
ok = max_diff < 1e-6
status = "PASS" if ok else "FAIL"
print(f" err={heading_err:.2f} spd={speed_ms:.1f}: {status} (max_diff={max_diff:.2e})")
if not ok:
for k, d in diffs.items():
if d > 1e-6:
print(f" {k}: 2d={eval(k+'_2d'):.6f} webots={eval(k+'_w'):.6f}")
all_pass = False
return all_pass
def test_sheep_drive_parity():
"""Test that sheep diff-drive matches Webots sheep controller."""
print("\n=== Test 3: Sheep Drive Parity ===")
env = HerdingEnv(n_sheep=1)
env.reset(seed=42)
all_pass = True
test_cases = [
# (heading_error, speed_rad)
(0.0, 20.0), # aligned, flee speed
(0.0, 3.0), # aligned, wander speed
(0.5, 15.0), # moderate error
(1.57, 10.0), # 90deg — should spin in place
(3.14, 20.0), # 180deg — should spin in place fast
(0.2, 8.0), # small error, medium speed
]
for heading_err, speed_rad in test_cases:
env.sheep_heading[0] = 0.0
env.sheep_pos[0] = np.array([0.0, 0.0], dtype=np.float32)
target_heading = heading_err
# 2D env
new_pos = env._sheep_drive(0, target_heading, speed_rad, 0.016)
v_2d_raw = float(np.linalg.norm(new_pos - np.array([0.0, 0.0]))) / 0.016
# Re-derive v, w from the internal state
heading_2d = env.sheep_heading[0]
# Webots equivalent
v_w, w_w, l_w, r_w = webots_sheep_drive(heading_err, speed_rad)
# For 2D, compute the same intermediate values
err_2d = (target_heading - 0.0 + np.pi) % (2 * np.pi) - np.pi
fwd_2d = speed_rad * max(0.0, math.cos(err_2d))
turn_2d = 4.0 * err_2d
l_2d = max(-22.0, min(22.0, fwd_2d - turn_2d))
r_2d = max(-22.0, min(22.0, fwd_2d + turn_2d))
diffs = {
"left": abs(l_2d - l_w),
"right": abs(r_2d - r_w),
}
max_diff = max(diffs.values())
ok = max_diff < 1e-6
status = "PASS" if ok else "FAIL"
print(f" err={heading_err:.2f} spd={speed_rad:.1f}: {status} (max_diff={max_diff:.2e})")
if not ok:
for k, d in diffs.items():
if d > 1e-6:
print(f" {k}: 2d={l_2d if k=='left' else r_2d:.6f} webots={l_w if k=='left' else r_w:.6f}")
all_pass = False
return all_pass
def test_full_trajectory_parity():
"""Test that running identical actions produces matching trajectories."""
print("\n=== Test 4: Full Trajectory Parity (dog only) ===")
# Run 50 steps with a fixed action, compare dog heading/position
# at each step between 2D env kinematics and pure Webots kinematics.
env = HerdingEnv(n_sheep=1)
env.reset(seed=42)
env.dog_pos = np.array([0.0, 0.0], dtype=np.float32)
env.dog_heading = 0.0
env.ENTRY_AWARE = ENTRY_AWARE
action = np.array([0.8, -0.6], dtype=np.float32) # magnitude 1.0
dt = 0.016667 # sub_dt
# Webots-side tracking
wb_heading = 0.0
wb_x, wb_y = 0.0, 0.0
max_heading_diff = 0.0
max_pos_diff = 0.0
for step in range(50):
# 2D env sub-step
env._step_dog_substep(action, dt)
# Webots-side computation
speed_ms = 1.0 * 2.5
target_heading = math.atan2(-0.6, 0.8)
err = math.atan2(math.sin(target_heading - wb_heading),
math.cos(target_heading - wb_heading))
fwd_ms = speed_ms * max(0.0, math.cos(err))
fwd_rad = fwd_ms / 0.038
turn = 4.0 * err
l = max(-70.0, min(70.0, fwd_rad - turn))
r = max(-70.0, min(70.0, fwd_rad + turn))
v = 0.038 * 0.5 * (r + l)
w = (0.038 / 0.28) * (r - l)
wb_heading = math.atan2(math.sin(wb_heading + w * dt),
math.cos(wb_heading + w * dt))
wb_x += math.cos(wb_heading) * v * dt
wb_y += math.sin(wb_heading) * v * dt
heading_diff = abs(env.dog_heading - wb_heading)
pos_diff = math.hypot(env.dog_pos[0] - wb_x, env.dog_pos[1] - wb_y)
max_heading_diff = max(max_heading_diff, heading_diff)
max_pos_diff = max(max_pos_diff, pos_diff)
print(f" Max heading diff over 50 steps: {max_heading_diff:.2e} rad")
print(f" Max position diff over 50 steps: {max_pos_diff:.2e} m")
ok = max_pos_diff < 1e-4
print(f" {'PASS' if ok else 'FAIL'}: Trajectories match")
return ok
if __name__ == "__main__": if __name__ == "__main__":
results = [] main()
results.append(("Obs parity", test_obs_parity()))
results.append(("Dog drive parity", test_dog_drive_parity()))
results.append(("Sheep drive parity", test_sheep_drive_parity()))
results.append(("Trajectory parity", test_full_trajectory_parity()))
print("\n" + "=" * 50)
print("RESULTS")
print("=" * 50)
all_pass = True
for name, passed in results:
print(f" {name}: {'PASS' if passed else 'FAIL'}")
if not passed:
all_pass = False
print(f"\nOverall: {'ALL PASS' if all_pass else 'SOME FAILURES'}")
env.close()
+8 -6
View File
@@ -1,6 +1,8 @@
gymnasium>=0.29 # Pin major versions; SB3 2.x requires gymnasium and torch >= 1.13.
stable-baselines3>=2.3 gymnasium>=0.29,<2.0
torch>=2.2 stable-baselines3[extra]>=2.3,<3.0
numpy>=1.26 torch>=2.1
matplotlib>=3.8 numpy>=1.24
tensorboard>=2.16 pyyaml>=6.0
tensorboard>=2.14
tqdm>=4.66
-1
View File
@@ -1 +0,0 @@
-392
View File
@@ -1,392 +0,0 @@
"""
PPO training for the herding task with curriculum learning.
Trains from scratch through a 1→max_sheep curriculum, evaluates after each
stage, and auto-generates trajectory/timeseries plots plus a summary chart.
Usage
-----
python train.py # defaults from config.json
python train.py --config my_config.json --max-sheep 5
python train.py --max-sheep 3 --steps-per-stage 1000000
Outputs (in runs/<timestamp>/):
config.json resolved config
final_model.zip trained PPO model
vecnorm.pkl VecNormalize statistics
stage_results.json per-stage evaluation metrics
success_rate.png summary bar chart
eval/ trajectory & timeseries plots per sheep count
"""
import argparse
import json
import os
import time
from copy import deepcopy
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import (
DummyVecEnv,
SubprocVecEnv,
VecNormalize,
)
from herding_env import HerdingEnv
from viz import (
run_and_record,
plot_trajectory,
plot_timeseries,
plot_success_rate,
save_episode_gif,
)
# ── Callbacks ────────────────────────────────────────────────────────────────
class ProgressCallback(BaseCallback):
"""One-line progress summary every `freq` env steps."""
def __init__(self, stage_label: str, freq: int = 100_000):
super().__init__()
self.stage_label = stage_label
self.freq = freq
self._last = 0
self._ep_returns = []
self._ep_success = []
self._total_eps = 0
self._total_success = 0
self._cur_ret = None
def _on_step(self) -> bool:
rewards = self.locals.get("rewards")
dones = self.locals.get("dones")
infos = self.locals.get("infos", [])
if rewards is None or dones is None:
return True
if self._cur_ret is None or len(self._cur_ret) != len(rewards):
self._cur_ret = np.zeros(len(rewards), dtype=np.float64)
self._cur_ret += np.asarray(rewards, dtype=np.float64)
for i, d in enumerate(dones):
if not d:
continue
self._ep_returns.append(float(self._cur_ret[i]))
info = infos[i] if i < len(infos) else {}
success = int(info.get("n_penned", 0) == info.get("n_sheep", -1))
self._ep_success.append(success)
self._total_eps += 1
self._total_success += success
self._cur_ret[i] = 0.0
if len(self._ep_returns) > 50:
self._ep_returns.pop(0)
self._ep_success.pop(0)
if self.num_timesteps - self._last >= self.freq:
self._last = self.num_timesteps
n = len(self._ep_returns)
mean_r = float(np.mean(self._ep_returns)) if n else float("nan")
win_sr = float(np.mean(self._ep_success)) if n else float("nan")
cum_sr = (self._total_success / self._total_eps
if self._total_eps else float("nan"))
print(f" ... [{self.stage_label} | "
f"{self.num_timesteps:>7,} steps | "
f"ret(last {n})={mean_r:+.2f} "
f"win_sr={win_sr*100:.0f}% cum_sr={cum_sr*100:.0f}%]",
flush=True)
return True
# ── Environment factory ──────────────────────────────────────────────────────
def make_env(n_sheep, seed, max_steps, reward_cfg=None):
def _init():
env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
reward_cfg=reward_cfg)
env.reset(seed=seed)
return env
return _init
# ── Failure-mode classification ──────────────────────────────────────────────
COMPACT_RADIUS = 5.0
def _classify(ep_radii, ep_com_dists, n_penned, n_sheep):
if n_penned == n_sheep:
return "SUCCESS"
if min(ep_radii) > COMPACT_RADIUS:
return "NEVER_COMPACT"
first = next(i for i, r in enumerate(ep_radii) if r <= COMPACT_RADIUS)
if min(ep_com_dists[first:]) > 3.0:
return "COMPACT_CANT_DRIVE"
if n_penned == 0:
return "DROVE_NO_SHEEP"
return f"PARTIAL_{n_penned}of{n_sheep}"
# ── Evaluation ───────────────────────────────────────────────────────────────
def evaluate(model, vn_template, n_sheep, n_episodes, max_steps,
reward_cfg=None):
"""Evaluate at a given sheep count; returns metrics dict."""
raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, reward_cfg)])
vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
vn.obs_rms = deepcopy(vn_template.obs_rms)
vn.ret_rms = deepcopy(vn_template.ret_rms)
successes = 0
ep_lens = []
min_pen_list = []
action_mags = []
failure_counts = {}
rc_sums = {}
rc_n = 0
for _ in range(n_episodes):
obs = vn.reset()
done = False
steps = 0
min_pen = float("inf")
mags = []
ep_radii = []
ep_com_dists = []
while not done:
action, _ = model.predict(obs, deterministic=True)
obs, _, dones, infos = vn.step(action)
done = dones[0]
inner = vn.envs[0]
com, radius, _ = inner._flock_stats()
min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER)))
mags.append(float(np.linalg.norm(action[0])))
ep_radii.append(radius)
ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER)))
steps += 1
rc = infos[0].get("rcomps")
if rc:
for k, v in rc.items():
rc_sums[k] = rc_sums.get(k, 0.0) + v
rc_n += 1
n_penned = infos[0].get("n_penned", 0)
success = n_penned == n_sheep
successes += int(success)
ep_lens.append(steps)
min_pen_list.append(min_pen)
action_mags.extend(mags)
mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep)
failure_counts[mode] = failure_counts.get(mode, 0) + 1
vn.close()
result = {
"sr": successes / n_episodes,
"mean_len": float(np.mean(ep_lens)),
"mean_min_pen": float(np.mean(min_pen_list)),
"mean_act": float(np.mean(action_mags)) if action_mags else 0.0,
"failure_modes": failure_counts,
}
if rc_n > 0:
result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()}
return result
# ── CLI ──────────────────────────────────────────────────────────────────────
DEFAULT_CONFIG = {
"W_PER_SHEEP": 2.0,
"W_ALIGN": 0.05,
"W_PEN_BONUS": 10.0,
"W_COMPLETE": 100.0,
"W_STEP_COST": 0.02,
"W_SOUTH": 0.01,
"W_COMPACT": 0.0,
"W_WALL_TOUCH": 0.04,
"WALL_TOUCH_BUFFER": 0.3,
"ALIGN_SHAPE": "standoff",
"ALIGN_GATED": True,
"ENTRY_AWARE": True,
"ent_coef": 0.02,
}
def parse_args():
p = argparse.ArgumentParser(
description="PPO training for herding task with curriculum learning")
p.add_argument("--config", type=str, default=None,
help="JSON config file (reward weights + ent_coef)")
p.add_argument("--max-sheep", type=int, default=10)
p.add_argument("--steps-per-stage", type=int, default=1_500_000)
p.add_argument("--n-envs", type=int, default=8)
p.add_argument("--max-steps", type=int, default=2500)
p.add_argument("--eval-episodes", type=int, default=30)
p.add_argument("--run-dir", type=str, default=None)
p.add_argument("--no-gif", action="store_true",
help="Skip per-stage GIF rendering (PNGs still produced).")
p.add_argument("--gif-fps", type=int, default=20)
p.add_argument("--gif-skip", type=int, default=3,
help="Keep every Nth frame (smaller GIF; default 3).")
return p.parse_args()
# ── Main ─────────────────────────────────────────────────────────────────────
def main():
args = parse_args()
# Load config: --config overrides, else auto-load config.json if present
cfg = dict(DEFAULT_CONFIG)
config_path = args.config
if config_path is None and os.path.exists("config.json"):
config_path = "config.json"
if config_path:
with open(config_path) as f:
cfg.update(json.load(f))
print(f"Config loaded from {config_path}")
rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
# Run directory
run_dir = args.run_dir or os.path.join(
"runs", time.strftime("%Y%m%d_%H%M%S"))
eval_dir = os.path.join(run_dir, "eval")
os.makedirs(eval_dir, exist_ok=True)
with open(os.path.join(run_dir, "config.json"), "w") as f:
json.dump(cfg, f, indent=2)
print(f"Config: {cfg}")
print(f"Run dir: {run_dir}")
print(f"Curriculum: 1 → {args.max_sheep} sheep, "
f"{args.steps_per_stage:,} steps/stage\n")
# Training envs
train_env = SubprocVecEnv([
make_env(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg)
for i in range(args.n_envs)
])
vn = VecNormalize(train_env, norm_obs=True, norm_reward=True,
clip_obs=10.0)
# Model — force CPU (PPO with MLP runs faster on CPU than GPU; SB3 warns
# about this otherwise).
model = PPO(
"MlpPolicy", vn,
learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
gamma=0.995, gae_lambda=0.95, clip_range=0.2,
ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5,
policy_kwargs=dict(net_arch=[256, 256]),
device="cpu",
verbose=0,
)
# Curriculum training
stage_results = []
t0 = time.time()
try:
for n in range(1, args.max_sheep + 1):
if n == 1:
print(f"\n[Stage n_sheep=1] training {args.steps_per_stage:,} steps")
model.learn(
total_timesteps=args.steps_per_stage,
reset_num_timesteps=True,
callback=ProgressCallback("1 sheep", freq=100_000),
)
else:
# Mixed transition: half envs stay at n-1, half advance to n,
# for the first half of the stage budget. This prevents the
# n+1 task's noisy early gradients from destroying the n policy
# (catastrophic forgetting) before it has a chance to adapt.
half = max(1, args.n_envs // 2)
for i in range(half):
vn.env_method("set_n_sheep", n - 1, indices=[i])
for i in range(half, args.n_envs):
vn.env_method("set_n_sheep", n, indices=[i])
mix_steps = args.steps_per_stage // 2
full_steps = args.steps_per_stage - mix_steps
print(f"\n[Stage n_sheep={n}] mixed ({n-1}/{n} sheep) "
f"{mix_steps:,} steps")
model.learn(
total_timesteps=mix_steps,
reset_num_timesteps=False,
callback=ProgressCallback(f"{n-1}{n} mix", freq=100_000),
)
vn.env_method("set_n_sheep", n)
print(f"[Stage n_sheep={n}] full ({n} sheep) {full_steps:,} steps")
model.learn(
total_timesteps=full_steps,
reset_num_timesteps=False,
callback=ProgressCallback(f"{n} sheep", freq=100_000),
)
# Evaluate
print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% "
f"mean_len={r['mean_len']:.0f} "
f"mean_min_pen={r['mean_min_pen']:.1f}m "
f"mean_act={r['mean_act']:.2f}")
# Failure-mode breakdown
if r["failure_modes"]:
modes = " ".join(
f"{k}={v}" for k, v in sorted(
r["failure_modes"].items(), key=lambda x: -x[1]))
print(f" failure modes: {modes}")
# Reward breakdown
if "reward_per_step" in r:
rps = r["reward_per_step"]
print(f" reward/step: " + " ".join(
f"{k}={v:+.4f}" for k, v in rps.items()))
# Episode visualisation: trajectory + timeseries + animated GIF
hist = run_and_record(model, vn, n, args.max_steps, rcfg,
seed=1000 + n)
tag = "success" if hist["success"] else "fail"
plot_trajectory(
hist,
os.path.join(eval_dir, f"traj_{n}s_{tag}.png"))
plot_timeseries(
hist,
os.path.join(eval_dir, f"ts_{n}s_{tag}.png"))
if not args.no_gif:
save_episode_gif(
hist,
os.path.join(eval_dir, f"ep_{n}s_{tag}.gif"),
fps=args.gif_fps, skip=args.gif_skip)
r["n_sheep"] = n
stage_results.append(r)
# Save artefacts
model.save(os.path.join(run_dir, "final_model"))
vn.save(os.path.join(run_dir, "vecnorm.pkl"))
with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
json.dump(stage_results, f, indent=2)
finally:
try:
vn.close()
except Exception:
pass
# Summary
elapsed = (time.time() - t0) / 60
print("\n" + "=" * 70)
print(" TRAINING SUMMARY")
print("=" * 70)
for r in stage_results:
print(f" n_sheep={r['n_sheep']} sr={r['sr']*100:>3.0f}% "
f"len={r['mean_len']:>5.0f} min_pen={r['mean_min_pen']:>5.1f}m "
f"act={r['mean_act']:.2f}")
print(f"\n Total time: {elapsed:.1f} min")
print(f" Artefacts: {run_dir}/")
plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png"))
print(f" Plots: {run_dir}/success_rate.png, {eval_dir}/")
if __name__ == "__main__":
main()
-412
View File
@@ -1,412 +0,0 @@
"""
PPO training with attention-based policy (train_at.py).
Key difference from train.py
-----------------------------
- Observation exposes ALL sheep as individual per-sheep tokens rather than
only the top-3 farthest. The policy therefore has complete flock visibility
at any sheep count — no hidden sheep even at n=10.
- A TransformerFeaturesExtractor processes the sheep tokens with multi-head
self-attention (permutation-invariant), then mean-pools over valid tokens
and concatenates the result with global dog/pen features.
- Curriculum transition uses the same mixed-env approach as train.py: half
the envs stay at n-1 for the first half of each new stage to suppress
catastrophic forgetting.
Observation layout (7 + MAX_SHEEP*6 = 67 dims, fixed)
-------------------------------------------------------
Global (7):
dog_x / FIELD, dog_y / FIELD,
cos(heading), sin(heading),
(pen_x - dog_x) / D, (pen_y - dog_y) / D,
n_active / n_sheep
Per sheep i (6):
(sheep_x - dog_x) / D, (sheep_y - dog_y) / D, ← pos rel to dog
(pen_x - sheep_x) / D, (pen_y - sheep_y) / D, ← sheep-to-pen
is_active 1.0 if not penned, else 0.0
is_valid 1.0 if i < n_sheep, else 0.0 (padding sentinel)
After VecNormalize, is_valid for real sheep normalises > 0 and for
padding tokens < 0 (because mean ∈ (0,1)), so a threshold of 0 cleanly
separates real from padded without any extra bookkeeping.
Usage
-----
python train_at.py # defaults from config.json
python train_at.py --max-sheep 10 --steps-per-stage 2000000
python train_at.py --embed-dim 128 --n-heads 4 --n-layers 3
"""
import argparse
import json
import os
import time
from copy import deepcopy
import numpy as np
import torch
import torch.nn as nn
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize
from herding_env import HerdingEnv
from train import ProgressCallback, _classify, COMPACT_RADIUS, DEFAULT_CONFIG
from viz import (
run_and_record, plot_trajectory, plot_timeseries,
plot_success_rate, save_episode_gif,
)
# ── Per-sheep token observation environment ───────────────────────────────────
class HerdingEnvAt(HerdingEnv):
"""
HerdingEnv with a per-sheep token observation for the attention policy.
Everything else (dynamics, reward, curriculum interface) is inherited.
"""
OBS_GLOBAL = 7
OBS_SHEEP = 6
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
obs_dim = self.OBS_GLOBAL + self.MAX_SHEEP * self.OBS_SHEEP
self.observation_space = spaces.Box(
low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32
)
def _obs(self) -> np.ndarray:
S = self.FIELD
D = 2.0 * self.FIELD
pen_ref = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER
active_mask = ~self.penned[:self.n_sheep]
n_active = int(active_mask.sum())
global_feats = np.array([
self.dog_pos[0] / S,
self.dog_pos[1] / S,
float(np.cos(self.dog_heading)),
float(np.sin(self.dog_heading)),
(pen_ref[0] - self.dog_pos[0]) / D,
(pen_ref[1] - self.dog_pos[1]) / D,
n_active / max(self.n_sheep, 1),
], dtype=np.float32)
sheep_feats = np.zeros((self.MAX_SHEEP, self.OBS_SHEEP), dtype=np.float32)
for i in range(self.n_sheep):
pos = self.sheep_pos[i]
sheep_feats[i] = [
(pos[0] - self.dog_pos[0]) / D,
(pos[1] - self.dog_pos[1]) / D,
(pen_ref[0] - pos[0]) / D,
(pen_ref[1] - pos[1]) / D,
float(not self.penned[i]),
1.0, # is_valid: this sheep exists
]
# i >= n_sheep: all zeros, is_valid=0 → masked out in attention
return np.concatenate([global_feats, sheep_feats.ravel()])
# ── Attention features extractor ──────────────────────────────────────────────
class ShepherdAttentionExtractor(BaseFeaturesExtractor):
"""
Multi-head self-attention over per-sheep tokens, mean-pooled over valid
(non-padding) tokens and concatenated with global dog/pen features.
After VecNormalize:
real sheep → is_valid_norm > 0 (normalised from 1.0)
padding → is_valid_norm ≤ 0 (normalised from 0.0)
so threshold at 0 is always correct regardless of curriculum stage.
"""
GLOBAL_DIM = HerdingEnvAt.OBS_GLOBAL # 7
SHEEP_DIM = HerdingEnvAt.OBS_SHEEP # 6
MAX_SHEEP = HerdingEnv.MAX_SHEEP # 10
VALID_IDX = 5 # index of is_valid within each token
def __init__(self, observation_space, embed_dim: int = 64,
n_heads: int = 4, n_layers: int = 2, ff_dim: int = 128):
super().__init__(observation_space,
features_dim=self.GLOBAL_DIM + embed_dim)
self.sheep_embed = nn.Linear(self.SHEEP_DIM, embed_dim)
encoder_layer = nn.TransformerEncoderLayer(
d_model=embed_dim, nhead=n_heads, dim_feedforward=ff_dim,
dropout=0.0, batch_first=True,
)
self.transformer = nn.TransformerEncoder(encoder_layer,
num_layers=n_layers,
enable_nested_tensor=False)
def forward(self, obs: torch.Tensor) -> torch.Tensor:
B = obs.shape[0]
global_feats = obs[:, :self.GLOBAL_DIM] # (B, 7)
tokens = obs[:, self.GLOBAL_DIM:].view(
B, self.MAX_SHEEP, self.SHEEP_DIM) # (B, 10, 6)
# is_valid after VecNorm: real > 0, padding ≤ 0
is_valid_norm = tokens[:, :, self.VALID_IDX] # (B, 10)
key_padding_mask = is_valid_norm <= 0.0 # True → ignore
x = self.sheep_embed(tokens) # (B, 10, E)
x = self.transformer(x, src_key_padding_mask=key_padding_mask)
valid_w = (is_valid_norm > 0.0).float().unsqueeze(-1) # (B, 10, 1)
pooled = (x * valid_w).sum(1) / valid_w.sum(1).clamp(min=1.0)
return torch.cat([global_feats, pooled], dim=1) # (B, 7+E)
# ── Environment factory ───────────────────────────────────────────────────────
def make_env_at(n_sheep, seed, max_steps, reward_cfg=None):
def _init():
env = HerdingEnvAt(n_sheep=n_sheep, max_steps=max_steps,
reward_cfg=reward_cfg)
env.reset(seed=seed)
return env
return _init
# ── Evaluation ────────────────────────────────────────────────────────────────
def evaluate_at(model, vn_template, n_sheep, n_episodes, max_steps,
reward_cfg=None):
raw = DummyVecEnv([make_env_at(n_sheep, 9999, max_steps, reward_cfg)])
vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
vn.obs_rms = deepcopy(vn_template.obs_rms)
vn.ret_rms = deepcopy(vn_template.ret_rms)
successes = 0
ep_lens, min_pen_list, action_mags = [], [], []
failure_counts, rc_sums = {}, {}
rc_n = 0
for _ in range(n_episodes):
obs = vn.reset()
done = False
steps, min_pen = 0, float("inf")
mags, ep_radii, ep_com_dists = [], [], []
while not done:
action, _ = model.predict(obs, deterministic=True)
obs, _, dones, infos = vn.step(action)
done = dones[0]
inner = vn.envs[0]
com, radius, _ = inner._flock_stats()
min_pen = min(min_pen,
float(np.linalg.norm(com - inner.PEN_CENTER)))
mags.append(float(np.linalg.norm(action[0])))
ep_radii.append(radius)
ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER)))
steps += 1
rc = infos[0].get("rcomps")
if rc:
for k, v in rc.items():
rc_sums[k] = rc_sums.get(k, 0.0) + v
rc_n += 1
n_penned = infos[0].get("n_penned", 0)
successes += int(n_penned == n_sheep)
ep_lens.append(steps)
min_pen_list.append(min_pen)
action_mags.extend(mags)
mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep)
failure_counts[mode] = failure_counts.get(mode, 0) + 1
vn.close()
result = {
"sr": successes / n_episodes,
"mean_len": float(np.mean(ep_lens)),
"mean_min_pen": float(np.mean(min_pen_list)),
"mean_act": float(np.mean(action_mags)) if action_mags else 0.0,
"failure_modes": failure_counts,
}
if rc_n > 0:
result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()}
return result
# ── CLI ───────────────────────────────────────────────────────────────────────
def parse_args():
p = argparse.ArgumentParser(
description="PPO + attention training for herding task")
p.add_argument("--config", type=str, default=None)
p.add_argument("--max-sheep", type=int, default=10)
p.add_argument("--steps-per-stage", type=int, default=1_500_000)
p.add_argument("--n-envs", type=int, default=8)
p.add_argument("--max-steps", type=int, default=2500)
p.add_argument("--eval-episodes", type=int, default=30)
p.add_argument("--run-dir", type=str, default=None)
p.add_argument("--no-gif", action="store_true")
p.add_argument("--gif-fps", type=int, default=20)
p.add_argument("--gif-skip", type=int, default=3)
# Attention architecture
p.add_argument("--embed-dim", type=int, default=64,
help="Transformer embedding dimension (default 64)")
p.add_argument("--n-heads", type=int, default=4,
help="Number of attention heads (default 4)")
p.add_argument("--n-layers", type=int, default=2,
help="Number of transformer encoder layers (default 2)")
p.add_argument("--ff-dim", type=int, default=128,
help="Transformer feed-forward dim (default 128)")
return p.parse_args()
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
args = parse_args()
cfg = dict(DEFAULT_CONFIG)
config_path = args.config
if config_path is None and os.path.exists("config.json"):
config_path = "config.json"
if config_path:
with open(config_path) as f:
cfg.update(json.load(f))
print(f"Config loaded from {config_path}")
rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
run_dir = args.run_dir or os.path.join(
"runs", "at_" + time.strftime("%Y%m%d_%H%M%S"))
eval_dir = os.path.join(run_dir, "eval")
os.makedirs(eval_dir, exist_ok=True)
with open(os.path.join(run_dir, "config.json"), "w") as f:
json.dump(cfg, f, indent=2)
print(f"Config: {cfg}")
print(f"Run dir: {run_dir}")
print(f"Curriculum: 1 → {args.max_sheep} sheep, "
f"{args.steps_per_stage:,} steps/stage")
print(f"Transformer: embed={args.embed_dim} heads={args.n_heads} "
f"layers={args.n_layers} ff={args.ff_dim}\n")
train_env = SubprocVecEnv([
make_env_at(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg)
for i in range(args.n_envs)
])
vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
model = PPO(
"MlpPolicy", vn,
learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
gamma=0.995, gae_lambda=0.95, clip_range=0.2,
ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5,
policy_kwargs=dict(
features_extractor_class=ShepherdAttentionExtractor,
features_extractor_kwargs=dict(
embed_dim=args.embed_dim,
n_heads=args.n_heads,
n_layers=args.n_layers,
ff_dim=args.ff_dim,
),
net_arch=[256, 256],
),
device="cpu",
verbose=0,
)
stage_results = []
t0 = time.time()
try:
for n in range(1, args.max_sheep + 1):
if n == 1:
print(f"\n[Stage n_sheep=1] training {args.steps_per_stage:,} steps")
model.learn(
total_timesteps=args.steps_per_stage,
reset_num_timesteps=True,
callback=ProgressCallback("1 sheep", freq=100_000),
)
else:
half = max(1, args.n_envs // 2)
mix_steps = args.steps_per_stage // 2
full_steps = args.steps_per_stage - mix_steps
for i in range(half):
vn.env_method("set_n_sheep", n - 1, indices=[i])
for i in range(half, args.n_envs):
vn.env_method("set_n_sheep", n, indices=[i])
print(f"\n[Stage n_sheep={n}] mixed ({n-1}/{n} sheep) "
f"{mix_steps:,} steps")
model.learn(
total_timesteps=mix_steps,
reset_num_timesteps=False,
callback=ProgressCallback(f"{n-1}{n} mix", freq=100_000),
)
vn.env_method("set_n_sheep", n)
print(f"[Stage n_sheep={n}] full ({n} sheep) {full_steps:,} steps")
model.learn(
total_timesteps=full_steps,
reset_num_timesteps=False,
callback=ProgressCallback(f"{n} sheep", freq=100_000),
)
print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
r = evaluate_at(model, vn, n, args.eval_episodes,
args.max_steps, rcfg)
print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% "
f"mean_len={r['mean_len']:.0f} "
f"mean_min_pen={r['mean_min_pen']:.1f}m "
f"mean_act={r['mean_act']:.2f}")
if r["failure_modes"]:
modes = " ".join(
f"{k}={v}" for k, v in sorted(
r["failure_modes"].items(), key=lambda x: -x[1]))
print(f" failure modes: {modes}")
if "reward_per_step" in r:
rps = r["reward_per_step"]
print(" reward/step: " + " ".join(
f"{k}={v:+.4f}" for k, v in rps.items()))
hist = run_and_record(
model, vn, n, args.max_steps, rcfg,
seed=1000 + n, make_env_fn=make_env_at,
)
tag = "success" if hist["success"] else "fail"
plot_trajectory(hist, os.path.join(eval_dir, f"traj_{n}s_{tag}.png"))
plot_timeseries(hist, os.path.join(eval_dir, f"ts_{n}s_{tag}.png"))
if not args.no_gif:
save_episode_gif(
hist,
os.path.join(eval_dir, f"ep_{n}s_{tag}.gif"),
fps=args.gif_fps, skip=args.gif_skip)
r["n_sheep"] = n
stage_results.append(r)
model.save(os.path.join(run_dir, "final_model"))
vn.save(os.path.join(run_dir, "vecnorm.pkl"))
with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
json.dump(stage_results, f, indent=2)
finally:
try:
vn.close()
except Exception:
pass
elapsed = (time.time() - t0) / 60
print("\n" + "=" * 70)
print(" TRAINING SUMMARY (attention policy)")
print("=" * 70)
for r in stage_results:
print(f" n_sheep={r['n_sheep']} sr={r['sr']*100:>3.0f}% "
f"len={r['mean_len']:>5.0f} "
f"min_pen={r['mean_min_pen']:>5.1f}m "
f"act={r['mean_act']:.2f}")
print(f"\n Total time: {elapsed:.1f} min")
print(f" Artefacts: {run_dir}/")
plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png"))
print(f" Plots: {run_dir}/success_rate.png, {eval_dir}/")
if __name__ == "__main__":
main()
+267
View File
@@ -0,0 +1,267 @@
"""Train a PPO shepherd-dog policy on ``HerdingEnv`` with curriculum.
Defaults to 16 parallel ``SubprocVecEnv`` workers feeding a GPU policy.
Saves checkpoints, the best-eval model, and the VecNormalize stats —
all three are needed at inference time by the Webots controller.
Usage::
python -m training.train_ppo \
--config training/configs/ppo_default.yaml \
--out-dir training/runs/baseline
To resume from a checkpoint::
python -m training.train_ppo --resume training/runs/baseline/checkpoints/ppo_500000_steps.zip
"""
from __future__ import annotations
import argparse
import os
import sys
from pathlib import Path
import yaml
_HERE = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
import numpy as np
import torch as th
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import (
BaseCallback, CheckpointCallback, EvalCallback,
)
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import (
DummyVecEnv, SubprocVecEnv, VecNormalize,
)
from training.herding_env import HerdingEnv
# --------------------------------------------------------------------------
# Env factories
# --------------------------------------------------------------------------
def _make_env(rank: int, seed: int = 0):
def _thunk():
env = HerdingEnv(seed=seed + rank)
env = Monitor(env, info_keywords=("is_success", "n_sheep", "n_penned"))
return env
return _thunk
# --------------------------------------------------------------------------
# Curriculum callback
# --------------------------------------------------------------------------
class CurriculumCallback(BaseCallback):
"""Drive the env's flock-size + state-space difficulty curriculum.
Schedule entries: {step, max_n_sheep, difficulty}. The largest entry
whose step <= num_timesteps wins; both knobs update together.
"""
def __init__(self, schedule, vec_envs, verbose: int = 0):
super().__init__(verbose)
self.schedule = sorted(schedule, key=lambda d: d["step"])
# Accept a list of envs so the eval env tracks training difficulty.
self.vec_envs = vec_envs if isinstance(vec_envs, (list, tuple)) else [vec_envs]
self._last_n = None
self._last_d = None
def _call(self, method, value):
for v in self.vec_envs:
try:
v.env_method(method, value)
except AttributeError:
v.venv.env_method(method, value)
def _on_step(self) -> bool:
t = self.num_timesteps
n = self.schedule[0]["max_n_sheep"]
d = self.schedule[0].get("difficulty", 1.0)
for entry in self.schedule:
if t >= entry["step"]:
n = entry["max_n_sheep"]
d = entry.get("difficulty", 1.0)
if n != self._last_n:
self._call("set_max_n_sheep", n)
self._last_n = n
if d != self._last_d:
self._call("set_difficulty", d)
self._last_d = d
if self.verbose:
print(f"[curriculum] t={t} → max_n_sheep={n} difficulty={d}")
return True
# --------------------------------------------------------------------------
# Main
# --------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--config", default=os.path.join(_HERE, "configs", "ppo_default.yaml"))
parser.add_argument("--out-dir", default=os.path.join(_HERE, "runs", "latest"))
parser.add_argument("--n-envs", type=int, default=None,
help="Override config n_envs.")
parser.add_argument("--total-timesteps", type=int, default=None,
help="Override config total_timesteps.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--resume", type=str, default=None,
help="Path to a SB3 zip to resume from.")
# SB3 recommends CPU for MlpPolicy — GPU helps CNN policies, not MLPs
# of this size. Override with --device cuda if you really want it.
parser.add_argument("--device", default="cpu")
parser.add_argument("--no-vecnorm", action="store_true",
help="Disable VecNormalize wrapper. Required when "
"resuming from a BC-pretrained policy that "
"wasn't trained under it.")
parser.add_argument("--no-curriculum", action="store_true",
help="Skip curriculum callback (resumed policy is "
"already competent across the distribution).")
parser.add_argument("--imitate-weight", type=float, default=None,
help="Override env W_IMITATE. Set to 0 to disable "
"Strömbom imitation reward.")
parser.add_argument("--difficulty", type=float, default=None,
help="Override env difficulty (0=easy, 1=hard). "
"Used in BC fine-tune to skip easy curriculum.")
parser.add_argument("--log-std", type=float, default=None,
help="Override the policy's log_std after load. "
"BC trained with std≈1.6 (log_std=0.5) which "
"is too noisy for fine-tune. Use -1.5 (std≈0.22) "
"to keep PPO close to the BC mean while still "
"exploring locally.")
parser.add_argument("--learning-rate", type=float, default=None,
help="Override config learning rate. For BC "
"fine-tune, 5e-5 is much safer than the 3e-4 "
"default.")
args = parser.parse_args()
with open(args.config) as f:
cfg = yaml.safe_load(f)
n_envs = args.n_envs or cfg["n_envs"]
total_timesteps = args.total_timesteps or cfg["total_timesteps"]
out = Path(args.out_dir)
out.mkdir(parents=True, exist_ok=True)
(out / "checkpoints").mkdir(exist_ok=True)
(out / "best").mkdir(exist_ok=True)
(out / "evals").mkdir(exist_ok=True)
print(f"[train] out={out} n_envs={n_envs} total={total_timesteps} device={args.device}")
# --- Train env (vectorised, optionally normalised) ---
env_fns = [_make_env(i, seed=args.seed) for i in range(n_envs)]
venv = SubprocVecEnv(env_fns) if n_envs > 1 else DummyVecEnv(env_fns)
eval_venv = DummyVecEnv([_make_env(99, seed=args.seed + 999)])
if not args.no_vecnorm:
venv = VecNormalize(venv, norm_obs=True, norm_reward=False, clip_obs=10.0)
eval_venv = VecNormalize(eval_venv, norm_obs=True, norm_reward=False,
clip_obs=10.0, training=False)
eval_venv.obs_rms = venv.obs_rms
else:
print("[train] VecNormalize disabled (resumed policy was trained without it).")
# Apply env-level overrides (used by BC fine-tune to disable Strömbom
# imitation and start at full deployment difficulty).
def _env_call(method, value):
for v in (venv, eval_venv):
try:
v.env_method(method, value)
except AttributeError:
v.venv.env_method(method, value)
if args.imitate_weight is not None:
_env_call("set_imitate_weight", args.imitate_weight)
print(f"[train] W_IMITATE overridden to {args.imitate_weight}")
if args.difficulty is not None:
_env_call("set_difficulty", args.difficulty)
print(f"[train] difficulty pinned to {args.difficulty}")
# --- Model ---
policy_kwargs = dict(
net_arch=dict(pi=cfg["net_arch_pi"], vf=cfg["net_arch_vf"]),
log_std_init=cfg.get("log_std_init", 0.0),
)
if args.resume:
print(f"[train] resuming from {args.resume}")
custom_objects = {}
if args.learning_rate is not None:
custom_objects["learning_rate"] = args.learning_rate
model = PPO.load(args.resume, env=venv, device=args.device,
tensorboard_log=str(out / "tb"),
custom_objects=custom_objects or None)
if args.log_std is not None:
import torch as _th
with _th.no_grad():
model.policy.log_std.fill_(args.log_std)
print(f"[train] log_std overridden to {args.log_std} "
f"(std≈{2.71828 ** args.log_std:.2f})")
if args.learning_rate is not None:
print(f"[train] learning_rate overridden to {args.learning_rate}")
else:
model = PPO(
cfg["policy"], venv,
learning_rate=cfg["learning_rate"],
n_steps=cfg["n_steps"],
batch_size=cfg["batch_size"],
n_epochs=cfg["n_epochs"],
gamma=cfg["gamma"],
gae_lambda=cfg["gae_lambda"],
clip_range=cfg["clip_range"],
ent_coef=cfg["ent_coef"],
vf_coef=cfg["vf_coef"],
max_grad_norm=cfg["max_grad_norm"],
target_kl=cfg.get("target_kl"),
policy_kwargs=policy_kwargs,
tensorboard_log=str(out / "tb"),
seed=args.seed,
device=args.device,
verbose=1,
)
# --- Callbacks ---
ckpt_cb = CheckpointCallback(
save_freq=max(1, cfg["checkpoint_freq"] // n_envs),
save_path=str(out / "checkpoints"), name_prefix="ppo",
save_vecnormalize=True,
)
eval_cb = EvalCallback(
eval_venv,
best_model_save_path=str(out / "best"),
log_path=str(out / "evals"),
eval_freq=max(1, cfg["eval_freq"] // n_envs),
n_eval_episodes=cfg["n_eval_episodes"],
deterministic=True,
)
callbacks = [ckpt_cb, eval_cb]
if not args.no_curriculum and "curriculum" in cfg and cfg["curriculum"]:
callbacks.append(CurriculumCallback(
cfg["curriculum"], [venv, eval_venv], verbose=1,
))
elif args.no_curriculum:
print("[train] curriculum disabled — env knobs left at their current values.")
# --- Train ---
model.learn(total_timesteps=total_timesteps, callback=callbacks,
progress_bar=True)
# --- Save final model + VecNormalize stats ---
model.save(out / "final.zip")
venv.save(str(out / "vecnormalize.pkl"))
# The EvalCallback already wrote best_model.zip into out/best/ — drop the
# VecNormalize stats next to it for the controller to pick up.
venv.save(str(out / "best" / "vecnormalize.pkl"))
print(f"[train] done. saved to {out}")
if __name__ == "__main__":
main()
-342
View File
@@ -1,342 +0,0 @@
"""
All visualization for the herding policy: trajectory plots, timeseries plots,
success-rate bar chart, and animated GIFs.
Used both by train.py (auto-rendered after each curriculum stage) and as a CLI
to render a fresh episode against a saved model.
CLI usage:
python viz.py --run-dir runs/v1 --n-sheep 5
python viz.py --run-dir runs/v1 --n-sheep 10 --no-gif
python viz.py --model runs/v1/final_model.zip --vecnorm runs/v1/vecnorm.pkl \\
--n-sheep 3 --out-dir vis_v1_3sheep
"""
import argparse
import os
import json
from copy import deepcopy
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.animation as animation
from matplotlib.collections import LineCollection
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from herding_env import HerdingEnv
# ── Palette ──────────────────────────────────────────────────────────────────
SHEEP_COLORS = [
"#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00",
"#a65628", "#f781bf", "#999999", "#66c2a5", "#fc8d62",
]
DOG_COLOR = "#4e342e"
# ── Common drawing primitives ────────────────────────────────────────────────
def draw_field(ax):
ax.set_xlim(-16, 16)
ax.set_ylim(-16, 16)
ax.set_aspect("equal")
ax.set_facecolor("#dcedc8")
ax.add_patch(mpatches.Rectangle(
(-15, -15), 30, 30, fill=False, edgecolor="#795548", lw=2))
ax.add_patch(mpatches.Rectangle(
(10, -15), 3, 7, facecolor="#ffe082", edgecolor="#795548", lw=2))
ax.text(11.5, -11.5, "pen", ha="center", va="center",
fontsize=8, color="#795548")
def faded_path(ax, xs, ys, color, lw=1.5, label=None):
n = len(xs)
if n < 2:
return
points = np.array([xs, ys]).T.reshape(-1, 1, 2)
segs = np.concatenate([points[:-1], points[1:]], axis=1)
alphas = np.linspace(0.15, 1.0, len(segs))
colors = [(*matplotlib.colors.to_rgb(color), a) for a in alphas]
ax.add_collection(LineCollection(segs, colors=colors, linewidth=lw))
if label:
ax.plot([], [], color=color, lw=lw, label=label)
# ── Episode rollout ──────────────────────────────────────────────────────────
def make_eval_env(n_sheep, seed, max_steps, reward_cfg=None):
def _init():
env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
reward_cfg=reward_cfg)
env.reset(seed=seed)
return env
return _init
def run_and_record(model, vn_template, n_sheep, max_steps,
reward_cfg=None, seed=42, make_env_fn=None):
"""Run one deterministic episode and return full trajectory history."""
_factory = make_env_fn or make_eval_env
raw = DummyVecEnv([_factory(n_sheep, seed, max_steps, reward_cfg)])
vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
vn.obs_rms = deepcopy(vn_template.obs_rms)
vn.ret_rms = deepcopy(vn_template.ret_rms)
obs = vn.reset()
inner = vn.envs[0]
done = False
dog_xs, dog_ys = [], []
sheep_xs = [[] for _ in range(n_sheep)]
sheep_ys = [[] for _ in range(n_sheep)]
sheep_penned = [[] for _ in range(n_sheep)]
radii = []
pen_dists = [[] for _ in range(n_sheep)]
action_mags = []
rewards = []
penned_at = [None] * n_sheep
step = 0
while not done:
action, _ = model.predict(obs, deterministic=True)
obs, reward, dones, infos = vn.step(action)
done = dones[0]
step += 1
dog_xs.append(float(inner.dog_pos[0]))
dog_ys.append(float(inner.dog_pos[1]))
com, radius, _ = inner._flock_stats()
radii.append(radius)
rewards.append(float(reward[0]))
action_mags.append(float(np.linalg.norm(action[0])))
for i in range(n_sheep):
sheep_xs[i].append(float(inner.sheep_pos[i][0]))
sheep_ys[i].append(float(inner.sheep_pos[i][1]))
sheep_penned[i].append(bool(inner.penned[i]))
pen_dists[i].append(
float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER)))
if inner.penned[i] and penned_at[i] is None:
penned_at[i] = step
n_penned = infos[0].get("n_penned", 0)
vn.close()
return dict(
dog_xs=dog_xs, dog_ys=dog_ys,
sheep_xs=sheep_xs, sheep_ys=sheep_ys,
sheep_penned=sheep_penned,
radii=radii, pen_dists=pen_dists,
action_mags=action_mags, rewards=rewards,
penned_at=penned_at,
n_penned=n_penned, n_sheep=n_sheep,
success=n_penned == n_sheep, steps=step,
)
# ── Static plots ─────────────────────────────────────────────────────────────
def plot_trajectory(hist, out_path):
fig, ax = plt.subplots(figsize=(7, 7))
draw_field(ax)
for i in range(hist["n_sheep"]):
c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
xs, ys = hist["sheep_xs"][i], hist["sheep_ys"][i]
faded_path(ax, xs, ys, c, lw=1.2, label=f"sheep {i+1}")
ax.plot(xs[0], ys[0], "o", color=c, ms=7, zorder=4)
end = hist["penned_at"][i] if hist["penned_at"][i] is not None else -1
ax.plot(xs[end], ys[end], "*", color=c, ms=11, zorder=5)
faded_path(ax, hist["dog_xs"], hist["dog_ys"], DOG_COLOR, lw=2.0,
label="dog")
ax.plot(hist["dog_xs"][0], hist["dog_ys"][0], "s", color=DOG_COLOR,
ms=10, zorder=5)
ax.plot(hist["dog_xs"][-1], hist["dog_ys"][-1], "D", color=DOG_COLOR,
ms=10, zorder=5)
result = ("SUCCESS" if hist["success"]
else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})")
ax.set_title(f"n={hist['n_sheep']} {result} {hist['steps']} steps",
fontsize=12)
ax.legend(loc="upper left", fontsize=8)
plt.tight_layout()
fig.savefig(out_path, dpi=120)
plt.close(fig)
def plot_timeseries(hist, out_path):
t = np.arange(hist["steps"])
fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)
axes[0].plot(t, hist["radii"], color="steelblue")
axes[0].axhline(5.0, color="orange", ls="--", lw=1, label="compact (5m)")
axes[0].set_ylabel("flock radius (m)")
axes[0].legend(fontsize=8)
axes[0].set_title("Flock radius")
for i in range(hist["n_sheep"]):
c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
axes[1].plot(t, hist["pen_dists"][i], color=c, lw=1,
label=f"sheep {i+1}")
if hist["penned_at"][i] is not None:
axes[1].axvline(hist["penned_at"][i], color=c, ls=":", lw=1)
axes[1].set_ylabel("dist to pen (m)")
axes[1].legend(fontsize=7, ncol=min(hist["n_sheep"], 5))
axes[1].set_title("Per-sheep distance to pen")
axes[2].plot(t, hist["action_mags"], color="tomato", lw=1)
axes[2].axhline(1.0, color="gray", ls="--", lw=1, label="max")
axes[2].set_ylabel("action ||(vx,vy)||")
axes[2].set_ylim(0, 1.5)
axes[2].set_title("Dog action magnitude")
axes[2].legend(fontsize=8)
axes[3].plot(t, hist["rewards"], color="purple", lw=1, alpha=0.7)
axes[3].axhline(0, color="black", lw=0.5)
axes[3].set_ylabel("reward")
axes[3].set_xlabel("step")
axes[3].set_title("Reward per step")
result = ("SUCCESS" if hist["success"]
else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})")
fig.suptitle(f"n_sheep={hist['n_sheep']} {result} {hist['steps']} steps",
fontsize=13)
plt.tight_layout()
fig.savefig(out_path, dpi=120)
plt.close(fig)
def plot_success_rate(stage_results, out_path):
fig, ax = plt.subplots(figsize=(8, 4))
ns = [r["n_sheep"] for r in stage_results]
srs = [r["sr"] * 100 for r in stage_results]
bars = ax.bar(ns, srs, color="steelblue", edgecolor="white")
ax.set_xlabel("Sheep count")
ax.set_ylabel("Success rate (%)")
ax.set_ylim(0, 105)
ax.axhline(90, color="orange", ls="--", lw=1, label="90% target")
for bar, sr in zip(bars, srs):
ax.text(bar.get_x() + bar.get_width() / 2,
bar.get_height() + 1, f"{sr:.0f}%",
ha="center", fontsize=9)
ax.legend()
ax.set_title("Evaluation success rate per sheep count")
plt.tight_layout()
fig.savefig(out_path, dpi=120)
plt.close(fig)
# ── Animated GIF ─────────────────────────────────────────────────────────────
def save_episode_gif(hist, out_path, fps=20, skip=3):
"""Render hist as an animated GIF. `skip` keeps every Nth frame (smaller file)."""
n_sheep = hist["n_sheep"]
frames = list(range(0, hist["steps"], max(1, skip)))
if frames[-1] != hist["steps"] - 1:
frames.append(hist["steps"] - 1)
fig, ax = plt.subplots(figsize=(6, 6))
draw_field(ax)
title = ax.text(0, 16.5, "", ha="center", fontsize=11)
dog_marker, = ax.plot([], [], "s", color=DOG_COLOR, ms=12,
markeredgecolor="black", markeredgewidth=1.5,
zorder=5)
sheep_markers = []
for i in range(n_sheep):
c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
m, = ax.plot([], [], "o", color=c, ms=10,
markeredgecolor="#333", markeredgewidth=1, zorder=4)
sheep_markers.append(m)
dog_trail, = ax.plot([], [], color=DOG_COLOR, lw=1.0, alpha=0.5)
def update(k):
title.set_text(
f"n={n_sheep} step {k+1}/{hist['steps']} "
f"penned {sum(hist['sheep_penned'][i][k] for i in range(n_sheep))}/{n_sheep}")
dog_marker.set_data([hist["dog_xs"][k]], [hist["dog_ys"][k]])
dog_trail.set_data(hist["dog_xs"][:k+1], hist["dog_ys"][:k+1])
for i, m in enumerate(sheep_markers):
m.set_data([hist["sheep_xs"][i][k]], [hist["sheep_ys"][i][k]])
penned = hist["sheep_penned"][i][k]
m.set_color("deeppink" if penned else SHEEP_COLORS[i % len(SHEEP_COLORS)])
return [title, dog_marker, dog_trail, *sheep_markers]
anim = animation.FuncAnimation(
fig, update, frames=frames, interval=1000 / fps, blit=False)
anim.save(out_path, writer=animation.PillowWriter(fps=fps), dpi=80)
plt.close(fig)
# ── CLI ──────────────────────────────────────────────────────────────────────
def _resolve_paths(args):
if args.run_dir:
model_path = os.path.join(args.run_dir, "final_model.zip")
vn_path = os.path.join(args.run_dir, "vecnorm.pkl")
cfg_path = os.path.join(args.run_dir, "config.json")
else:
model_path = args.model
vn_path = args.vecnorm
cfg_path = args.config
return model_path, vn_path, cfg_path
def main():
p = argparse.ArgumentParser(
description="Render trajectory + timeseries + GIF for a saved policy.")
p.add_argument("--run-dir", type=str, default=None,
help="Run directory containing final_model.zip + vecnorm.pkl + config.json")
p.add_argument("--model", type=str, default=None)
p.add_argument("--vecnorm", type=str, default=None)
p.add_argument("--config", type=str, default=None)
p.add_argument("--n-sheep", type=int, default=3)
p.add_argument("--seed", type=int, default=42)
p.add_argument("--max-steps", type=int, default=2500)
p.add_argument("--out-dir", type=str, default=None)
p.add_argument("--no-gif", action="store_true",
help="Skip the animated GIF (PNG-only is faster).")
p.add_argument("--gif-fps", type=int, default=20)
p.add_argument("--gif-skip", type=int, default=3)
args = p.parse_args()
model_path, vn_path, cfg_path = _resolve_paths(args)
if not (model_path and vn_path):
p.error("either --run-dir or both --model and --vecnorm are required")
rcfg = None
if cfg_path and os.path.exists(cfg_path):
with open(cfg_path) as f:
cfg = json.load(f)
rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
out_dir = args.out_dir or os.path.join(
os.path.dirname(os.path.abspath(model_path)),
f"vis_{args.n_sheep}s")
os.makedirs(out_dir, exist_ok=True)
print(f"Loading model: {model_path}")
print(f"Loading vecnorm: {vn_path}")
model = PPO.load(model_path, device="cpu")
raw = DummyVecEnv([make_eval_env(args.n_sheep, args.seed, args.max_steps, rcfg)])
vn = VecNormalize.load(vn_path, raw)
print(f"Rolling out n_sheep={args.n_sheep} (seed={args.seed})...")
hist = run_and_record(model, vn, args.n_sheep, args.max_steps,
reward_cfg=rcfg, seed=args.seed)
result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})"
print(f" {result} in {hist['steps']} steps")
plot_trajectory(hist, os.path.join(out_dir, "trajectory.png"))
plot_timeseries(hist, os.path.join(out_dir, "timeseries.png"))
print(f" saved trajectory.png + timeseries.png to {out_dir}/")
if not args.no_gif:
gif_path = os.path.join(out_dir, "episode.gif")
print(f" rendering GIF (fps={args.gif_fps}, skip={args.gif_skip})...")
save_episode_gif(hist, gif_path, fps=args.gif_fps, skip=args.gif_skip)
print(f" saved {gif_path}")
if __name__ == "__main__":
main()
+1 -1
View File
@@ -1,5 +1,5 @@
Webots Project File version R2025a Webots Project File version R2025a
perspectives: 000000ff00000000fd00000002000000010000011c00000298fc0200000001fb0000001400540065007800740045006400690074006f00720100000000000002980000003f00ffffff000000030000084300000238fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000008430000006900ffffff000007250000029800000001000000020000000100000008fc00000000 perspectives: 000000ff00000000fd00000002000000010000011c000001bcfc0200000001fb0000001400540065007800740045006400690074006f00720100000000000001bc0000003f00ffffff00000003000005c600000220fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000005c60000006900ffffff000004a8000001bc00000001000000020000000100000008fc00000000
simulationViewPerspectives: 000000ff000000010000000200000100000006250100000002010000000100 simulationViewPerspectives: 000000ff000000010000000200000100000006250100000002010000000100
sceneTreePerspectives: 000000ff00000001000000030000001f000000c0000000000100000002010000000200 sceneTreePerspectives: 000000ff00000001000000030000001f000000c0000000000100000002010000000200
maximizedDockId: -1 maximizedDockId: -1
+69 -63
View File
@@ -10,7 +10,7 @@ EXTERNPROTO "../protos/Sheep.proto"
# World # World
WorldInfo { WorldInfo {
info [ info [
"RL-Based Autonomous Shepherd Robot" "Autonomous Shepherd Robot (Strömbom)"
"Group G25" "Group G25"
] ]
title "Shepherd Herding" title "Shepherd Herding"
@@ -106,19 +106,26 @@ Solid { translation -2.5 -15 0.84 children [ Shape { appearance USE CAP geometry
Solid { translation 14 -15 0.40 children [ Shape { appearance USE STONE_A geometry Box { size 2.0 0.16 0.80 } } ] boundingObject Box { size 2.0 0.16 0.80 } } Solid { translation 14 -15 0.40 children [ Shape { appearance USE STONE_A geometry Box { size 2.0 0.16 0.80 } } ] boundingObject Box { size 2.0 0.16 0.80 } }
Solid { translation 14 -15 0.84 children [ Shape { appearance USE CAP geometry Box { size 2.1 0.26 0.07 } } ] boundingObject Box { size 2.1 0.26 0.07 } } Solid { translation 14 -15 0.84 children [ Shape { appearance USE CAP geometry Box { size 2.1 0.26 0.07 } } ] boundingObject Box { size 2.1 0.26 0.07 } }
# Gate posts # Gate posts
Solid { translation 10 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } Solid { translation 10 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
Solid { translation 13 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } Solid { translation 13 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
# Outer gate (wooden, slightly ajar, Z-brace) # Outer gate — fully open, hinged on the west gate post. Modeled as a swung-back
Solid { translation 11.5 -15.08 0.55 rotation 0 0 1 0.25 children [ # wooden gate parallel to the south wall, on the west side, so the 3m corridor
# between gate posts (x=10..13, y=-15) is unobstructed.
Solid { translation 8.6 -15.05 0.55 rotation 0 0 1 0 children [
Shape { appearance USE WOOD geometry Box { size 2.80 0.05 1.00 } } Shape { appearance USE WOOD geometry Box { size 2.80 0.05 1.00 } }
Transform { translation 0 0.02 0 rotation 0 1 0 0.34 children [ Shape { appearance DEF FPOST PBRAppearance { baseColor 0.35 0.22 0.10 roughness 0.90 } geometry Box { size 2.97 0.04 0.06 } } ] } # FPOST appearance DEF lives here so the external pen below can USE it.
Transform { translation 0 0.02 0 rotation 0 1 0 0.34 children [
Shape { appearance DEF FPOST PBRAppearance { baseColor 0.35 0.22 0.10 roughness 0.90 } geometry Box { size 2.97 0.04 0.06 } }
] }
] boundingObject Box { size 2.80 0.08 1.00 } } ] boundingObject Box { size 2.80 0.08 1.00 } }
# ==================== QUARANTINE PEN (wooden post-and-rail fence, inside field) ==================== # ==================== EXTERNAL PEN (south of field, accessed through south-wall gate) ====================
# Flow: main field → inner gate → quarantine areaouter gate → outside # Flow: main field → south-wall gate (x ∈ [10, 13], y = -15)external pen
# The pen is a wooden post-and-rail rectangle south of the field, x ∈ [10, 13],
# y ∈ [-22, -15], open on the north side (the gate hole is the entrance).
# West wall (x=10, ~7m along Y) # Pen west wall (x=10, y from -22 to -15, length 7m)
Solid { translation 10 -11.46 0.55 children [ Solid { translation 10 -18.5 0.55 children [
Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
@@ -130,8 +137,8 @@ Solid { translation 10 -11.46 0.55 children [
Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] } Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] }
] boundingObject Box { size 0.14 6.92 1.10 } } ] boundingObject Box { size 0.14 6.92 1.10 } }
# East wall (x=13) # Pen east wall (x=13, y from -22 to -15, length 7m)
Solid { translation 13 -11.46 0.55 children [ Solid { translation 13 -18.5 0.55 children [
Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 -3.46 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 -1.73 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] } Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
@@ -143,39 +150,50 @@ Solid { translation 13 -11.46 0.55 children [
Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] } Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 0.14 6.92 0.04 } } ] }
] boundingObject Box { size 0.14 6.92 1.10 } } ] boundingObject Box { size 0.14 6.92 1.10 } }
# North wall - open entrance (no wall, just corner posts) # Pen south wall (y=-22, x from 10 to 13, length 3m, closes the back of the pen)
Solid { translation 10 -8 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] boundingObject Box { size 0.12 0.12 1.10 } } Solid { translation 11.5 -22 0.55 children [
Solid { translation 13 -8 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] boundingObject Box { size 0.12 0.12 1.10 } } Transform { translation -1.5 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
Transform { translation 0 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
Transform { translation 1.5 0 0 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
Transform { translation 0 0 -0.38 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] }
Transform { translation 0 0 -0.05 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] }
Transform { translation 0 0 0.30 children [ Shape { appearance USE WOOD geometry Box { size 2.92 0.06 0.08 } } ] }
Transform { translation 0 0 0.53 children [ Shape { appearance USE FPOST geometry Box { size 2.92 0.14 0.04 } } ] }
] boundingObject Box { size 2.92 0.14 1.10 } }
# Pen north corner posts at the gate opening (no wall — sheep enter here from the field)
Solid { translation 10 -15.0 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
Solid { translation 13 -15.0 0.55 children [ Shape { appearance USE FPOST geometry Box { size 0.12 0.12 1.10 } } ] }
# Corner pillars # Corner pillars
Solid { translation 15 15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } Solid { translation 15 15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
Solid { translation 15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } Solid { translation 15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
Solid { translation -15 15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } Solid { translation -15 15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
Solid { translation -15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] boundingObject Box { size 0.44 0.44 1.12 } } Solid { translation -15 -15 0.56 children [ Shape { appearance USE STONE_B geometry Box { size 0.44 0.44 1.12 } } Shape { appearance USE CAP geometry Box { size 0.54 0.54 0.08 } } ] }
# Mid-pillars every 5 m — East # Mid-pillars every 5 m — East
Solid { translation 15 10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 15 10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation 15 5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 15 5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation 15 0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 15 0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation 15 -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 15 -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation 15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
# West # West
Solid { translation -15 10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation -15 10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation -15 5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation -15 5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation -15 0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation -15 0 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation -15 -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation -15 -5 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation -15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation -15 -10 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
# North # North
Solid { translation 10 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 10 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation 5 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 5 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation 0 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 0 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation -5 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation -5 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation -10 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation -10 15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
# South # South
Solid { translation 5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation 0 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation 0 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation -5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation -5 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
Solid { translation -10 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] boundingObject Box { size 0.34 0.34 1.06 } } Solid { translation -10 -15 0.53 children [ Shape { appearance USE STONE_B geometry Box { size 0.34 0.34 1.06 } } Shape { appearance USE CAP geometry Box { size 0.44 0.44 0.07 } } ] }
# ==================== BARN 1 — Gambrel/Dutch style (NE, outside fence) ==================== # ==================== BARN 1 — Gambrel/Dutch style (NE, outside fence) ====================
# Body 10×7×4, weathered gray-brown wood, gambrel roof, large double doors # Body 10×7×4, weathered gray-brown wood, gambrel roof, large double doors
@@ -503,28 +521,16 @@ ShepherdDog {
} }
# ==================== SHEEP ==================== # ==================== SHEEP ====================
Sheep { # Up to 10 sheep, scattered through the field's central/north zone. Comment
translation 3 2 0.5 # out trailing slots to test smaller flock sizes; the dog policy is trained
name "sheep1" # to handle 1..10 sheep so any prefix works.
controller "sheep" Sheep { translation 3.0 2.0 0.5 name "sheep1" controller "sheep" }
} Sheep { translation 3.0 -2.0 0.5 name "sheep2" controller "sheep" }
Sheep { Sheep { translation 4.0 0.0 0.5 name "sheep3" controller "sheep" }
translation 3 -2 0.5 Sheep { translation -3.0 4.0 0.5 name "sheep4" controller "sheep" }
name "sheep2" Sheep { translation -5.0 -2.0 0.5 name "sheep5" controller "sheep" }
controller "sheep" Sheep { translation 6.0 5.0 0.5 name "sheep6" controller "sheep" }
} Sheep { translation -6.0 6.0 0.5 name "sheep7" controller "sheep" }
Sheep { Sheep { translation 0.0 8.0 0.5 name "sheep8" controller "sheep" }
translation 4 0 0.5 Sheep { translation -8.0 0.0 0.5 name "sheep9" controller "sheep" }
name "sheep3" Sheep { translation 7.0 -4.0 0.5 name "sheep10" controller "sheep" }
controller "sheep"
}
# Sheep {
# translation 3.5 1 0.5
# name "sheep4"
# controller "sheep"
# }
# Sheep {
# translation 3.5 -1 0.5
# name "sheep5"
# controller "sheep"
# }