Checkpoint 6

This commit is contained in:
Johnny Fernandes
2026-05-11 10:35:48 +01:00
parent b457155538
commit fce0e0c786
27 changed files with 194 additions and 704 deletions
+38 -38
View File
@@ -12,7 +12,7 @@ gate into an external pen. The dog has three deployable modes:
| `rl` | KL-regularised PPO fine-tune of `bc` | Reward-driven refinement | | `rl` | KL-regularised PPO fine-tune of `bc` | Reward-driven refinement |
`sequential` (single-target pin-and-push) is kept as an alternative `sequential` (single-target pin-and-push) is kept as an alternative
analytic baseline. `dagger` is a data-collection mode, not deployment. analytic baseline.
## Perception ## Perception
@@ -28,13 +28,13 @@ control step:
(`herding/sheep_tracker.py`). (`herding/sheep_tracker.py`).
**LiDAR validation** (intermediate-goal item v from `docs/project.md`): **LiDAR validation** (intermediate-goal item v from `docs/project.md`):
run the dog controller in `HERDING_MODE=diag` mode to capture 80 during development a diagnostic-dump controller captured 80 real
real Webots scans plus the ground-truth sheep positions in Webots scans plus the ground-truth sheep positions. Comparing
`training/dagger/diag_<ts>.npz`. Comparing detections against GT in detections against GT showed clustered centroids match GT positions
that file showed clustered centroids match GT positions within 0.15 m within 0.15 m after the +SHEEP_RADIUS surface-to-centre correction —
after the +SHEEP_RADIUS surface-to-centre correction — i.e. the i.e. the LiDAR pipeline produces correct sheep-position estimates
LiDAR pipeline produces correct sheep-position estimates from the from the real Webots scan, validating the sensor for the herding
real Webots scan, validating the sensor for the herding task. task.
The tracker outputs a `{name: (x, y)}` dict shaped exactly like the The tracker outputs a `{name: (x, y)}` dict shaped exactly like the
prior receiver-based one, so Strömbom, Sequential, and the BC obs prior receiver-based one, so Strömbom, Sequential, and the BC obs
@@ -53,7 +53,7 @@ Privileged ground-truth perception is available for ablation —
pip install -r training/requirements.txt pip install -r training/requirements.txt
# 2. Smoke test # 2. Smoke test
python -m training.parity_test python -m tests.parity_test
# 3. Reproduce the BC policy (~10 min on CPU: ~5 min demos + ~3 min BC) # 3. Reproduce the BC policy (~10 min on CPU: ~5 min demos + ~3 min BC)
python -m tools.collect_demos --teacher strombom \ python -m tools.collect_demos --teacher strombom \
@@ -61,21 +61,17 @@ python -m tools.collect_demos --teacher strombom \
python -m training.bc_pretrain --demos training/demos.npz \ python -m training.bc_pretrain --demos training/demos.npz \
--out training/runs/bc --epochs 60 --net-arch 512,512 --out training/runs/bc --epochs 60 --net-arch 512,512
# 4. Optional: DAgger from inside Webots if sim-trained doesn't transfer # 4. KL-PPO fine-tune of the BC policy (~30 min on CPU, 1 M steps)
tools/auto_dagger.sh 3 60
python -m tools.dagger_merge_train --out training/runs/bc_dagger
# 5. Evaluate (env)
python -m training.eval --policy training/runs/bc \
--max-flock 10 --max-steps 8000 --n-seeds 5
# 6. Optional RL fine-tune of the BC policy (~40 min on CPU, 1 M steps)
python -m training.train_ppo \ python -m training.train_ppo \
--bc training/runs/bc \ --bc training/runs/bc \
--out training/runs/rl \ --out training/runs/rl \
--total-timesteps 1000000 --total-timesteps 1000000
# 7. Run in Webots # 5. Evaluate (env)
python -m training.eval --policy training/runs/rl \
--max-flock 10 --max-steps 15000 --n-seeds 10
# 6. Run in Webots
tools/run_webots.sh 10 bc # behaviour-cloned MLP tools/run_webots.sh 10 bc # behaviour-cloned MLP
tools/run_webots.sh 10 rl # KL-PPO fine-tune tools/run_webots.sh 10 rl # KL-PPO fine-tune
tools/run_webots.sh 10 strombom # analytic baseline tools/run_webots.sh 10 strombom # analytic baseline
@@ -84,22 +80,25 @@ tools/run_webots.sh 10 strombom # analytic baseline
## Layout ## Layout
``` ```
herding/ — single source of truth (env + Webots both import) herding/ — perception / control / world primitives
geometry.py — field/pen constants, robot specs
flocking_sim.py — Reynolds-style sheep dynamics
diffdrive.py — differential-drive kinematics
control.py — shared near-sheep speed-modulation helper
obs.py — 32-D order-invariant observation builder obs.py — 32-D order-invariant observation builder
strombom.pycanonical CoM-drive teacher world/ environment-side physics & geometry
sequential.py — single-target "pin-and-push" teacher geometry.py field/pen constants, robot specs
active_scan.py — wraps a base teacher with opening rotation + diffdrive.py differential-drive kinematics
walk-to-centre + speed modulation flocking_sim.py Reynolds + Strömbom 2014 sheep dynamics
lidar_sim.pyfast 2D raycast for the env (sheep + walls + posts) perception/ LiDAR → tracked-sheep pipeline
lidar_perception.py — scan → world-frame cluster centroids + filters lidar_sim.py fast 2D raycast for the env
sheep_tracker.py — multi-target NN tracker with FOV memory lidar_perception.py scan → world-frame cluster centroids + filters
sheep_tracker.py multi-target NN tracker with FOV memory
control/ — every dog mode's action source
strombom.py canonical CoM collect/drive heuristic
sequential.py single-target "pin-and-push" alternative
active_scan.py wraps a base teacher with opening rotation +
walk-to-centre fallback
modulation.py shared near-sheep speed-modulation helper
controllers/ controllers/
sheep/sheep.py — Webots sheep controller (uses herding.flocking_sim) sheep/sheep.py — Webots sheep controller (uses herding.world.flocking_sim)
shepherd_dog/ shepherd_dog/
shepherd_dog.py — Webots dog controller, mode-switched shepherd_dog.py — Webots dog controller, mode-switched
policy_loader.py — lazy SB3 policy loader (auto-detects frame stack) policy_loader.py — lazy SB3 policy loader (auto-detects frame stack)
@@ -107,16 +106,17 @@ controllers/
training/ training/
herding_env.py — Gymnasium env (LiDAR + tracker by default) herding_env.py — Gymnasium env (LiDAR + tracker by default)
bc_pretrain.py — supervised BC of (obs, action) demos into MLP bc_pretrain.py — supervised BC of (obs, action) demos into MLP
eval.py — analytic + BC policy comparison harness train_ppo.py — KL-regularised PPO fine-tune of BC
parity_test.py — shape / determinism smoke test eval.py — analytic + learned policy comparison harness
runs/ — checkpoints (whitelisted in .gitignore) runs/ — checkpoints (whitelisted in .gitignore)
requirements.txt requirements.txt
tests/
parity_test.py — shape / determinism / baseline smoke test
tools/ tools/
collect_demos.py — sim demos via the active-scan teacher collect_demos.py — sim demos via the active-scan teacher
dagger_merge_train.py — merge Webots-collected DAgger demos and retrain
run_webots.sh — launch Webots with N sheep + chosen mode run_webots.sh — launch Webots with N sheep + chosen mode
auto_dagger.sh — headless DAgger collection across many runs
worlds/ worlds/
field.wbt — main world (3 m gate, external pen) field.wbt — main world (3 m gate, external pen)
@@ -127,8 +127,8 @@ docs/project.md — original project goals
## Shared low-level control ## Shared low-level control
Every dog mode (RL, Strömbom, Sequential, the DAgger teacher) routes Every dog mode (Strömbom, Sequential, BC, RL) routes its action
its action through `herding/control.py:modulate_speed_near_sheep`, through `herding/control/modulation.py:modulate_speed_near_sheep`,
which scales action magnitude down when within ~2.5 m of the nearest which scales action magnitude down when within ~2.5 m of the nearest
tracked sheep. This stops the dog from charging in at full speed and tracked sheep. This stops the dog from charging in at full speed and
scattering the flock. Direction (intent) is preserved. scattering the flock. Direction (intent) is preserved.
+2 -2
View File
@@ -11,14 +11,14 @@ _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
if _PROJECT_ROOT not in sys.path: if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT) sys.path.insert(0, _PROJECT_ROOT)
from herding.flocking_sim import ( # noqa: F401 from herding.world.flocking_sim import ( # noqa: F401
MAX_SPEED, FLEE_SPEED, WANDER_SPEED, MAX_SPEED, FLEE_SPEED, WANDER_SPEED,
WALL_MARGIN, WALL_HARD_MARGIN, WALL_HARD_GAIN, WALL_MARGIN, WALL_HARD_MARGIN, WALL_HARD_GAIN,
FLEE_DIST, SEPARATION_DIST, COHESION_DIST, FLEE_DIST, SEPARATION_DIST, COHESION_DIST,
PEN_MARGIN, PEN_MARGIN,
compute_heading_speed, compute_heading_speed,
) )
from herding.geometry import ( # noqa: F401 from herding.world.geometry import ( # noqa: F401
FIELD_X, FIELD_Y, PEN_X, PEN_Y, FIELD_X, FIELD_Y, PEN_X, PEN_Y,
in_pen, in_pen,
) )
+3 -3
View File
@@ -24,9 +24,9 @@ if _PROJECT_ROOT not in sys.path:
from controller import Supervisor from controller import Supervisor
from herding.diffdrive import heading_speed_to_wheels from herding.world.diffdrive import heading_speed_to_wheels
from herding.flocking_sim import MAX_SPEED, compute_heading_speed from herding.world.flocking_sim import MAX_SPEED, compute_heading_speed
from herding.geometry import ( from herding.world.geometry import (
SHEEP_MAX_WHEEL_OMEGA, SHEEP_MAX_WHEEL_OMEGA,
is_penned_position, is_penned_position,
) )
+77 -263
View File
@@ -4,52 +4,39 @@ Mode is selected by ``HERDING_MODE`` (env var, or via the
``herding_runtime.cfg`` file the launcher writes since Webots strips ``herding_runtime.cfg`` file the launcher writes since Webots strips
env vars on some setups): env vars on some setups):
strombom → canonical Strömbom collect/drive heuristic. strombom → canonical Strömbom (2014) collect/drive heuristic
sequential → single-target "pin and push" — drives the sheep wrapped in ActiveScanTeacher (opening rotation +
closest to the pen. walk-to-centre when the tracker briefly empties).
bc → behaviour-cloned MLP, trained on Strömbom demos via sequential → single-target "pin-and-push", same wrapper.
sim. Default policy directory: training/runs/bc. bc → behaviour-cloned MLP, trained on Strömbom demos.
rl → KL-regularised PPO fine-tune of the BC policy. Same Default policy: training/runs/bc/policy.zip.
obs/action space as bc; refines time-to-pen via rl → KL-regularised PPO fine-tune of bc. Same obs/action
environment reward while staying anchored to bc. space as bc; refines time-to-pen via reward while
Default policy directory: training/runs/rl. staying anchored to bc.
dagger DAgger data collection. Reads sheep ground-truth Default policy: training/runs/rl/policy.zip.
via the receiver, computes the active-scan teacher's
recommended action at every step, drives with either
the teacher (HERDING_DAGGER_DRIVER=teacher, default)
or the loaded student (=student), and logs each
(lidar_stacked_obs, teacher_action) pair. On exit
dumps to ``training/dagger/dagger_<ts>.npz`` for
``tools.dagger_merge_train`` to consume.
Sheep perception Sheep perception
---------------- ----------------
The dog now perceives sheep through its **front-mounted 140° LiDAR** The dog perceives sheep through its **front-mounted 140° LiDAR**
(``protos/ShepherdDog.proto``: 180 rays, 12 m max range). Each step (``protos/ShepherdDog.proto``: 180 rays, 12 m max range). Each step:
the controller:
1. Reads ``lidar.getRangeImage()``. 1. Reads ``lidar.getRangeImage()``.
2. Runs ``herding.lidar_perception.detections_from_scan`` to cluster 2. Runs ``herding.perception.lidar_perception.detections_from_scan``
returns into world-frame ``(x, y)`` sheep estimates. to cluster returns into world-frame ``(x, y)`` sheep estimates.
3. Folds those into a ``herding.sheep_tracker.SheepTracker`` which 3. Folds those into a ``SheepTracker`` which maintains last-seen
maintains last-seen positions for sheep currently out of the positions for sheep currently out of FOV and latches "penned"
FOV and latches "penned" once a track disappears near the gate. once a track crosses the gate plane south.
The output of step 3 is a ``{name: (x, y)}`` dict shaped exactly like Sheep ``emitter`` messages are read **for diagnostic logging only**
the receiver-based one we used to consume — so Strömbom, Sequential (GT_penned counter + auto-finish sentinel); they are never used to
and the BC obs builder run unchanged. The sheep→dog Emitter/Receiver drive the policy. Perception for control comes entirely from LiDAR.
link is still up (kept passively for compatibility) but its messages
are *not* used for control.
All modes share the same low-level differential-drive controller Auto-finish
(``herding.diffdrive.velocity_to_wheels`` with cos(err)-clamped forward -----------
speed), so switching modes does not retune actuation. When the dog observes (via GT, read off the receiver) that all sheep
are penned, it writes ``training/.run_done`` and the launcher
A safety supervisor enforces the "dog stays out of the pen" invariant: (``tools/run_webots.sh``) detects it and closes Webots. This keeps
if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is batch evaluation runs bounded.
overridden with a north-driving correction. RL fallback: if the policy
zip can't be loaded (SB3 missing, file missing), the controller drops
to strombom mode automatically.
""" """
import math import math
@@ -62,26 +49,27 @@ _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
if _PROJECT_ROOT not in sys.path: if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT) sys.path.insert(0, _PROJECT_ROOT)
import numpy as np
from controller import Robot from controller import Robot
from herding.active_scan import ActiveScanTeacher from herding.control.active_scan import ActiveScanTeacher
from herding.control import modulate_speed_near_sheep from herding.control.modulation import modulate_speed_near_sheep
from herding.diffdrive import velocity_to_wheels from herding.control.sequential import compute_action as sequential_action
from herding.geometry import ( from herding.control.strombom import compute_action as strombom_action
from herding.obs import build_obs
from herding.perception.lidar_perception import detections_from_scan
from herding.perception.sheep_tracker import SheepTracker
from herding.world.diffdrive import velocity_to_wheels
from herding.world.geometry import (
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA,
DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS, DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS,
PEN_ENTRY, is_penned_position, PEN_ENTRY, is_penned_position,
) )
from herding.lidar_perception import detections_from_scan
from herding.obs import OBS_DIM, build_obs
from herding.sequential import compute_action_debug as sequential_action_debug
from herding.sheep_tracker import SheepTracker
from herding.strombom import compute_action as strombom_action
from herding.strombom import compute_action_debug as strombom_action_debug
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Mode selection # Mode + policy resolution
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _load_runtime_config(): def _load_runtime_config():
@@ -135,60 +123,41 @@ def _resolve_policy_dir(mode: str) -> str:
mode_default = { mode_default = {
"bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"), "bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
"rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl"), "rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl"),
"dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
} }
primary = mode_default.get(mode, mode_default["bc"]) primary = mode_default.get(mode, mode_default["bc"])
if os.path.isdir(primary): if os.path.isdir(primary):
return primary return primary
# Fall back to BC if the requested checkpoint isn't there yet
# (e.g., user asked for `rl` before training the fine-tune).
fallback = mode_default["bc"] fallback = mode_default["bc"]
if os.path.isdir(fallback): if os.path.isdir(fallback):
return fallback return fallback
return env_dir or primary return env_dir or primary
_VALID_MODES = ("bc", "rl", "strombom", "sequential", "dagger", "diag") _VALID_MODES = ("bc", "rl", "strombom", "sequential")
# Back-compat: an old config saying HERDING_MODE=rl meant "the BC policy".
# We now use `rl` strictly for the KL-PPO fine-tune. If the rl
# directory isn't present, _resolve_policy_dir below silently falls
# back to bc, preserving the old behaviour.
if MODE not in _VALID_MODES: if MODE not in _VALID_MODES:
print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.") print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
MODE = "strombom" MODE = "strombom"
DAGGER_DRIVER = (os.environ.get("HERDING_DAGGER_DRIVER")
or _runtime_cfg.get("HERDING_DAGGER_DRIVER")
or "teacher").lower()
if DAGGER_DRIVER not in ("teacher", "student"):
DAGGER_DRIVER = "teacher"
POLICY_DIR = _resolve_policy_dir(MODE) POLICY_DIR = _resolve_policy_dir(MODE)
policy_handle = None policy_handle = None
if MODE in ("bc", "rl", "dagger"): if MODE in ("bc", "rl"):
print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists={os.path.isdir(POLICY_DIR)}") print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists={os.path.isdir(POLICY_DIR)}")
try: try:
from policy_loader import load as _load_policy from policy_loader import load as _load_policy
policy_handle = _load_policy(POLICY_DIR) policy_handle = _load_policy(POLICY_DIR)
print(f"[dog] policy loaded from {POLICY_DIR}") print(f"[dog] policy loaded from {POLICY_DIR}")
except Exception as exc: except Exception as exc:
if MODE in ("bc", "rl"):
print(f"[dog] policy load failed ({exc!r}); falling back to strombom.") print(f"[dog] policy load failed ({exc!r}); falling back to strombom.")
MODE = "strombom" MODE = "strombom"
else: print(f"[dog] running in mode={MODE}")
# In dagger mode, no policy is fine if driver=teacher.
print(f"[dog] policy load failed ({exc!r}); dagger driver forced to teacher.")
policy_handle = None
print(f"[dog] running in mode={MODE}"
+ (f" driver={DAGGER_DRIVER}" if MODE == "dagger" else ""))
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Action smoothing + safety supervisor # Control parameters
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
ACTION_SMOOTH = 0.55 # was 0.35; bumped for less frame-to-frame action jitter ACTION_SMOOTH = 0.55 # EMA on (vx, vy) — kills frame-to-frame jitter
prev_action = (0.0, 0.0) RUN_DONE_FILE = os.path.join(_PROJECT_ROOT, "training", ".run_done")
def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple: def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
@@ -202,10 +171,6 @@ def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
return (vx, vy) return (vx, vy)
# ---------------------------------------------------------------------------
# Driving
# ---------------------------------------------------------------------------
def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float): def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float):
if math.hypot(vx, vy) < 1e-3: if math.hypot(vx, vy) < 1e-3:
left_motor.setVelocity(0.0) left_motor.setVelocity(0.0)
@@ -245,12 +210,9 @@ receiver = robot.getDevice("receiver"); receiver.enable(timestep)
emitter = robot.getDevice("emitter") emitter = robot.getDevice("emitter")
lidar = robot.getDevice("lidar"); lidar.enable(timestep) lidar = robot.getDevice("lidar"); lidar.enable(timestep)
# The receiver channel from sheep is no longer consumed for perception
# (kept enabled in case any peripheral tooling reads it). Sheep
# positions come exclusively from the LiDAR + tracker pipeline below.
tracker = SheepTracker() tracker = SheepTracker()
# Cosmetic ear motors — ignored by control logic but keep them animated. # Cosmetic ear motors — animated; not used by control.
left_ear = robot.getDevice("left ear motor") left_ear = robot.getDevice("left ear motor")
right_ear = robot.getDevice("right ear motor") right_ear = robot.getDevice("right ear motor")
left_ear.setPosition(float("inf")) left_ear.setPosition(float("inf"))
@@ -266,75 +228,26 @@ EAR_RATE = 8.0
# Main loop # Main loop
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Active sheep positions come from the LiDAR-fed tracker each step; # Analytic-teacher wrapper (instantiated lazily so RL/BC modes don't pay
# penned_set is the tracker's ``get_penned_set()`` call. We drain the # the import-time cost). Each gets the same ActiveScanTeacher treatment:
# receiver queue without consuming it, so the small backlog of sheep # rotate-on-empty, walk-to-centre, near-sheep speed modulation.
# pings can't grow unbounded. analytic_teacher = None
step_count = 0 if MODE in ("strombom", "sequential"):
base_fn = strombom_action if MODE == "strombom" else sequential_action
analytic_teacher = ActiveScanTeacher(base_fn)
import atexit # GT positions from sheep emitters — used **only** for the auto-finish
import time # sentinel and the GT_penned diagnostic line. Never fed into control.
import numpy as _np
# DAgger state ----------------------------------------------------------
# Logged each step in dagger mode: (stacked_lidar_obs, teacher_action).
DAGGER_LOG_OBS: list = []
DAGGER_LOG_ACT: list = []
# Diagnostic mode buffer (one dict per step).
DIAG_BUF: list = []
# Frame stack buffer the controller maintains itself when dagger mode is
# active — the stacked obs we log must match what the policy sees so the
# downstream BC consumes (stacked_obs, teacher_action) pairs cleanly.
_FRAME_STACK = (policy_handle.frame_stack if policy_handle is not None else 4)
_dagger_buffer: list = []
# Active-scan teacher operates on GT (read from receiver).
_dagger_teacher = ActiveScanTeacher(strombom_action) if MODE == "dagger" else None
# GT positions accumulated from the receiver (sheep emit their xy each step).
_gt_sheep: dict = {} _gt_sheep: dict = {}
_run_done = False
prev_action = (0.0, 0.0)
_DAGGER_RUN_TS = int(time.time()) # one file per controller run step_count = 0
_DAGGER_DUMPED = False
# Sentinel that the auto-collection script polls — empty file written
# when this controller decides the run is "done" (all sheep penned, by
# GT). The launcher then kills Webots and moves on without waiting out
# its timeout. Honoured only in dagger mode.
_DAGGER_DONE_FILE = os.path.join(_PROJECT_ROOT, "training", "dagger", ".DONE")
def _dump_dagger_log():
"""Save accumulated (obs, teacher_action) pairs to disk on exit.
Webots may SIGKILL the controller, so the loop also calls this every
DAGGER_FLUSH_STEPS so we lose at most a few seconds of data per run.
Idempotent — repeated calls overwrite the same file with the latest
accumulated buffer.
"""
global _DAGGER_DUMPED
if MODE != "dagger" or not DAGGER_LOG_OBS:
return
out_dir = os.path.join(_PROJECT_ROOT, "training", "dagger")
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, f"dagger_{_DAGGER_RUN_TS}.npz")
obs_arr = _np.stack(DAGGER_LOG_OBS).astype(_np.float32)
act_arr = _np.stack(DAGGER_LOG_ACT).astype(_np.float32)
_np.savez(out_path, obs=obs_arr, actions=act_arr)
if not _DAGGER_DUMPED:
print(f"[dog dagger] wrote {len(DAGGER_LOG_OBS)} pairs → {out_path}")
_DAGGER_DUMPED = True
DAGGER_FLUSH_STEPS = 500
atexit.register(_dump_dagger_log)
while robot.step(timestep) != -1: while robot.step(timestep) != -1:
step_count += 1 step_count += 1
# Drain receiver. In every mode we capture GT for the diagnostic # Drain sheep emitter messages → GT (diagnostic only).
# log line — perception still comes from LiDAR, the GT is read-only.
while receiver.getQueueLength() > 0: while receiver.getQueueLength() > 0:
msg = receiver.getString() msg = receiver.getString()
receiver.nextPacket() receiver.nextPacket()
@@ -350,115 +263,28 @@ while robot.step(timestep) != -1:
n = compass.getValues() n = compass.getValues()
dog_heading = math.atan2(n[0], n[1]) dog_heading = math.atan2(n[0], n[1])
# ---- LiDAR perception → tracker → sheep_positions dict ---- # ---- LiDAR perception → tracker → active sheep positions ----
ranges = _np.asarray(lidar.getRangeImage(), dtype=_np.float32) ranges = np.asarray(lidar.getRangeImage(), dtype=np.float32)
detections = detections_from_scan(ranges, dog_xy[0], dog_xy[1], dog_heading) detections = detections_from_scan(ranges, dog_xy[0], dog_xy[1], dog_heading)
sheep_positions = tracker.update(detections) sheep_positions = tracker.update(detections)
penned_set = tracker.get_penned_set()
# ---- Diagnostic mode: dump the first DIAG_STEPS scans + GT to disk.
if MODE == "diag":
DIAG_STEPS = 80
if step_count <= DIAG_STEPS:
DIAG_BUF.append(dict(
step=step_count,
ranges=ranges.copy(),
dog_x=dog_xy[0], dog_y=dog_xy[1], dog_h=dog_heading,
gt_sheep=dict(_gt_sheep),
detections=list(detections),
))
if step_count == DIAG_STEPS:
_diag_path = os.path.join(_PROJECT_ROOT, "training", "dagger",
f"diag_{int(time.time())}.npz")
os.makedirs(os.path.dirname(_diag_path), exist_ok=True)
_np.savez(
_diag_path,
ranges=_np.stack([d["ranges"] for d in DIAG_BUF]),
dog_xy=_np.array([[d["dog_x"], d["dog_y"]] for d in DIAG_BUF],
dtype=_np.float32),
dog_h=_np.array([d["dog_h"] for d in DIAG_BUF], dtype=_np.float32),
# Per-step GT serialised: max-pad to 10 sheep.
gt_xy=_np.array([
[list(d["gt_sheep"].get(f"sheep{i}", (1e9, 1e9)))
for i in range(1, 11)]
for d in DIAG_BUF
], dtype=_np.float32),
detections=_np.array([
len(d["detections"]) for d in DIAG_BUF
], dtype=_np.int32),
)
print(f"[dog diag] wrote {DIAG_STEPS} scans → {_diag_path}")
# Build the single-frame LiDAR obs (matches what the env produces).
sheep_xy_list = list(sheep_positions.values()) sheep_xy_list = list(sheep_positions.values())
sheep_penned_list = [False] * len(sheep_xy_list) sheep_penned_list = [False] * len(sheep_xy_list)
single_obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list) single_obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list)
# Maintain our own frame stack so logged obs == what policy sees.
if not _dagger_buffer:
_dagger_buffer = [single_obs.copy() for _ in range(_FRAME_STACK)]
else:
_dagger_buffer.append(single_obs)
if len(_dagger_buffer) > _FRAME_STACK:
_dagger_buffer = _dagger_buffer[-_FRAME_STACK:]
stacked_obs = _np.concatenate(_dagger_buffer, axis=0).astype(_np.float32)
# ---- Action selection ---- # ---- Action selection ----
if MODE == "diag": if MODE in ("bc", "rl") and policy_handle is not None:
# Diagnostic mode: rotate in place so the captured scans cover
# all 360° of view from one position. Target = heading + π →
# cos(err) clamps forward to ~0, the dog spins.
_t = dog_heading + math.pi
vx, vy = math.cos(_t), math.sin(_t)
elif MODE == "dagger":
# Teacher: active-scan + Strömbom on GT (active sheep only).
gt_active = {name: xy for name, xy in _gt_sheep.items()
if not is_penned_position(xy[0], xy[1])}
t_vx, t_vy, _mode_str = _dagger_teacher(
dog_xy, dog_heading, gt_active, PEN_ENTRY,
)
# Student (if a policy is loaded).
s_vx, s_vy = None, None
if policy_handle is not None:
action = policy_handle.predict(stacked_obs)
s_vx, s_vy = float(action[0]), float(action[1])
# Drive selection.
if DAGGER_DRIVER == "student" and policy_handle is not None:
vx, vy = s_vx, s_vy
else:
vx, vy = t_vx, t_vy
# Always log the teacher action (this is the supervision signal).
DAGGER_LOG_OBS.append(stacked_obs.copy())
DAGGER_LOG_ACT.append(_np.array([t_vx, t_vy], dtype=_np.float32))
elif MODE in ("bc", "rl") and policy_handle is not None:
# Pass the single-frame obs; the policy_loader maintains its own
# frame stack internally. Both bc and rl use the same control
# interface — the only difference is which checkpoint loaded.
action = policy_handle.predict(single_obs) action = policy_handle.predict(single_obs)
vx, vy = float(action[0]), float(action[1]) vx, vy = float(action[0]), float(action[1])
elif MODE in ("strombom", "sequential"): else:
# Wrap the analytic teacher in ActiveScanTeacher so the dog vx, vy, _mode_str = analytic_teacher(
# rotates / walks-to-centre when the tracker briefly empties,
# instead of going idle. Without this wrapper, the first 2 s
# of LiDAR-blind operation kills the run because Strömbom and
# Sequential both return (0, 0) when there are no positions.
if "_analytic_teacher" not in globals():
from herding.sequential import compute_action as sequential_action
_analytic_teacher = ActiveScanTeacher(
strombom_action if MODE == "strombom" else sequential_action
)
vx, vy, _mode_str = _analytic_teacher(
dog_xy, dog_heading, sheep_positions, PEN_ENTRY, dog_xy, dog_heading, sheep_positions, PEN_ENTRY,
) )
# Shared post-process: speed modulation near sheep. Applies to bc, # Near-sheep speed modulation (shared by every mode).
# rl, strombom, sequential — every mode where the action source is
# nominally unit-magnitude. In dagger mode the active-scan teacher
# has already modulated, and the diag mode action is hand-built for
# rotation; both skip.
if MODE not in ("dagger", "diag"):
vx, vy = modulate_speed_near_sheep(vx, vy, dog_xy, sheep_positions) vx, vy = modulate_speed_near_sheep(vx, vy, dog_xy, sheep_positions)
# EMA smoothing — reduces oscillation from policy or Strömbom flips. # EMA smoothing — kills frame-to-frame action jitter.
vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx
vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy
@@ -469,7 +295,7 @@ while robot.step(timestep) != -1:
drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX) drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX)
emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}") emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}")
# Cosmetic ear wiggle — purely visual. # Cosmetic ear wiggle.
ear_phase += 0.12 ear_phase += 0.12
ear_pos = EAR_AMPLITUDE * math.sin(ear_phase) ear_pos = EAR_AMPLITUDE * math.sin(ear_phase)
left_ear.setVelocity(EAR_RATE) left_ear.setVelocity(EAR_RATE)
@@ -477,38 +303,26 @@ while robot.step(timestep) != -1:
left_ear.setPosition(ear_pos) left_ear.setPosition(ear_pos)
right_ear.setPosition(-ear_pos) right_ear.setPosition(-ear_pos)
# --- Early-stop when all GT sheep are penned (all modes) --- # Auto-finish: when all GT sheep are penned, write the sentinel.
# The dog isn't a Supervisor so it can't call simulationQuit() — # The launcher polls for it and closes Webots so batch evals don't
# instead we write a sentinel file the launcher polls for and uses # hang after the task is done. Bounded by `_gt_sheep` so we don't
# to kill the Webots process. Bounded by `_gt_sheep` so we don't
# fire during the first few steps while the receiver fills. # fire during the first few steps while the receiver fills.
if _gt_sheep and not os.path.exists(_DAGGER_DONE_FILE): if _gt_sheep and not _run_done:
gt_active_count = sum(1 for x, y in _gt_sheep.values() gt_active = sum(1 for x, y in _gt_sheep.values()
if not is_penned_position(x, y)) if not is_penned_position(x, y))
if gt_active_count == 0: if gt_active == 0:
if MODE == "dagger": os.makedirs(os.path.dirname(RUN_DONE_FILE), exist_ok=True)
_dump_dagger_log() open(RUN_DONE_FILE, "w").close()
os.makedirs(os.path.dirname(_DAGGER_DONE_FILE), exist_ok=True) _run_done = True
open(_DAGGER_DONE_FILE, "w").close()
print(f"[dog] all {len(_gt_sheep)} sheep penned at step " print(f"[dog] all {len(_gt_sheep)} sheep penned at step "
f"{step_count} — wrote {_DAGGER_DONE_FILE}, " f"{step_count} — wrote sentinel, launcher will close Webots")
f"launcher will close Webots")
if MODE == "dagger" and step_count % DAGGER_FLUSH_STEPS == 0 and DAGGER_LOG_OBS:
_dump_dagger_log()
if step_count % 200 == 0: if step_count % 200 == 0:
gt_penned = sum(1 for x, y in _gt_sheep.values() gt_penned = sum(1 for x, y in _gt_sheep.values()
if is_penned_position(x, y)) if is_penned_position(x, y))
gt_total = len(_gt_sheep) gt_total = len(_gt_sheep)
extra = ""
if MODE == "dagger":
extra = f" logged={len(DAGGER_LOG_OBS)}"
print(f"[dog mode={MODE}] step={step_count} " print(f"[dog mode={MODE}] step={step_count} "
f"GT_penned={gt_penned}/{gt_total} " f"GT_penned={gt_penned}/{gt_total} "
f"tracks_active={tracker.n_active()} " f"tracks_active={tracker.n_active()} "
f"tracks_penned={tracker.n_penned()} " f"tracks_penned={tracker.n_penned()} "
f"detections={len(detections)} action=({vx:+.2f}, {vy:+.2f}){extra}") f"detections={len(detections)} action=({vx:+.2f}, {vy:+.2f})")
# Loop ended (Webots told us to quit). Flush any remaining DAgger log.
_dump_dagger_log()
View File
@@ -24,7 +24,7 @@ from __future__ import annotations
import math import math
from herding.control import modulate_speed_near_sheep from herding.control.modulation import modulate_speed_near_sheep
INITIAL_SCAN_STEPS = 80 # ≈1.3 s at dt=16 ms — full rotation at the +π turn target. INITIAL_SCAN_STEPS = 80 # ≈1.3 s at dt=16 ms — full rotation at the +π turn target.
@@ -24,7 +24,7 @@ flock size and works up to at least n=10 within a 15 000-step budget.
import math import math
from herding.geometry import GATE_Y, PEN_ENTRY, in_pen from herding.world.geometry import GATE_Y, PEN_ENTRY, in_pen
DELTA_DRIVE = 1.5 # standoff behind the target sheep DELTA_DRIVE = 1.5 # standoff behind the target sheep
@@ -9,7 +9,7 @@ Reference: Strömbom et al. 2014, "Solving the shepherding problem".
import math import math
from herding.geometry import PEN_ENTRY, GATE_Y, in_pen from herding.world.geometry import PEN_ENTRY, GATE_Y, in_pen
# Algorithm parameters. DELTA_DRIVE / DELTA_COLLECT were tightened from # Algorithm parameters. DELTA_DRIVE / DELTA_COLLECT were tightened from
# the original (4.0 / 2.5) because the new external pen sits ~26 m from # the original (4.0 / 2.5) because the new external pen sits ~26 m from
+1 -1
View File
@@ -31,7 +31,7 @@ Layout (all components normalised so values stay roughly in [-1, 1]):
import math import math
import numpy as np import numpy as np
from herding.geometry import ( from herding.world.geometry import (
FIELD_X, FIELD_Y, PEN_ENTRY, MAX_SHEEP, FIELD_X, FIELD_Y, PEN_ENTRY, MAX_SHEEP,
) )
View File
@@ -29,8 +29,8 @@ import math
import numpy as np import numpy as np
from herding.geometry import FIELD_X, FIELD_Y, GATE_Y, PEN_X, PEN_Y from herding.world.geometry import FIELD_X, FIELD_Y, GATE_Y, PEN_X, PEN_Y
from herding.lidar_sim import ( from herding.perception.lidar_sim import (
LIDAR_FOV, LIDAR_MAX_RANGE, LIDAR_N_RAYS, SHEEP_RADIUS, ray_angles, LIDAR_FOV, LIDAR_MAX_RANGE, LIDAR_N_RAYS, SHEEP_RADIUS, ray_angles,
) )
@@ -26,7 +26,7 @@ from __future__ import annotations
import math import math
from herding.geometry import MAX_SHEEP, in_pen, is_penned_position from herding.world.geometry import MAX_SHEEP, in_pen, is_penned_position
GATE_M = 2.5 # m — primary NN gate (recent tracks) GATE_M = 2.5 # m — primary NN gate (recent tracks)
View File
@@ -51,7 +51,7 @@ is a defensible engineering adaptation of Strömbom's qualitative
import math import math
import random import random
from herding.geometry import ( from herding.world.geometry import (
FIELD_X, FIELD_Y, FIELD_X, FIELD_Y,
PEN_X, PEN_Y, PEN_X, PEN_Y,
GATE_X, GATE_X,
View File
@@ -21,9 +21,9 @@ if _PROJECT_ROOT not in sys.path:
import numpy as np import numpy as np
from herding.geometry import MAX_SHEEP, PEN_ENTRY from herding.world.geometry import MAX_SHEEP, PEN_ENTRY
from herding.obs import OBS_DIM from herding.obs import OBS_DIM
from herding.strombom import compute_action from herding.control.strombom import compute_action
from training.herding_env import HerdingEnv from training.herding_env import HerdingEnv
-166
View File
@@ -1,166 +0,0 @@
#!/bin/bash
# tools/auto_dagger.sh — automated DAgger collection across many headless
# Webots runs.
#
# For each (flock_size, run_index) combination, generates a world with N
# active sheep at randomised positions, launches Webots in fast/headless
# mode, lets the controller log (lidar_obs, teacher_action) pairs for up
# to RUN_SEC seconds, kills the run, and moves on. The dog controller's
# 500-step periodic flush means each run produces a complete .npz even
# when killed by timeout.
#
# Usage:
# tools/auto_dagger.sh [RUNS_PER_FLOCK] [SECONDS_PER_RUN]
# RUNS_PER_FLOCK : how many randomised runs per flock size (default 3)
# SECONDS_PER_RUN: wall-clock cap per Webots run (default 60)
#
# Env-var overrides:
# HERDING_POLICY_DIR : policy the controller loads (only used when
# HERDING_DAGGER_DRIVER=student). Default bc.
# HERDING_DAGGER_DRIVER : "teacher" (default) or "student".
# HEADLESS=1 : force --no-rendering (default on).
# FLOCKS="1 3 5 8 10" : space-separated flock sizes to iterate over.
#
# Output:
# training/dagger/dagger_<ts>.npz — one per Webots run.
#
# After collection, run:
# python -m tools.dagger_merge_train --out training/runs/bc_dagger
set -e
RUNS_PER_FLOCK=${1:-3}
RUN_SEC=${2:-60}
FLOCKS=${FLOCKS:-"1 3 5 8 10"}
HEADLESS=${HEADLESS:-1}
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
SRC="$ROOT/worlds/field.wbt"
DST="$ROOT/worlds/field_test.wbt"
POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc}"
DRIVER="${HERDING_DAGGER_DRIVER:-teacher}"
DONE_FILE="$ROOT/training/dagger/.DONE"
WEBOTS_PID=""
cleanup() {
echo "Caught interrupt — killing Webots (pid=$WEBOTS_PID) and exiting."
[[ -n "$WEBOTS_PID" ]] && kill "$WEBOTS_PID" 2>/dev/null
wait "$WEBOTS_PID" 2>/dev/null || true
exit 1
}
trap cleanup INT TERM
webots_args=(--mode=fast --batch --minimize)
if [[ "$HEADLESS" == "1" ]]; then
webots_args+=(--no-rendering)
fi
echo "Auto-dagger collection"
echo " flock sizes : $FLOCKS"
echo " runs per size : $RUNS_PER_FLOCK"
echo " seconds per run : $RUN_SEC"
echo " policy dir : $POLICY_DIR (used only when driver=student)"
echo " driver : $DRIVER"
echo " webots flags : ${webots_args[*]}"
echo
# Runtime config — re-written before each run anyway, but written once
# here so a manual webots launch at the same time would also pick it up.
cat > "$ROOT/herding_runtime.cfg" <<EOF
HERDING_MODE=dagger
HERDING_POLICY_DIR=$POLICY_DIR
HERDING_DAGGER_DRIVER=$DRIVER
EOF
# Count files before, so we can summarise what was added.
mkdir -p "$ROOT/training/dagger"
before_count=$(ls -1 "$ROOT/training/dagger"/dagger_*.npz 2>/dev/null | wc -l || echo 0)
run_idx=0
total_runs=0
for f in $FLOCKS; do total_runs=$((total_runs + RUNS_PER_FLOCK)); done
for flock in $FLOCKS; do
for run in $(seq 1 "$RUNS_PER_FLOCK"); do
run_idx=$((run_idx + 1))
seed=$((1000 * flock + run))
echo "=== [$run_idx/$total_runs] flock=$flock run=$run seed=$seed ==="
# Generate randomised world.
cp "$SRC" "$DST"
for i in $(seq $((flock + 1)) 10); do
sed -i "s|^Sheep .* \"sheep${i}\".*|# &|" "$DST"
done
# Inline Python: jitter sheep1..flock translations.
python3 - "$DST" "$flock" "$seed" <<'PYEOF'
import re, random, sys
path, n_str, seed = sys.argv[1], sys.argv[2], sys.argv[3]
n = int(n_str); random.seed(int(seed))
with open(path) as f:
txt = f.read()
def rand_pos():
while True:
x = random.uniform(-12.0, 12.0)
y = random.uniform(-10.0, 12.0) # avoid the gate strip
if x * x + y * y > 9.0: # at least 3 m from dog spawn
return x, y
for i in range(1, n + 1):
x, y = rand_pos()
pat = re.compile(
r'Sheep \{ translation\s+\S+\s+\S+\s+(\S+)\s+name "sheep' + str(i) + r'"'
)
txt = pat.sub(rf'Sheep {{ translation {x:.2f} {y:.2f} \g<1> name "sheep{i}"', txt, count=1)
with open(path, "w") as f:
f.write(txt)
PYEOF
# Run Webots in the background; poll for the .DONE sentinel or
# the wall-clock timeout, whichever comes first.
rm -f "$DONE_FILE"
webots "${webots_args[@]}" "$DST" \
> /tmp/webots_dagger_run.log 2>&1 &
WEBOTS_PID=$!
# Give the controller 10 s to start before polling the sentinel,
# otherwise a sheep that spawns already penned triggers an instant
# false-positive kill.
elapsed=0
grace=10
while kill -0 "$WEBOTS_PID" 2>/dev/null; do
if (( elapsed >= grace )) && [[ -f "$DONE_FILE" ]]; then
echo " sentinel .DONE detected — killing Webots early"
kill "$WEBOTS_PID" 2>/dev/null
wait "$WEBOTS_PID" 2>/dev/null || true
break
fi
if (( elapsed >= RUN_SEC )); then
echo " timeout ($RUN_SEC s) — killing Webots"
kill "$WEBOTS_PID" 2>/dev/null
wait "$WEBOTS_PID" 2>/dev/null || true
break
fi
sleep 2
elapsed=$((elapsed + 2))
done
WEBOTS_PID=""
# Quick sanity from the log: did the controller actually run?
if grep -q "running in mode=dagger" /tmp/webots_dagger_run.log; then
new_pairs=$(tail -50 /tmp/webots_dagger_run.log | grep -oE 'logged=[0-9]+' | tail -1)
echo " controller ran ($new_pairs)"
else
echo " WARNING: controller may not have started (see /tmp/webots_dagger_run.log)"
fi
done
done
after_count=$(ls -1 "$ROOT/training/dagger"/dagger_*.npz 2>/dev/null | wc -l || echo 0)
new_files=$((after_count - before_count))
echo
echo "Done."
echo " new dagger files : $new_files"
echo " total in dir : $after_count"
echo
echo "Next:"
echo " python -m tools.dagger_merge_train --out training/runs/bc_dagger"
+4 -4
View File
@@ -26,10 +26,10 @@ if _PROJECT_ROOT not in sys.path:
import numpy as np import numpy as np
from herding.active_scan import ActiveScanTeacher from herding.control.active_scan import ActiveScanTeacher
from herding.geometry import PEN_ENTRY from herding.world.geometry import PEN_ENTRY
from herding.sequential import compute_action as sequential_action from herding.control.sequential import compute_action as sequential_action
from herding.strombom import compute_action as strombom_action from herding.control.strombom import compute_action as strombom_action
from training.herding_env import HerdingEnv from training.herding_env import HerdingEnv
-135
View File
@@ -1,135 +0,0 @@
"""Merge Webots DAgger demos with sim demos and retrain the BC policy.
The dog controller in ``HERDING_MODE=dagger`` writes per-run files to
``training/dagger/dagger_<ts>.npz`` containing ``(obs, actions)`` pairs
where:
* ``obs`` is the **stacked LiDAR observation** as built by the live
Webots tracker — exactly the input distribution the deployed
controller sees.
* ``actions`` is the **active-scan-teacher action computed from
ground-truth sheep positions** (read off the sheep emitter).
Combined with the existing sim demos (``training/demos.npz`` by
default), this gives the BC student a training set that includes the
real Webots false-positive distribution — closing the sim-to-real
perception gap that the all-sim pipeline couldn't bridge.
Usage::
# Iteration 1 — merge all dagger files with sim demos, retrain
python -m tools.dagger_merge_train \\
--sim training/demos.npz \\
--out training/runs/bc_dagger1
# Iteration 2 — drop the sim baseline, train only on Webots data
python -m tools.dagger_merge_train --no-sim --out training/runs/bc_dagger2
The new policy is saved as ``<out>/policy.zip`` and is auto-loaded by
the controller's resolution priority on the next Webots run.
"""
from __future__ import annotations
import argparse
import glob
import os
import subprocess
import sys
from pathlib import Path
_HERE = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
import numpy as np
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--sim", default="training/demos.npz",
help="Sim demo file to mix with the Webots data. "
"Pass --no-sim to train only on dagger data.")
parser.add_argument("--no-sim", action="store_true",
help="Skip the sim demos entirely.")
parser.add_argument("--dagger-glob", default="training/dagger/dagger_*.npz",
help="Glob for Webots-collected dagger files.")
parser.add_argument("--merged-out", default="training/demos_dagger.npz",
help="Where to write the merged demo file.")
parser.add_argument("--out", default="training/runs/bc_dagger",
help="Where to write the BC policy.")
parser.add_argument("--epochs", type=int, default=60)
parser.add_argument("--batch-size", type=int, default=256)
parser.add_argument("--net-arch", default="512,512")
parser.add_argument("--cos-weight", type=float, default=1.0)
args = parser.parse_args()
# --- Gather Webots files ---
dagger_paths = sorted(glob.glob(args.dagger_glob))
if not dagger_paths:
raise SystemExit(f"No dagger files found at {args.dagger_glob}"
"run Webots in HERDING_MODE=dagger first.")
chunks_obs: list[np.ndarray] = []
chunks_act: list[np.ndarray] = []
total_dagger = 0
for p in dagger_paths:
data = np.load(p)
obs = data["obs"].astype(np.float32)
act = data["actions"].astype(np.float32)
chunks_obs.append(obs)
chunks_act.append(act)
total_dagger += len(obs)
print(f" + {p}: {obs.shape[0]} pairs (obs dim {obs.shape[1]})")
print(f"[merge] total dagger pairs: {total_dagger}")
obs_dim = chunks_obs[0].shape[1]
if any(c.shape[1] != obs_dim for c in chunks_obs):
raise SystemExit(
"Dagger files have inconsistent obs dims — they were collected "
"with different frame_stack settings. Either rerun with a "
"consistent setting or filter the glob."
)
# --- Optionally include sim demos ---
if not args.no_sim:
sim = np.load(args.sim)
sim_obs = sim["obs"].astype(np.float32)
sim_act = sim["actions"].astype(np.float32)
if sim_obs.shape[1] != obs_dim:
raise SystemExit(
f"Sim demos have obs dim {sim_obs.shape[1]} but dagger demos "
f"have {obs_dim}. Recollect sim demos at the same frame_stack."
)
chunks_obs.append(sim_obs)
chunks_act.append(sim_act)
print(f"[merge] + sim demos: {sim_obs.shape[0]} pairs from {args.sim}")
obs_all = np.concatenate(chunks_obs, axis=0)
act_all = np.concatenate(chunks_act, axis=0)
# Empty meta — bc_pretrain doesn't actually use it but the file format
# has it.
meta = np.zeros((0, 5), dtype=np.int32)
Path(args.merged_out).parent.mkdir(parents=True, exist_ok=True)
np.savez(args.merged_out, obs=obs_all, actions=act_all, meta=meta)
print(f"[merge] wrote {len(obs_all)} pairs → {args.merged_out}")
print(f"[merge] obs shape {obs_all.shape}, action shape {act_all.shape}")
# --- Run BC training ---
cmd = [
sys.executable, "-m", "training.bc_pretrain",
"--demos", args.merged_out,
"--out", args.out,
"--epochs", str(args.epochs),
"--batch-size", str(args.batch_size),
"--net-arch", args.net_arch,
"--cos-weight", str(args.cos_weight),
]
print(f"\n[merge] launching: {' '.join(cmd)}")
subprocess.run(cmd, check=True, cwd=_PROJECT_ROOT)
if __name__ == "__main__":
main()
+9 -13
View File
@@ -7,19 +7,17 @@
# Usage: # Usage:
# tools/run_webots.sh [N] [MODE] # tools/run_webots.sh [N] [MODE]
# N : number of active sheep (1..10), default 10 # N : number of active sheep (1..10), default 10
# MODE : "bc" | "rl" | "strombom" | "sequential" | "dagger", default "bc" # MODE : "bc" | "rl" | "strombom" | "sequential", default "bc"
# #
# Examples: # Examples:
# tools/run_webots.sh 10 bc # BC-trained policy, 10 sheep # tools/run_webots.sh 10 bc # behaviour-cloned MLP, 10 sheep
# tools/run_webots.sh 10 rl # KL-PPO fine-tune of bc, 10 sheep # tools/run_webots.sh 10 rl # KL-PPO fine-tune of bc, 10 sheep
# tools/run_webots.sh 5 sequential # the analytic teacher, 5 sheep # tools/run_webots.sh 5 sequential # single-target analytic baseline
# tools/run_webots.sh 3 strombom # canonical baseline, 3 sheep # tools/run_webots.sh 3 strombom # canonical Strömbom analytic
# #
# Notes: # Notes:
# * The RL mode loads the latest BC policy by default — priority # * bc loads training/runs/bc/policy.zip, rl loads training/runs/rl.
# the BC policy (bc/policy.zip) (the controller resolves it). # Override via HERDING_POLICY_DIR=/path/to/run env var.
# (LiDAR-perception, frame-stack K=4). Override via
# HERDING_POLICY_DIR=/path/to/run env var.
# * Conda env "tir" must be active (provides stable-baselines3 + torch). # * Conda env "tir" must be active (provides stable-baselines3 + torch).
set -e set -e
@@ -30,10 +28,9 @@ if (( N < 1 || N > 10 )); then
echo "N must be 1..10, got $N" >&2; exit 1 echo "N must be 1..10, got $N" >&2; exit 1
fi fi
case "$MODE" in case "$MODE" in
bc|rl|strombom|sequential|dagger) ;; bc|rl|strombom|sequential) ;;
*) echo "MODE must be bc|rl|strombom|sequential|dagger, got '$MODE'" >&2; exit 1 ;; *) echo "MODE must be bc|rl|strombom|sequential, got '$MODE'" >&2; exit 1 ;;
esac esac
DAGGER_DRIVER=${HERDING_DAGGER_DRIVER:-teacher}
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )" ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
SRC="$ROOT/worlds/field.wbt" SRC="$ROOT/worlds/field.wbt"
@@ -59,7 +56,6 @@ RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc}"
cat > "$ROOT/herding_runtime.cfg" <<EOF cat > "$ROOT/herding_runtime.cfg" <<EOF
HERDING_MODE=$MODE HERDING_MODE=$MODE
HERDING_POLICY_DIR=$RESOLVED_POLICY_DIR HERDING_POLICY_DIR=$RESOLVED_POLICY_DIR
HERDING_DAGGER_DRIVER=$DAGGER_DRIVER
EOF EOF
export HERDING_MODE="$MODE" export HERDING_MODE="$MODE"
@@ -68,7 +64,7 @@ export HERDING_POLICY_DIR="$RESOLVED_POLICY_DIR"
# The controller writes this sentinel when all GT sheep are penned. We # The controller writes this sentinel when all GT sheep are penned. We
# poll for it and kill Webots so the run finishes cleanly instead of # poll for it and kill Webots so the run finishes cleanly instead of
# idling for minutes after the task is done. # idling for minutes after the task is done.
DONE_FILE="$ROOT/training/dagger/.DONE" DONE_FILE="$ROOT/training/.run_done"
mkdir -p "$(dirname "$DONE_FILE")" mkdir -p "$(dirname "$DONE_FILE")"
rm -f "$DONE_FILE" rm -f "$DONE_FILE"
+35 -54
View File
@@ -1,21 +1,16 @@
# Training pipeline # Training pipeline
Behavior cloning of analytic herding teachers into a neural-network Two stages, strictly sequential:
policy that runs under LiDAR perception in Webots.
``` ```
sim demos (active-scan teacher on tracker output, K=4 frame stack) sim demos (Strömbom on tracker output, K=4 frame stack)
bc_pretrain.py ──► runs/bc (BC baseline) bc_pretrain.py ──► runs/bc (Strömbom-imitated MLP)
▼ KL-regularised PPO fine-tune (training/train_ppo.py) ▼ KL-regularised PPO fine-tune
runs/rl (deployed `rl` mode) runs/rl (deployed `rl` mode — beats BC and Strömbom)
# optional branch — kept for reference, not deployed:
runs/bc_dagger (Webots-grounded DAgger refinement, useful if a
modified world breaks sim-to-real transfer)
``` ```
## Files ## Files
@@ -23,10 +18,9 @@ runs/bc_dagger (Webots-grounded DAgger refinement, useful if a
``` ```
herding_env.py — Gymnasium env (LiDAR raycast + tracker by default) herding_env.py — Gymnasium env (LiDAR raycast + tracker by default)
bc_pretrain.py — MSE + cosine BC of (obs, action) demos into MlpPolicy bc_pretrain.py — MSE + cosine BC of (obs, action) demos into MlpPolicy
eval.py — analytic teachers + BC policies, full n=1..10 grid train_ppo.py — KL-regularised PPO fine-tune of a BC checkpoint
parity_test.py — shape / determinism / baseline smoke test eval.py — multi-seed analytic / learned policy comparison
runs/ — checkpoints (most are .gitignored; the deployed runs/ — checkpoints (whitelisted entries in top-level .gitignore)
ones are whitelisted in the top-level .gitignore)
``` ```
## Setup ## Setup
@@ -39,75 +33,62 @@ CPU is the default and recommended device — SB3 PPO with an MLP policy
of this size runs faster on CPU than GPU because the bottleneck is of this size runs faster on CPU than GPU because the bottleneck is
rollout collection, not gradient compute. rollout collection, not gradient compute.
## The BC pipeline ## End-to-end pipeline
``` ```bash
# 1. Sim demos with the active-scan + Strömbom teacher under LiDAR # 1. Sim demos with the active-scan + Strömbom teacher under LiDAR
# perception. K=4 frame stack so the MLP has temporal context. # perception. K=4 frame stack so the MLP has temporal context.
python -m tools.collect_demos --teacher strombom \ python -m tools.collect_demos --teacher strombom \
--out demos.npz --seeds-per-n 15 --subsample 3 --frame-stack 4 --out training/demos.npz --seeds-per-n 15 --subsample 3 --frame-stack 4
# 2. Behavior-clone. # 2. Behaviour-clone.
python -m training.bc_pretrain --demos demos.npz \ python -m training.bc_pretrain --demos training/demos.npz \
--out runs/bc --epochs 60 --net-arch 512,512 --out training/runs/bc --epochs 60 --net-arch 512,512
# 3. Evaluate. # 3. KL-regularised PPO fine-tune of bc.
python -m training.eval --policy runs/bc \ python -m training.train_ppo \
--max-flock 10 --max-steps 8000 --n-seeds 5 --bc training/runs/bc --out training/runs/rl \
--total-timesteps 1000000
# 4. Multi-seed eval (env-side, fast).
python -m training.eval --policy training/runs/rl \
--max-flock 10 --max-steps 15000 --n-seeds 10
``` ```
`bc_pretrain.py` saves the **best-val_cos** snapshot, not the final `bc_pretrain.py` saves the **best-val_cos** snapshot, not the final
epoch — multi-modal teachers make training noisy and the last epoch is epoch — multi-modal teachers make training noisy and the last epoch is
often worse than an earlier one. often worse than an earlier one.
## DAgger from Webots `train_ppo.py` loads BC weights into both a trainable policy and a
frozen reference, fixes `log_std` small, and adds `β · KL(π‖π_ref)` to
Sim-only BC plateaus because the env's 2D raycast can't reproduce all the loss so the policy can only move within a trust region around BC.
the false-positive clusters Webots generates from real geometry. The See the file header for hyperparameter rationale.
fix is to collect (obs, teacher_action) pairs from inside Webots:
```
# Headless DAgger collection: 5 flock sizes × 3 runs each.
tools/auto_dagger.sh 3 60
# Merge with the sim baseline + retrain.
python -m tools.dagger_merge_train --out runs/bc_dagger
```
Iterate by re-running collection with the new student in the driver's
seat:
```
HERDING_POLICY_DIR=$PWD/training/runs/bc_dagger \
HERDING_DAGGER_DRIVER=student \
tools/auto_dagger.sh 3 60
python -m tools.dagger_merge_train --out runs/bc_dagger
```
## Available analytic teachers ## Available analytic teachers
| Name | What it does | Notes | | Name | What it does | Notes |
|---|---|---| |---|---|---|
| `strombom` | Canonical Strömbom — collect when flock is scattered, drive CoM otherwise | Default; works well for n=110 under tight cohesion | | `strombom` | Strömbom 2014 — collect when flock is scattered, drive CoM otherwise | Default; works for n=110 under tight cohesion |
| `sequential` | Pick the sheep closest to the pen and drive only it | Alternative; needs loose-cohesion regime | | `sequential` | Pick the sheep closest to the pen and drive only it | Alternative; needs loose-cohesion regime |
Both are wrapped at demo-collection time in Both are wrapped at demo-collection time in
`herding/active_scan.py:ActiveScanTeacher`, which adds an opening `herding/control/active_scan.py:ActiveScanTeacher`, which adds an
in-place rotation, walk-to-centre when the LiDAR sees nothing, and opening in-place rotation, walk-to-centre when the LiDAR sees
near-sheep speed modulation (the same modulation `herding/control.py` nothing, and near-sheep speed modulation (same modulation
applies to every dog mode at inference). `herding/control/modulation.py` applies to every dog mode at
inference).
## Evaluating analytic teachers directly ## Evaluating analytic teachers directly
``` ```
python -m training.eval --policy strombom --max-flock 10 --max-steps 8000 --n-seeds 5 python -m training.eval --policy strombom --max-flock 10 --max-steps 15000 --n-seeds 10
python -m training.eval --policy sequential --max-flock 10 --max-steps 8000 --n-seeds 5 python -m training.eval --policy sequential --max-flock 10 --max-steps 15000 --n-seeds 10
``` ```
## Webots inference ## Webots inference
``` ```
tools/run_webots.sh 10 rl tools/run_webots.sh 10 bc # or rl, strombom, sequential
``` ```
The dog controller loads `runs/bc` for `bc` mode and `runs/rl` for The dog controller loads `runs/bc` for `bc` mode and `runs/rl` for
+3 -3
View File
@@ -25,9 +25,9 @@ if _PROJECT_ROOT not in sys.path:
import numpy as np import numpy as np
from herding.geometry import MAX_SHEEP, PEN_ENTRY from herding.world.geometry import MAX_SHEEP, PEN_ENTRY
from herding.sequential import compute_action as sequential_action from herding.control.sequential import compute_action as sequential_action
from herding.strombom import compute_action as strombom_action from herding.control.strombom import compute_action as strombom_action
from training.herding_env import HerdingEnv from training.herding_env import HerdingEnv
+7 -7
View File
@@ -56,24 +56,24 @@ _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
if _PROJECT_ROOT not in sys.path: if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT) sys.path.insert(0, _PROJECT_ROOT)
from herding.diffdrive import ( from herding.world.diffdrive import (
heading_speed_to_wheels, kinematics_step, velocity_to_wheels, heading_speed_to_wheels, kinematics_step, velocity_to_wheels,
) )
from herding.flocking_sim import ( from herding.world.flocking_sim import (
FLEE_SPEED, MAX_SPEED, WANDER_SPEED, compute_heading_speed, FLEE_SPEED, MAX_SPEED, WANDER_SPEED, compute_heading_speed,
) )
from herding.geometry import ( from herding.world.geometry import (
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_BASE, DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_BASE,
DOG_WHEEL_RADIUS, FIELD_X, FIELD_Y, GATE_X, MAX_SHEEP, DOG_WHEEL_RADIUS, FIELD_X, FIELD_Y, GATE_X, MAX_SHEEP,
PEN_ENTRY, PEN_X, PEN_Y, PEN_ENTRY, PEN_X, PEN_Y,
SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS, SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS,
WEBOTS_DT, is_penned_position, WEBOTS_DT, is_penned_position,
) )
from herding.lidar_perception import detections_from_scan from herding.perception.lidar_perception import detections_from_scan
from herding.lidar_sim import simulate_scan from herding.perception.lidar_sim import simulate_scan
from herding.obs import OBS_DIM, build_obs from herding.obs import OBS_DIM, build_obs
from herding.sheep_tracker import SheepTracker from herding.perception.sheep_tracker import SheepTracker
from herding.strombom import compute_action as strombom_action from herding.control.strombom import compute_action as strombom_action
class HerdingEnv(gym.Env): class HerdingEnv(gym.Env):