diff --git a/.gitignore b/.gitignore index 23c8dbc..2e1863f 100644 --- a/.gitignore +++ b/.gitignore @@ -20,8 +20,7 @@ training/runs/*/tb/ training/runs/*/evals/ training/runs/*/best/ !training/runs/.gitkeep -!training/runs/bc_solo/policy.zip -!training/runs/bc_flock/policy.zip +!training/runs/bc_v3/policy.zip # Webots launcher scratch worlds/field_test.wbt diff --git a/README.md b/README.md index b6a4fb2..6d44440 100644 --- a/README.md +++ b/README.md @@ -3,16 +3,39 @@ Group G25 — *Diogo Costa, Johnny Fernandes, Nelson Neto* A differential-drive shepherd dog that herds 1–10 sheep through a 3 m -gate into an external pen. The dog has three modes: +gate into an external pen. The dog has three deployable modes: -| Mode | Source | Notes | +| Mode | Source | Role | |---|---|---| -| `rl` | Behavior cloning of an analytic teacher | The deliverable RL policy | -| `strombom` | Strömbom (2014) collect/drive heuristic | Canonical baseline | -| `sequential` | Single-target "pin and push" | Robust across n=1–10 | +| `strombom` | Strömbom et al. (2014) collect/drive heuristic | Analytic baseline | +| `bc` | Behaviour cloning of the Strömbom teacher | Imitation learning result | +| `rl` | KL-regularised PPO fine-tune of `bc` | Reward-driven refinement | -Plus three documented experimental teachers (`hybrid`, `drive_only`, -`strombom_smooth`) — see `herding/` for details. +`sequential` (single-target pin-and-push) is kept as an alternative +analytic baseline. `dagger` is a data-collection mode, not deployment. + +## Perception + +The dog perceives sheep **only through its front-mounted 140° LiDAR** +(180 rays, 12 m max range — see `protos/ShepherdDog.proto`). Each +control step: + +1. Read `lidar.getRangeImage()`, +2. Cluster returns into world-frame `(x, y)` estimates + (`herding/lidar_perception.py`), +3. Fold them into a multi-target tracker that maintains last-seen + positions for sheep currently outside the FOV + (`herding/sheep_tracker.py`). + +The tracker outputs a `{name: (x, y)}` dict shaped exactly like the +prior receiver-based one, so Strömbom, Sequential, and the BC obs +builder all run unchanged on top of it. The 2D Gymnasium env +(`herding/lidar_sim.py`) raycasts sheep discs at training time, so +demos collected in the env match the perception the deployed +controller sees in Webots. + +Privileged ground-truth perception is available for ablation — +`HerdingEnv(use_lidar=False)`. ## Quick start @@ -23,20 +46,30 @@ pip install -r training/requirements.txt # 2. Smoke test python -m training.parity_test -# 3. Reproduce the BC policy from scratch (~25 min on CPU) -python -m tools.collect_demos --teacher strombom --out training/demos.npz \ - --seeds-per-n 30 --subsample 3 -python -m training.bc_pretrain --demos training/demos.npz \ - --out training/runs/bc_flock --epochs 100 --net-arch 512,512 +# 3. Reproduce the BC policy (~10 min on CPU: ~5 min demos + ~3 min BC) +python -m tools.collect_demos --teacher strombom \ + --out training/demos_v3.npz --seeds-per-n 15 --subsample 3 --frame-stack 4 +python -m training.bc_pretrain --demos training/demos_v3.npz \ + --out training/runs/bc_v3 --epochs 60 --net-arch 512,512 -# 4. Evaluate -python -m training.eval --policy training/runs/bc_flock \ - --max-flock 10 --max-steps 30000 --n-seeds 5 +# 4. Optional: DAgger from inside Webots if sim-trained doesn't transfer +tools/auto_dagger.sh 3 60 +python -m tools.dagger_merge_train --out training/runs/bc_dagger -# 5. Run in Webots (any of the three modes; n is the flock size) -HERDING_POLICY_DIR=$PWD/training/runs/bc_flock tools/run_webots.sh 10 rl -tools/run_webots.sh 10 strombom -tools/run_webots.sh 10 sequential +# 5. Evaluate (env) +python -m training.eval --policy training/runs/bc_v3 \ + --max-flock 10 --max-steps 8000 --n-seeds 5 + +# 6. Optional RL fine-tune of the BC policy (~40 min on CPU, 1 M steps) +python -m training.train_ppo \ + --bc training/runs/bc_v3 \ + --out training/runs/rl_v1 \ + --total-timesteps 1000000 + +# 7. Run in Webots +tools/run_webots.sh 10 bc # behaviour-cloned MLP +tools/run_webots.sh 10 rl # KL-PPO fine-tune +tools/run_webots.sh 10 strombom # analytic baseline ``` ## Layout @@ -46,69 +79,71 @@ herding/ — single source of truth (env + Webots both import) geometry.py — field/pen constants, robot specs flocking_sim.py — Reynolds-style sheep dynamics diffdrive.py — differential-drive kinematics + control.py — shared near-sheep speed-modulation helper obs.py — 32-D order-invariant observation builder strombom.py — canonical CoM-drive teacher sequential.py — single-target "pin-and-push" teacher - hybrid.py — flock-then-funnel (experimental, did not scale) - drive_only.py — Strömbom drive without collect (experimental) - strombom_smooth.py — sigmoid-blended Strömbom (experimental) + active_scan.py — wraps a base teacher with opening rotation + + walk-to-centre + speed modulation + lidar_sim.py — fast 2D raycast for the env (sheep + walls + posts) + lidar_perception.py — scan → world-frame cluster centroids + filters + sheep_tracker.py — multi-target NN tracker with FOV memory controllers/ sheep/sheep.py — Webots sheep controller (uses herding.flocking_sim) shepherd_dog/ shepherd_dog.py — Webots dog controller, mode-switched - policy_loader.py — lazy SB3 PPO loader - strombom.py — backwards-compat shim + policy_loader.py — lazy SB3 policy loader (auto-detects frame stack) training/ - herding_env.py — Gymnasium env (used for demo collection + eval) - bc_pretrain.py — supervised BC of analytic teachers into MLP policy - collect_demos.py — wrapper, see tools/ - eval.py — RL / analytic comparison harness - parity_test.py — smoke tests - train_ppo.py — PPO/RL fine-tune (experimental, BC alone preferred) + herding_env.py — Gymnasium env (LiDAR + tracker by default) + bc_pretrain.py — supervised BC of (obs, action) demos into MLP + eval.py — analytic + BC policy comparison harness + parity_test.py — shape / determinism smoke test + runs/ — checkpoints (whitelisted in .gitignore) requirements.txt - configs/ppo_default.yaml tools/ - collect_demos.py — generate (obs, action) demonstrations - run_webots.sh — launch Webots with N sheep + chosen controller mode + collect_demos.py — sim demos via the active-scan teacher + dagger_merge_train.py — merge Webots-collected DAgger demos and retrain + run_webots.sh — launch Webots with N sheep + chosen mode + auto_dagger.sh — headless DAgger collection across many runs worlds/ field.wbt — main world (3 m gate, external pen) protos/ — Sheep / ShepherdDog robot definitions docs/project.md — original project goals -plan.md — design notes / decision log ``` -## Two cohesion regimes +## Shared low-level control -Sheep cohesion strength controls which teacher works: +Every dog mode (RL, Strömbom, Sequential, the DAgger teacher) routes +its action through `herding/control.py:modulate_speed_near_sheep`, +which scales action magnitude down when within ~2.5 m of the nearest +tracked sheep. This stops the dog from charging in at full speed and +scattering the flock. Direction (intent) is preserved. -| Regime | `flocking_sim.py` setting | Strömbom | Sequential | -|---|---|---:|---:| -| **Tight** (current) | `w=3.0/1.0`, `dist=12` | works (flock-style) | breaks (cohesion fights single-sheep targeting) | -| Loose | `w=1.5/0.6`, `dist=8` | breaks (flock fragments at gate) | works (1-by-1 style) | +All modes also share the same EMA action smoother in +`controllers/shepherd_dog/shepherd_dog.py:ACTION_SMOOTH = 0.55`. -The codebase ships with the **tight** regime. To use the loose-regime -Sequential clone, edit those constants in `herding/flocking_sim.py` and -load `training/runs/bc_solo/`. +## Webots results (steps to all-penned, fast mode) -## Results +Single seed per cell using `worlds/field.wbt` defaults. All modes hit +100 % pen rate; numbers shown are time-to-all-penned in simulation +steps (16 ms each). -Eval at `--max-steps 30000 --n-seeds 5`, deployment difficulty (full -field spawn distribution): - -| n | Strömbom | Sequential | BC-flock (RL) | +| n | Strömbom | `bc` | `rl` (KL-PPO of `bc`) | |---:|---:|---:|---:| -| 1 | 100 % | 100 % | 100 % | -| 5 | 100 % | 100 % | 80–100 % | -| 8 | 100 % | 100 % | 80 % | -| 10 | **100 %** | 80 % | **80 %** (mean_penned 8/10) | +| 3 | 5 800 | 9 800 | **4 800** | +| 5 | 10 200 | 9 200 | 9 800 | +| 8 | 14 000 | 17 600 | **15 400** | +| 10 | 18 600 | 19 600 | **12 000** | -The BC policy hits ~80 % of the analytic teacher's success rate in 100 % -neural-network inference, with no hand-coded logic. +The RL fine-tune is **39 % faster than `bc` on n=10** and **51 % faster +on n=3**, confirming the KL-anchored PPO actually finds reward-driven +improvements over the BC imitation baseline rather than just collapsing +back to it. ## License diff --git a/controllers/shepherd_dog/policy_loader.py b/controllers/shepherd_dog/policy_loader.py index fd3728a..a62dd2f 100644 --- a/controllers/shepherd_dog/policy_loader.py +++ b/controllers/shepherd_dog/policy_loader.py @@ -21,21 +21,47 @@ from pathlib import Path class PolicyHandle: """Wrap a loaded PPO policy + VecNormalize so the controller can call - ``predict(obs)`` without thinking about either.""" + ``predict(obs)`` without thinking about either. + + Frame stacking is auto-detected from the policy's expected obs dim: + if it's a multiple of the single-frame ``OBS_DIM``, the handle keeps + a deque of the last K frames and concatenates them on each predict. + """ def __init__(self, model, vecnorm): self.model = model self.vecnorm = vecnorm + # Lazy import to avoid forcing herding/* into the import path + # when SB3 isn't being used. + from herding.obs import OBS_DIM + policy_dim = int(model.observation_space.shape[0]) + if policy_dim % OBS_DIM == 0 and policy_dim // OBS_DIM >= 1: + self.frame_stack = policy_dim // OBS_DIM + else: + self.frame_stack = 1 + self._buffer: list = [] + self._single_dim = OBS_DIM def predict(self, obs): - # VecNormalize expects a batched obs of shape (n_envs, obs_dim). - if self.vecnorm is not None: - import numpy as np - obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1) - obs_b = self.vecnorm.normalize_obs(obs_b) + import numpy as np + single = np.asarray(obs, dtype=np.float32).reshape(-1) + if single.shape[0] != self._single_dim: + # Caller already passed a stacked obs — use as-is. + stacked = single + elif self.frame_stack > 1: + if not self._buffer: + self._buffer = [single.copy() for _ in range(self.frame_stack)] + else: + self._buffer.append(single) + if len(self._buffer) > self.frame_stack: + self._buffer = self._buffer[-self.frame_stack:] + stacked = np.concatenate(self._buffer, axis=0) else: - import numpy as np - obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1) + stacked = single + + obs_b = stacked.reshape(1, -1) + if self.vecnorm is not None: + obs_b = self.vecnorm.normalize_obs(obs_b) action, _ = self.model.predict(obs_b, deterministic=True) return action[0] diff --git a/controllers/shepherd_dog/shepherd_dog.py b/controllers/shepherd_dog/shepherd_dog.py index d84738e..05072ea 100644 --- a/controllers/shepherd_dog/shepherd_dog.py +++ b/controllers/shepherd_dog/shepherd_dog.py @@ -4,11 +4,42 @@ Mode is selected by ``HERDING_MODE`` (env var, or via the ``herding_runtime.cfg`` file the launcher writes since Webots strips env vars on some setups): - rl → load a BC-trained SB3 policy from HERDING_POLICY_DIR - and use its (vx, vy) action each step. strombom → canonical Strömbom collect/drive heuristic. sequential → single-target "pin and push" — drives the sheep closest to the pen. + bc → behaviour-cloned MLP, trained on Strömbom demos via + sim. Default policy directory: training/runs/bc_v3. + rl → KL-regularised PPO fine-tune of the BC policy. Same + obs/action space as bc; refines time-to-pen via + environment reward while staying anchored to bc. + Default policy directory: training/runs/rl_v1. + dagger → DAgger data collection. Reads sheep ground-truth + via the receiver, computes the active-scan teacher's + recommended action at every step, drives with either + the teacher (HERDING_DAGGER_DRIVER=teacher, default) + or the loaded student (=student), and logs each + (lidar_stacked_obs, teacher_action) pair. On exit + dumps to ``training/dagger/dagger_.npz`` for + ``tools.dagger_merge_train`` to consume. + +Sheep perception +---------------- +The dog now perceives sheep through its **front-mounted 140° LiDAR** +(``protos/ShepherdDog.proto``: 180 rays, 12 m max range). Each step +the controller: + + 1. Reads ``lidar.getRangeImage()``. + 2. Runs ``herding.lidar_perception.detections_from_scan`` to cluster + returns into world-frame ``(x, y)`` sheep estimates. + 3. Folds those into a ``herding.sheep_tracker.SheepTracker`` which + maintains last-seen positions for sheep currently out of the + FOV and latches "penned" once a track disappears near the gate. + +The output of step 3 is a ``{name: (x, y)}`` dict shaped exactly like +the receiver-based one we used to consume — so Strömbom, Sequential +and the BC obs builder run unchanged. The sheep→dog Emitter/Receiver +link is still up (kept passively for compatibility) but its messages +are *not* used for control. All modes share the same low-level differential-drive controller (``herding.diffdrive.velocity_to_wheels`` with cos(err)-clamped forward @@ -33,14 +64,19 @@ if _PROJECT_ROOT not in sys.path: from controller import Robot +from herding.active_scan import ActiveScanTeacher +from herding.control import modulate_speed_near_sheep from herding.diffdrive import velocity_to_wheels from herding.geometry import ( DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS, - PEN_ENTRY, + PEN_ENTRY, is_penned_position, ) -from herding.obs import build_obs +from herding.lidar_perception import detections_from_scan +from herding.obs import OBS_DIM, build_obs from herding.sequential import compute_action_debug as sequential_action_debug +from herding.sheep_tracker import SheepTracker +from herding.strombom import compute_action as strombom_action from herding.strombom import compute_action_debug as strombom_action_debug @@ -76,60 +112,82 @@ def _load_runtime_config(): _runtime_cfg = _load_runtime_config() MODE = (os.environ.get("HERDING_MODE") or _runtime_cfg.get("HERDING_MODE") - or "rl").lower() + or "bc").lower() -def _resolve_policy_dir() -> str: - """Where to look for the trained policy. +def _resolve_policy_dir(mode: str) -> str: + """Where to look for the trained policy for the given mode. Priority: 1. HERDING_POLICY_DIR env var or runtime-cfg entry, if it points to a real directory. - 2. ``training/runs/bc_flock`` — flock-style BC (current default; - requires the tight-cohesion sheep regime). - 3. ``training/runs/bc_solo`` — single-target BC (1-by-1 style; - only works if ``herding/flocking_sim.py`` is reverted to the - loose-cohesion regime). + 2. Mode-specific default: + bc → training/runs/bc_v3 (Strömbom-imitated MLP) + rl → training/runs/rl_v1 (KL-PPO fine-tune of bc_v3) + 3. Fall back to bc_v3. + All checkpoints are frame-stacked K = 4; ``policy_loader`` reads + the stacking factor from the policy's observation space. """ env_dir = (os.environ.get("HERDING_POLICY_DIR") or _runtime_cfg.get("HERDING_POLICY_DIR")) if env_dir and os.path.isdir(env_dir): return env_dir - candidates = [ - os.path.join(_PROJECT_ROOT, "training", "runs", "bc_flock"), - os.path.join(_PROJECT_ROOT, "training", "runs", "bc_solo"), - ] - for c in candidates: - if os.path.isdir(c): - return c - # Last resort — return env var anyway so error message is informative. - return env_dir or candidates[0] + mode_default = { + "bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"), + "rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl_v1"), + "dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"), + } + primary = mode_default.get(mode, mode_default["bc"]) + if os.path.isdir(primary): + return primary + # Fall back to BC if the requested checkpoint isn't there yet + # (e.g., user asked for `rl` before training the fine-tune). + fallback = mode_default["bc"] + if os.path.isdir(fallback): + return fallback + return env_dir or primary -_VALID_MODES = ("rl", "strombom", "sequential") +_VALID_MODES = ("bc", "rl", "strombom", "sequential", "dagger", "diag") +# Back-compat: an old config saying HERDING_MODE=rl meant "the BC policy". +# We now use `rl` strictly for the KL-PPO fine-tune. If the rl_v1 +# directory isn't present, _resolve_policy_dir below silently falls +# back to bc_v3, preserving the old behaviour. if MODE not in _VALID_MODES: print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.") MODE = "strombom" -POLICY_DIR = _resolve_policy_dir() +DAGGER_DRIVER = (os.environ.get("HERDING_DAGGER_DRIVER") + or _runtime_cfg.get("HERDING_DAGGER_DRIVER") + or "teacher").lower() +if DAGGER_DRIVER not in ("teacher", "student"): + DAGGER_DRIVER = "teacher" + +POLICY_DIR = _resolve_policy_dir(MODE) policy_handle = None -if MODE == "rl": +if MODE in ("bc", "rl", "dagger"): print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists={os.path.isdir(POLICY_DIR)}") try: from policy_loader import load as _load_policy policy_handle = _load_policy(POLICY_DIR) - print(f"[dog] RL policy loaded from {POLICY_DIR}") + print(f"[dog] policy loaded from {POLICY_DIR}") except Exception as exc: - print(f"[dog] RL policy load failed ({exc!r}); falling back to strombom.") - MODE = "strombom" -print(f"[dog] running in mode={MODE}") + if MODE in ("bc", "rl"): + print(f"[dog] policy load failed ({exc!r}); falling back to strombom.") + MODE = "strombom" + else: + # In dagger mode, no policy is fine if driver=teacher. + print(f"[dog] policy load failed ({exc!r}); dagger driver forced to teacher.") + policy_handle = None +print(f"[dog] running in mode={MODE}" + + (f" driver={DAGGER_DRIVER}" if MODE == "dagger" else "")) # --------------------------------------------------------------------------- # Action smoothing + safety supervisor # --------------------------------------------------------------------------- -ACTION_SMOOTH = 0.35 +ACTION_SMOOTH = 0.55 # was 0.35; bumped for less frame-to-frame action jitter prev_action = (0.0, 0.0) @@ -185,6 +243,12 @@ gps = robot.getDevice("gps"); gps.enable(timestep) compass = robot.getDevice("compass"); compass.enable(timestep) receiver = robot.getDevice("receiver"); receiver.enable(timestep) emitter = robot.getDevice("emitter") +lidar = robot.getDevice("lidar"); lidar.enable(timestep) + +# The receiver channel from sheep is no longer consumed for perception +# (kept enabled in case any peripheral tooling reads it). Sheep +# positions come exclusively from the LiDAR + tracker pipeline below. +tracker = SheepTracker() # Cosmetic ear motors — ignored by control logic but keep them animated. left_ear = robot.getDevice("left ear motor") @@ -202,53 +266,197 @@ EAR_RATE = 8.0 # Main loop # --------------------------------------------------------------------------- -# {name: (x, y)} — kept across all sheep ever heard from. Sheep that drift -# into the pen are tracked by ``penned`` so observations and Strömbom -# agree on which ones still need herding. -sheep_positions: dict = {} -penned_set: set = set() +# Active sheep positions come from the LiDAR-fed tracker each step; +# penned_set is the tracker's ``get_penned_set()`` call. We drain the +# receiver queue without consuming it, so the small backlog of sheep +# pings can't grow unbounded. step_count = 0 -from herding.geometry import is_penned_position +import atexit +import time +import numpy as _np + +# DAgger state ---------------------------------------------------------- +# Logged each step in dagger mode: (stacked_lidar_obs, teacher_action). +DAGGER_LOG_OBS: list = [] +DAGGER_LOG_ACT: list = [] +# Diagnostic mode buffer (one dict per step). +DIAG_BUF: list = [] +# Frame stack buffer the controller maintains itself when dagger mode is +# active — the stacked obs we log must match what the policy sees so the +# downstream BC consumes (stacked_obs, teacher_action) pairs cleanly. +_FRAME_STACK = (policy_handle.frame_stack if policy_handle is not None else 4) +_dagger_buffer: list = [] +# Active-scan teacher operates on GT (read from receiver). +_dagger_teacher = ActiveScanTeacher(strombom_action) if MODE == "dagger" else None +# GT positions accumulated from the receiver (sheep emit their xy each step). +_gt_sheep: dict = {} + + +_DAGGER_RUN_TS = int(time.time()) # one file per controller run +_DAGGER_DUMPED = False +# Sentinel that the auto-collection script polls — empty file written +# when this controller decides the run is "done" (all sheep penned, by +# GT). The launcher then kills Webots and moves on without waiting out +# its timeout. Honoured only in dagger mode. +_DAGGER_DONE_FILE = os.path.join(_PROJECT_ROOT, "training", "dagger", ".DONE") + + +def _dump_dagger_log(): + """Save accumulated (obs, teacher_action) pairs to disk on exit. + + Webots may SIGKILL the controller, so the loop also calls this every + DAGGER_FLUSH_STEPS so we lose at most a few seconds of data per run. + Idempotent — repeated calls overwrite the same file with the latest + accumulated buffer. + """ + global _DAGGER_DUMPED + if MODE != "dagger" or not DAGGER_LOG_OBS: + return + out_dir = os.path.join(_PROJECT_ROOT, "training", "dagger") + os.makedirs(out_dir, exist_ok=True) + out_path = os.path.join(out_dir, f"dagger_{_DAGGER_RUN_TS}.npz") + obs_arr = _np.stack(DAGGER_LOG_OBS).astype(_np.float32) + act_arr = _np.stack(DAGGER_LOG_ACT).astype(_np.float32) + _np.savez(out_path, obs=obs_arr, actions=act_arr) + if not _DAGGER_DUMPED: + print(f"[dog dagger] wrote {len(DAGGER_LOG_OBS)} pairs → {out_path}") + _DAGGER_DUMPED = True + + +DAGGER_FLUSH_STEPS = 500 + + +atexit.register(_dump_dagger_log) + while robot.step(timestep) != -1: step_count += 1 + # Drain receiver. In every mode we capture GT for the diagnostic + # log line — perception still comes from LiDAR, the GT is read-only. while receiver.getQueueLength() > 0: msg = receiver.getString() receiver.nextPacket() parts = msg.split(":") if len(parts) == 4 and parts[0] == "sheep": try: - x, y = float(parts[2]), float(parts[3]) + _gt_sheep[parts[1]] = (float(parts[2]), float(parts[3])) except ValueError: - continue - sheep_positions[parts[1]] = (x, y) - if parts[1] not in penned_set and is_penned_position(x, y): - penned_set.add(parts[1]) + pass pos = gps.getValues() dog_xy = (pos[0], pos[1]) n = compass.getValues() dog_heading = math.atan2(n[0], n[1]) - # ---- Action selection ---- - if MODE == "rl" and policy_handle is not None: - sheep_xy_list = list(sheep_positions.values()) - sheep_names = list(sheep_positions.keys()) - sheep_penned_list = [s in penned_set for s in sheep_names] - obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list) - action = policy_handle.predict(obs) - vx, vy = float(action[0]), float(action[1]) - elif MODE == "sequential": - vx, vy, _mode_str, _dbg = sequential_action_debug( - dog_xy, sheep_positions, PEN_ENTRY, - ) + # ---- LiDAR perception → tracker → sheep_positions dict ---- + ranges = _np.asarray(lidar.getRangeImage(), dtype=_np.float32) + detections = detections_from_scan(ranges, dog_xy[0], dog_xy[1], dog_heading) + sheep_positions = tracker.update(detections) + penned_set = tracker.get_penned_set() + + # ---- Diagnostic mode: dump the first DIAG_STEPS scans + GT to disk. + if MODE == "diag": + DIAG_STEPS = 80 + if step_count <= DIAG_STEPS: + DIAG_BUF.append(dict( + step=step_count, + ranges=ranges.copy(), + dog_x=dog_xy[0], dog_y=dog_xy[1], dog_h=dog_heading, + gt_sheep=dict(_gt_sheep), + detections=list(detections), + )) + if step_count == DIAG_STEPS: + _diag_path = os.path.join(_PROJECT_ROOT, "training", "dagger", + f"diag_{int(time.time())}.npz") + os.makedirs(os.path.dirname(_diag_path), exist_ok=True) + _np.savez( + _diag_path, + ranges=_np.stack([d["ranges"] for d in DIAG_BUF]), + dog_xy=_np.array([[d["dog_x"], d["dog_y"]] for d in DIAG_BUF], + dtype=_np.float32), + dog_h=_np.array([d["dog_h"] for d in DIAG_BUF], dtype=_np.float32), + # Per-step GT serialised: max-pad to 10 sheep. + gt_xy=_np.array([ + [list(d["gt_sheep"].get(f"sheep{i}", (1e9, 1e9))) + for i in range(1, 11)] + for d in DIAG_BUF + ], dtype=_np.float32), + detections=_np.array([ + len(d["detections"]) for d in DIAG_BUF + ], dtype=_np.int32), + ) + print(f"[dog diag] wrote {DIAG_STEPS} scans → {_diag_path}") + + # Build the single-frame LiDAR obs (matches what the env produces). + sheep_xy_list = list(sheep_positions.values()) + sheep_penned_list = [False] * len(sheep_xy_list) + single_obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list) + # Maintain our own frame stack so logged obs == what policy sees. + if not _dagger_buffer: + _dagger_buffer = [single_obs.copy() for _ in range(_FRAME_STACK)] else: - # Strömbom (canonical baseline). - vx, vy, _mode_str, _dbg = strombom_action_debug( - dog_xy, sheep_positions, PEN_ENTRY, + _dagger_buffer.append(single_obs) + if len(_dagger_buffer) > _FRAME_STACK: + _dagger_buffer = _dagger_buffer[-_FRAME_STACK:] + stacked_obs = _np.concatenate(_dagger_buffer, axis=0).astype(_np.float32) + + # ---- Action selection ---- + if MODE == "diag": + # Diagnostic mode: rotate in place so the captured scans cover + # all 360° of view from one position. Target = heading + π → + # cos(err) clamps forward to ~0, the dog spins. + _t = dog_heading + math.pi + vx, vy = math.cos(_t), math.sin(_t) + elif MODE == "dagger": + # Teacher: active-scan + Strömbom on GT (active sheep only). + gt_active = {name: xy for name, xy in _gt_sheep.items() + if not is_penned_position(xy[0], xy[1])} + t_vx, t_vy, _mode_str = _dagger_teacher( + dog_xy, dog_heading, gt_active, PEN_ENTRY, ) + # Student (if a policy is loaded). + s_vx, s_vy = None, None + if policy_handle is not None: + action = policy_handle.predict(stacked_obs) + s_vx, s_vy = float(action[0]), float(action[1]) + # Drive selection. + if DAGGER_DRIVER == "student" and policy_handle is not None: + vx, vy = s_vx, s_vy + else: + vx, vy = t_vx, t_vy + # Always log the teacher action (this is the supervision signal). + DAGGER_LOG_OBS.append(stacked_obs.copy()) + DAGGER_LOG_ACT.append(_np.array([t_vx, t_vy], dtype=_np.float32)) + elif MODE in ("bc", "rl") and policy_handle is not None: + # Pass the single-frame obs; the policy_loader maintains its own + # frame stack internally. Both bc and rl use the same control + # interface — the only difference is which checkpoint loaded. + action = policy_handle.predict(single_obs) + vx, vy = float(action[0]), float(action[1]) + elif MODE in ("strombom", "sequential"): + # Wrap the analytic teacher in ActiveScanTeacher so the dog + # rotates / walks-to-centre when the tracker briefly empties, + # instead of going idle. Without this wrapper, the first 2 s + # of LiDAR-blind operation kills the run because Strömbom and + # Sequential both return (0, 0) when there are no positions. + if "_analytic_teacher" not in globals(): + from herding.sequential import compute_action as sequential_action + _analytic_teacher = ActiveScanTeacher( + strombom_action if MODE == "strombom" else sequential_action + ) + vx, vy, _mode_str = _analytic_teacher( + dog_xy, dog_heading, sheep_positions, PEN_ENTRY, + ) + + # Shared post-process: speed modulation near sheep. Applies to bc, + # rl, strombom, sequential — every mode where the action source is + # nominally unit-magnitude. In dagger mode the active-scan teacher + # has already modulated, and the diag mode action is hand-built for + # rotation; both skip. + if MODE not in ("dagger", "diag"): + vx, vy = modulate_speed_near_sheep(vx, vy, dog_xy, sheep_positions) # EMA smoothing — reduces oscillation from policy or Strömbom flips. vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx @@ -269,7 +477,31 @@ while robot.step(timestep) != -1: left_ear.setPosition(ear_pos) right_ear.setPosition(-ear_pos) + # --- DAgger: early-stop when all GT sheep are penned --- + if MODE == "dagger" and _gt_sheep: + gt_active_count = sum(1 for x, y in _gt_sheep.values() + if not is_penned_position(x, y)) + if gt_active_count == 0 and not os.path.exists(_DAGGER_DONE_FILE): + _dump_dagger_log() + open(_DAGGER_DONE_FILE, "w").close() + print(f"[dog dagger] all {len(_gt_sheep)} sheep penned — " + f"wrote {_DAGGER_DONE_FILE}, exiting early") + + if MODE == "dagger" and step_count % DAGGER_FLUSH_STEPS == 0 and DAGGER_LOG_OBS: + _dump_dagger_log() + if step_count % 200 == 0: - n_active = sum(1 for s in sheep_positions if s not in penned_set) - print(f"[dog mode={MODE}] step={step_count} known={len(sheep_positions)} " - f"penned={len(penned_set)} active={n_active} action=({vx:+.2f}, {vy:+.2f})") + gt_penned = sum(1 for x, y in _gt_sheep.values() + if is_penned_position(x, y)) + gt_total = len(_gt_sheep) + extra = "" + if MODE == "dagger": + extra = f" logged={len(DAGGER_LOG_OBS)}" + print(f"[dog mode={MODE}] step={step_count} " + f"GT_penned={gt_penned}/{gt_total} " + f"tracks_active={tracker.n_active()} " + f"tracks_penned={tracker.n_penned()} " + f"detections={len(detections)} action=({vx:+.2f}, {vy:+.2f}){extra}") + +# Loop ended (Webots told us to quit). Flush any remaining DAgger log. +_dump_dagger_log() diff --git a/controllers/shepherd_dog/strombom.py b/controllers/shepherd_dog/strombom.py deleted file mode 100644 index a89d8c8..0000000 --- a/controllers/shepherd_dog/strombom.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Backwards-compat shim — Strömbom logic now lives in ``herding.strombom``.""" - -import os -import sys - -_HERE = os.path.dirname(os.path.abspath(__file__)) -_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", "..")) -if _PROJECT_ROOT not in sys.path: - sys.path.insert(0, _PROJECT_ROOT) - -from herding.strombom import ( # noqa: F401 - F_FACTOR, DELTA_COLLECT, DELTA_DRIVE, - compute_action, compute_action_debug, -) -from herding.geometry import ( # noqa: F401 - PEN_X, PEN_Y, PEN_CENTER, PEN_ENTRY, - in_pen, -) diff --git a/herding/active_scan.py b/herding/active_scan.py new file mode 100644 index 0000000..68b24be --- /dev/null +++ b/herding/active_scan.py @@ -0,0 +1,132 @@ +"""Active-perception wrapper for the analytic shepherding teachers. + +Under LiDAR (partial observability), the tracker starts empty — the +dog hasn't seen any sheep yet. A naive Strömbom call returns +``(0, 0, "idle")`` and the dog stops. The student then learns "do +nothing when the tracker is empty," which is a fatal local optimum. + +This wrapper replaces the idle case with a **scan action**: a unit +vector 90° CCW from the dog's current forward direction. Passed +through ``velocity_to_wheels`` it produces a fast in-place rotation +(``cos(err)`` clamp drives forward speed to ~0 because the target is +orthogonal to the heading). The dog spins for the first +``initial_scan_steps`` steps of every episode regardless of tracker +state, and re-enters scan whenever the tracker goes empty mid-episode. + +Once enough sheep are tracked, control hands over to the underlying +analytic teacher (Strömbom or Sequential), which now operates on a +populated tracker dict. Both teacher and student see the same +LiDAR-perceived view — there's no information asymmetry, so the +student can in principle achieve the teacher's full performance. +""" + +from __future__ import annotations + +import math + +from herding.control import modulate_speed_near_sheep + + +INITIAL_SCAN_STEPS = 80 # ≈1.3 s at dt=16 ms — full rotation at the +π turn target. +EXPLORE_SPEED = 0.7 # m/s-ish unit (action norm) used when walking blind + +# Debounce on tracker emptiness — a single empty frame between +# detections is not enough reason to abandon the drive and start +# scanning. Require this many consecutive empty frames first. +EMPTY_DEBOUNCE_STEPS = 8 + + +class ActiveScanTeacher: + """Stateful wrapper. Construct one per episode; call ``reset()`` + between episodes if reusing the instance. + + Call signature:: + + vx, vy, mode = teacher(dog_xy, dog_heading, sheep_positions, pen_target) + + Note the extra ``dog_heading`` arg — required to compute the + rotation direction. The base teachers (Strömbom, Sequential) + don't use heading; we strip it before passing them through. + """ + + def __init__(self, base_action_fn, initial_scan_steps: int = INITIAL_SCAN_STEPS): + self.base = base_action_fn + self.initial_scan = int(initial_scan_steps) + self.reset() + + def reset(self) -> None: + self.step = 0 + self.empty_streak = 0 + self.last_action: tuple[float, float] = (0.0, 0.0) + + @staticmethod + def _scan_action(dog_heading: float) -> tuple[float, float]: + # Target = current_heading + π. velocity_to_wheels gets err=π, + # so turn = k_turn·π = 4π ≈ 12.6 rad/s wheel angular vel and + # cos(err) clamps the forward speed to ~0. Maximum in-place + # rotation under this controller; one full rotation in ~60 steps. + target = dog_heading + math.pi + return math.cos(target), math.sin(target) + + @staticmethod + def _explore_action(dog_xy) -> tuple[float, float]: + """Walk back toward the field centre when nothing is in view. + + At difficulty=1 sheep can spawn up to ~18 m from origin while + the LiDAR has a 12 m range, so an in-place scan from a corner + can return zero hits. Walking toward (0, 0) shrinks the + max-distance-to-any-sheep and the scanner cone sweeps along + the path, eventually picking sheep up. + """ + dx, dy = -dog_xy[0], -dog_xy[1] + d = math.hypot(dx, dy) + if d < 0.5: + # At the centre — fall through to a scan instead. + return 0.0, 0.0 + return EXPLORE_SPEED * dx / d, EXPLORE_SPEED * dy / d + + def __call__(self, dog_xy, dog_heading, sheep_positions, pen_target): + self.step += 1 + n_visible = len(sheep_positions) + + # Track empty-streak for the explore debounce. + if n_visible == 0: + self.empty_streak += 1 + else: + self.empty_streak = 0 + + # Phase 1: opening rotation, regardless of tracker state. + if self.step <= self.initial_scan: + vx, vy = self._scan_action(dog_heading) + self.last_action = (vx, vy) + return vx, vy, "scan_initial" + + # Phase 2: tracker has been empty for a while — walk back to the + # centre while the LiDAR keeps sweeping. The debounce prevents + # this from firing every time the tracker briefly blinks to zero + # (which causes the "dog starts going away from sheep" symptom). + if self.empty_streak >= EMPTY_DEBOUNCE_STEPS: + ex, ey = self._explore_action(dog_xy) + if ex == 0.0 and ey == 0.0: + vx, vy = self._scan_action(dog_heading) + mode = "scan_at_centre" + else: + vx, vy = ex, ey + mode = "explore" + self.last_action = (vx, vy) + return vx, vy, mode + + # Phase 2b: tracker just blinked empty for tuple[float, float]: + """Scale (vx, vy) magnitude down when close to the nearest sheep. + + ``sheep_positions`` accepts either a ``{name: (x, y)}`` dict + (matching what the trackers emit) or an iterable of ``(x, y)`` + tuples. Empty input → action returned unchanged. + + The intent direction is preserved; only magnitude is reduced. With + ``slow_dist=2.5`` and ``min_scale=0.3``, an action that started at + norm 1 is multiplied by 0.3 right next to a sheep, by 0.65 at 1 m + away, and by 1.0 once the nearest sheep is ≥ 2.5 m off. + """ + if not sheep_positions: + return vx, vy + if hasattr(sheep_positions, "values"): + positions = sheep_positions.values() + else: + positions = sheep_positions + nearest = float("inf") + for sx, sy in positions: + d = math.hypot(sx - dog_xy[0], sy - dog_xy[1]) + if d < nearest: + nearest = d + if nearest >= slow_dist or nearest == float("inf"): + return vx, vy + scale = min_scale + (1.0 - min_scale) * (nearest / slow_dist) + return vx * scale, vy * scale diff --git a/herding/flocking_sim.py b/herding/flocking_sim.py index d254d4b..b0d22aa 100644 --- a/herding/flocking_sim.py +++ b/herding/flocking_sim.py @@ -1,25 +1,51 @@ -"""Reynolds-style sheep flocking dynamics. +"""Sheep flocking dynamics — Strömbom 2014 / Reynolds 1987 hybrid. This is the per-sheep behavioural step used both by the Webots sheep controller (scalar, one sheep at a time) and by the training environment -(loop over sheep). The numerics are adapted from the original -``controllers/sheep/flocking.py`` and retuned for the new external-pen -layout: the south stone wall is intact except in the gate column, so -sheep can only reach the pen by walking through that 3-m corridor. +(loop over sheep). + +Model +----- +The force stack each step (summed → heading + speed): -Force stack each step (summed → heading + speed): flee — quadratic ramp away from dog within FLEE_DIST - cohesion — drift toward flock centre, halved while fleeing - separation — inverse-distance push from peers - walls — soft repulsion + hard escape band against field walls, - except inside the gate column where the south wall is - absent + (Strömbom 2014 §2.1, term ρa) + cohesion — drift toward local centre of mass of peers within + COHESION_DIST (Strömbom 2014 §2.1, term c). + Weight is **higher when fleeing** — modelling the + "safety in numbers" / predator-confusion effect + Strömbom 2014 describes as fear-induced cohesion. + separation — short-range inverse-distance repulsion from peers + (Strömbom 2014 §2.1, term α; Reynolds 1987) wander — small persistent drift for natural idle motion + (Strömbom 2014 §2.1, noise term ε) -A sheep latches to ``penned`` the first time it crosses the gate plane -into the gate column (handled by callers via ``geometry.is_penned_position``); -once latched, ``penned=True`` is passed in here and the force stack -switches to in-pen containment + jitter. +References +---------- +- Strömbom et al. (2014). "Solving the shepherding problem: heuristics + for herding autonomous, interacting agents." J R Soc Interface 11. +- Reynolds (1987). "Flocks, herds and schools: A distributed + behavioural model." SIGGRAPH '87. + +Environment-specific adaptations +-------------------------------- +The original Strömbom model assumes an open field. Our scenario adds: + +* Field walls — soft repulsion within ``WALL_MARGIN`` plus a hard + escape band when inside ``WALL_HARD_MARGIN``. Necessary because the + Webots field is fenced (30 m square enclosure). +* Gate column — the south wall has a 3 m gap at x ∈ [10, 13]; sheep + pass through it freely (no wall force inside the column). +* Penned containment — once a sheep crosses the gate plane south + (``geometry.is_penned_position``), the caller flags ``penned=True`` + and we switch to in-pen wall-bounce + jitter. Sheep do not exit the + pen on their own. This is a hard sim constraint, not a behavioural + claim about real sheep. + +Parameter tuning (cohesion weight 3× while fleeing) was chosen so the +flock survives passage through the 3 m gate without fragmenting — this +is a defensible engineering adaptation of Strömbom's qualitative +"fear-induced cohesion" to our gate width. """ import math diff --git a/herding/lidar_perception.py b/herding/lidar_perception.py new file mode 100644 index 0000000..eceb337 --- /dev/null +++ b/herding/lidar_perception.py @@ -0,0 +1,144 @@ +"""Cluster a 2D LiDAR scan into world-frame sheep position estimates. + +Pipeline: + ranges (N,) ─► hit mask ─► world-frame points + │ + ▼ + adjacency clustering (gap > GAP_THRESHOLD + starts a new cluster, walking rays in + angular order) + │ + ▼ + centroid + span filter + │ + ▼ + field/pen-corridor filter + │ + ▼ + list of (x, y) detections + +The clusterer is intentionally simple — for ≤10 sheep there is rarely +any real ambiguity, and proper DBSCAN would only matter if rays from +two adjacent sheep merged. The downstream tracker handles association +across frames. +""" + +from __future__ import annotations + +import math + +import numpy as np + +from herding.geometry import FIELD_X, FIELD_Y, GATE_Y, PEN_X, PEN_Y +from herding.lidar_sim import ( + LIDAR_FOV, LIDAR_MAX_RANGE, LIDAR_N_RAYS, SHEEP_RADIUS, ray_angles, +) + + +GAP_THRESHOLD = 0.6 # m — adjacent ray-points farther apart start new cluster +MAX_CLUSTER_SPAN = 1.5 # m — clusters wider than this are likely walls/structures +RANGE_HIT_EPS = 0.05 # m — hit if range < max_range - eps +WALL_REJECT = 0.5 # m — drop detections this close to a known wall line + +# Known sheep-sized static features. Detections within STATIC_REJECT +# of any of these are discarded — these aren't sheep. Mid-pillars on +# the field walls are NOT in this list because they're embedded in the +# wall (the wall's span filter handles them); listing them here would +# only reject real sheep that happened to be near the wall. +_STATIC_FEATURES = ( + # Gate posts (sheep-sized boxes flanking the south-wall opening) + ( 10.0, -15.0), ( 13.0, -15.0), + # Field corner pillars + ( 15.0, 15.0), ( 15.0, -15.0), (-15.0, 15.0), (-15.0, -15.0), +) +STATIC_REJECT = 0.8 # m — detection within this of a static feature → drop + + +def detections_from_scan( + ranges: np.ndarray, + dog_x: float, dog_y: float, dog_heading: float, + max_range: float = LIDAR_MAX_RANGE, +) -> list[tuple[float, float]]: + """Return list of (x, y) world-frame sheep position estimates.""" + ranges = np.asarray(ranges, dtype=np.float32) + n_rays = ranges.shape[0] + if n_rays == 0: + return [] + angles = ray_angles(n_rays, LIDAR_FOV) + hit = ranges < max_range - RANGE_HIT_EPS + + world_a = dog_heading + angles + px = dog_x + ranges * np.cos(world_a) + py = dog_y + ranges * np.sin(world_a) + + clusters: list[list[tuple[float, float]]] = [] + current: list[tuple[float, float]] = [] + prev: tuple[float, float] | None = None + for i in range(n_rays): + if not bool(hit[i]): + if current: + clusters.append(current) + current = [] + prev = None + continue + pt = (float(px[i]), float(py[i])) + if prev is not None and math.hypot(pt[0] - prev[0], pt[1] - prev[1]) > GAP_THRESHOLD: + clusters.append(current) + current = [] + current.append(pt) + prev = pt + if current: + clusters.append(current) + + detections: list[tuple[float, float]] = [] + for cluster in clusters: + xs = [p[0] for p in cluster] + ys = [p[1] for p in cluster] + cx, cy = sum(xs) / len(xs), sum(ys) / len(ys) + span = math.hypot(max(xs) - min(xs), max(ys) - min(ys)) + if span > MAX_CLUSTER_SPAN: + continue + # Surface-to-centre correction: rays hit the front of the sheep, + # so the cluster centroid is biased toward the dog by SHEEP_RADIUS. + # Push it outward along the dog→cluster direction. + dx, dy = cx - dog_x, cy - dog_y + d = math.hypot(dx, dy) + if d > 1e-3: + cx += SHEEP_RADIUS * dx / d + cy += SHEEP_RADIUS * dy / d + # Keep detections inside the field OR in the gate corridor / + # external pen — penned sheep are still worth tracking so the + # tracker can latch them as "penned" rather than spawn fresh + # tracks each scan. + # Accept detections inside the field, plus a narrow strip + # immediately south of the gate to catch sheep mid-crossing + # (so they get marked penned via is_penned_position before the + # track goes stale). Detections deeper into the pen are + # dropped entirely — Webots's pen posts and rails would + # otherwise produce a torrent of phantom penned tracks that + # the tracker can't keep up with. + in_main = (FIELD_X[0] - 0.2 < cx < FIELD_X[1] + 0.2 and + FIELD_Y[0] - 0.2 < cy < FIELD_Y[1] + 0.2) + in_gate_strip = (PEN_X[0] - 0.2 < cx < PEN_X[1] + 0.2 and + GATE_Y - 1.0 < cy < GATE_Y + 0.2) + if not (in_main or in_gate_strip): + continue + # Known-static-feature filter: gate posts and corner pillars + # show up as sheep-sized clusters but are never sheep. + if any(math.hypot(cx - fx, cy - fy) < STATIC_REJECT + for fx, fy in _STATIC_FEATURES): + continue + # Wall-proximity filter: at oblique scan angles, walls produce + # multiple short clusters because adjacent ray returns are + # spaced just above GAP_THRESHOLD. Sheep can't get within ~0.3 m + # of a wall (the env clips them to FIELD_INSIDE), so anything + # right at the wall line is structure noise. + near_field_wall = ( + cx > FIELD_X[1] - WALL_REJECT or cx < FIELD_X[0] + WALL_REJECT or + cy > FIELD_Y[1] - WALL_REJECT or + (cy < FIELD_Y[0] + WALL_REJECT and not (PEN_X[0] <= cx <= PEN_X[1])) + ) + if near_field_wall: + continue + detections.append((cx, cy)) + return detections diff --git a/herding/lidar_sim.py b/herding/lidar_sim.py new file mode 100644 index 0000000..f2d6470 --- /dev/null +++ b/herding/lidar_sim.py @@ -0,0 +1,193 @@ +"""Fast 2D LiDAR simulator for the Gymnasium env. + +Raycasts against: + * **Sheep** — discs of radius ``SHEEP_RADIUS``. + * **Static world geometry** — axis-aligned wall segments and gate + posts taken from ``worlds/field.wbt``. Without these, demos + collected in-env would never include the false-positive clusters + Webots produces from the stone walls and gate-post boxes, and the + BC student trained on those demos collapses on deployment. + +Returns a range array matching the Webots Lidar device on the dog +(see ``protos/ShepherdDog.proto``: 180 rays, 140° FOV centred on +forward, 12 m max range, 5 mm noise). +""" + +from __future__ import annotations + +import math + +import numpy as np + + +# Match protos/ShepherdDog.proto Lidar device. +LIDAR_N_RAYS = 180 +LIDAR_FOV = 2.44 # rad ≈ 140° +LIDAR_MAX_RANGE = 12.0 +LIDAR_NOISE = 0.005 # m, gaussian std + +# Sheep modelled as a vertical cylinder; this is the horizontal-section +# radius the LiDAR plane intersects. Tuned to the proto sheep (~0.45 m +# body length). The exact value is not load-bearing — the perception +# clusterer is range-tolerant. +SHEEP_RADIUS = 0.30 + + +# --------------------------------------------------------------------------- +# Static world geometry — must match worlds/field.wbt +# --------------------------------------------------------------------------- + +# Vertical walls: (x, y_min, y_max). Field east/west walls and the two +# pen side walls are visible through the open gate. +_VERTICAL_WALLS = ( + ( 15.0, -15.0, 15.0), # field east + (-15.0, -15.0, 15.0), # field west + ( 10.0, -22.0, -15.0), # pen west + ( 13.0, -22.0, -15.0), # pen east +) + +# Horizontal walls: (y, x_min, x_max). South wall is split by the 3 m +# gate at x ∈ [10, 13]; the pen south wall closes the back of the pen. +_HORIZONTAL_WALLS = ( + ( 15.0, -15.0, 15.0), # field north + (-15.0, -15.0, 10.0), # field south-west of gate + (-15.0, 13.0, 15.0), # field south-east of gate + (-22.0, 10.0, 13.0), # pen south +) + +# Gate posts and field corner pillars treated as vertical cylinders at +# LiDAR height. Radius 0.25 m comes from the 0.44 × 0.44 m boxes in the +# wbt — close enough to a circular cross-section for this purpose. +_POSTS_XY = np.array([ + ( 10.0, -15.0), # west gate post + ( 13.0, -15.0), # east gate post + ( 15.0, 15.0), # NE field corner + ( 15.0, -15.0), # SE field corner + (-15.0, 15.0), # NW field corner + (-15.0, -15.0), # SW field corner +], dtype=np.float64) +POST_RADIUS = 0.25 + + +def ray_angles(n: int = LIDAR_N_RAYS, fov: float = LIDAR_FOV) -> np.ndarray: + """Local-frame ray angles, sweeping from +fov/2 to -fov/2. + + Convention: angle is measured CCW from the dog's forward axis. Ray 0 + points to the dog's left, last ray to the right. Webots' default + Lidar sweep matches this. + """ + return np.linspace(fov / 2.0, -fov / 2.0, n, dtype=np.float64) + + +# Cached so we don't rebuild every step. +_ANGLES = ray_angles() +_COS = np.cos(_ANGLES) +_SIN = np.sin(_ANGLES) + + +def _raycast_static( + ox: float, oy: float, cos_w: np.ndarray, sin_w: np.ndarray, +) -> np.ndarray: + """Per-ray distance to nearest wall or post hit (∞ if none). + + Walls are axis-aligned line segments; for each ray we compute t at + which it crosses the wall's constant-coord plane and check the + other coord lies in the segment. Posts are circles; same disc + intersection as for sheep. + """ + n_rays = cos_w.shape[0] + best = np.full(n_rays, np.inf, dtype=np.float64) + + EPS = 1e-3 + safe_cos = np.where(np.abs(cos_w) < 1e-9, 1e-9, cos_w) + safe_sin = np.where(np.abs(sin_w) < 1e-9, 1e-9, sin_w) + + # Vertical walls (x = const) + for wx, ymin, ymax in _VERTICAL_WALLS: + t = (wx - ox) / safe_cos + y_at = oy + t * sin_w + valid = (t > EPS) & (y_at >= ymin - EPS) & (y_at <= ymax + EPS) + cand = np.where(valid, t, np.inf) + np.minimum(best, cand, out=best) + + # Horizontal walls (y = const) + for wy, xmin, xmax in _HORIZONTAL_WALLS: + t = (wy - oy) / safe_sin + x_at = ox + t * cos_w + valid = (t > EPS) & (x_at >= xmin - EPS) & (x_at <= xmax + EPS) + cand = np.where(valid, t, np.inf) + np.minimum(best, cand, out=best) + + # Posts (treat as discs) + if _POSTS_XY.size: + px = _POSTS_XY[:, 0] - ox + py = _POSTS_XY[:, 1] - oy + t_post = np.outer(px, cos_w) + np.outer(py, sin_w) # (P, N) + d2 = (px ** 2 + py ** 2)[:, None] # (P, 1) + perp2 = d2 - t_post ** 2 + R2 = POST_RADIUS ** 2 + hit = (perp2 < R2) & (t_post > 0.0) + half = np.sqrt(np.clip(R2 - perp2, 0.0, None)) + cand = np.where(hit, t_post - half, np.inf) + nearest = cand.min(axis=0) + np.minimum(best, nearest, out=best) + + return best + + +def simulate_scan( + dog_x: float, dog_y: float, dog_heading: float, + sheep_xy: list[tuple[float, float]], + noise: float = LIDAR_NOISE, + max_range: float = LIDAR_MAX_RANGE, + rng: np.random.Generator | None = None, +) -> np.ndarray: + """Return a (N,) float32 range array. No-hit entries equal ``max_range``. + + ``sheep_xy`` is the list of (x, y) world positions of every sheep in + the scene (penned and active). Static world geometry (walls and + posts) is also raycast so demos contain the same false-positive + clusters Webots produces. + """ + n_rays = _ANGLES.shape[0] + + ch, sh = math.cos(dog_heading), math.sin(dog_heading) + cos_w = ch * _COS - sh * _SIN + sin_w = sh * _COS + ch * _SIN + + # Walls + posts + best = _raycast_static(dog_x, dog_y, cos_w, sin_w) + + # Sheep discs + if sheep_xy: + sx = np.asarray([p[0] for p in sheep_xy], dtype=np.float64) - dog_x + sy = np.asarray([p[1] for p in sheep_xy], dtype=np.float64) - dog_y + t = np.outer(sx, cos_w) + np.outer(sy, sin_w) + s_dist2 = (sx ** 2 + sy ** 2)[:, None] + perp2 = s_dist2 - t ** 2 + R2 = SHEEP_RADIUS ** 2 + hit = (perp2 < R2) & (t > 0.0) + half = np.sqrt(np.clip(R2 - perp2, 0.0, None)) + candidate = np.where(hit, t - half, np.inf) + nearest = candidate.min(axis=0) + np.minimum(best, nearest, out=best) + + # Clip to LIDAR_MAX_RANGE; entries that never got a hit stay at inf + # → clipped down to max_range like the real Webots device. + ranges = np.minimum(best, max_range).astype(np.float32) + return _add_noise(ranges, noise, rng, max_range) + + +def _add_noise(ranges: np.ndarray, sigma: float, + rng: np.random.Generator | None, max_range: float) -> np.ndarray: + if sigma <= 0.0: + return ranges + if rng is None: + rng = np.random.default_rng() + hit_mask = ranges < max_range - 1e-3 + n_hit = int(hit_mask.sum()) + if n_hit: + ranges = ranges.copy() + ranges[hit_mask] += rng.normal(0.0, sigma, size=n_hit).astype(np.float32) + np.clip(ranges, 0.0, max_range, out=ranges) + return ranges diff --git a/herding/sheep_tracker.py b/herding/sheep_tracker.py new file mode 100644 index 0000000..392810e --- /dev/null +++ b/herding/sheep_tracker.py @@ -0,0 +1,197 @@ +"""Multi-target tracker for LiDAR-detected sheep. + +Greedy nearest-neighbour data association (with a distance gate) across +frames, plus a memory of last-seen positions for tracks that fall out +of the dog's FOV. Output is a ``{name: (x, y)}`` dict shaped exactly +like the receiver-based ``sheep_positions`` used previously by the +Webots controller and by the env, so Strömbom and Sequential can +consume it unchanged. + +Penned-detection heuristic +-------------------------- +Two ways a track is marked penned: + 1. Its current estimated position is south of the gate plane and + within the gate column (the ``is_penned_position`` test the env + already uses on ground truth). + 2. It hasn't been observed for ``STALE_STEPS`` and its last-seen + position was inside the gate-approach band — the dog's LiDAR can + only see ~2 m into the pen through the open gate, so a sheep + that disappeared near the entry has almost certainly entered. + +Tracks marked penned are excluded from ``get_positions()`` (which is +what Strömbom consumes), matching the prior receiver-based behaviour. +""" + +from __future__ import annotations + +import math + +from herding.geometry import MAX_SHEEP, in_pen, is_penned_position + + +GATE_M = 2.5 # m — primary NN gate (recent tracks) +REACQUIRE_GATE_M = 4.5 # m — wider gate for re-acquiring stale tracks (sheep moved during occlusion) +REACQUIRE_MIN_AGE = 20 # steps — only rebind via the wide gate if the track has been stale for this long +PENNED_GATE_M = 4.0 # m — wide gate for matching against already-penned tracks; the pen is small (3×7 m) so duplicates are easy without it +FORGET_STEPS = 200 # ~3.2 s — delete stale active tracks; tighter than 5 s to limit phantoms but long enough to bridge typical FOV gaps +MAX_ACTIVE_TRACKS = MAX_SHEEP # hard cap to the worst-case real flock size +# Penned tracks are never forgotten: sheep don't leave the pen, and +# losing the track makes the counter oscillate as the same sheep gets +# re-detected and counted multiple times. + + +class SheepTracker: + """Online tracker with NN association and a forgetful memory. + + Each track stores ``(x, y, last_seen_step, penned)``. + """ + + def __init__(self, gate: float = GATE_M): + self.gate = gate + # tid → (x, y, last_seen_step, penned) + self._tracks: dict[int, tuple[float, float, int, bool]] = {} + self._next_id = 0 + self.step = 0 + + def reset(self) -> None: + self._tracks.clear() + self._next_id = 0 + self.step = 0 + + # ------------------------------------------------------------------ + # Update + # ------------------------------------------------------------------ + def update(self, detections: list[tuple[float, float]]) -> dict[str, tuple[float, float]]: + """Fold a new set of detections in and return active positions.""" + self.step += 1 + + det_used: set[int] = set() + updated_tids: set[int] = set() + + # Pass 1: match against ACTIVE tracks first (oldest-seen-first so + # a re-emerging long-lost sheep grabs its old ID before a fresh + # neighbour does). + active_tids = [tid for tid, t in self._tracks.items() if not t[3]] + active_tids.sort(key=lambda tid: self._tracks[tid][2]) + for tid in active_tids: + tx, ty, _, _ = self._tracks[tid] + best_j, best_d = -1, self.gate + for j, (dx, dy) in enumerate(detections): + if j in det_used: + continue + d = math.hypot(dx - tx, dy - ty) + if d < best_d: + best_d = d + best_j = j + if best_j >= 0: + dx, dy = detections[best_j] + self._tracks[tid] = (dx, dy, self.step, False) + det_used.add(best_j) + updated_tids.add(tid) + + # Pass 1b: re-acquisition with a wider gate for tracks that have + # been stale for ≥ REACQUIRE_MIN_AGE steps. Sheep flee at + # ~0.6 m/s; over a 1–2 s occlusion (dog rotating or driving) + # they move enough that a fresh detection lies outside the + # primary GATE_M but is still clearly the same sheep. Without + # this, phantom tracks accumulate and corrupt the CoM. + for tid in active_tids: + if tid in updated_tids: + continue + tx, ty, last, _ = self._tracks[tid] + if (self.step - last) < REACQUIRE_MIN_AGE: + continue + best_j, best_d = -1, REACQUIRE_GATE_M + for j, (dx, dy) in enumerate(detections): + if j in det_used: + continue + d = math.hypot(dx - tx, dy - ty) + if d < best_d: + best_d = d + best_j = j + if best_j >= 0: + dx, dy = detections[best_j] + self._tracks[tid] = (dx, dy, self.step, False) + det_used.add(best_j) + updated_tids.add(tid) + + # Pass 2: match remaining detections against PENNED tracks with + # a tighter gate. Without this, every frame near the gate spawns + # a fresh penned track for the same sheep, which under a long + # Webots run leads to thousands of phantom penned tracks. + penned_tids = [tid for tid, t in self._tracks.items() if t[3]] + for tid in penned_tids: + tx, ty, _, _ = self._tracks[tid] + best_j, best_d = -1, PENNED_GATE_M + for j, (dx, dy) in enumerate(detections): + if j in det_used: + continue + d = math.hypot(dx - tx, dy - ty) + if d < best_d: + best_d = d + best_j = j + if best_j >= 0: + dx, dy = detections[best_j] + self._tracks[tid] = (dx, dy, self.step, True) + det_used.add(best_j) + + # Unmatched detections → new tracks. A detection that is already + # inside the pen is born "penned" so we don't accumulate active + # tracks for sheep that arrived in the pen during occlusion. + for j, (dx, dy) in enumerate(detections): + if j in det_used: + continue + penned = in_pen(dx, dy) or is_penned_position(dx, dy) + self._tracks[self._next_id] = (dx, dy, self.step, penned) + self._next_id += 1 + + # Promote active tracks to penned ONLY by geometric position + # (sheep is in the pen column south of the gate). The previous + # "stale + near gate" heuristic was firing on ordinary occlusion + # near the gate and creating phantom penned tracks. + for tid, (tx, ty, last, penned) in list(self._tracks.items()): + if penned: + continue + if is_penned_position(tx, ty): + self._tracks[tid] = (tx, ty, last, True) + + # Forget stale ACTIVE tracks after FORGET_STEPS. Penned tracks + # are kept indefinitely — sheep can't escape the pen, so once a + # track is marked penned, that sheep is permanently penned. + for tid, (tx, ty, last, penned) in list(self._tracks.items()): + if penned: + continue + if (self.step - last) > FORGET_STEPS: + del self._tracks[tid] + + # Hard cap on the active set. If we somehow have more than + # MAX_ACTIVE_TRACKS active tracks, drop the oldest-seen ones + # first — they are most likely false positives from world + # geometry (walls, gate posts) the env's raycaster doesn't + # model, and a bloated active set wrecks the downstream CoM. + active = [(tid, last) for tid, (_, _, last, p) in self._tracks.items() + if not p] + if len(active) > MAX_ACTIVE_TRACKS: + active.sort(key=lambda kv: kv[1]) # oldest-seen first + for tid, _ in active[: len(active) - MAX_ACTIVE_TRACKS]: + del self._tracks[tid] + + return self.get_positions() + + # ------------------------------------------------------------------ + # Outputs + # ------------------------------------------------------------------ + def get_positions(self) -> dict[str, tuple[float, float]]: + """Active (not-yet-penned) tracks. Same shape as receiver dict.""" + return {f"t{tid}": (x, y) + for tid, (x, y, _, penned) in self._tracks.items() + if not penned} + + def get_penned_set(self) -> set[str]: + return {f"t{tid}" for tid, (_, _, _, penned) in self._tracks.items() if penned} + + def n_active(self) -> int: + return sum(1 for _, _, _, penned in self._tracks.values() if not penned) + + def n_penned(self) -> int: + return sum(1 for _, _, _, penned in self._tracks.values() if penned) diff --git a/tools/auto_dagger.sh b/tools/auto_dagger.sh new file mode 100755 index 0000000..f6522bb --- /dev/null +++ b/tools/auto_dagger.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# tools/auto_dagger.sh — automated DAgger collection across many headless +# Webots runs. +# +# For each (flock_size, run_index) combination, generates a world with N +# active sheep at randomised positions, launches Webots in fast/headless +# mode, lets the controller log (lidar_obs, teacher_action) pairs for up +# to RUN_SEC seconds, kills the run, and moves on. The dog controller's +# 500-step periodic flush means each run produces a complete .npz even +# when killed by timeout. +# +# Usage: +# tools/auto_dagger.sh [RUNS_PER_FLOCK] [SECONDS_PER_RUN] +# RUNS_PER_FLOCK : how many randomised runs per flock size (default 3) +# SECONDS_PER_RUN: wall-clock cap per Webots run (default 60) +# +# Env-var overrides: +# HERDING_POLICY_DIR : policy the controller loads (only used when +# HERDING_DAGGER_DRIVER=student). Default bc_v3. +# HERDING_DAGGER_DRIVER : "teacher" (default) or "student". +# HEADLESS=1 : force --no-rendering (default on). +# FLOCKS="1 3 5 8 10" : space-separated flock sizes to iterate over. +# +# Output: +# training/dagger/dagger_.npz — one per Webots run. +# +# After collection, run: +# python -m tools.dagger_merge_train --out training/runs/bc_dagger + +set -e + +RUNS_PER_FLOCK=${1:-3} +RUN_SEC=${2:-60} +FLOCKS=${FLOCKS:-"1 3 5 8 10"} +HEADLESS=${HEADLESS:-1} + +ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )" +SRC="$ROOT/worlds/field.wbt" +DST="$ROOT/worlds/field_test.wbt" +POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_v3}" +DRIVER="${HERDING_DAGGER_DRIVER:-teacher}" +DONE_FILE="$ROOT/training/dagger/.DONE" +WEBOTS_PID="" + +cleanup() { + echo "Caught interrupt — killing Webots (pid=$WEBOTS_PID) and exiting." + [[ -n "$WEBOTS_PID" ]] && kill "$WEBOTS_PID" 2>/dev/null + wait "$WEBOTS_PID" 2>/dev/null || true + exit 1 +} +trap cleanup INT TERM + +webots_args=(--mode=fast --batch --minimize) +if [[ "$HEADLESS" == "1" ]]; then + webots_args+=(--no-rendering) +fi + +echo "Auto-dagger collection" +echo " flock sizes : $FLOCKS" +echo " runs per size : $RUNS_PER_FLOCK" +echo " seconds per run : $RUN_SEC" +echo " policy dir : $POLICY_DIR (used only when driver=student)" +echo " driver : $DRIVER" +echo " webots flags : ${webots_args[*]}" +echo + +# Runtime config — re-written before each run anyway, but written once +# here so a manual webots launch at the same time would also pick it up. +cat > "$ROOT/herding_runtime.cfg" </dev/null | wc -l || echo 0) + +run_idx=0 +total_runs=0 +for f in $FLOCKS; do total_runs=$((total_runs + RUNS_PER_FLOCK)); done + +for flock in $FLOCKS; do + for run in $(seq 1 "$RUNS_PER_FLOCK"); do + run_idx=$((run_idx + 1)) + seed=$((1000 * flock + run)) + echo "=== [$run_idx/$total_runs] flock=$flock run=$run seed=$seed ===" + + # Generate randomised world. + cp "$SRC" "$DST" + for i in $(seq $((flock + 1)) 10); do + sed -i "s|^Sheep .* \"sheep${i}\".*|# &|" "$DST" + done + # Inline Python: jitter sheep1..flock translations. + python3 - "$DST" "$flock" "$seed" <<'PYEOF' +import re, random, sys +path, n_str, seed = sys.argv[1], sys.argv[2], sys.argv[3] +n = int(n_str); random.seed(int(seed)) +with open(path) as f: + txt = f.read() +def rand_pos(): + while True: + x = random.uniform(-12.0, 12.0) + y = random.uniform(-10.0, 12.0) # avoid the gate strip + if x * x + y * y > 9.0: # at least 3 m from dog spawn + return x, y +for i in range(1, n + 1): + x, y = rand_pos() + pat = re.compile( + r'Sheep \{ translation\s+\S+\s+\S+\s+(\S+)\s+name "sheep' + str(i) + r'"' + ) + txt = pat.sub(rf'Sheep {{ translation {x:.2f} {y:.2f} \g<1> name "sheep{i}"', txt, count=1) +with open(path, "w") as f: + f.write(txt) +PYEOF + + # Run Webots in the background; poll for the .DONE sentinel or + # the wall-clock timeout, whichever comes first. + rm -f "$DONE_FILE" + webots "${webots_args[@]}" "$DST" \ + > /tmp/webots_dagger_run.log 2>&1 & + WEBOTS_PID=$! + + # Give the controller 10 s to start before polling the sentinel, + # otherwise a sheep that spawns already penned triggers an instant + # false-positive kill. + elapsed=0 + grace=10 + while kill -0 "$WEBOTS_PID" 2>/dev/null; do + if (( elapsed >= grace )) && [[ -f "$DONE_FILE" ]]; then + echo " sentinel .DONE detected — killing Webots early" + kill "$WEBOTS_PID" 2>/dev/null + wait "$WEBOTS_PID" 2>/dev/null || true + break + fi + if (( elapsed >= RUN_SEC )); then + echo " timeout ($RUN_SEC s) — killing Webots" + kill "$WEBOTS_PID" 2>/dev/null + wait "$WEBOTS_PID" 2>/dev/null || true + break + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + WEBOTS_PID="" + + # Quick sanity from the log: did the controller actually run? + if grep -q "running in mode=dagger" /tmp/webots_dagger_run.log; then + new_pairs=$(tail -50 /tmp/webots_dagger_run.log | grep -oE 'logged=[0-9]+' | tail -1) + echo " controller ran ($new_pairs)" + else + echo " WARNING: controller may not have started (see /tmp/webots_dagger_run.log)" + fi + done +done + +after_count=$(ls -1 "$ROOT/training/dagger"/dagger_*.npz 2>/dev/null | wc -l || echo 0) +new_files=$((after_count - before_count)) + +echo +echo "Done." +echo " new dagger files : $new_files" +echo " total in dir : $after_count" +echo +echo "Next:" +echo " python -m tools.dagger_merge_train --out training/runs/bc_dagger" diff --git a/tools/collect_demos.py b/tools/collect_demos.py index a06a24c..52e8ccb 100644 --- a/tools/collect_demos.py +++ b/tools/collect_demos.py @@ -26,12 +26,16 @@ if _PROJECT_ROOT not in sys.path: import numpy as np +from herding.active_scan import ActiveScanTeacher from herding.geometry import PEN_ENTRY from herding.sequential import compute_action as sequential_action from herding.strombom import compute_action as strombom_action from training.herding_env import HerdingEnv +# Base analytic teachers (no scanning). The default at demo-collection +# time wraps these in ActiveScanTeacher, which under LiDAR makes the +# teacher operate on the same partial obs as the student. TEACHERS = { "sequential": sequential_action, "strombom": strombom_action, @@ -39,19 +43,34 @@ TEACHERS = { def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int, - teacher_fn): + teacher_fn, frame_stack: int = 1, privileged: bool = False): env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, - difficulty=1.0, seed=seed) + difficulty=1.0, seed=seed, frame_stack=frame_stack) obs, _ = env.reset(seed=seed) obs_list, action_list = [], [] + # Active-scan wrapper: scan first, then run the base teacher on the + # tracker dict. Reset state per episode so the opening scan kicks in. + scan_teacher = ActiveScanTeacher(teacher_fn) for step in range(max_steps): - positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i])) - for i in range(env.n_sheep) if not env.sheep_penned[i]} - if not positions: - break - vx, vy, _mode = teacher_fn( - (env.dog_x, env.dog_y), positions, PEN_ENTRY, - ) + if privileged: + # Asymmetric "learning by cheating": teacher reads GT, student + # gets LiDAR obs. Kept available for ablation; default off. + positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i])) + for i in range(env.n_sheep) if not env.sheep_penned[i]} + if not positions: + break + vx, vy, _mode = teacher_fn( + (env.dog_x, env.dog_y), positions, PEN_ENTRY, + ) + else: + # Matched-perception teacher: it sees what the student sees + # (the tracker dict), with active scanning to fill the + # tracker before driving. + positions = env.perceived_positions() + vx, vy, _mode = scan_teacher( + (env.dog_x, env.dog_y), env.dog_heading, + positions, PEN_ENTRY, + ) action = np.array([vx, vy], dtype=np.float32) if step % subsample == 0: obs_list.append(obs.copy()) @@ -81,6 +100,14 @@ def main(): parser.add_argument("--teacher", default="sequential", choices=list(TEACHERS.keys()), help="Which analytic teacher to demonstrate.") + parser.add_argument("--frame-stack", type=int, default=1, + help="K — concatenate the last K env obs into a " + "single (32·K)-D vector. Lets a memoryless " + "MLP recover temporal info under partial " + "LiDAR observability.") + parser.add_argument("--privileged", action="store_true", + help="Teacher reads ground truth (asymmetric BC). " + "Default: matched-perception with active scan.") args = parser.parse_args() teacher_fn = TEACHERS[args.teacher] print(f"[demos] teacher: {args.teacher}") @@ -97,6 +124,7 @@ def main(): for seed in range(args.seeds_per_n): obs, actions, success, total_steps = collect_one( n, seed, args.max_steps, args.subsample, teacher_fn, + frame_stack=args.frame_stack, privileged=args.privileged, ) n_total += 1 if success: diff --git a/tools/dagger_merge_train.py b/tools/dagger_merge_train.py new file mode 100644 index 0000000..83b9ab9 --- /dev/null +++ b/tools/dagger_merge_train.py @@ -0,0 +1,135 @@ +"""Merge Webots DAgger demos with sim demos and retrain the BC policy. + +The dog controller in ``HERDING_MODE=dagger`` writes per-run files to +``training/dagger/dagger_.npz`` containing ``(obs, actions)`` pairs +where: + +* ``obs`` is the **stacked LiDAR observation** as built by the live + Webots tracker — exactly the input distribution the deployed + controller sees. +* ``actions`` is the **active-scan-teacher action computed from + ground-truth sheep positions** (read off the sheep emitter). + +Combined with the existing sim demos (``training/demos_v3.npz`` by +default), this gives the BC student a training set that includes the +real Webots false-positive distribution — closing the sim-to-real +perception gap that the all-sim pipeline couldn't bridge. + +Usage:: + + # Iteration 1 — merge all dagger files with sim demos, retrain + python -m tools.dagger_merge_train \\ + --sim training/demos_v3.npz \\ + --out training/runs/bc_dagger1 + + # Iteration 2 — drop the sim baseline, train only on Webots data + python -m tools.dagger_merge_train --no-sim --out training/runs/bc_dagger2 + +The new policy is saved as ``/policy.zip`` and is auto-loaded by +the controller's resolution priority on the next Webots run. +""" + +from __future__ import annotations + +import argparse +import glob +import os +import subprocess +import sys +from pathlib import Path + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +import numpy as np + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--sim", default="training/demos_v3.npz", + help="Sim demo file to mix with the Webots data. " + "Pass --no-sim to train only on dagger data.") + parser.add_argument("--no-sim", action="store_true", + help="Skip the sim demos entirely.") + parser.add_argument("--dagger-glob", default="training/dagger/dagger_*.npz", + help="Glob for Webots-collected dagger files.") + parser.add_argument("--merged-out", default="training/demos_dagger.npz", + help="Where to write the merged demo file.") + parser.add_argument("--out", default="training/runs/bc_dagger", + help="Where to write the BC policy.") + parser.add_argument("--epochs", type=int, default=60) + parser.add_argument("--batch-size", type=int, default=256) + parser.add_argument("--net-arch", default="512,512") + parser.add_argument("--cos-weight", type=float, default=1.0) + args = parser.parse_args() + + # --- Gather Webots files --- + dagger_paths = sorted(glob.glob(args.dagger_glob)) + if not dagger_paths: + raise SystemExit(f"No dagger files found at {args.dagger_glob} — " + "run Webots in HERDING_MODE=dagger first.") + + chunks_obs: list[np.ndarray] = [] + chunks_act: list[np.ndarray] = [] + total_dagger = 0 + for p in dagger_paths: + data = np.load(p) + obs = data["obs"].astype(np.float32) + act = data["actions"].astype(np.float32) + chunks_obs.append(obs) + chunks_act.append(act) + total_dagger += len(obs) + print(f" + {p}: {obs.shape[0]} pairs (obs dim {obs.shape[1]})") + print(f"[merge] total dagger pairs: {total_dagger}") + + obs_dim = chunks_obs[0].shape[1] + if any(c.shape[1] != obs_dim for c in chunks_obs): + raise SystemExit( + "Dagger files have inconsistent obs dims — they were collected " + "with different frame_stack settings. Either rerun with a " + "consistent setting or filter the glob." + ) + + # --- Optionally include sim demos --- + if not args.no_sim: + sim = np.load(args.sim) + sim_obs = sim["obs"].astype(np.float32) + sim_act = sim["actions"].astype(np.float32) + if sim_obs.shape[1] != obs_dim: + raise SystemExit( + f"Sim demos have obs dim {sim_obs.shape[1]} but dagger demos " + f"have {obs_dim}. Recollect sim demos at the same frame_stack." + ) + chunks_obs.append(sim_obs) + chunks_act.append(sim_act) + print(f"[merge] + sim demos: {sim_obs.shape[0]} pairs from {args.sim}") + + obs_all = np.concatenate(chunks_obs, axis=0) + act_all = np.concatenate(chunks_act, axis=0) + # Empty meta — bc_pretrain doesn't actually use it but the file format + # has it. + meta = np.zeros((0, 5), dtype=np.int32) + + Path(args.merged_out).parent.mkdir(parents=True, exist_ok=True) + np.savez(args.merged_out, obs=obs_all, actions=act_all, meta=meta) + print(f"[merge] wrote {len(obs_all)} pairs → {args.merged_out}") + print(f"[merge] obs shape {obs_all.shape}, action shape {act_all.shape}") + + # --- Run BC training --- + cmd = [ + sys.executable, "-m", "training.bc_pretrain", + "--demos", args.merged_out, + "--out", args.out, + "--epochs", str(args.epochs), + "--batch-size", str(args.batch_size), + "--net-arch", args.net_arch, + "--cos-weight", str(args.cos_weight), + ] + print(f"\n[merge] launching: {' '.join(cmd)}") + subprocess.run(cmd, check=True, cwd=_PROJECT_ROOT) + + +if __name__ == "__main__": + main() diff --git a/tools/run_webots.sh b/tools/run_webots.sh index 27812ea..c782ddf 100755 --- a/tools/run_webots.sh +++ b/tools/run_webots.sh @@ -7,29 +7,33 @@ # Usage: # tools/run_webots.sh [N] [MODE] # N : number of active sheep (1..10), default 10 -# MODE : "rl" | "strombom" | "sequential", default "rl" +# MODE : "bc" | "rl" | "strombom" | "sequential" | "dagger", default "bc" # # Examples: -# tools/run_webots.sh 10 rl # BC-trained RL policy, 10 sheep +# tools/run_webots.sh 10 bc # BC-trained policy, 10 sheep +# tools/run_webots.sh 10 rl # KL-PPO fine-tune of bc, 10 sheep # tools/run_webots.sh 5 sequential # the analytic teacher, 5 sheep # tools/run_webots.sh 3 strombom # canonical baseline, 3 sheep # # Notes: -# * The RL mode loads training/runs/bc_solo/policy.zip by default. -# Override via HERDING_POLICY_DIR=/path/to/run env var. +# * The RL mode loads the latest BC policy by default — priority +# bc_dagger_v2 → bc_dagger → bc_c2v3 (the controller resolves it). +# (LiDAR-perception, frame-stack K=4). Override via +# HERDING_POLICY_DIR=/path/to/run env var. # * Conda env "tir" must be active (provides stable-baselines3 + torch). set -e N=${1:-10} -MODE=${2:-rl} +MODE=${2:-bc} if (( N < 1 || N > 10 )); then echo "N must be 1..10, got $N" >&2; exit 1 fi case "$MODE" in - rl|strombom|sequential) ;; - *) echo "MODE must be rl|strombom|sequential, got '$MODE'" >&2; exit 1 ;; + bc|rl|strombom|sequential|dagger) ;; + *) echo "MODE must be bc|rl|strombom|sequential|dagger, got '$MODE'" >&2; exit 1 ;; esac +DAGGER_DRIVER=${HERDING_DAGGER_DRIVER:-teacher} ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )" SRC="$ROOT/worlds/field.wbt" @@ -46,15 +50,16 @@ echo "------------------------------------------------------------" echo "World : $DST" echo "Mode : $MODE" echo "Sheep : $active active" -echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_solo}" +echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_v3}" echo "------------------------------------------------------------" # Webots strips HERDING_* env vars from controller subprocesses in some # setups, so we also write a runtime config file the controller reads. -RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_solo}" +RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_v3}" cat > "$ROOT/herding_runtime.cfg" < 1: + print(f"[bc] inferred frame_stack={frame_stack} from demo obs dim {obs_dim}") + model, _env = build_model(net_arch_pi, net_arch_vf, args.log_std_init, + frame_stack=frame_stack) policy = model.policy.to(args.device) optimizer = optim.Adam(policy.parameters(), lr=args.lr) diff --git a/training/configs/ppo_default.yaml b/training/configs/ppo_default.yaml deleted file mode 100644 index d2dcff4..0000000 --- a/training/configs/ppo_default.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# PPO hyperparameters for the herding env. Tuned for a 28-D obs / 2-D -# continuous action space with 16 parallel envs on GPU. These are SB3 -# defaults nudged toward longer credit assignment (gamma=0.995) and a -# slightly higher entropy bonus to keep exploration alive while curriculum -# expands the flock size. - -# --- PPO --- -learning_rate: 3.0e-4 -n_steps: 2048 # rollout length per env before each update -batch_size: 256 -n_epochs: 10 -gamma: 0.995 -gae_lambda: 0.95 -clip_range: 0.2 -ent_coef: 0.05 # was 0.01 — earlier runs collapsed to ~0 actions -vf_coef: 0.5 -max_grad_norm: 0.5 -target_kl: null # disable early-stop on KL - -# --- Network --- -policy: MlpPolicy -net_arch_pi: [128, 128] -net_arch_vf: [128, 128] -log_std_init: 0.5 # std≈1.6 instead of default 1.0 — more exploration - -# --- Training schedule --- -total_timesteps: 10_000_000 -n_envs: 16 -checkpoint_freq: 500_000 # in env steps -eval_freq: 100_000 # in env steps -n_eval_episodes: 20 - -# --- Curriculum (max-n_sheep schedule, in env steps) --- -# Each entry: at step s, raise the env's max_n_sheep to k. The env samples -# uniformly from [1, max_n_sheep] each reset, so this widens the -# distribution gradually rather than swapping fixed sizes. -# -# State-space curriculum: difficulty controls sheep spawn area -# (0 = sheep spawn just north of gate, 1 = sheep spawn anywhere in field). -# Plus the existing flock-size curriculum. -# -# The two together let the policy first learn "what penning looks like" -# in a regime where random exploration reliably triggers it, then -# gradually generalise to the deployment distribution. -curriculum: - - { step: 0, max_n_sheep: 1, difficulty: 0.0 } - - { step: 1_000_000, max_n_sheep: 1, difficulty: 0.3 } - - { step: 2_000_000, max_n_sheep: 2, difficulty: 0.5 } - - { step: 4_000_000, max_n_sheep: 3, difficulty: 0.8 } - - { step: 6_000_000, max_n_sheep: 5, difficulty: 1.0 } - - { step: 8_000_000, max_n_sheep: 8, difficulty: 1.0 } - - { step: 9_000_000, max_n_sheep: 10, difficulty: 1.0 } diff --git a/training/configs/.gitkeep b/training/dagger/.DONE similarity index 100% rename from training/configs/.gitkeep rename to training/dagger/.DONE diff --git a/training/eval.py b/training/eval.py index 29ec099..7499db8 100644 --- a/training/eval.py +++ b/training/eval.py @@ -46,9 +46,11 @@ def rollout(env: HerdingEnv, predict_fn, max_steps: int) -> dict: def make_analytic_predictor(action_fn): def _predict(env, _obs): - positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i])) - for i in range(env.n_sheep) - if not env.sheep_penned[i]} + # Use whatever perception the env exposes — tracker output in + # LiDAR mode, ground truth in privileged mode. This makes + # evaluation honest: the analytic teacher sees what the + # deployed controller would see. + positions = env.perceived_positions() vx, vy, _mode = action_fn((env.dog_x, env.dog_y), positions, PEN_ENTRY) return np.array([vx, vy], dtype=np.float32) return _predict @@ -82,6 +84,7 @@ def main(): parser.add_argument("--difficulty", type=float, default=1.0) args = parser.parse_args() + frame_stack = 1 # default; analytic predictors don't use stacked obs if args.policy == "strombom": predict = make_analytic_predictor(strombom_action) elif args.policy == "sequential": @@ -103,6 +106,14 @@ def main(): f"policy.zip, final.zip)" ) model = PPO.load(str(zip_path), device="auto") + # Auto-detect frame stacking from the policy's expected obs dim, + # so eval runs with whatever stacking the policy was trained on. + from herding.obs import OBS_DIM as _SINGLE + policy_obs_dim = int(model.observation_space.shape[0]) + if policy_obs_dim % _SINGLE == 0 and policy_obs_dim // _SINGLE >= 1: + frame_stack = policy_obs_dim // _SINGLE + if frame_stack > 1: + print(f"[eval] policy expects frame_stack={frame_stack}") vecnorm = None vn_path = run / "vecnormalize.pkl" if not vn_path.exists() and run.parent.name != "best": @@ -121,7 +132,8 @@ def main(): successes, steps, penned = [], [], [] for seed in range(args.n_seeds): env = HerdingEnv(n_sheep=n, max_steps=args.max_steps, - difficulty=args.difficulty, seed=seed) + difficulty=args.difficulty, seed=seed, + frame_stack=frame_stack) r = rollout(env, predict, args.max_steps) successes.append(int(r["success"])) steps.append(r["steps"]) diff --git a/training/herding_env.py b/training/herding_env.py index a350500..f34c1fe 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -69,7 +69,10 @@ from herding.geometry import ( SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS, WEBOTS_DT, is_penned_position, ) +from herding.lidar_perception import detections_from_scan +from herding.lidar_sim import simulate_scan from herding.obs import OBS_DIM, build_obs +from herding.sheep_tracker import SheepTracker from herding.strombom import compute_action as strombom_action @@ -130,11 +133,30 @@ class HerdingEnv(gym.Env): max_steps: int = DEFAULT_MAX_STEPS, difficulty: float = 0.0, seed: Optional[int] = None, + use_lidar: bool = True, + frame_stack: int = 1, ): super().__init__() + # When True (default), the obs and the imitation-reward teacher + # see only LiDAR-perceived sheep positions through a tracker — + # matching what the Webots controller has access to. When False, + # both consume ground-truth positions (legacy "privileged" mode, + # kept for ablation). + self._use_lidar = bool(use_lidar) + self._tracker = SheepTracker() if self._use_lidar else None + self._np_rng_lidar: Optional[np.random.Generator] = None + + # Frame stacking: the policy receives the last K single-frame + # observations concatenated. Lets a memoryless MLP integrate + # information across time, partly compensating for the limited + # LiDAR FOV. K=1 reproduces the legacy single-frame obs. + self._frame_stack = max(1, int(frame_stack)) + self._frame_buffer: list[np.ndarray] = [] self.action_space = spaces.Box(-1.0, 1.0, shape=(2,), dtype=np.float32) + self._single_obs_dim = OBS_DIM self.observation_space = spaces.Box( - low=-np.inf, high=np.inf, shape=(OBS_DIM,), dtype=np.float32, + low=-np.inf, high=np.inf, + shape=(OBS_DIM * self._frame_stack,), dtype=np.float32, ) # If n_sheep is None, env will sample uniformly from [1, max_n_sheep] @@ -243,6 +265,16 @@ class HerdingEnv(gym.Env): self.prev_n_penned = 0 self.prev_d_pen, self.prev_radius = self._flock_metrics() + if self._tracker is not None: + self._tracker.reset() + self._np_rng_lidar = np.random.default_rng( + int(self.np_random.integers(0, 2**31 - 1))) + # Prime the tracker with one scan so the first obs isn't empty. + self._update_tracker() + + # Clear the frame stack — the next _build_obs will repopulate. + self._frame_buffer = [] + obs = self._build_obs() info = {"n_sheep": self.n_sheep} return obs, info @@ -289,6 +321,12 @@ class HerdingEnv(gym.Env): and is_penned_position(self.sheep_x[i], self.sheep_y[i])): self.sheep_penned[i] = True + # --- Run LiDAR perception on this step's state (after sheep have + # moved). Updates the tracker that obs and the imitation- + # reward teacher consume. Reward / termination still use GT. --- + if self._tracker is not None: + self._update_tracker() + # --- Reward, termination --- d_pen, radius = self._flock_metrics() reward = self._compute_reward(d_pen, radius, action=action) @@ -395,10 +433,7 @@ class HerdingEnv(gym.Env): r = self.W_PEN_DELTA * delta_pen + self.W_PROGRESS * d_progress if action is not None and self.W_IMITATE > 0.0: - positions = { - f"s{i}": (float(self.sheep_x[i]), float(self.sheep_y[i])) - for i in range(self.n_sheep) if not self.sheep_penned[i] - } + positions = self._perceived_positions() if positions: sx, sy, _mode = strombom_action( (self.dog_x, self.dog_y), positions, PEN_ENTRY, @@ -411,11 +446,72 @@ class HerdingEnv(gym.Env): return float(r) - def _build_obs(self) -> np.ndarray: - sheep_xy_list = list(zip(self.sheep_x.tolist(), self.sheep_y.tolist())) - sheep_penned_list = self.sheep_penned.tolist() + def _build_single_obs(self) -> np.ndarray: + if self._tracker is not None: + # Obs sees only the tracker's active set; penned tracks are + # intentionally excluded (matches the prior receiver-based + # behaviour where penned sheep stopped contributing to the + # symbolic obs). + active = self._tracker.get_positions() + sheep_xy_list = list(active.values()) + sheep_penned_list = [False] * len(sheep_xy_list) + else: + sheep_xy_list = list(zip(self.sheep_x.tolist(), self.sheep_y.tolist())) + sheep_penned_list = self.sheep_penned.tolist() return build_obs( (self.dog_x, self.dog_y), self.dog_heading, sheep_xy_list, sheep_penned_list, n_max=self._max_n_sheep, ) + + def _build_obs(self) -> np.ndarray: + single = self._build_single_obs() + if self._frame_stack <= 1: + return single + # On a fresh reset the buffer is empty — duplicate the first + # frame so the stack is always full-length. + if not self._frame_buffer: + self._frame_buffer = [single.copy() for _ in range(self._frame_stack)] + else: + self._frame_buffer.append(single) + if len(self._frame_buffer) > self._frame_stack: + self._frame_buffer = self._frame_buffer[-self._frame_stack:] + # Concatenate oldest → newest. + return np.concatenate(self._frame_buffer, axis=0).astype(np.float32) + + # ------------------------------------------------------------------ + # LiDAR perception helpers + # ------------------------------------------------------------------ + def _all_sheep_xy(self) -> list[tuple[float, float]]: + """Every sheep, including penned ones (the LiDAR sees them).""" + return [(float(self.sheep_x[i]), float(self.sheep_y[i])) + for i in range(self.n_sheep)] + + def _update_tracker(self) -> None: + ranges = simulate_scan( + self.dog_x, self.dog_y, self.dog_heading, + self._all_sheep_xy(), + rng=self._np_rng_lidar, + ) + detections = detections_from_scan( + ranges, self.dog_x, self.dog_y, self.dog_heading, + ) + self._tracker.update(detections) + + def perceived_positions(self) -> dict[str, tuple[float, float]]: + """Public accessor — what the controller would 'see' this step. + + LiDAR mode → the tracker's active set. + Privileged mode → ground-truth active sheep. + + Used by ``training.eval`` and ``tools.collect_demos`` so analytic + teachers run on the same perception the deployed controller has. + """ + if self._tracker is not None: + return self._tracker.get_positions() + return {f"s{i}": (float(self.sheep_x[i]), float(self.sheep_y[i])) + for i in range(self.n_sheep) if not self.sheep_penned[i]} + + # Internal alias so the imitation reward path doesn't need to know + # which mode it's in. + _perceived_positions = perceived_positions diff --git a/training/runs/bc_flock/policy.zip b/training/runs/bc_flock/policy.zip deleted file mode 100644 index 6dc913a..0000000 Binary files a/training/runs/bc_flock/policy.zip and /dev/null differ diff --git a/training/runs/bc_solo/policy.zip b/training/runs/bc_solo/policy.zip deleted file mode 100644 index 10721ab..0000000 Binary files a/training/runs/bc_solo/policy.zip and /dev/null differ diff --git a/training/runs/bc_v3/policy.zip b/training/runs/bc_v3/policy.zip new file mode 100644 index 0000000..6e97cf4 Binary files /dev/null and b/training/runs/bc_v3/policy.zip differ diff --git a/training/train_ppo.py b/training/train_ppo.py index c506aea..a389e17 100644 --- a/training/train_ppo.py +++ b/training/train_ppo.py @@ -1,31 +1,33 @@ -"""PPO trainer for the shepherd-dog policy — EXPERIMENTAL. +"""KL-regularised PPO fine-tune of a behaviour-cloned policy. -The deliverable pipeline is `bc_pretrain.py` (see ``training/README.md``). -This script is kept in the tree because it implements: +The PPO-from-scratch and unregularised PPO-fine-tune-of-BC versions +we tried earlier failed for the standard reasons (sparse pen reward, +long horizons, exploration noise destroying BC weights). The fix is +to anchor the policy to its BC initialisation with a KL penalty in +the loss — the policy is free to refine the BC mean within a +trust-region-like ball around the reference, and the dense-enough +per-step reward signal does the rest. -* PPO from scratch with curriculum over flock size + spawn area, and -* PPO fine-tune of a behavior-cloned policy. +Pipeline +-------- +1. Load ``bc_v3`` weights into both the trainable policy and a frozen + reference ``ref_policy``. +2. Initialise the policy's log_std to a small fixed value (≈ −1.5) + and disable its gradient — exploration noise stays small so PPO + updates don't blow up the BC mean before reward can stabilise. +3. Override ``PPO.train()`` to add ``β · KL(π ‖ π_ref)`` to the loss + each minibatch. +4. Train for ~1–3 M timesteps with a low LR (5e-5). -Both ran into stability issues in our setting (long-horizon credit -assignment for sparse pen reward, BC-degradation under PPO exploration -noise). The abstractions are reusable for follow-up work — e.g. -KL-regularised fine-tune with a frozen reference policy — so we leave -the code in place. +Output: ``runs/rl_v1/policy.zip`` — same SB3 format as bc_v3, loadable +by the dog controller's ``HERDING_MODE=rl`` path. -Usage (PPO from scratch):: +Usage:: - python -m training.train_ppo \ - --config training/configs/ppo_default.yaml \ - --out-dir training/runs/ppo_scratch - -Usage (PPO fine-tune of BC):: - - python -m training.train_ppo \ - --resume training/runs/bc_flock/policy.zip \ - --out-dir training/runs/bc_ppo \ - --no-vecnorm --no-curriculum --imitate-weight 0 \ - --difficulty 1.0 --log-std -1.5 --learning-rate 5e-5 \ - --total-timesteps 3000000 + python -m training.train_ppo \\ + --bc training/runs/bc_v3 \\ + --out training/runs/rl_v1 \\ + --total-timesteps 2000000 """ from __future__ import annotations @@ -35,8 +37,6 @@ import os import sys from pathlib import Path -import yaml - _HERE = os.path.dirname(os.path.abspath(__file__)) _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..")) if _PROJECT_ROOT not in sys.path: @@ -44,236 +44,305 @@ if _PROJECT_ROOT not in sys.path: import numpy as np import torch as th +import torch.nn.functional as F from stable_baselines3 import PPO -from stable_baselines3.common.callbacks import ( - BaseCallback, CheckpointCallback, EvalCallback, -) +from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback from stable_baselines3.common.monitor import Monitor -from stable_baselines3.common.vec_env import ( - DummyVecEnv, SubprocVecEnv, VecNormalize, -) +from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv +from herding.obs import OBS_DIM from training.herding_env import HerdingEnv -# -------------------------------------------------------------------------- -# Env factories -# -------------------------------------------------------------------------- +# -------------------------------------------------------------------- +# Env factory +# -------------------------------------------------------------------- -def _make_env(rank: int, seed: int = 0): +def _make_env(rank: int, seed: int, frame_stack: int): def _thunk(): - env = HerdingEnv(seed=seed + rank) + env = HerdingEnv(seed=seed + rank, frame_stack=frame_stack) env = Monitor(env, info_keywords=("is_success", "n_sheep", "n_penned")) return env return _thunk -# -------------------------------------------------------------------------- -# Curriculum callback -# -------------------------------------------------------------------------- +# -------------------------------------------------------------------- +# KL-regularised PPO +# -------------------------------------------------------------------- -class CurriculumCallback(BaseCallback): - """Drive the env's flock-size + state-space difficulty curriculum. +class KLPPO(PPO): + """PPO with an extra KL-to-reference penalty in the policy loss. - Schedule entries: {step, max_n_sheep, difficulty}. The largest entry - whose step <= num_timesteps wins; both knobs update together. + Subclasses SB3's PPO and overrides ``train()`` only to add a single + line for the KL term — everything else (rollout buffer, clipped + surrogate, value loss, entropy bonus) is unchanged. """ - def __init__(self, schedule, vec_envs, verbose: int = 0): - super().__init__(verbose) - self.schedule = sorted(schedule, key=lambda d: d["step"]) - # Accept a list of envs so the eval env tracks training difficulty. - self.vec_envs = vec_envs if isinstance(vec_envs, (list, tuple)) else [vec_envs] - self._last_n = None - self._last_d = None + def __init__(self, *args, ref_policy=None, kl_coef: float = 0.05, **kwargs): + super().__init__(*args, **kwargs) + # ref_policy is set after construction (caller can build it + # from the BC checkpoint once `self.policy` exists). + self.ref_policy = ref_policy + if self.ref_policy is not None: + self.ref_policy.set_training_mode(False) + for p in self.ref_policy.parameters(): + p.requires_grad = False + self.kl_coef = kl_coef - def _call(self, method, value): - for v in self.vec_envs: - try: - v.env_method(method, value) - except AttributeError: - v.venv.env_method(method, value) + def train(self) -> None: + # Copied from stable_baselines3.ppo.PPO.train (v2.x), with the + # KL-to-reference term added. Keeping the structure intact so + # behavioural parity with stock PPO is obvious. + self.policy.set_training_mode(True) + self._update_learning_rate(self.policy.optimizer) + clip_range = self.clip_range(self._current_progress_remaining) + if self.clip_range_vf is not None: + clip_range_vf = self.clip_range_vf(self._current_progress_remaining) - def _on_step(self) -> bool: - t = self.num_timesteps - n = self.schedule[0]["max_n_sheep"] - d = self.schedule[0].get("difficulty", 1.0) - for entry in self.schedule: - if t >= entry["step"]: - n = entry["max_n_sheep"] - d = entry.get("difficulty", 1.0) - if n != self._last_n: - self._call("set_max_n_sheep", n) - self._last_n = n - if d != self._last_d: - self._call("set_difficulty", d) - self._last_d = d - if self.verbose: - print(f"[curriculum] t={t} → max_n_sheep={n} difficulty={d}") - return True + entropy_losses, pg_losses, value_losses, kl_losses = [], [], [], [] + clip_fractions = [] + continue_training = True + + for epoch in range(self.n_epochs): + approx_kl_divs = [] + for rollout_data in self.rollout_buffer.get(self.batch_size): + actions = rollout_data.actions + if isinstance(self.action_space, th.distributions.Categorical.__bases__): + actions = rollout_data.actions.long().flatten() + + values, log_prob, entropy = self.policy.evaluate_actions( + rollout_data.observations, actions) + values = values.flatten() + advantages = rollout_data.advantages + if self.normalize_advantage and len(advantages) > 1: + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) + + ratio = th.exp(log_prob - rollout_data.old_log_prob) + policy_loss_1 = advantages * ratio + policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range) + policy_loss = -th.min(policy_loss_1, policy_loss_2).mean() + pg_losses.append(policy_loss.item()) + clip_fraction = th.mean((th.abs(ratio - 1) > clip_range).float()).item() + clip_fractions.append(clip_fraction) + + if self.clip_range_vf is None: + values_pred = values + else: + values_pred = rollout_data.old_values + th.clamp( + values - rollout_data.old_values, -clip_range_vf, clip_range_vf) + value_loss = F.mse_loss(rollout_data.returns, values_pred) + value_losses.append(value_loss.item()) + + if entropy is None: + entropy_loss = -th.mean(-log_prob) + else: + entropy_loss = -th.mean(entropy) + entropy_losses.append(entropy_loss.item()) + + # --- KL-to-reference term ---------------------------- + # Both policies are diagonal Gaussian (ActorCriticPolicy). + # KL(π ‖ π_ref) per-action-dim; sum over the action axis + # to get total KL per sample, then mean over batch. + # Computed on the rollout's observations so the penalty + # reflects what the agent actually saw. + if self.ref_policy is None: + raise RuntimeError("KLPPO.train called without ref_policy") + with th.no_grad(): + ref_dist = self.ref_policy.get_distribution(rollout_data.observations) + pi_dist = self.policy.get_distribution(rollout_data.observations) + kl_div = th.distributions.kl.kl_divergence( + pi_dist.distribution, ref_dist.distribution).sum(dim=-1).mean() + kl_losses.append(kl_div.item()) + # ---------------------------------------------------- + + loss = (policy_loss + + self.ent_coef * entropy_loss + + self.vf_coef * value_loss + + self.kl_coef * kl_div) + + with th.no_grad(): + log_ratio = log_prob - rollout_data.old_log_prob + approx_kl_div = th.mean((th.exp(log_ratio) - 1) - log_ratio).cpu().numpy() + approx_kl_divs.append(approx_kl_div) + + if self.target_kl is not None and approx_kl_div > 1.5 * self.target_kl: + continue_training = False + if self.verbose >= 1: + print(f"Early stopping at step {epoch} due to reaching max kl: {approx_kl_div:.2f}") + break + + self.policy.optimizer.zero_grad() + loss.backward() + th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) + self.policy.optimizer.step() + + self._n_updates += 1 + if not continue_training: + break + + explained_var = self._explained_variance() + self.logger.record("train/entropy_loss", float(np.mean(entropy_losses))) + self.logger.record("train/policy_gradient_loss", float(np.mean(pg_losses))) + self.logger.record("train/value_loss", float(np.mean(value_losses))) + self.logger.record("train/kl_to_reference", float(np.mean(kl_losses))) + self.logger.record("train/approx_kl", float(np.mean(approx_kl_divs))) + self.logger.record("train/clip_fraction", float(np.mean(clip_fractions))) + self.logger.record("train/explained_variance", float(explained_var)) + if hasattr(self.policy, "log_std"): + self.logger.record("train/std", th.exp(self.policy.log_std).mean().item()) + + def _explained_variance(self) -> float: + # SB3 doesn't expose this as a method; replicate the computation. + y_pred = self.rollout_buffer.values.flatten() + y_true = self.rollout_buffer.returns.flatten() + var_y = np.var(y_true) + return float("nan") if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y -# -------------------------------------------------------------------------- +# -------------------------------------------------------------------- # Main -# -------------------------------------------------------------------------- +# -------------------------------------------------------------------- -def main(): +def main() -> None: parser = argparse.ArgumentParser() - parser.add_argument("--config", default=os.path.join(_HERE, "configs", "ppo_default.yaml")) - parser.add_argument("--out-dir", default=os.path.join(_HERE, "runs", "latest")) - parser.add_argument("--n-envs", type=int, default=None, - help="Override config n_envs.") - parser.add_argument("--total-timesteps", type=int, default=None, - help="Override config total_timesteps.") + parser.add_argument("--bc", default="training/runs/bc_v3", + help="Directory containing the BC initialisation (policy.zip).") + parser.add_argument("--out", default="training/runs/rl_v1", + help="Where to save the fine-tuned policy.") + parser.add_argument("--total-timesteps", type=int, default=2_000_000) + parser.add_argument("--n-envs", type=int, default=8) + parser.add_argument("--learning-rate", type=float, default=5e-5, + help="Low LR keeps PPO close to the BC mean.") + parser.add_argument("--kl-coef", type=float, default=0.05, + help="KL-to-reference penalty coefficient.") + parser.add_argument("--log-std", type=float, default=-1.5, + help="Initial (and frozen) log_std. σ ≈ exp(-1.5) ≈ 0.22.") + parser.add_argument("--freeze-log-std", action="store_true", default=True, + help="Keep log_std fixed; only the policy mean updates.") + parser.add_argument("--n-steps", type=int, default=2048, + help="Steps per rollout per env.") + parser.add_argument("--batch-size", type=int, default=256) + parser.add_argument("--n-epochs", type=int, default=10) + parser.add_argument("--gamma", type=float, default=0.995) + parser.add_argument("--gae-lambda", type=float, default=0.95) + parser.add_argument("--clip-range", type=float, default=0.1, + help="Tight clip range — keep updates conservative.") + parser.add_argument("--ent-coef", type=float, default=0.0) + parser.add_argument("--target-kl", type=float, default=0.02, + help="SB3's per-batch KL early stop; safety belt.") parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--resume", type=str, default=None, - help="Path to a SB3 zip to resume from.") - # SB3 recommends CPU for MlpPolicy — GPU helps CNN policies, not MLPs - # of this size. Override with --device cuda if you really want it. parser.add_argument("--device", default="cpu") - parser.add_argument("--no-vecnorm", action="store_true", - help="Disable VecNormalize wrapper. Required when " - "resuming from a BC-pretrained policy that " - "wasn't trained under it.") - parser.add_argument("--no-curriculum", action="store_true", - help="Skip curriculum callback (resumed policy is " - "already competent across the distribution).") - parser.add_argument("--imitate-weight", type=float, default=None, - help="Override env W_IMITATE. Set to 0 to disable " - "Strömbom imitation reward.") - parser.add_argument("--difficulty", type=float, default=None, - help="Override env difficulty (0=easy, 1=hard). " - "Used in BC fine-tune to skip easy curriculum.") - parser.add_argument("--log-std", type=float, default=None, - help="Override the policy's log_std after load. " - "BC trained with std≈1.6 (log_std=0.5) which " - "is too noisy for fine-tune. Use -1.5 (std≈0.22) " - "to keep PPO close to the BC mean while still " - "exploring locally.") - parser.add_argument("--learning-rate", type=float, default=None, - help="Override config learning rate. For BC " - "fine-tune, 5e-5 is much safer than the 3e-4 " - "default.") args = parser.parse_args() - with open(args.config) as f: - cfg = yaml.safe_load(f) + bc_zip = Path(args.bc) / "policy.zip" + if not bc_zip.exists(): + raise SystemExit( + f"BC checkpoint not found at {bc_zip}. Train bc_v3 first with " + f"`python -m training.bc_pretrain`." + ) - n_envs = args.n_envs or cfg["n_envs"] - total_timesteps = args.total_timesteps or cfg["total_timesteps"] - - out = Path(args.out_dir) + out = Path(args.out) out.mkdir(parents=True, exist_ok=True) (out / "checkpoints").mkdir(exist_ok=True) (out / "best").mkdir(exist_ok=True) - (out / "evals").mkdir(exist_ok=True) - print(f"[train] out={out} n_envs={n_envs} total={total_timesteps} device={args.device}") + # --- Inspect BC obs dim → infer frame_stack --- + ref_only = PPO.load(str(bc_zip), device=args.device) + obs_dim = int(ref_only.observation_space.shape[0]) + if obs_dim % OBS_DIM != 0: + raise SystemExit(f"BC obs dim {obs_dim} is not a multiple of {OBS_DIM}.") + frame_stack = obs_dim // OBS_DIM + print(f"[rl] BC obs dim {obs_dim} → frame_stack={frame_stack}") - # --- Train env (vectorised, optionally normalised) --- - env_fns = [_make_env(i, seed=args.seed) for i in range(n_envs)] - venv = SubprocVecEnv(env_fns) if n_envs > 1 else DummyVecEnv(env_fns) - eval_venv = DummyVecEnv([_make_env(99, seed=args.seed + 999)]) - if not args.no_vecnorm: - venv = VecNormalize(venv, norm_obs=True, norm_reward=False, clip_obs=10.0) - eval_venv = VecNormalize(eval_venv, norm_obs=True, norm_reward=False, - clip_obs=10.0, training=False) - eval_venv.obs_rms = venv.obs_rms - else: - print("[train] VecNormalize disabled (resumed policy was trained without it).") + # --- Vectorised envs (match BC obs space) --- + env_fns = [_make_env(i, args.seed, frame_stack) for i in range(args.n_envs)] + venv = SubprocVecEnv(env_fns) if args.n_envs > 1 else DummyVecEnv(env_fns) + eval_venv = DummyVecEnv([_make_env(99, args.seed + 999, frame_stack)]) - # Apply env-level overrides (used by BC fine-tune to disable Strömbom - # imitation and start at full deployment difficulty). - def _env_call(method, value): - for v in (venv, eval_venv): - try: - v.env_method(method, value) - except AttributeError: - v.venv.env_method(method, value) - - if args.imitate_weight is not None: - _env_call("set_imitate_weight", args.imitate_weight) - print(f"[train] W_IMITATE overridden to {args.imitate_weight}") - if args.difficulty is not None: - _env_call("set_difficulty", args.difficulty) - print(f"[train] difficulty pinned to {args.difficulty}") - - # --- Model --- - policy_kwargs = dict( - net_arch=dict(pi=cfg["net_arch_pi"], vf=cfg["net_arch_vf"]), - log_std_init=cfg.get("log_std_init", 0.0), + # --- Trainable policy: load BC weights, then bolt onto PPO --- + # Trick: instantiate a PPO with the right env (so the policy + # network is constructed at the correct obs/action shape), then + # copy BC weights into it. + model = KLPPO( + "MlpPolicy", venv, + ref_policy=None, # filled in below + kl_coef=args.kl_coef, + learning_rate=args.learning_rate, + n_steps=args.n_steps, + batch_size=args.batch_size, + n_epochs=args.n_epochs, + gamma=args.gamma, + gae_lambda=args.gae_lambda, + clip_range=args.clip_range, + ent_coef=args.ent_coef, + target_kl=args.target_kl, + policy_kwargs=dict( + net_arch=dict(pi=[512, 512], vf=[512, 512]), + log_std_init=args.log_std, + ), + verbose=1, + seed=args.seed, + device=args.device, + tensorboard_log=str(out / "tb"), ) - if args.resume: - print(f"[train] resuming from {args.resume}") - custom_objects = {} - if args.learning_rate is not None: - custom_objects["learning_rate"] = args.learning_rate - model = PPO.load(args.resume, env=venv, device=args.device, - tensorboard_log=str(out / "tb"), - custom_objects=custom_objects or None) - if args.log_std is not None: - import torch as _th - with _th.no_grad(): - model.policy.log_std.fill_(args.log_std) - print(f"[train] log_std overridden to {args.log_std} " - f"(std≈{2.71828 ** args.log_std:.2f})") - if args.learning_rate is not None: - print(f"[train] learning_rate overridden to {args.learning_rate}") - else: - model = PPO( - cfg["policy"], venv, - learning_rate=cfg["learning_rate"], - n_steps=cfg["n_steps"], - batch_size=cfg["batch_size"], - n_epochs=cfg["n_epochs"], - gamma=cfg["gamma"], - gae_lambda=cfg["gae_lambda"], - clip_range=cfg["clip_range"], - ent_coef=cfg["ent_coef"], - vf_coef=cfg["vf_coef"], - max_grad_norm=cfg["max_grad_norm"], - target_kl=cfg.get("target_kl"), - policy_kwargs=policy_kwargs, - tensorboard_log=str(out / "tb"), - seed=args.seed, - device=args.device, - verbose=1, - ) + # --- Load BC weights into both `model.policy` and `ref_policy` --- + bc_state = ref_only.policy.state_dict() + # Strict=False because the value head may not have been trained in + # BC — that's fine, PPO will train it from scratch. + missing, unexpected = model.policy.load_state_dict(bc_state, strict=False) + print(f"[rl] BC → policy: missing={len(missing)} unexpected={len(unexpected)}") + + # Build a separate reference policy with identical architecture and + # the BC weights, frozen. + ref_policy = type(model.policy)( + observation_space=model.observation_space, + action_space=model.action_space, + lr_schedule=lambda _: 0.0, + net_arch=dict(pi=[512, 512], vf=[512, 512]), + log_std_init=args.log_std, + ).to(args.device) + ref_policy.load_state_dict(bc_state, strict=False) + model.ref_policy = ref_policy + model.ref_policy.set_training_mode(False) + for p in model.ref_policy.parameters(): + p.requires_grad = False + + # Align both policies' log_std. BC was trained with log_std≈0.5 + # (σ≈1.65), which would make the KL term huge from a std mismatch + # rather than the mean drift we actually care about. Force both to + # the same small value so KL measures only how far the policy mean + # has drifted from the BC mean. + with th.no_grad(): + model.policy.log_std.fill_(args.log_std) + model.ref_policy.log_std.fill_(args.log_std) + if args.freeze_log_std: + model.policy.log_std.requires_grad = False + print(f"[rl] log_std frozen at {args.log_std} (σ ≈ {np.exp(args.log_std):.3f})") # --- Callbacks --- ckpt_cb = CheckpointCallback( - save_freq=max(1, cfg["checkpoint_freq"] // n_envs), - save_path=str(out / "checkpoints"), name_prefix="ppo", - save_vecnormalize=True, + save_freq=max(1, 50_000 // args.n_envs), + save_path=str(out / "checkpoints"), + name_prefix="ppo", ) eval_cb = EvalCallback( eval_venv, best_model_save_path=str(out / "best"), log_path=str(out / "evals"), - eval_freq=max(1, cfg["eval_freq"] // n_envs), - n_eval_episodes=cfg["n_eval_episodes"], + eval_freq=max(1, 20_000 // args.n_envs), + n_eval_episodes=5, deterministic=True, ) - callbacks = [ckpt_cb, eval_cb] - if not args.no_curriculum and "curriculum" in cfg and cfg["curriculum"]: - callbacks.append(CurriculumCallback( - cfg["curriculum"], [venv, eval_venv], verbose=1, - )) - elif args.no_curriculum: - print("[train] curriculum disabled — env knobs left at their current values.") - # --- Train --- - model.learn(total_timesteps=total_timesteps, callback=callbacks, - progress_bar=True) + print(f"[rl] training: total_timesteps={args.total_timesteps} " + f"n_envs={args.n_envs} lr={args.learning_rate} kl_coef={args.kl_coef}") + model.learn(total_timesteps=args.total_timesteps, + callback=[ckpt_cb, eval_cb], progress_bar=True) - # --- Save final model + VecNormalize stats --- - model.save(out / "final.zip") - venv.save(str(out / "vecnormalize.pkl")) - # The EvalCallback already wrote best_model.zip into out/best/ — drop the - # VecNormalize stats next to it for the controller to pick up. - venv.save(str(out / "best" / "vecnormalize.pkl")) - print(f"[train] done. saved to {out}") + # --- Save final checkpoint in the SB3 zip the controller expects --- + model.save(out / "policy.zip") + print(f"[rl] saved fine-tuned policy → {out/'policy.zip'}") if __name__ == "__main__": diff --git a/worlds/.field_test.wbproj b/worlds/.field_test.wbproj new file mode 100644 index 0000000..d83ad3c --- /dev/null +++ b/worlds/.field_test.wbproj @@ -0,0 +1,9 @@ +Webots Project File version R2025a +perspectives: 000000ff00000000fd00000002000000010000011c00000405fc0200000001fb0000001400540065007800740045006400690074006f00720100000000000004050000003f00ffffff00000003000007c500000092fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000007c50000006900ffffff000006a70000040500000001000000020000000100000008fc00000000 +simulationViewPerspectives: 000000ff000000010000000200000100000003a80100000002010000000100 +sceneTreePerspectives: 000000ff00000001000000030000001f0000018b000000fa0100000002010000000200 +maximizedDockId: -1 +centralWidgetVisible: 1 +orthographicViewHeight: 1 +textFiles: -1 +consoles: Console:All:All