diff --git a/.gitignore b/.gitignore index ed9ece1..23c8dbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,25 +1,28 @@ -# Stuff -#_example/ +# Editor / IDE .claude/ +.venv/ # Python __pycache__/ *.pyc -.venv/ -# Optional env parity debug +# Webots controller scratch / debug +controllers/shepherd_dog/dog_behavior_log.csv dog_debug.csv -# Webots controller scratch -controllers/shepherd_dog/dog_behavior_log.csv - -# Training artefacts -training/runs/* -!training/runs/.gitkeep +# Training artefacts: ignore by default, whitelist the two working BC policies *.zip *.pkl - -# TensorBoard +*.npz events.out.tfevents.* +training/runs/*/checkpoints/ +training/runs/*/tb/ +training/runs/*/evals/ +training/runs/*/best/ +!training/runs/.gitkeep +!training/runs/bc_solo/policy.zip +!training/runs/bc_flock/policy.zip + +# Webots launcher scratch worlds/field_test.wbt herding_runtime.cfg diff --git a/README.md b/README.md new file mode 100644 index 0000000..b6a4fb2 --- /dev/null +++ b/README.md @@ -0,0 +1,115 @@ +# Autonomous Shepherd-Dog Herding (Webots + RL) + +Group G25 — *Diogo Costa, Johnny Fernandes, Nelson Neto* + +A differential-drive shepherd dog that herds 1–10 sheep through a 3 m +gate into an external pen. The dog has three modes: + +| Mode | Source | Notes | +|---|---|---| +| `rl` | Behavior cloning of an analytic teacher | The deliverable RL policy | +| `strombom` | Strömbom (2014) collect/drive heuristic | Canonical baseline | +| `sequential` | Single-target "pin and push" | Robust across n=1–10 | + +Plus three documented experimental teachers (`hybrid`, `drive_only`, +`strombom_smooth`) — see `herding/` for details. + +## Quick start + +```bash +# 1. Set up the Python env (any venv with PyTorch + SB3) +pip install -r training/requirements.txt + +# 2. Smoke test +python -m training.parity_test + +# 3. Reproduce the BC policy from scratch (~25 min on CPU) +python -m tools.collect_demos --teacher strombom --out training/demos.npz \ + --seeds-per-n 30 --subsample 3 +python -m training.bc_pretrain --demos training/demos.npz \ + --out training/runs/bc_flock --epochs 100 --net-arch 512,512 + +# 4. Evaluate +python -m training.eval --policy training/runs/bc_flock \ + --max-flock 10 --max-steps 30000 --n-seeds 5 + +# 5. Run in Webots (any of the three modes; n is the flock size) +HERDING_POLICY_DIR=$PWD/training/runs/bc_flock tools/run_webots.sh 10 rl +tools/run_webots.sh 10 strombom +tools/run_webots.sh 10 sequential +``` + +## Layout + +``` +herding/ — single source of truth (env + Webots both import) + geometry.py — field/pen constants, robot specs + flocking_sim.py — Reynolds-style sheep dynamics + diffdrive.py — differential-drive kinematics + obs.py — 32-D order-invariant observation builder + strombom.py — canonical CoM-drive teacher + sequential.py — single-target "pin-and-push" teacher + hybrid.py — flock-then-funnel (experimental, did not scale) + drive_only.py — Strömbom drive without collect (experimental) + strombom_smooth.py — sigmoid-blended Strömbom (experimental) + +controllers/ + sheep/sheep.py — Webots sheep controller (uses herding.flocking_sim) + shepherd_dog/ + shepherd_dog.py — Webots dog controller, mode-switched + policy_loader.py — lazy SB3 PPO loader + strombom.py — backwards-compat shim + +training/ + herding_env.py — Gymnasium env (used for demo collection + eval) + bc_pretrain.py — supervised BC of analytic teachers into MLP policy + collect_demos.py — wrapper, see tools/ + eval.py — RL / analytic comparison harness + parity_test.py — smoke tests + train_ppo.py — PPO/RL fine-tune (experimental, BC alone preferred) + requirements.txt + configs/ppo_default.yaml + +tools/ + collect_demos.py — generate (obs, action) demonstrations + run_webots.sh — launch Webots with N sheep + chosen controller mode + +worlds/ + field.wbt — main world (3 m gate, external pen) + +protos/ — Sheep / ShepherdDog robot definitions +docs/project.md — original project goals +plan.md — design notes / decision log +``` + +## Two cohesion regimes + +Sheep cohesion strength controls which teacher works: + +| Regime | `flocking_sim.py` setting | Strömbom | Sequential | +|---|---|---:|---:| +| **Tight** (current) | `w=3.0/1.0`, `dist=12` | works (flock-style) | breaks (cohesion fights single-sheep targeting) | +| Loose | `w=1.5/0.6`, `dist=8` | breaks (flock fragments at gate) | works (1-by-1 style) | + +The codebase ships with the **tight** regime. To use the loose-regime +Sequential clone, edit those constants in `herding/flocking_sim.py` and +load `training/runs/bc_solo/`. + +## Results + +Eval at `--max-steps 30000 --n-seeds 5`, deployment difficulty (full +field spawn distribution): + +| n | Strömbom | Sequential | BC-flock (RL) | +|---:|---:|---:|---:| +| 1 | 100 % | 100 % | 100 % | +| 5 | 100 % | 100 % | 80–100 % | +| 8 | 100 % | 100 % | 80 % | +| 10 | **100 %** | 80 % | **80 %** (mean_penned 8/10) | + +The BC policy hits ~80 % of the analytic teacher's success rate in 100 % +neural-network inference, with no hand-coded logic. + +## License + +Educational project for the *Topics in Intelligent Robotics* course. diff --git a/controllers/shepherd_dog/shepherd_dog.py b/controllers/shepherd_dog/shepherd_dog.py index 0830776..d84738e 100644 --- a/controllers/shepherd_dog/shepherd_dog.py +++ b/controllers/shepherd_dog/shepherd_dog.py @@ -1,26 +1,24 @@ """Shepherd Dog controller (Webots). -Runs in one of two modes selected by the ``HERDING_MODE`` environment -variable: +Mode is selected by ``HERDING_MODE`` (env var, or via the +``herding_runtime.cfg`` file the launcher writes since Webots strips +env vars on some setups): - HERDING_MODE=rl → load an SB3 PPO policy from - HERDING_POLICY_DIR (default - training/runs/latest/best) and use its - (vx, vy) action each step. - HERDING_MODE=strombom → use the analytic Strömbom collect/drive - heuristic. This is the fallback if the RL - policy can't be loaded (e.g. SB3 not - installed in the Webots Python env, or no - checkpoint yet). + rl → load a BC-trained SB3 policy from HERDING_POLICY_DIR + and use its (vx, vy) action each step. + strombom → canonical Strömbom collect/drive heuristic. + sequential → single-target "pin and push" — drives the sheep + closest to the pen. -Both modes share the same low-level differential-drive controller -(``herding.diffdrive.velocity_to_wheels`` + clamped forward speed), so -switching modes does not retune the actuation layer. +All modes share the same low-level differential-drive controller +(``herding.diffdrive.velocity_to_wheels`` with cos(err)-clamped forward +speed), so switching modes does not retune actuation. A safety supervisor enforces the "dog stays out of the pen" invariant: if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is -overridden with a north-driving correction. This is a hard guarantee -the policy cannot escape. +overridden with a north-driving correction. RL fallback: if the policy +zip can't be loaded (SB3 missing, file missing), the controller drops +to strombom mode automatically. """ import math @@ -85,19 +83,21 @@ def _resolve_policy_dir() -> str: """Where to look for the trained policy. Priority: - 1. HERDING_POLICY_DIR env var (if set and points to a real dir) - 2. training/runs/bc_pretrained/ (BC-only checkpoint) - 3. training/runs/bc_ppo/best/ (PPO fine-tuned best) - 4. training/runs/latest/best/ (legacy default) + 1. HERDING_POLICY_DIR env var or runtime-cfg entry, if it points + to a real directory. + 2. ``training/runs/bc_flock`` — flock-style BC (current default; + requires the tight-cohesion sheep regime). + 3. ``training/runs/bc_solo`` — single-target BC (1-by-1 style; + only works if ``herding/flocking_sim.py`` is reverted to the + loose-cohesion regime). """ env_dir = (os.environ.get("HERDING_POLICY_DIR") or _runtime_cfg.get("HERDING_POLICY_DIR")) if env_dir and os.path.isdir(env_dir): return env_dir candidates = [ - os.path.join(_PROJECT_ROOT, "training", "runs", "bc_pretrained"), - os.path.join(_PROJECT_ROOT, "training", "runs", "bc_ppo", "best"), - os.path.join(_PROJECT_ROOT, "training", "runs", "latest", "best"), + os.path.join(_PROJECT_ROOT, "training", "runs", "bc_flock"), + os.path.join(_PROJECT_ROOT, "training", "runs", "bc_solo"), ] for c in candidates: if os.path.isdir(c): @@ -106,30 +106,22 @@ def _resolve_policy_dir() -> str: return env_dir or candidates[0] -POLICY_DIR = _resolve_policy_dir() +_VALID_MODES = ("rl", "strombom", "sequential") +if MODE not in _VALID_MODES: + print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.") + MODE = "strombom" +POLICY_DIR = _resolve_policy_dir() policy_handle = None if MODE == "rl": - print(f"[dog] HERDING_MODE={MODE} HERDING_POLICY_DIR(env)=" - f"{os.environ.get('HERDING_POLICY_DIR', '')}") - print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists=" - f"{os.path.isdir(POLICY_DIR)}") - if os.path.isdir(POLICY_DIR): - try: - entries = sorted(os.listdir(POLICY_DIR)) - except OSError: - entries = [] - print(f"[dog] dir contents: {entries}") + print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists={os.path.isdir(POLICY_DIR)}") try: from policy_loader import load as _load_policy policy_handle = _load_policy(POLICY_DIR) print(f"[dog] RL policy loaded from {POLICY_DIR}") except Exception as exc: - print(f"[dog] RL policy load failed ({exc!r}); falling back to Strömbom.") + print(f"[dog] RL policy load failed ({exc!r}); falling back to strombom.") MODE = "strombom" -if MODE not in ("rl", "strombom", "sequential"): - print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.") - MODE = "strombom" print(f"[dog] running in mode={MODE}") diff --git a/herding/flocking_sim.py b/herding/flocking_sim.py index 61dcd52..d254d4b 100644 --- a/herding/flocking_sim.py +++ b/herding/flocking_sim.py @@ -44,7 +44,7 @@ WALL_HARD_GAIN = 50.0 FLEE_DIST = 7.0 SEPARATION_DIST = 2.5 -COHESION_DIST = 8.0 +COHESION_DIST = 12.0 # was 8.0 — wider engagement so far-flung sheep are pulled in PEN_MARGIN = 0.8 @@ -125,12 +125,13 @@ def compute_heading_speed(x, y, penned, dog_xy, peers, wander_angle, rng=None): cy += py cn += 1 if cn > 0: - # Cohesion needs to be comparable to flee at close range to keep - # the flock together through narrow obstacles like the 3m gate. - # Flee at 2m has magnitude ~10; cohesion at peer-distance 5m - # with w=1.5 contributes ~7.5 — same order, so the flock - # translates as a unit instead of fragmenting under pressure. - w = 1.5 if fleeing else 0.6 + # Cohesion needs to dominate flee at close range so the flock + # stays glued together when squeezing through the narrow gate. + # Flee at 2 m has magnitude ~10; cohesion of w=3.0 with the + # peer-CoM 4 m away contributes ~12, so the flock prefers + # bunching to dispersing under pressure. This is what makes + # canonical Strömbom drive work in our 3 m gate. + w = 3.0 if fleeing else 1.0 fx += (cx / cn - x) * w fy += (cy / cn - y) * w diff --git a/plan.md b/plan.md deleted file mode 100644 index 8a7ce27..0000000 --- a/plan.md +++ /dev/null @@ -1,458 +0,0 @@ -# RL-Driven Shepherd Herding — Implementation Plan - -This plan turns the existing Strömbom-only Webots project into a dual-mode -shepherd controller (RL primary, Strömbom fallback), with a fast Gymnasium -training environment that mirrors the Webots dynamics tightly enough for -sim-to-sim transfer. Stable-Baselines3 PPO is the learner. - ---- - -## 1. Current state (audit) - -### World geometry — `worlds/field.wbt` -- Field bounded by stone walls at **x,y ∈ [−15, +15]**. Inside-usable area is - ~[−14.5, 14.5] (`X_MIN/MAX` in `flocking.py`). -- **Pen is *inside* the field**: x ∈ [10, 13], y ∈ [−15, −8], with the - opening on its **north** side at y = −8 (post-and-rail fence W/E; open N). -- South stone wall has a **gate at x ∈ [10, 13], y = −15** (split wall + - gate posts at x=10 and x=13). So sheep that get penned end up between the - fence (N side at y=−8) and the south stone wall (with the wooden gate at - y=−15 currently slightly ajar). The pen is effectively an L-shape inside - the field, not external. -- Spawns: dog at origin (0, 0), 3 sheep around (3, ±2) and (4, 0). Two more - sheep are commented out. - -### Robots — protos -- **Sheep** (`protos/Sheep.proto`): differential drive, wheel radius 0.031 m, - axle half-width 0.10 m → wheel base 0.20 m. `maxVelocity = 25 rad/s` → - max linear ≈ **0.78 m/s**. Sensors: GPS, Compass, Emitter+Receiver on - channel 1. `supervisor = TRUE` (used to repaint wool pink on pen entry). -- **ShepherdDog** (`protos/ShepherdDog.proto`): differential drive, wheel - radius 0.038 m, axle half-width 0.14 m → wheel base 0.28 m. - `maxVelocity = 70 rad/s` → max linear ≈ **2.66 m/s**. Sensors: GPS, - Compass, Gyro, Accelerometer, **Lidar** (front-only, FOV 2.44 rad ≈ 140°, - 180 rays, range 0.10–12 m, noise 0.005), Emitter+Receiver on channel 1, - cosmetic ear/tail motors. - -### Sheep controller — `controllers/sheep/{sheep.py,flocking.py}` -- Reynolds-style boid stack: flee (quadratic ramp inside FLEE_DIST=7 m), - cohesion (within 8 m), separation (within 2.5 m), wall soft repulsion - (margin 5 m), wall hard escape (margin 1 m, gain 50), wander. -- Pen-aware: sheep below the gate line but outside the gate corridor get a - northward "deadzone" assist; on first entry into the pen rectangle, - sheep latches `penned=True`, repaints pink, and switches to in-pen - containment + jitter. -- Driver: heading-error PD on diff-drive (k=4), forward velocity scaled by - `cos(err)`, MAX_SPEED=22 (motor units, capped by proto's 25 rad/s). -- Stuck detector: if displacement < 0.05 m for 20 steps, drives toward - field origin to escape wall-pin (a known differential-drive failure mode). - -### Dog controller — `controllers/shepherd_dog/{shepherd_dog.py,strombom.py}` -- Strömbom collect/drive heuristic. CoM-radius gating - `radius > F·√n` with F=2 selects collect (push furthest sheep inward) vs - drive (push CoM toward the pen entry point at (11.5, −8.0)). -- Deadzone rescue: when a sheep is below the gate line and outside the - pen's x-corridor, the dog repositions to a "behind the sheep, opposite - the pen" stand-off so the sheep's flee vector points back through the - gate. Variants 0/1 alternate lateral offset to break corner cycles. -- Stuck-rescue, EMA action smoothing, target-deadband, RESCUE_SPEED_CAP, - cooldown — all empirical fixes for diff-drive oscillation. -- Logs full per-step debug to `dog_behavior_log.csv` (currently 7 MB — - add to `.gitignore`). - -### Deleted training scaffolding (per `git status`) -- `controllers/shepherd_dog_rl/{shepherd_dog_rl.py, final_model.zip, vecnorm.pkl, plot_debug.py}` -- `training/{config.json, herding_env.py, parity_test.py, requirements.txt, train.py, train_at.py, viz.py, runs/.gitkeep}` - -A previous attempt existed; we'll redesign rather than resurrect, keeping -only the lessons (parity-tested env, VecNormalize wrapper, eval cadence). - ---- - -## 2. Design decisions - -### 2.1 Pen location — keep inside-field with N gate -The user offered moving the pen *external* (through a wall hole). Tradeoffs: - -| Option | Pros | Cons | -|---|---|---| -| **(A) Keep inside-field** (current) | World already built; Strömbom logic already tuned; gate corridor is short | Dog must navigate around three pen walls; adds geometric clutter | -| (B) External pen via wall hole | Cleaner field — dog only sees sheep + outer walls; pen as goal region beyond a 3 m hole at y=−15 | Requires editing `field.wbt` (split south wall, add external pen walls beyond y<−15); existing rescue/deadzone logic must be retuned; outside-field flocking constants don't currently apply | - -**Recommendation: keep (A)** for parity with the working Strömbom controller, -but add a **simplification**: widen the pen entrance from 3 m (x ∈ [10, 13]) -to 4 m (x ∈ [9.5, 13.5]) and raise the entrance line from y=−8 to y=−7.5 -to give the dog more turning room. Optional later: gate B as a curriculum -extension (Section 7). - -### 2.2 Where to train - -PPO on Webots directly is too slow (real-time stepping, single env, slow -reset). The previous training scaffolding used a Python 2D sim — that is -the right approach. Constraints for sim-to-sim transfer: - -1. **Use the exact same flocking math**: import `controllers/sheep/flocking.py` - from the env, do not reimplement. -2. **Use the same world constants**: import `controllers/shepherd_dog/strombom.py` - for pen geometry and Strömbom baseline. -3. **Model differential drive faithfully**: match wheel-radius, base, and - max wheel-velocity from the proto files. Heading update from - `(ω_R − ω_L)·r / b`, position from `(ω_R + ω_L)·r / 2`. -4. **Match Webots step**: `basicTimeStep = 16 ms`. The sheep controller runs - at every basic step; the env will use the same `dt = 0.016 s`. -5. **Lidar deferred**: dog policy will use a *symbolic* observation - (positions of dog + sheep, plus pen geometry) — not raw lidar — for the - first iteration. Lidar-from-pixels is a much harder learning problem - and isn't required for the herding task. (See Section 7 for an - optional later upgrade.) - -### 2.3 Action space for the dog - -Two viable choices: - -- **(a) High-level velocity vector** `(vx, vy) ∈ [−1, 1]²`. The same - representation Strömbom emits today; the existing - `drive_action(vx, vy, ...)` function in `shepherd_dog.py` converts this - to wheel speeds. Decouples the policy from low-level diff-drive - oscillations and enables direct A/B against Strömbom. -- (b) Direct wheel speeds `(ω_L, ω_R) ∈ [−1, 1]²`. More expressive but the - policy must learn diff-drive control from scratch — which is exactly - the source of the wall-stuck and oscillation pain we're trying to - avoid. - -**Recommendation: (a)** — high-level `(vx, vy)`. Reuses the well-tuned -`drive_action` controller, which already handles `cos(err)` clamping and -turn gain. RL focuses on *strategy*, not actuation. - -### 2.4 Observation space for the dog - -Symbolic, fixed-size, normalized to [−1, 1]: - -| Field | Dim | Notes | -|---|---|---| -| Dog (x, y, cos h, sin h) | 4 | Position normalized by 15 | -| Sheep CoM (x, y) | 2 | Of *active* (not-penned) sheep | -| Sheep dispersion (radius, std-x, std-y) | 3 | Strömbom collect-vs-drive features | -| Vector dog→CoM (dx, dy, dist) | 3 | Helps the value function | -| Vector dog→pen-entry (dx, dy, dist) | 3 | | -| Vector furthest-sheep→CoM (dx, dy) | 2 | Strömbom collect target hint | -| Min sheep-to-wall distance + min dog-to-wall | 2 | Safety signal | -| Active sheep count / N_max | 1 | | -| 8-bin polar histogram of sheep around dog | 8 | Order-invariant flock shape | - -Total: **28 features**. Order-invariant by construction (histogram + summary -stats), so the policy generalizes across flock sizes 1..N_max. - -### 2.5 Reward - -Sparse-only is too hard at flock scale; we shape conservatively. - -``` -r_t = w_pen · ΔN_penned # +1 per newly penned sheep - + w_progress· (d_CoM_pen[t-1] − d_CoM_pen[t]) # closer-to-pen progress - + w_compact· (R[t-1] − R[t]) # tighter flock progress - − w_time · 1 # constant time penalty - − w_wall · I(min_wall_dist < 1.0 m) # dog too close to wall - − w_collide· I(dog within 0.3 m of any sheep) # avoid contact - + w_done · I(all sheep penned) # terminal bonus -``` - -Initial weights: `w_pen=2.0, w_progress=0.5, w_compact=0.2, w_time=0.005, -w_wall=0.01, w_collide=0.05, w_done=10.0`. Tune via 1-sheep curriculum -first — if the dog learns 1-sheep cleanly, the weights are sane. - -### 2.6 Episode - -- Max steps: 3000 (≈ 48 s at dt=16 ms — generous). -- Termination: all sheep penned (success), dog/sheep stuck > 600 steps with - no progress (failure), step limit (timeout). -- Reset: domain-randomized — sheep count ∈ {1..N_max}, sheep positions - uniform in field minus pen+gate corridor, dog at origin ± U(−2, 2). - -### 2.7 Curriculum - -| Stage | N_sheep | Duration (steps) | Pass criterion | -|---|---|---|---| -| 0 | 1 | 0.5 M | success ≥ 90 % | -| 1 | 2 | 1.0 M | success ≥ 80 % | -| 2 | 3 | 1.5 M | success ≥ 70 % | -| 3 | 1..3 mixed | 2.0 M | mean reward stable | -| 4 (optional) | 5 | 2.0 M | success ≥ 60 % | - -Implemented by changing only `n_sheep` in the env reset. - ---- - -## 3. Repository layout (new) - -``` -project/ -├── controllers/ -│ ├── sheep/ # unchanged -│ ├── shepherd_dog/ # Strömbom controller (renamed entry) -│ │ ├── shepherd_dog.py # mode-switch wrapper: RL | strombom -│ │ ├── strombom.py # unchanged (canonical Strömbom) -│ │ └── policy_loader.py # NEW: loads SB3 zip + VecNormalize -│ └── ... -├── herding/ # NEW: Python package, importable from env + controller -│ ├── __init__.py -│ ├── geometry.py # field/pen constants, in_pen(), wall helpers (single source of truth) -│ ├── flocking_sim.py # vectorised numpy port of flocking.py for fast batched sheep -│ ├── diffdrive.py # diff-drive integrator matching the proto specs -│ └── obs.py # observation builder shared by env and Webots controller -├── training/ # NEW -│ ├── herding_env.py # gymnasium.Env, single-agent (the dog) -│ ├── parity_test.py # asserts env trajectory ≈ Webots trajectory for fixed seeds -│ ├── train_ppo.py # SB3 PPO entry point -│ ├── eval.py # rollout + metrics (success rate, time-to-pen) -│ ├── configs/ -│ │ ├── ppo_default.yaml -│ │ └── curriculum.yaml -│ ├── runs/ # tensorboard + checkpoints (.gitignored) -│ └── requirements.txt -├── docs/ -│ └── project.md # unchanged -├── plan.md # this file -└── ... -``` - -`herding/` becomes the **single source of truth** for geometry and dynamics. -The Webots controllers and the training env both import from it, so when a -constant changes in one place it changes everywhere — eliminating the -sim/Webots-drift class of bugs. - -This means the existing `controllers/sheep/flocking.py` and -`controllers/shepherd_dog/strombom.py` become thin shims that re-export -from `herding/`. Webots controllers can import `herding/` because Webots -adds the project root to `sys.path` at controller startup; we'll verify. - ---- - -## 4. The Gymnasium environment — `training/herding_env.py` - -```python -class HerdingEnv(gymnasium.Env): - metadata = {"render_modes": ["rgb_array", "human"]} - - def __init__(self, n_sheep=3, max_steps=3000, dt=0.016, seed=None): - self.action_space = Box(low=-1, high=1, shape=(2,), dtype=np.float32) - self.observation_space = Box(low=-1, high=1, shape=(28,), dtype=np.float32) - ... - - def reset(self, *, seed=None, options=None): - # Random sheep positions in field \ pen corridor, dog near origin. - # Optional curriculum: options["n_sheep"] overrides. - ... - - def step(self, action): - vx, vy = action # high-level velocity intent - # Convert to wheel speeds via the same drive_action inverse used in Webots - wL, wR = self._diffdrive_inverse(vx, vy, self.dog_state) - self.dog_state = self._integrate_diffdrive(self.dog_state, wL, wR, self.dt) - # Step every sheep one boid step (vectorized in flocking_sim.py) - self.sheep_state = self._step_sheep(self.sheep_state, self.dog_state) - # Update penned set, compute reward, observation, done flags - ... -``` - -Key points: -- **Vectorised sheep update**: re-implements `flocking.py` in numpy so 100 - parallel envs with 5 sheep each take ms, not seconds. Numerical parity - with the scalar version is asserted in `parity_test.py`. -- **Same diff-drive integrator** for the dog as Webots will see at - inference. Wall + pen-fence collisions clamp position (a Webots-realistic - no-pass-through approximation). -- **Domain randomization** in reset: sheep count, spawn positions, sheep - flock-parameter jitter (±10 % on FLEE_DIST, COHESION_DIST, etc.) for - robustness. - ---- - -## 5. Training pipeline — `training/train_ppo.py` - -- **Algorithm**: SB3 `PPO` with `MlpPolicy`, `n_steps=2048`, `batch_size=256`, - `n_epochs=10`, `gamma=0.995`, `gae_lambda=0.95`, `clip_range=0.2`, - `ent_coef=0.005`, `vf_coef=0.5`, `learning_rate=3e-4`. -- **Vec envs**: `SubprocVecEnv` × 16 parallel envs (the env is pure numpy - so subprocs are CPU-cheap). -- **Normalization**: `VecNormalize(norm_obs=True, norm_reward=True, - clip_obs=10.0)`. Pickled alongside the policy zip — both required at - inference. -- **Callbacks**: - - `CheckpointCallback` every 100 k steps. - - `EvalCallback` on a separate eval env (no normalization-update) every - 50 k steps; logs success rate and time-to-pen to TensorBoard. - - Custom `CurriculumCallback`: bumps `n_sheep` when eval success rate - crosses the stage threshold for 3 consecutive evals. -- **Determinism for debugging**: seed-pinned eval env so regressions are - catchable. - ---- - -## 6. Webots integration — RL inference path - -`controllers/shepherd_dog/shepherd_dog.py` becomes a thin wrapper: - -```python -MODE = os.environ.get("HERDING_MODE", "rl") # "rl" | "strombom" - -if MODE == "rl": - policy = policy_loader.load("training/runs/best/policy.zip", - "training/runs/best/vecnormalize.pkl") - obs_fn = build_obs # from herding/obs.py -else: - obs_fn = None # strombom path uses sheep_positions directly - -while robot.step(timestep) != -1: - receive_messages() - if MODE == "rl": - obs = obs_fn(dog_xy, dog_heading, sheep_positions, ...) - action, _ = policy.predict(obs, deterministic=True) - vx, vy = action.tolist() - else: - vx, vy, mode, dbg = compute_action_debug(dog_xy, sheep_positions, PEN_ENTRY) - # plus existing rescue/cooldown/EMA layer - drive_action(vx, vy, ...) -``` - -A **safety supervisor** wraps the RL output: if `obs` indicates the dog is -< 0.6 m from a wall, override with the existing wall-escape behavior -(reverse + turn). This is a hard guarantee diff-drive needs because PPO -may not discover wall-escape reliably from on-policy data. - -`policy_loader.py` handles the SB3 import lazily so the controller still -works with `MODE=strombom` even if SB3 is not installed in the Webots -Python environment. - ---- - -## 7. Optional extensions (post-baseline) - -- **External pen** (Section 2.1 option B): edit `field.wbt` to extend the - south wall hole into an external L-shaped pen with its own walls; update - `herding/geometry.py`; retrain stage 3 only. -- **Lidar observation**: replace symbolic obs with 36-bin downsampled - lidar + ego state; train end-to-end. Useful as the "extra merit" - dimension in the project doc. -- **Two-dog mode**: make env multi-agent, train with `MAPPO`-style shared - critic or independent PPO. The proto already supports multiple dog - instances; world only needs a second `ShepherdDog` node. -- **Mecanum comparison**: swap the dog proto for a mecanum variant; same - policy, different `_integrate_diffdrive` (becomes holonomic). -- **Sheep flock size scaling**: 5, 10, 20 — the obs is order-invariant so - the same policy generalises; just curriculum further. - ---- - -## 8. Risks & mitigations - -| Risk | Mitigation | -|---|---| -| Sim-to-Webots gap (sheep dynamics, wall friction) | `parity_test.py` asserts trajectory match within tolerance for fixed seeds; if it fails, fix the env, not the policy | -| Dog learns to wall-pin sheep against fence | Add `w_collide` penalty + min-sheep-to-wall term in obs; curriculum from 1 sheep first | -| PPO oscillation collapses into spinning | Action smoothing in env step (EMA on `(vx, vy)`, mirroring `ACTION_SMOOTH=0.35` from Strömbom controller); reward small `‖a_t − a_{t-1}‖` penalty | -| Pen approach failures (sheep refuse gate) | Reuse the existing `deadzone_rescue` as a *scripted fallback* triggered when a sheep has been deadzoned > 200 steps — RL handles the common case, scripted handles the corner | -| Gym version mismatch (gymnasium vs gym) | Lock to `gymnasium>=0.29`, `stable-baselines3>=2.3` in requirements | - ---- - -## 9. Milestones (suggested order of implementation) - -1. **M0 — Refactor** (no behavior change): create `herding/` package, move - constants out of `flocking.py`/`strombom.py`, leave shims; verify - Webots still runs Strömbom unchanged. Add `dog_behavior_log.csv` to - `.gitignore`. -2. **M1 — Env & parity**: `herding_env.py`, `parity_test.py`. Asserts - sheep + dog trajectories match Webots within tolerance for 5 fixed - seeds. *Done when parity test green.* -3. **M2 — PPO baseline**: train Stage 0 (1 sheep) for 0.5 M steps; eval - in env at ≥ 90 % success. -4. **M3 — Webots inference**: load Stage 0 policy in `shepherd_dog.py` - with `HERDING_MODE=rl`; verify the dog herds 1 sheep into the pen in - the actual Webots world. *This is the sim-to-sim transfer gate.* -5. **M4 — Curriculum**: stages 1–3, ~5 M steps total, with checkpoints - and eval logs. -6. **M5 — Strömbom comparison**: run both controllers on a fixed eval - suite (same seeds, 1/2/3 sheep), log success rate and time-to-pen. - This is a deliverable for the project's "quantitative evaluation" - goal. -7. **M6 — Documentation**: a short README in `training/` showing how to - train, evaluate, and switch modes in Webots. - -Each milestone is independently demoable. M0–M3 is the critical path to -"RL works in Webots"; M4–M6 polishes it for the project deliverable. - ---- - -## 10. Decisions (locked in by implementation) - -- **Pen layout**: option B (external pen). The pen sits south of the - field at x ∈ [10, 13], y ∈ [-22, -15] and is reached through the - existing 3 m gap in the south stone wall. The old in-field - quarantine fence is gone and the wooden gate is modeled as - swung-open and parked on the west gate post so the corridor is - unobstructed. This kills the deadzone class entirely. -- **Flock size**: 1..10 sheep, sampled uniformly each reset. The order- - invariant observation (CoM, dispersion, polar histogram) lets a - single policy generalise across the whole range. A curriculum widens - ``max_n_sheep`` from 1 to 10 over training to keep early exploration - tractable. -- **Single-sheep mode**: handled by the same policy (n_sheep=1 is the - first stage of the curriculum and stays in the training distribution - throughout). No separate model. -- **Hardware**: GPU for training. SubprocVecEnv × 16 on CPU feeds an - MlpPolicy on GPU; ~2–3 h for the full curriculum. - -## 11. What was built - -``` -herding/ # single source of truth, importable from both - geometry.py # field/pen constants, latch helpers, robot specs - flocking_sim.py # Reynolds boid step (matches Webots controller) - diffdrive.py # diff-drive kinematics + velocity↔wheels - obs.py # 28-D order-invariant observation builder - strombom.py # collect/drive heuristic (baseline + fallback) - -worlds/field.wbt # external pen south of field, 10 sheep slots, - # gate parked open, in-field fence removed - -controllers/sheep/sheep.py # imports from herding/, latches on - # is_penned_position -controllers/shepherd_dog/ - shepherd_dog.py # mode switch (HERDING_MODE=rl|strombom), - # safety supervisor for DOG_SOUTH_LIMIT - policy_loader.py # lazy SB3 zip + VecNormalize loader - strombom.py # shim re-exporting herding.strombom - -training/ - herding_env.py # gymnasium.Env, action smoothing, reward shaping - train_ppo.py # SB3 PPO with VecNormalize, eval, checkpoints, - # curriculum callback - eval.py # success-rate / time-to-pen across n_sheep - parity_test.py # shape, determinism, baseline-rollout smoke test - configs/ppo_default.yaml - requirements.txt - README.md # how to train, evaluate, switch modes in Webots -``` - -## 12. To run - -```bash -# 1. Install deps (CUDA-enabled torch wheel for GPU) -pip install -r training/requirements.txt - -# 2. Smoke test -python -m training.parity_test - -# 3. Train (5 M steps, ~2–3 h on a single GPU) -python -m training.train_ppo --out-dir training/runs/baseline - -# 4. Evaluate vs Strömbom -python -m training.eval --policy training/runs/baseline/best -python -m training.eval --policy strombom - -# 5. Run in Webots -export HERDING_MODE=rl -export HERDING_POLICY_DIR=$PWD/training/runs/baseline/best -webots worlds/field.wbt -``` diff --git a/tools/camera_debug.py b/tools/camera_debug.py deleted file mode 100644 index e937ed2..0000000 --- a/tools/camera_debug.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -Viewpoint inspector — prints position, orientation and FOV to the console -once per second. Attach as the controller of a dummy supervisor robot to -copy-paste exact camera values into field.wbt. -""" - -from controller import Supervisor - -robot = Supervisor() -timestep = int(robot.getBasicTimeStep()) -vp = robot.getFromDef("VIEWPOINT") - -step = 0 -while robot.step(timestep) != -1: - if step % 60 == 0: - pos = vp.getField("position").getSFVec3f() - ori = vp.getField("orientation").getSFRotation() - fov = vp.getField("fieldOfView").getSFFloat() - print(f"position: {pos[0]:.3f} {pos[1]:.3f} {pos[2]:.3f}") - print(f"orientation: {ori[0]:.3f} {ori[1]:.3f} {ori[2]:.3f} {ori[3]:.3f}") - print(f"fieldOfView: {fov:.3f}\n") - step += 1 diff --git a/tools/collect_demos.py b/tools/collect_demos.py index 7a767d6..a06a24c 100644 --- a/tools/collect_demos.py +++ b/tools/collect_demos.py @@ -27,11 +27,19 @@ if _PROJECT_ROOT not in sys.path: import numpy as np from herding.geometry import PEN_ENTRY -from herding.sequential import compute_action +from herding.sequential import compute_action as sequential_action +from herding.strombom import compute_action as strombom_action from training.herding_env import HerdingEnv -def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int): +TEACHERS = { + "sequential": sequential_action, + "strombom": strombom_action, +} + + +def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int, + teacher_fn): env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, difficulty=1.0, seed=seed) obs, _ = env.reset(seed=seed) @@ -41,7 +49,7 @@ def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int): for i in range(env.n_sheep) if not env.sheep_penned[i]} if not positions: break - vx, vy, _mode = compute_action( + vx, vy, _mode = teacher_fn( (env.dog_x, env.dog_y), positions, PEN_ENTRY, ) action = np.array([vx, vy], dtype=np.float32) @@ -70,7 +78,12 @@ def main(): help="Keep every Nth (obs, action) pair.") parser.add_argument("--keep-failures", action="store_true", help="Include partial-success trajectories. Default off.") + parser.add_argument("--teacher", default="sequential", + choices=list(TEACHERS.keys()), + help="Which analytic teacher to demonstrate.") args = parser.parse_args() + teacher_fn = TEACHERS[args.teacher] + print(f"[demos] teacher: {args.teacher}") n_sheep_list = [int(x) for x in args.n_sheep_list.split(",")] print(f"[demos] grid: n_sheep={n_sheep_list}, seeds={args.seeds_per_n}, " @@ -83,7 +96,7 @@ def main(): for n in n_sheep_list: for seed in range(args.seeds_per_n): obs, actions, success, total_steps = collect_one( - n, seed, args.max_steps, args.subsample, + n, seed, args.max_steps, args.subsample, teacher_fn, ) n_total += 1 if success: diff --git a/tools/run_webots.sh b/tools/run_webots.sh index cf26b74..27812ea 100755 --- a/tools/run_webots.sh +++ b/tools/run_webots.sh @@ -15,7 +15,7 @@ # tools/run_webots.sh 3 strombom # canonical baseline, 3 sheep # # Notes: -# * The RL mode loads training/runs/bc_pretrained/policy.zip by default. +# * The RL mode loads training/runs/bc_solo/policy.zip by default. # Override via HERDING_POLICY_DIR=/path/to/run env var. # * Conda env "tir" must be active (provides stable-baselines3 + torch). @@ -46,12 +46,12 @@ echo "------------------------------------------------------------" echo "World : $DST" echo "Mode : $MODE" echo "Sheep : $active active" -echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}" +echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_solo}" echo "------------------------------------------------------------" # Webots strips HERDING_* env vars from controller subprocesses in some # setups, so we also write a runtime config file the controller reads. -RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_pretrained}" +RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_solo}" cat > "$ROOT/herding_runtime.cfg" </best/`. -2. In Webots, set the dog controller's environment variables: - - ```bash - export HERDING_MODE=rl - export HERDING_POLICY_DIR=$(pwd)/training/runs/baseline/best - webots worlds/field.wbt - ``` - - Or set them via Webots' controller args / a `.wbproj` if you prefer. - -3. To force the Strömbom baseline (same world, same controller): - - ```bash - export HERDING_MODE=strombom - webots worlds/field.wbt - ``` - -If `HERDING_MODE=rl` but the policy can't be loaded (SB3 not installed, -zip missing, etc.), the controller logs the error and falls back to -Strömbom automatically. - -## Curriculum knobs - -The default schedule in `configs/ppo_default.yaml` widens -`max_n_sheep` over training. Each reset samples `n_sheep ~ U[1, -max_n_sheep]`, so the final policy has seen every flock size from 1 to -10 in proportion. To pin a specific size, instantiate the env with -`HerdingEnv(n_sheep=N)` (see `eval.py`). - -## Reward shaping - -Weights live in class attributes on `HerdingEnv`. Tune from the 1-sheep -curriculum first — if the dog can't herd a single sheep cleanly, raising -`W_PROGRESS` or lowering `W_TIME` is usually the fix. For multi-sheep -collapse modes (dog spins between sheep), increase `W_COMPACT` so -tightening the flock pays. +The script is left in place because the abstractions are sound and the +code is reusable for follow-up work (e.g. KL-regularised fine-tune +with a frozen reference policy). Not part of the deliverable pipeline. diff --git a/training/bc_pretrain.py b/training/bc_pretrain.py index 3a82147..6e43b0a 100644 --- a/training/bc_pretrain.py +++ b/training/bc_pretrain.py @@ -1,20 +1,21 @@ -"""Behavior cloning of the sequential teacher into an SB3-compatible policy. +"""Behavior cloning of an analytic teacher into an SB3-compatible policy. -Trains the policy network (mean-action head) of an SB3 ``MlpPolicy`` to -mimic the demonstrations collected by ``tools.collect_demos``. The -saved zip is loadable via ``PPO.load(...)`` and can be passed to -``train_ppo.py --resume`` for fine-tuning. +Trains the policy network (mean-action head) of an SB3 ``MlpPolicy`` +to mimic the (obs, action) demonstrations produced by +``tools.collect_demos``. The saved zip is loadable via ``PPO.load(...)`` +and is what the Webots dog controller uses in ``HERDING_MODE=rl``. -Why this works: the teacher (sequential single-target driving) solves -n=10 at 80%+ in our env. BC gives the RL a competent starting policy, -so PPO doesn't have to discover behavior from scratch — it only has to -*refine* the teacher's strategy via the sparse pen reward. +Loss: MSE + (1 - cosine similarity). The cosine term is what stops +the policy mean from collapsing toward zero against unit-vector +targets. Best-by-val_cos checkpoint is restored at the end of training +so noisy multi-modal teachers (e.g. Strömbom) don't lose progress when +the last epoch lands on a bad gradient step. Usage:: python -m training.bc_pretrain \\ --demos training/demos.npz \\ - --out training/runs/bc_pretrained + --out training/runs/bc_flock """ from __future__ import annotations @@ -80,7 +81,7 @@ def policy_forward_mean(policy, obs_batch): def main(): parser = argparse.ArgumentParser() parser.add_argument("--demos", default="training/demos.npz") - parser.add_argument("--out", default="training/runs/bc_pretrained") + parser.add_argument("--out", default="training/runs/bc_solo") parser.add_argument("--epochs", type=int, default=60) parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--lr", type=float, default=1e-3) @@ -147,6 +148,11 @@ def main(): f"lr={args.lr} device={args.device}") t_start = time.time() best_val = float("inf") + best_cos = -1.0 + # Snapshot the best-by-val_cos policy weights and restore at the end — + # training is noisy on multi-modal teachers (e.g. Strömbom collect/drive), + # so the last epoch is often worse than an earlier one. + best_state = None def combined_loss(pred, target): mse = nn.functional.mse_loss(pred, target) @@ -201,6 +207,14 @@ def main(): f"val_mse={val_mse:.4f} val_cos={cos_sim:+.3f}") if val_mse < best_val: best_val = val_mse + if cos_sim > best_cos: + best_cos = cos_sim + best_state = {k: v.detach().cpu().clone() + for k, v in policy.state_dict().items()} + + if best_state is not None: + policy.load_state_dict(best_state) + print(f"[bc] restored best-val_cos snapshot (cos={best_cos:.3f})") elapsed = time.time() - t_start print(f"[bc] done in {elapsed:.0f}s best_val_mse={best_val:.4f}") diff --git a/training/demos.npz b/training/demos.npz deleted file mode 100644 index b84e4b2..0000000 Binary files a/training/demos.npz and /dev/null differ diff --git a/training/eval.py b/training/eval.py index af3af36..29ec099 100644 --- a/training/eval.py +++ b/training/eval.py @@ -26,8 +26,8 @@ if _PROJECT_ROOT not in sys.path: import numpy as np from herding.geometry import MAX_SHEEP, PEN_ENTRY -from herding.strombom import compute_action as strombom_action from herding.sequential import compute_action as sequential_action +from herding.strombom import compute_action as strombom_action from training.herding_env import HerdingEnv diff --git a/training/runs/bc_flock/policy.zip b/training/runs/bc_flock/policy.zip new file mode 100644 index 0000000..6dc913a Binary files /dev/null and b/training/runs/bc_flock/policy.zip differ diff --git a/training/runs/bc_solo/policy.zip b/training/runs/bc_solo/policy.zip new file mode 100644 index 0000000..10721ab Binary files /dev/null and b/training/runs/bc_solo/policy.zip differ diff --git a/training/train_ppo.py b/training/train_ppo.py index 4a674b0..c506aea 100644 --- a/training/train_ppo.py +++ b/training/train_ppo.py @@ -1,18 +1,31 @@ -"""Train a PPO shepherd-dog policy on ``HerdingEnv`` with curriculum. +"""PPO trainer for the shepherd-dog policy — EXPERIMENTAL. -Defaults to 16 parallel ``SubprocVecEnv`` workers feeding a GPU policy. -Saves checkpoints, the best-eval model, and the VecNormalize stats — -all three are needed at inference time by the Webots controller. +The deliverable pipeline is `bc_pretrain.py` (see ``training/README.md``). +This script is kept in the tree because it implements: -Usage:: +* PPO from scratch with curriculum over flock size + spawn area, and +* PPO fine-tune of a behavior-cloned policy. + +Both ran into stability issues in our setting (long-horizon credit +assignment for sparse pen reward, BC-degradation under PPO exploration +noise). The abstractions are reusable for follow-up work — e.g. +KL-regularised fine-tune with a frozen reference policy — so we leave +the code in place. + +Usage (PPO from scratch):: python -m training.train_ppo \ --config training/configs/ppo_default.yaml \ - --out-dir training/runs/baseline + --out-dir training/runs/ppo_scratch -To resume from a checkpoint:: +Usage (PPO fine-tune of BC):: - python -m training.train_ppo --resume training/runs/baseline/checkpoints/ppo_500000_steps.zip + python -m training.train_ppo \ + --resume training/runs/bc_flock/policy.zip \ + --out-dir training/runs/bc_ppo \ + --no-vecnorm --no-curriculum --imitate-weight 0 \ + --difficulty 1.0 --log-std -1.5 --learning-rate 5e-5 \ + --total-timesteps 3000000 """ from __future__ import annotations diff --git a/worlds/.field.wbproj b/worlds/.field.wbproj deleted file mode 100644 index 255f061..0000000 --- a/worlds/.field.wbproj +++ /dev/null @@ -1,9 +0,0 @@ -Webots Project File version R2025a -perspectives: 000000ff00000000fd00000002000000010000011c000001bcfc0200000001fb0000001400540065007800740045006400690074006f00720100000000000001bc0000003f00ffffff00000003000005c600000220fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000005c60000006900ffffff000004a8000001bc00000001000000020000000100000008fc00000000 -simulationViewPerspectives: 000000ff000000010000000200000100000006250100000002010000000100 -sceneTreePerspectives: 000000ff00000001000000030000001f000000c0000000000100000002010000000200 -maximizedDockId: -1 -centralWidgetVisible: 1 -orthographicViewHeight: 1 -textFiles: -1 -consoles: Console:All:All