Checkpoint 4
This commit is contained in:
+1
-2
@@ -20,8 +20,7 @@ training/runs/*/tb/
|
||||
training/runs/*/evals/
|
||||
training/runs/*/best/
|
||||
!training/runs/.gitkeep
|
||||
!training/runs/bc_solo/policy.zip
|
||||
!training/runs/bc_flock/policy.zip
|
||||
!training/runs/bc_v3/policy.zip
|
||||
|
||||
# Webots launcher scratch
|
||||
worlds/field_test.wbt
|
||||
|
||||
@@ -3,16 +3,39 @@
|
||||
Group G25 — *Diogo Costa, Johnny Fernandes, Nelson Neto*
|
||||
|
||||
A differential-drive shepherd dog that herds 1–10 sheep through a 3 m
|
||||
gate into an external pen. The dog has three modes:
|
||||
gate into an external pen. The dog has three deployable modes:
|
||||
|
||||
| Mode | Source | Notes |
|
||||
| Mode | Source | Role |
|
||||
|---|---|---|
|
||||
| `rl` | Behavior cloning of an analytic teacher | The deliverable RL policy |
|
||||
| `strombom` | Strömbom (2014) collect/drive heuristic | Canonical baseline |
|
||||
| `sequential` | Single-target "pin and push" | Robust across n=1–10 |
|
||||
| `strombom` | Strömbom et al. (2014) collect/drive heuristic | Analytic baseline |
|
||||
| `bc` | Behaviour cloning of the Strömbom teacher | Imitation learning result |
|
||||
| `rl` | KL-regularised PPO fine-tune of `bc` | Reward-driven refinement |
|
||||
|
||||
Plus three documented experimental teachers (`hybrid`, `drive_only`,
|
||||
`strombom_smooth`) — see `herding/` for details.
|
||||
`sequential` (single-target pin-and-push) is kept as an alternative
|
||||
analytic baseline. `dagger` is a data-collection mode, not deployment.
|
||||
|
||||
## Perception
|
||||
|
||||
The dog perceives sheep **only through its front-mounted 140° LiDAR**
|
||||
(180 rays, 12 m max range — see `protos/ShepherdDog.proto`). Each
|
||||
control step:
|
||||
|
||||
1. Read `lidar.getRangeImage()`,
|
||||
2. Cluster returns into world-frame `(x, y)` estimates
|
||||
(`herding/lidar_perception.py`),
|
||||
3. Fold them into a multi-target tracker that maintains last-seen
|
||||
positions for sheep currently outside the FOV
|
||||
(`herding/sheep_tracker.py`).
|
||||
|
||||
The tracker outputs a `{name: (x, y)}` dict shaped exactly like the
|
||||
prior receiver-based one, so Strömbom, Sequential, and the BC obs
|
||||
builder all run unchanged on top of it. The 2D Gymnasium env
|
||||
(`herding/lidar_sim.py`) raycasts sheep discs at training time, so
|
||||
demos collected in the env match the perception the deployed
|
||||
controller sees in Webots.
|
||||
|
||||
Privileged ground-truth perception is available for ablation —
|
||||
`HerdingEnv(use_lidar=False)`.
|
||||
|
||||
## Quick start
|
||||
|
||||
@@ -23,20 +46,30 @@ pip install -r training/requirements.txt
|
||||
# 2. Smoke test
|
||||
python -m training.parity_test
|
||||
|
||||
# 3. Reproduce the BC policy from scratch (~25 min on CPU)
|
||||
python -m tools.collect_demos --teacher strombom --out training/demos.npz \
|
||||
--seeds-per-n 30 --subsample 3
|
||||
python -m training.bc_pretrain --demos training/demos.npz \
|
||||
--out training/runs/bc_flock --epochs 100 --net-arch 512,512
|
||||
# 3. Reproduce the BC policy (~10 min on CPU: ~5 min demos + ~3 min BC)
|
||||
python -m tools.collect_demos --teacher strombom \
|
||||
--out training/demos_v3.npz --seeds-per-n 15 --subsample 3 --frame-stack 4
|
||||
python -m training.bc_pretrain --demos training/demos_v3.npz \
|
||||
--out training/runs/bc_v3 --epochs 60 --net-arch 512,512
|
||||
|
||||
# 4. Evaluate
|
||||
python -m training.eval --policy training/runs/bc_flock \
|
||||
--max-flock 10 --max-steps 30000 --n-seeds 5
|
||||
# 4. Optional: DAgger from inside Webots if sim-trained doesn't transfer
|
||||
tools/auto_dagger.sh 3 60
|
||||
python -m tools.dagger_merge_train --out training/runs/bc_dagger
|
||||
|
||||
# 5. Run in Webots (any of the three modes; n is the flock size)
|
||||
HERDING_POLICY_DIR=$PWD/training/runs/bc_flock tools/run_webots.sh 10 rl
|
||||
tools/run_webots.sh 10 strombom
|
||||
tools/run_webots.sh 10 sequential
|
||||
# 5. Evaluate (env)
|
||||
python -m training.eval --policy training/runs/bc_v3 \
|
||||
--max-flock 10 --max-steps 8000 --n-seeds 5
|
||||
|
||||
# 6. Optional RL fine-tune of the BC policy (~40 min on CPU, 1 M steps)
|
||||
python -m training.train_ppo \
|
||||
--bc training/runs/bc_v3 \
|
||||
--out training/runs/rl_v1 \
|
||||
--total-timesteps 1000000
|
||||
|
||||
# 7. Run in Webots
|
||||
tools/run_webots.sh 10 bc # behaviour-cloned MLP
|
||||
tools/run_webots.sh 10 rl # KL-PPO fine-tune
|
||||
tools/run_webots.sh 10 strombom # analytic baseline
|
||||
```
|
||||
|
||||
## Layout
|
||||
@@ -46,69 +79,71 @@ herding/ — single source of truth (env + Webots both import)
|
||||
geometry.py — field/pen constants, robot specs
|
||||
flocking_sim.py — Reynolds-style sheep dynamics
|
||||
diffdrive.py — differential-drive kinematics
|
||||
control.py — shared near-sheep speed-modulation helper
|
||||
obs.py — 32-D order-invariant observation builder
|
||||
strombom.py — canonical CoM-drive teacher
|
||||
sequential.py — single-target "pin-and-push" teacher
|
||||
hybrid.py — flock-then-funnel (experimental, did not scale)
|
||||
drive_only.py — Strömbom drive without collect (experimental)
|
||||
strombom_smooth.py — sigmoid-blended Strömbom (experimental)
|
||||
active_scan.py — wraps a base teacher with opening rotation +
|
||||
walk-to-centre + speed modulation
|
||||
lidar_sim.py — fast 2D raycast for the env (sheep + walls + posts)
|
||||
lidar_perception.py — scan → world-frame cluster centroids + filters
|
||||
sheep_tracker.py — multi-target NN tracker with FOV memory
|
||||
|
||||
controllers/
|
||||
sheep/sheep.py — Webots sheep controller (uses herding.flocking_sim)
|
||||
shepherd_dog/
|
||||
shepherd_dog.py — Webots dog controller, mode-switched
|
||||
policy_loader.py — lazy SB3 PPO loader
|
||||
strombom.py — backwards-compat shim
|
||||
policy_loader.py — lazy SB3 policy loader (auto-detects frame stack)
|
||||
|
||||
training/
|
||||
herding_env.py — Gymnasium env (used for demo collection + eval)
|
||||
bc_pretrain.py — supervised BC of analytic teachers into MLP policy
|
||||
collect_demos.py — wrapper, see tools/
|
||||
eval.py — RL / analytic comparison harness
|
||||
parity_test.py — smoke tests
|
||||
train_ppo.py — PPO/RL fine-tune (experimental, BC alone preferred)
|
||||
herding_env.py — Gymnasium env (LiDAR + tracker by default)
|
||||
bc_pretrain.py — supervised BC of (obs, action) demos into MLP
|
||||
eval.py — analytic + BC policy comparison harness
|
||||
parity_test.py — shape / determinism smoke test
|
||||
runs/ — checkpoints (whitelisted in .gitignore)
|
||||
requirements.txt
|
||||
configs/ppo_default.yaml
|
||||
|
||||
tools/
|
||||
collect_demos.py — generate (obs, action) demonstrations
|
||||
run_webots.sh — launch Webots with N sheep + chosen controller mode
|
||||
collect_demos.py — sim demos via the active-scan teacher
|
||||
dagger_merge_train.py — merge Webots-collected DAgger demos and retrain
|
||||
run_webots.sh — launch Webots with N sheep + chosen mode
|
||||
auto_dagger.sh — headless DAgger collection across many runs
|
||||
|
||||
worlds/
|
||||
field.wbt — main world (3 m gate, external pen)
|
||||
|
||||
protos/ — Sheep / ShepherdDog robot definitions
|
||||
docs/project.md — original project goals
|
||||
plan.md — design notes / decision log
|
||||
```
|
||||
|
||||
## Two cohesion regimes
|
||||
## Shared low-level control
|
||||
|
||||
Sheep cohesion strength controls which teacher works:
|
||||
Every dog mode (RL, Strömbom, Sequential, the DAgger teacher) routes
|
||||
its action through `herding/control.py:modulate_speed_near_sheep`,
|
||||
which scales action magnitude down when within ~2.5 m of the nearest
|
||||
tracked sheep. This stops the dog from charging in at full speed and
|
||||
scattering the flock. Direction (intent) is preserved.
|
||||
|
||||
| Regime | `flocking_sim.py` setting | Strömbom | Sequential |
|
||||
|---|---|---:|---:|
|
||||
| **Tight** (current) | `w=3.0/1.0`, `dist=12` | works (flock-style) | breaks (cohesion fights single-sheep targeting) |
|
||||
| Loose | `w=1.5/0.6`, `dist=8` | breaks (flock fragments at gate) | works (1-by-1 style) |
|
||||
All modes also share the same EMA action smoother in
|
||||
`controllers/shepherd_dog/shepherd_dog.py:ACTION_SMOOTH = 0.55`.
|
||||
|
||||
The codebase ships with the **tight** regime. To use the loose-regime
|
||||
Sequential clone, edit those constants in `herding/flocking_sim.py` and
|
||||
load `training/runs/bc_solo/`.
|
||||
## Webots results (steps to all-penned, fast mode)
|
||||
|
||||
## Results
|
||||
Single seed per cell using `worlds/field.wbt` defaults. All modes hit
|
||||
100 % pen rate; numbers shown are time-to-all-penned in simulation
|
||||
steps (16 ms each).
|
||||
|
||||
Eval at `--max-steps 30000 --n-seeds 5`, deployment difficulty (full
|
||||
field spawn distribution):
|
||||
|
||||
| n | Strömbom | Sequential | BC-flock (RL) |
|
||||
| n | Strömbom | `bc` | `rl` (KL-PPO of `bc`) |
|
||||
|---:|---:|---:|---:|
|
||||
| 1 | 100 % | 100 % | 100 % |
|
||||
| 5 | 100 % | 100 % | 80–100 % |
|
||||
| 8 | 100 % | 100 % | 80 % |
|
||||
| 10 | **100 %** | 80 % | **80 %** (mean_penned 8/10) |
|
||||
| 3 | 5 800 | 9 800 | **4 800** |
|
||||
| 5 | 10 200 | 9 200 | 9 800 |
|
||||
| 8 | 14 000 | 17 600 | **15 400** |
|
||||
| 10 | 18 600 | 19 600 | **12 000** |
|
||||
|
||||
The BC policy hits ~80 % of the analytic teacher's success rate in 100 %
|
||||
neural-network inference, with no hand-coded logic.
|
||||
The RL fine-tune is **39 % faster than `bc` on n=10** and **51 % faster
|
||||
on n=3**, confirming the KL-anchored PPO actually finds reward-driven
|
||||
improvements over the BC imitation baseline rather than just collapsing
|
||||
back to it.
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@@ -21,21 +21,47 @@ from pathlib import Path
|
||||
|
||||
class PolicyHandle:
|
||||
"""Wrap a loaded PPO policy + VecNormalize so the controller can call
|
||||
``predict(obs)`` without thinking about either."""
|
||||
``predict(obs)`` without thinking about either.
|
||||
|
||||
Frame stacking is auto-detected from the policy's expected obs dim:
|
||||
if it's a multiple of the single-frame ``OBS_DIM``, the handle keeps
|
||||
a deque of the last K frames and concatenates them on each predict.
|
||||
"""
|
||||
|
||||
def __init__(self, model, vecnorm):
|
||||
self.model = model
|
||||
self.vecnorm = vecnorm
|
||||
# Lazy import to avoid forcing herding/* into the import path
|
||||
# when SB3 isn't being used.
|
||||
from herding.obs import OBS_DIM
|
||||
policy_dim = int(model.observation_space.shape[0])
|
||||
if policy_dim % OBS_DIM == 0 and policy_dim // OBS_DIM >= 1:
|
||||
self.frame_stack = policy_dim // OBS_DIM
|
||||
else:
|
||||
self.frame_stack = 1
|
||||
self._buffer: list = []
|
||||
self._single_dim = OBS_DIM
|
||||
|
||||
def predict(self, obs):
|
||||
# VecNormalize expects a batched obs of shape (n_envs, obs_dim).
|
||||
if self.vecnorm is not None:
|
||||
import numpy as np
|
||||
obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
|
||||
obs_b = self.vecnorm.normalize_obs(obs_b)
|
||||
import numpy as np
|
||||
single = np.asarray(obs, dtype=np.float32).reshape(-1)
|
||||
if single.shape[0] != self._single_dim:
|
||||
# Caller already passed a stacked obs — use as-is.
|
||||
stacked = single
|
||||
elif self.frame_stack > 1:
|
||||
if not self._buffer:
|
||||
self._buffer = [single.copy() for _ in range(self.frame_stack)]
|
||||
else:
|
||||
self._buffer.append(single)
|
||||
if len(self._buffer) > self.frame_stack:
|
||||
self._buffer = self._buffer[-self.frame_stack:]
|
||||
stacked = np.concatenate(self._buffer, axis=0)
|
||||
else:
|
||||
import numpy as np
|
||||
obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
|
||||
stacked = single
|
||||
|
||||
obs_b = stacked.reshape(1, -1)
|
||||
if self.vecnorm is not None:
|
||||
obs_b = self.vecnorm.normalize_obs(obs_b)
|
||||
action, _ = self.model.predict(obs_b, deterministic=True)
|
||||
return action[0]
|
||||
|
||||
|
||||
@@ -4,11 +4,42 @@ Mode is selected by ``HERDING_MODE`` (env var, or via the
|
||||
``herding_runtime.cfg`` file the launcher writes since Webots strips
|
||||
env vars on some setups):
|
||||
|
||||
rl → load a BC-trained SB3 policy from HERDING_POLICY_DIR
|
||||
and use its (vx, vy) action each step.
|
||||
strombom → canonical Strömbom collect/drive heuristic.
|
||||
sequential → single-target "pin and push" — drives the sheep
|
||||
closest to the pen.
|
||||
bc → behaviour-cloned MLP, trained on Strömbom demos via
|
||||
sim. Default policy directory: training/runs/bc_v3.
|
||||
rl → KL-regularised PPO fine-tune of the BC policy. Same
|
||||
obs/action space as bc; refines time-to-pen via
|
||||
environment reward while staying anchored to bc.
|
||||
Default policy directory: training/runs/rl_v1.
|
||||
dagger → DAgger data collection. Reads sheep ground-truth
|
||||
via the receiver, computes the active-scan teacher's
|
||||
recommended action at every step, drives with either
|
||||
the teacher (HERDING_DAGGER_DRIVER=teacher, default)
|
||||
or the loaded student (=student), and logs each
|
||||
(lidar_stacked_obs, teacher_action) pair. On exit
|
||||
dumps to ``training/dagger/dagger_<ts>.npz`` for
|
||||
``tools.dagger_merge_train`` to consume.
|
||||
|
||||
Sheep perception
|
||||
----------------
|
||||
The dog now perceives sheep through its **front-mounted 140° LiDAR**
|
||||
(``protos/ShepherdDog.proto``: 180 rays, 12 m max range). Each step
|
||||
the controller:
|
||||
|
||||
1. Reads ``lidar.getRangeImage()``.
|
||||
2. Runs ``herding.lidar_perception.detections_from_scan`` to cluster
|
||||
returns into world-frame ``(x, y)`` sheep estimates.
|
||||
3. Folds those into a ``herding.sheep_tracker.SheepTracker`` which
|
||||
maintains last-seen positions for sheep currently out of the
|
||||
FOV and latches "penned" once a track disappears near the gate.
|
||||
|
||||
The output of step 3 is a ``{name: (x, y)}`` dict shaped exactly like
|
||||
the receiver-based one we used to consume — so Strömbom, Sequential
|
||||
and the BC obs builder run unchanged. The sheep→dog Emitter/Receiver
|
||||
link is still up (kept passively for compatibility) but its messages
|
||||
are *not* used for control.
|
||||
|
||||
All modes share the same low-level differential-drive controller
|
||||
(``herding.diffdrive.velocity_to_wheels`` with cos(err)-clamped forward
|
||||
@@ -33,14 +64,19 @@ if _PROJECT_ROOT not in sys.path:
|
||||
|
||||
from controller import Robot
|
||||
|
||||
from herding.active_scan import ActiveScanTeacher
|
||||
from herding.control import modulate_speed_near_sheep
|
||||
from herding.diffdrive import velocity_to_wheels
|
||||
from herding.geometry import (
|
||||
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA,
|
||||
DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS,
|
||||
PEN_ENTRY,
|
||||
PEN_ENTRY, is_penned_position,
|
||||
)
|
||||
from herding.obs import build_obs
|
||||
from herding.lidar_perception import detections_from_scan
|
||||
from herding.obs import OBS_DIM, build_obs
|
||||
from herding.sequential import compute_action_debug as sequential_action_debug
|
||||
from herding.sheep_tracker import SheepTracker
|
||||
from herding.strombom import compute_action as strombom_action
|
||||
from herding.strombom import compute_action_debug as strombom_action_debug
|
||||
|
||||
|
||||
@@ -76,60 +112,82 @@ def _load_runtime_config():
|
||||
_runtime_cfg = _load_runtime_config()
|
||||
MODE = (os.environ.get("HERDING_MODE")
|
||||
or _runtime_cfg.get("HERDING_MODE")
|
||||
or "rl").lower()
|
||||
or "bc").lower()
|
||||
|
||||
|
||||
def _resolve_policy_dir() -> str:
|
||||
"""Where to look for the trained policy.
|
||||
def _resolve_policy_dir(mode: str) -> str:
|
||||
"""Where to look for the trained policy for the given mode.
|
||||
|
||||
Priority:
|
||||
1. HERDING_POLICY_DIR env var or runtime-cfg entry, if it points
|
||||
to a real directory.
|
||||
2. ``training/runs/bc_flock`` — flock-style BC (current default;
|
||||
requires the tight-cohesion sheep regime).
|
||||
3. ``training/runs/bc_solo`` — single-target BC (1-by-1 style;
|
||||
only works if ``herding/flocking_sim.py`` is reverted to the
|
||||
loose-cohesion regime).
|
||||
2. Mode-specific default:
|
||||
bc → training/runs/bc_v3 (Strömbom-imitated MLP)
|
||||
rl → training/runs/rl_v1 (KL-PPO fine-tune of bc_v3)
|
||||
3. Fall back to bc_v3.
|
||||
All checkpoints are frame-stacked K = 4; ``policy_loader`` reads
|
||||
the stacking factor from the policy's observation space.
|
||||
"""
|
||||
env_dir = (os.environ.get("HERDING_POLICY_DIR")
|
||||
or _runtime_cfg.get("HERDING_POLICY_DIR"))
|
||||
if env_dir and os.path.isdir(env_dir):
|
||||
return env_dir
|
||||
candidates = [
|
||||
os.path.join(_PROJECT_ROOT, "training", "runs", "bc_flock"),
|
||||
os.path.join(_PROJECT_ROOT, "training", "runs", "bc_solo"),
|
||||
]
|
||||
for c in candidates:
|
||||
if os.path.isdir(c):
|
||||
return c
|
||||
# Last resort — return env var anyway so error message is informative.
|
||||
return env_dir or candidates[0]
|
||||
mode_default = {
|
||||
"bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"),
|
||||
"rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl_v1"),
|
||||
"dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"),
|
||||
}
|
||||
primary = mode_default.get(mode, mode_default["bc"])
|
||||
if os.path.isdir(primary):
|
||||
return primary
|
||||
# Fall back to BC if the requested checkpoint isn't there yet
|
||||
# (e.g., user asked for `rl` before training the fine-tune).
|
||||
fallback = mode_default["bc"]
|
||||
if os.path.isdir(fallback):
|
||||
return fallback
|
||||
return env_dir or primary
|
||||
|
||||
|
||||
_VALID_MODES = ("rl", "strombom", "sequential")
|
||||
_VALID_MODES = ("bc", "rl", "strombom", "sequential", "dagger", "diag")
|
||||
# Back-compat: an old config saying HERDING_MODE=rl meant "the BC policy".
|
||||
# We now use `rl` strictly for the KL-PPO fine-tune. If the rl_v1
|
||||
# directory isn't present, _resolve_policy_dir below silently falls
|
||||
# back to bc_v3, preserving the old behaviour.
|
||||
if MODE not in _VALID_MODES:
|
||||
print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
|
||||
MODE = "strombom"
|
||||
|
||||
POLICY_DIR = _resolve_policy_dir()
|
||||
DAGGER_DRIVER = (os.environ.get("HERDING_DAGGER_DRIVER")
|
||||
or _runtime_cfg.get("HERDING_DAGGER_DRIVER")
|
||||
or "teacher").lower()
|
||||
if DAGGER_DRIVER not in ("teacher", "student"):
|
||||
DAGGER_DRIVER = "teacher"
|
||||
|
||||
POLICY_DIR = _resolve_policy_dir(MODE)
|
||||
policy_handle = None
|
||||
if MODE == "rl":
|
||||
if MODE in ("bc", "rl", "dagger"):
|
||||
print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists={os.path.isdir(POLICY_DIR)}")
|
||||
try:
|
||||
from policy_loader import load as _load_policy
|
||||
policy_handle = _load_policy(POLICY_DIR)
|
||||
print(f"[dog] RL policy loaded from {POLICY_DIR}")
|
||||
print(f"[dog] policy loaded from {POLICY_DIR}")
|
||||
except Exception as exc:
|
||||
print(f"[dog] RL policy load failed ({exc!r}); falling back to strombom.")
|
||||
MODE = "strombom"
|
||||
print(f"[dog] running in mode={MODE}")
|
||||
if MODE in ("bc", "rl"):
|
||||
print(f"[dog] policy load failed ({exc!r}); falling back to strombom.")
|
||||
MODE = "strombom"
|
||||
else:
|
||||
# In dagger mode, no policy is fine if driver=teacher.
|
||||
print(f"[dog] policy load failed ({exc!r}); dagger driver forced to teacher.")
|
||||
policy_handle = None
|
||||
print(f"[dog] running in mode={MODE}"
|
||||
+ (f" driver={DAGGER_DRIVER}" if MODE == "dagger" else ""))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Action smoothing + safety supervisor
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ACTION_SMOOTH = 0.35
|
||||
ACTION_SMOOTH = 0.55 # was 0.35; bumped for less frame-to-frame action jitter
|
||||
prev_action = (0.0, 0.0)
|
||||
|
||||
|
||||
@@ -185,6 +243,12 @@ gps = robot.getDevice("gps"); gps.enable(timestep)
|
||||
compass = robot.getDevice("compass"); compass.enable(timestep)
|
||||
receiver = robot.getDevice("receiver"); receiver.enable(timestep)
|
||||
emitter = robot.getDevice("emitter")
|
||||
lidar = robot.getDevice("lidar"); lidar.enable(timestep)
|
||||
|
||||
# The receiver channel from sheep is no longer consumed for perception
|
||||
# (kept enabled in case any peripheral tooling reads it). Sheep
|
||||
# positions come exclusively from the LiDAR + tracker pipeline below.
|
||||
tracker = SheepTracker()
|
||||
|
||||
# Cosmetic ear motors — ignored by control logic but keep them animated.
|
||||
left_ear = robot.getDevice("left ear motor")
|
||||
@@ -202,53 +266,197 @@ EAR_RATE = 8.0
|
||||
# Main loop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# {name: (x, y)} — kept across all sheep ever heard from. Sheep that drift
|
||||
# into the pen are tracked by ``penned`` so observations and Strömbom
|
||||
# agree on which ones still need herding.
|
||||
sheep_positions: dict = {}
|
||||
penned_set: set = set()
|
||||
# Active sheep positions come from the LiDAR-fed tracker each step;
|
||||
# penned_set is the tracker's ``get_penned_set()`` call. We drain the
|
||||
# receiver queue without consuming it, so the small backlog of sheep
|
||||
# pings can't grow unbounded.
|
||||
step_count = 0
|
||||
|
||||
from herding.geometry import is_penned_position
|
||||
import atexit
|
||||
import time
|
||||
import numpy as _np
|
||||
|
||||
# DAgger state ----------------------------------------------------------
|
||||
# Logged each step in dagger mode: (stacked_lidar_obs, teacher_action).
|
||||
DAGGER_LOG_OBS: list = []
|
||||
DAGGER_LOG_ACT: list = []
|
||||
# Diagnostic mode buffer (one dict per step).
|
||||
DIAG_BUF: list = []
|
||||
# Frame stack buffer the controller maintains itself when dagger mode is
|
||||
# active — the stacked obs we log must match what the policy sees so the
|
||||
# downstream BC consumes (stacked_obs, teacher_action) pairs cleanly.
|
||||
_FRAME_STACK = (policy_handle.frame_stack if policy_handle is not None else 4)
|
||||
_dagger_buffer: list = []
|
||||
# Active-scan teacher operates on GT (read from receiver).
|
||||
_dagger_teacher = ActiveScanTeacher(strombom_action) if MODE == "dagger" else None
|
||||
# GT positions accumulated from the receiver (sheep emit their xy each step).
|
||||
_gt_sheep: dict = {}
|
||||
|
||||
|
||||
_DAGGER_RUN_TS = int(time.time()) # one file per controller run
|
||||
_DAGGER_DUMPED = False
|
||||
# Sentinel that the auto-collection script polls — empty file written
|
||||
# when this controller decides the run is "done" (all sheep penned, by
|
||||
# GT). The launcher then kills Webots and moves on without waiting out
|
||||
# its timeout. Honoured only in dagger mode.
|
||||
_DAGGER_DONE_FILE = os.path.join(_PROJECT_ROOT, "training", "dagger", ".DONE")
|
||||
|
||||
|
||||
def _dump_dagger_log():
|
||||
"""Save accumulated (obs, teacher_action) pairs to disk on exit.
|
||||
|
||||
Webots may SIGKILL the controller, so the loop also calls this every
|
||||
DAGGER_FLUSH_STEPS so we lose at most a few seconds of data per run.
|
||||
Idempotent — repeated calls overwrite the same file with the latest
|
||||
accumulated buffer.
|
||||
"""
|
||||
global _DAGGER_DUMPED
|
||||
if MODE != "dagger" or not DAGGER_LOG_OBS:
|
||||
return
|
||||
out_dir = os.path.join(_PROJECT_ROOT, "training", "dagger")
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
out_path = os.path.join(out_dir, f"dagger_{_DAGGER_RUN_TS}.npz")
|
||||
obs_arr = _np.stack(DAGGER_LOG_OBS).astype(_np.float32)
|
||||
act_arr = _np.stack(DAGGER_LOG_ACT).astype(_np.float32)
|
||||
_np.savez(out_path, obs=obs_arr, actions=act_arr)
|
||||
if not _DAGGER_DUMPED:
|
||||
print(f"[dog dagger] wrote {len(DAGGER_LOG_OBS)} pairs → {out_path}")
|
||||
_DAGGER_DUMPED = True
|
||||
|
||||
|
||||
DAGGER_FLUSH_STEPS = 500
|
||||
|
||||
|
||||
atexit.register(_dump_dagger_log)
|
||||
|
||||
|
||||
while robot.step(timestep) != -1:
|
||||
step_count += 1
|
||||
|
||||
# Drain receiver. In every mode we capture GT for the diagnostic
|
||||
# log line — perception still comes from LiDAR, the GT is read-only.
|
||||
while receiver.getQueueLength() > 0:
|
||||
msg = receiver.getString()
|
||||
receiver.nextPacket()
|
||||
parts = msg.split(":")
|
||||
if len(parts) == 4 and parts[0] == "sheep":
|
||||
try:
|
||||
x, y = float(parts[2]), float(parts[3])
|
||||
_gt_sheep[parts[1]] = (float(parts[2]), float(parts[3]))
|
||||
except ValueError:
|
||||
continue
|
||||
sheep_positions[parts[1]] = (x, y)
|
||||
if parts[1] not in penned_set and is_penned_position(x, y):
|
||||
penned_set.add(parts[1])
|
||||
pass
|
||||
|
||||
pos = gps.getValues()
|
||||
dog_xy = (pos[0], pos[1])
|
||||
n = compass.getValues()
|
||||
dog_heading = math.atan2(n[0], n[1])
|
||||
|
||||
# ---- Action selection ----
|
||||
if MODE == "rl" and policy_handle is not None:
|
||||
sheep_xy_list = list(sheep_positions.values())
|
||||
sheep_names = list(sheep_positions.keys())
|
||||
sheep_penned_list = [s in penned_set for s in sheep_names]
|
||||
obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list)
|
||||
action = policy_handle.predict(obs)
|
||||
vx, vy = float(action[0]), float(action[1])
|
||||
elif MODE == "sequential":
|
||||
vx, vy, _mode_str, _dbg = sequential_action_debug(
|
||||
dog_xy, sheep_positions, PEN_ENTRY,
|
||||
)
|
||||
# ---- LiDAR perception → tracker → sheep_positions dict ----
|
||||
ranges = _np.asarray(lidar.getRangeImage(), dtype=_np.float32)
|
||||
detections = detections_from_scan(ranges, dog_xy[0], dog_xy[1], dog_heading)
|
||||
sheep_positions = tracker.update(detections)
|
||||
penned_set = tracker.get_penned_set()
|
||||
|
||||
# ---- Diagnostic mode: dump the first DIAG_STEPS scans + GT to disk.
|
||||
if MODE == "diag":
|
||||
DIAG_STEPS = 80
|
||||
if step_count <= DIAG_STEPS:
|
||||
DIAG_BUF.append(dict(
|
||||
step=step_count,
|
||||
ranges=ranges.copy(),
|
||||
dog_x=dog_xy[0], dog_y=dog_xy[1], dog_h=dog_heading,
|
||||
gt_sheep=dict(_gt_sheep),
|
||||
detections=list(detections),
|
||||
))
|
||||
if step_count == DIAG_STEPS:
|
||||
_diag_path = os.path.join(_PROJECT_ROOT, "training", "dagger",
|
||||
f"diag_{int(time.time())}.npz")
|
||||
os.makedirs(os.path.dirname(_diag_path), exist_ok=True)
|
||||
_np.savez(
|
||||
_diag_path,
|
||||
ranges=_np.stack([d["ranges"] for d in DIAG_BUF]),
|
||||
dog_xy=_np.array([[d["dog_x"], d["dog_y"]] for d in DIAG_BUF],
|
||||
dtype=_np.float32),
|
||||
dog_h=_np.array([d["dog_h"] for d in DIAG_BUF], dtype=_np.float32),
|
||||
# Per-step GT serialised: max-pad to 10 sheep.
|
||||
gt_xy=_np.array([
|
||||
[list(d["gt_sheep"].get(f"sheep{i}", (1e9, 1e9)))
|
||||
for i in range(1, 11)]
|
||||
for d in DIAG_BUF
|
||||
], dtype=_np.float32),
|
||||
detections=_np.array([
|
||||
len(d["detections"]) for d in DIAG_BUF
|
||||
], dtype=_np.int32),
|
||||
)
|
||||
print(f"[dog diag] wrote {DIAG_STEPS} scans → {_diag_path}")
|
||||
|
||||
# Build the single-frame LiDAR obs (matches what the env produces).
|
||||
sheep_xy_list = list(sheep_positions.values())
|
||||
sheep_penned_list = [False] * len(sheep_xy_list)
|
||||
single_obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list)
|
||||
# Maintain our own frame stack so logged obs == what policy sees.
|
||||
if not _dagger_buffer:
|
||||
_dagger_buffer = [single_obs.copy() for _ in range(_FRAME_STACK)]
|
||||
else:
|
||||
# Strömbom (canonical baseline).
|
||||
vx, vy, _mode_str, _dbg = strombom_action_debug(
|
||||
dog_xy, sheep_positions, PEN_ENTRY,
|
||||
_dagger_buffer.append(single_obs)
|
||||
if len(_dagger_buffer) > _FRAME_STACK:
|
||||
_dagger_buffer = _dagger_buffer[-_FRAME_STACK:]
|
||||
stacked_obs = _np.concatenate(_dagger_buffer, axis=0).astype(_np.float32)
|
||||
|
||||
# ---- Action selection ----
|
||||
if MODE == "diag":
|
||||
# Diagnostic mode: rotate in place so the captured scans cover
|
||||
# all 360° of view from one position. Target = heading + π →
|
||||
# cos(err) clamps forward to ~0, the dog spins.
|
||||
_t = dog_heading + math.pi
|
||||
vx, vy = math.cos(_t), math.sin(_t)
|
||||
elif MODE == "dagger":
|
||||
# Teacher: active-scan + Strömbom on GT (active sheep only).
|
||||
gt_active = {name: xy for name, xy in _gt_sheep.items()
|
||||
if not is_penned_position(xy[0], xy[1])}
|
||||
t_vx, t_vy, _mode_str = _dagger_teacher(
|
||||
dog_xy, dog_heading, gt_active, PEN_ENTRY,
|
||||
)
|
||||
# Student (if a policy is loaded).
|
||||
s_vx, s_vy = None, None
|
||||
if policy_handle is not None:
|
||||
action = policy_handle.predict(stacked_obs)
|
||||
s_vx, s_vy = float(action[0]), float(action[1])
|
||||
# Drive selection.
|
||||
if DAGGER_DRIVER == "student" and policy_handle is not None:
|
||||
vx, vy = s_vx, s_vy
|
||||
else:
|
||||
vx, vy = t_vx, t_vy
|
||||
# Always log the teacher action (this is the supervision signal).
|
||||
DAGGER_LOG_OBS.append(stacked_obs.copy())
|
||||
DAGGER_LOG_ACT.append(_np.array([t_vx, t_vy], dtype=_np.float32))
|
||||
elif MODE in ("bc", "rl") and policy_handle is not None:
|
||||
# Pass the single-frame obs; the policy_loader maintains its own
|
||||
# frame stack internally. Both bc and rl use the same control
|
||||
# interface — the only difference is which checkpoint loaded.
|
||||
action = policy_handle.predict(single_obs)
|
||||
vx, vy = float(action[0]), float(action[1])
|
||||
elif MODE in ("strombom", "sequential"):
|
||||
# Wrap the analytic teacher in ActiveScanTeacher so the dog
|
||||
# rotates / walks-to-centre when the tracker briefly empties,
|
||||
# instead of going idle. Without this wrapper, the first 2 s
|
||||
# of LiDAR-blind operation kills the run because Strömbom and
|
||||
# Sequential both return (0, 0) when there are no positions.
|
||||
if "_analytic_teacher" not in globals():
|
||||
from herding.sequential import compute_action as sequential_action
|
||||
_analytic_teacher = ActiveScanTeacher(
|
||||
strombom_action if MODE == "strombom" else sequential_action
|
||||
)
|
||||
vx, vy, _mode_str = _analytic_teacher(
|
||||
dog_xy, dog_heading, sheep_positions, PEN_ENTRY,
|
||||
)
|
||||
|
||||
# Shared post-process: speed modulation near sheep. Applies to bc,
|
||||
# rl, strombom, sequential — every mode where the action source is
|
||||
# nominally unit-magnitude. In dagger mode the active-scan teacher
|
||||
# has already modulated, and the diag mode action is hand-built for
|
||||
# rotation; both skip.
|
||||
if MODE not in ("dagger", "diag"):
|
||||
vx, vy = modulate_speed_near_sheep(vx, vy, dog_xy, sheep_positions)
|
||||
|
||||
# EMA smoothing — reduces oscillation from policy or Strömbom flips.
|
||||
vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx
|
||||
@@ -269,7 +477,31 @@ while robot.step(timestep) != -1:
|
||||
left_ear.setPosition(ear_pos)
|
||||
right_ear.setPosition(-ear_pos)
|
||||
|
||||
# --- DAgger: early-stop when all GT sheep are penned ---
|
||||
if MODE == "dagger" and _gt_sheep:
|
||||
gt_active_count = sum(1 for x, y in _gt_sheep.values()
|
||||
if not is_penned_position(x, y))
|
||||
if gt_active_count == 0 and not os.path.exists(_DAGGER_DONE_FILE):
|
||||
_dump_dagger_log()
|
||||
open(_DAGGER_DONE_FILE, "w").close()
|
||||
print(f"[dog dagger] all {len(_gt_sheep)} sheep penned — "
|
||||
f"wrote {_DAGGER_DONE_FILE}, exiting early")
|
||||
|
||||
if MODE == "dagger" and step_count % DAGGER_FLUSH_STEPS == 0 and DAGGER_LOG_OBS:
|
||||
_dump_dagger_log()
|
||||
|
||||
if step_count % 200 == 0:
|
||||
n_active = sum(1 for s in sheep_positions if s not in penned_set)
|
||||
print(f"[dog mode={MODE}] step={step_count} known={len(sheep_positions)} "
|
||||
f"penned={len(penned_set)} active={n_active} action=({vx:+.2f}, {vy:+.2f})")
|
||||
gt_penned = sum(1 for x, y in _gt_sheep.values()
|
||||
if is_penned_position(x, y))
|
||||
gt_total = len(_gt_sheep)
|
||||
extra = ""
|
||||
if MODE == "dagger":
|
||||
extra = f" logged={len(DAGGER_LOG_OBS)}"
|
||||
print(f"[dog mode={MODE}] step={step_count} "
|
||||
f"GT_penned={gt_penned}/{gt_total} "
|
||||
f"tracks_active={tracker.n_active()} "
|
||||
f"tracks_penned={tracker.n_penned()} "
|
||||
f"detections={len(detections)} action=({vx:+.2f}, {vy:+.2f}){extra}")
|
||||
|
||||
# Loop ended (Webots told us to quit). Flush any remaining DAgger log.
|
||||
_dump_dagger_log()
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
"""Backwards-compat shim — Strömbom logic now lives in ``herding.strombom``."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
|
||||
if _PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, _PROJECT_ROOT)
|
||||
|
||||
from herding.strombom import ( # noqa: F401
|
||||
F_FACTOR, DELTA_COLLECT, DELTA_DRIVE,
|
||||
compute_action, compute_action_debug,
|
||||
)
|
||||
from herding.geometry import ( # noqa: F401
|
||||
PEN_X, PEN_Y, PEN_CENTER, PEN_ENTRY,
|
||||
in_pen,
|
||||
)
|
||||
@@ -0,0 +1,132 @@
|
||||
"""Active-perception wrapper for the analytic shepherding teachers.
|
||||
|
||||
Under LiDAR (partial observability), the tracker starts empty — the
|
||||
dog hasn't seen any sheep yet. A naive Strömbom call returns
|
||||
``(0, 0, "idle")`` and the dog stops. The student then learns "do
|
||||
nothing when the tracker is empty," which is a fatal local optimum.
|
||||
|
||||
This wrapper replaces the idle case with a **scan action**: a unit
|
||||
vector 90° CCW from the dog's current forward direction. Passed
|
||||
through ``velocity_to_wheels`` it produces a fast in-place rotation
|
||||
(``cos(err)`` clamp drives forward speed to ~0 because the target is
|
||||
orthogonal to the heading). The dog spins for the first
|
||||
``initial_scan_steps`` steps of every episode regardless of tracker
|
||||
state, and re-enters scan whenever the tracker goes empty mid-episode.
|
||||
|
||||
Once enough sheep are tracked, control hands over to the underlying
|
||||
analytic teacher (Strömbom or Sequential), which now operates on a
|
||||
populated tracker dict. Both teacher and student see the same
|
||||
LiDAR-perceived view — there's no information asymmetry, so the
|
||||
student can in principle achieve the teacher's full performance.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
from herding.control import modulate_speed_near_sheep
|
||||
|
||||
|
||||
INITIAL_SCAN_STEPS = 80 # ≈1.3 s at dt=16 ms — full rotation at the +π turn target.
|
||||
EXPLORE_SPEED = 0.7 # m/s-ish unit (action norm) used when walking blind
|
||||
|
||||
# Debounce on tracker emptiness — a single empty frame between
|
||||
# detections is not enough reason to abandon the drive and start
|
||||
# scanning. Require this many consecutive empty frames first.
|
||||
EMPTY_DEBOUNCE_STEPS = 8
|
||||
|
||||
|
||||
class ActiveScanTeacher:
|
||||
"""Stateful wrapper. Construct one per episode; call ``reset()``
|
||||
between episodes if reusing the instance.
|
||||
|
||||
Call signature::
|
||||
|
||||
vx, vy, mode = teacher(dog_xy, dog_heading, sheep_positions, pen_target)
|
||||
|
||||
Note the extra ``dog_heading`` arg — required to compute the
|
||||
rotation direction. The base teachers (Strömbom, Sequential)
|
||||
don't use heading; we strip it before passing them through.
|
||||
"""
|
||||
|
||||
def __init__(self, base_action_fn, initial_scan_steps: int = INITIAL_SCAN_STEPS):
|
||||
self.base = base_action_fn
|
||||
self.initial_scan = int(initial_scan_steps)
|
||||
self.reset()
|
||||
|
||||
def reset(self) -> None:
|
||||
self.step = 0
|
||||
self.empty_streak = 0
|
||||
self.last_action: tuple[float, float] = (0.0, 0.0)
|
||||
|
||||
@staticmethod
|
||||
def _scan_action(dog_heading: float) -> tuple[float, float]:
|
||||
# Target = current_heading + π. velocity_to_wheels gets err=π,
|
||||
# so turn = k_turn·π = 4π ≈ 12.6 rad/s wheel angular vel and
|
||||
# cos(err) clamps the forward speed to ~0. Maximum in-place
|
||||
# rotation under this controller; one full rotation in ~60 steps.
|
||||
target = dog_heading + math.pi
|
||||
return math.cos(target), math.sin(target)
|
||||
|
||||
@staticmethod
|
||||
def _explore_action(dog_xy) -> tuple[float, float]:
|
||||
"""Walk back toward the field centre when nothing is in view.
|
||||
|
||||
At difficulty=1 sheep can spawn up to ~18 m from origin while
|
||||
the LiDAR has a 12 m range, so an in-place scan from a corner
|
||||
can return zero hits. Walking toward (0, 0) shrinks the
|
||||
max-distance-to-any-sheep and the scanner cone sweeps along
|
||||
the path, eventually picking sheep up.
|
||||
"""
|
||||
dx, dy = -dog_xy[0], -dog_xy[1]
|
||||
d = math.hypot(dx, dy)
|
||||
if d < 0.5:
|
||||
# At the centre — fall through to a scan instead.
|
||||
return 0.0, 0.0
|
||||
return EXPLORE_SPEED * dx / d, EXPLORE_SPEED * dy / d
|
||||
|
||||
def __call__(self, dog_xy, dog_heading, sheep_positions, pen_target):
|
||||
self.step += 1
|
||||
n_visible = len(sheep_positions)
|
||||
|
||||
# Track empty-streak for the explore debounce.
|
||||
if n_visible == 0:
|
||||
self.empty_streak += 1
|
||||
else:
|
||||
self.empty_streak = 0
|
||||
|
||||
# Phase 1: opening rotation, regardless of tracker state.
|
||||
if self.step <= self.initial_scan:
|
||||
vx, vy = self._scan_action(dog_heading)
|
||||
self.last_action = (vx, vy)
|
||||
return vx, vy, "scan_initial"
|
||||
|
||||
# Phase 2: tracker has been empty for a while — walk back to the
|
||||
# centre while the LiDAR keeps sweeping. The debounce prevents
|
||||
# this from firing every time the tracker briefly blinks to zero
|
||||
# (which causes the "dog starts going away from sheep" symptom).
|
||||
if self.empty_streak >= EMPTY_DEBOUNCE_STEPS:
|
||||
ex, ey = self._explore_action(dog_xy)
|
||||
if ex == 0.0 and ey == 0.0:
|
||||
vx, vy = self._scan_action(dog_heading)
|
||||
mode = "scan_at_centre"
|
||||
else:
|
||||
vx, vy = ex, ey
|
||||
mode = "explore"
|
||||
self.last_action = (vx, vy)
|
||||
return vx, vy, mode
|
||||
|
||||
# Phase 2b: tracker just blinked empty for <DEBOUNCE frames —
|
||||
# hold the previous action so the dog doesn't lurch.
|
||||
if n_visible == 0:
|
||||
vx, vy = self.last_action
|
||||
return vx, vy, "hold"
|
||||
|
||||
# Phase 3: hand to the underlying analytic teacher, then apply
|
||||
# the shared near-sheep speed modulation (centralised in
|
||||
# herding.control so the BC student, Strömbom, Sequential and
|
||||
# the DAgger teacher all behave identically near sheep).
|
||||
vx, vy, mode = self.base(dog_xy, sheep_positions, pen_target)
|
||||
vx, vy = modulate_speed_near_sheep(vx, vy, dog_xy, sheep_positions)
|
||||
self.last_action = (vx, vy)
|
||||
return vx, vy, mode
|
||||
@@ -0,0 +1,53 @@
|
||||
"""Shared low-level control helpers used by every dog mode.
|
||||
|
||||
Centralised here so the BC student, Strömbom, Sequential, and the DAgger
|
||||
teacher all apply identical post-processing to their action outputs.
|
||||
The downstream wheel-velocity layer (``herding.diffdrive``) is unchanged.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
|
||||
# Speed-modulation: scale action magnitude down when close to the
|
||||
# nearest sheep. Stops the dog from charging in at full speed and
|
||||
# scattering the flock. Action norm linearly ramps from MIN_SPEED at
|
||||
# distance 0 to 1.0 at SLOW_NEAR_SHEEP.
|
||||
SLOW_NEAR_SHEEP = 2.5
|
||||
MIN_SPEED = 0.30
|
||||
|
||||
|
||||
def modulate_speed_near_sheep(
|
||||
vx: float, vy: float,
|
||||
dog_xy: tuple[float, float],
|
||||
sheep_positions,
|
||||
slow_dist: float = SLOW_NEAR_SHEEP,
|
||||
min_scale: float = MIN_SPEED,
|
||||
) -> tuple[float, float]:
|
||||
"""Scale (vx, vy) magnitude down when close to the nearest sheep.
|
||||
|
||||
``sheep_positions`` accepts either a ``{name: (x, y)}`` dict
|
||||
(matching what the trackers emit) or an iterable of ``(x, y)``
|
||||
tuples. Empty input → action returned unchanged.
|
||||
|
||||
The intent direction is preserved; only magnitude is reduced. With
|
||||
``slow_dist=2.5`` and ``min_scale=0.3``, an action that started at
|
||||
norm 1 is multiplied by 0.3 right next to a sheep, by 0.65 at 1 m
|
||||
away, and by 1.0 once the nearest sheep is ≥ 2.5 m off.
|
||||
"""
|
||||
if not sheep_positions:
|
||||
return vx, vy
|
||||
if hasattr(sheep_positions, "values"):
|
||||
positions = sheep_positions.values()
|
||||
else:
|
||||
positions = sheep_positions
|
||||
nearest = float("inf")
|
||||
for sx, sy in positions:
|
||||
d = math.hypot(sx - dog_xy[0], sy - dog_xy[1])
|
||||
if d < nearest:
|
||||
nearest = d
|
||||
if nearest >= slow_dist or nearest == float("inf"):
|
||||
return vx, vy
|
||||
scale = min_scale + (1.0 - min_scale) * (nearest / slow_dist)
|
||||
return vx * scale, vy * scale
|
||||
+41
-15
@@ -1,25 +1,51 @@
|
||||
"""Reynolds-style sheep flocking dynamics.
|
||||
"""Sheep flocking dynamics — Strömbom 2014 / Reynolds 1987 hybrid.
|
||||
|
||||
This is the per-sheep behavioural step used both by the Webots sheep
|
||||
controller (scalar, one sheep at a time) and by the training environment
|
||||
(loop over sheep). The numerics are adapted from the original
|
||||
``controllers/sheep/flocking.py`` and retuned for the new external-pen
|
||||
layout: the south stone wall is intact except in the gate column, so
|
||||
sheep can only reach the pen by walking through that 3-m corridor.
|
||||
(loop over sheep).
|
||||
|
||||
Model
|
||||
-----
|
||||
The force stack each step (summed → heading + speed):
|
||||
|
||||
Force stack each step (summed → heading + speed):
|
||||
flee — quadratic ramp away from dog within FLEE_DIST
|
||||
cohesion — drift toward flock centre, halved while fleeing
|
||||
separation — inverse-distance push from peers
|
||||
walls — soft repulsion + hard escape band against field walls,
|
||||
except inside the gate column where the south wall is
|
||||
absent
|
||||
(Strömbom 2014 §2.1, term ρa)
|
||||
cohesion — drift toward local centre of mass of peers within
|
||||
COHESION_DIST (Strömbom 2014 §2.1, term c).
|
||||
Weight is **higher when fleeing** — modelling the
|
||||
"safety in numbers" / predator-confusion effect
|
||||
Strömbom 2014 describes as fear-induced cohesion.
|
||||
separation — short-range inverse-distance repulsion from peers
|
||||
(Strömbom 2014 §2.1, term α; Reynolds 1987)
|
||||
wander — small persistent drift for natural idle motion
|
||||
(Strömbom 2014 §2.1, noise term ε)
|
||||
|
||||
A sheep latches to ``penned`` the first time it crosses the gate plane
|
||||
into the gate column (handled by callers via ``geometry.is_penned_position``);
|
||||
once latched, ``penned=True`` is passed in here and the force stack
|
||||
switches to in-pen containment + jitter.
|
||||
References
|
||||
----------
|
||||
- Strömbom et al. (2014). "Solving the shepherding problem: heuristics
|
||||
for herding autonomous, interacting agents." J R Soc Interface 11.
|
||||
- Reynolds (1987). "Flocks, herds and schools: A distributed
|
||||
behavioural model." SIGGRAPH '87.
|
||||
|
||||
Environment-specific adaptations
|
||||
--------------------------------
|
||||
The original Strömbom model assumes an open field. Our scenario adds:
|
||||
|
||||
* Field walls — soft repulsion within ``WALL_MARGIN`` plus a hard
|
||||
escape band when inside ``WALL_HARD_MARGIN``. Necessary because the
|
||||
Webots field is fenced (30 m square enclosure).
|
||||
* Gate column — the south wall has a 3 m gap at x ∈ [10, 13]; sheep
|
||||
pass through it freely (no wall force inside the column).
|
||||
* Penned containment — once a sheep crosses the gate plane south
|
||||
(``geometry.is_penned_position``), the caller flags ``penned=True``
|
||||
and we switch to in-pen wall-bounce + jitter. Sheep do not exit the
|
||||
pen on their own. This is a hard sim constraint, not a behavioural
|
||||
claim about real sheep.
|
||||
|
||||
Parameter tuning (cohesion weight 3× while fleeing) was chosen so the
|
||||
flock survives passage through the 3 m gate without fragmenting — this
|
||||
is a defensible engineering adaptation of Strömbom's qualitative
|
||||
"fear-induced cohesion" to our gate width.
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
"""Cluster a 2D LiDAR scan into world-frame sheep position estimates.
|
||||
|
||||
Pipeline:
|
||||
ranges (N,) ─► hit mask ─► world-frame points
|
||||
│
|
||||
▼
|
||||
adjacency clustering (gap > GAP_THRESHOLD
|
||||
starts a new cluster, walking rays in
|
||||
angular order)
|
||||
│
|
||||
▼
|
||||
centroid + span filter
|
||||
│
|
||||
▼
|
||||
field/pen-corridor filter
|
||||
│
|
||||
▼
|
||||
list of (x, y) detections
|
||||
|
||||
The clusterer is intentionally simple — for ≤10 sheep there is rarely
|
||||
any real ambiguity, and proper DBSCAN would only matter if rays from
|
||||
two adjacent sheep merged. The downstream tracker handles association
|
||||
across frames.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
from herding.geometry import FIELD_X, FIELD_Y, GATE_Y, PEN_X, PEN_Y
|
||||
from herding.lidar_sim import (
|
||||
LIDAR_FOV, LIDAR_MAX_RANGE, LIDAR_N_RAYS, SHEEP_RADIUS, ray_angles,
|
||||
)
|
||||
|
||||
|
||||
GAP_THRESHOLD = 0.6 # m — adjacent ray-points farther apart start new cluster
|
||||
MAX_CLUSTER_SPAN = 1.5 # m — clusters wider than this are likely walls/structures
|
||||
RANGE_HIT_EPS = 0.05 # m — hit if range < max_range - eps
|
||||
WALL_REJECT = 0.5 # m — drop detections this close to a known wall line
|
||||
|
||||
# Known sheep-sized static features. Detections within STATIC_REJECT
|
||||
# of any of these are discarded — these aren't sheep. Mid-pillars on
|
||||
# the field walls are NOT in this list because they're embedded in the
|
||||
# wall (the wall's span filter handles them); listing them here would
|
||||
# only reject real sheep that happened to be near the wall.
|
||||
_STATIC_FEATURES = (
|
||||
# Gate posts (sheep-sized boxes flanking the south-wall opening)
|
||||
( 10.0, -15.0), ( 13.0, -15.0),
|
||||
# Field corner pillars
|
||||
( 15.0, 15.0), ( 15.0, -15.0), (-15.0, 15.0), (-15.0, -15.0),
|
||||
)
|
||||
STATIC_REJECT = 0.8 # m — detection within this of a static feature → drop
|
||||
|
||||
|
||||
def detections_from_scan(
|
||||
ranges: np.ndarray,
|
||||
dog_x: float, dog_y: float, dog_heading: float,
|
||||
max_range: float = LIDAR_MAX_RANGE,
|
||||
) -> list[tuple[float, float]]:
|
||||
"""Return list of (x, y) world-frame sheep position estimates."""
|
||||
ranges = np.asarray(ranges, dtype=np.float32)
|
||||
n_rays = ranges.shape[0]
|
||||
if n_rays == 0:
|
||||
return []
|
||||
angles = ray_angles(n_rays, LIDAR_FOV)
|
||||
hit = ranges < max_range - RANGE_HIT_EPS
|
||||
|
||||
world_a = dog_heading + angles
|
||||
px = dog_x + ranges * np.cos(world_a)
|
||||
py = dog_y + ranges * np.sin(world_a)
|
||||
|
||||
clusters: list[list[tuple[float, float]]] = []
|
||||
current: list[tuple[float, float]] = []
|
||||
prev: tuple[float, float] | None = None
|
||||
for i in range(n_rays):
|
||||
if not bool(hit[i]):
|
||||
if current:
|
||||
clusters.append(current)
|
||||
current = []
|
||||
prev = None
|
||||
continue
|
||||
pt = (float(px[i]), float(py[i]))
|
||||
if prev is not None and math.hypot(pt[0] - prev[0], pt[1] - prev[1]) > GAP_THRESHOLD:
|
||||
clusters.append(current)
|
||||
current = []
|
||||
current.append(pt)
|
||||
prev = pt
|
||||
if current:
|
||||
clusters.append(current)
|
||||
|
||||
detections: list[tuple[float, float]] = []
|
||||
for cluster in clusters:
|
||||
xs = [p[0] for p in cluster]
|
||||
ys = [p[1] for p in cluster]
|
||||
cx, cy = sum(xs) / len(xs), sum(ys) / len(ys)
|
||||
span = math.hypot(max(xs) - min(xs), max(ys) - min(ys))
|
||||
if span > MAX_CLUSTER_SPAN:
|
||||
continue
|
||||
# Surface-to-centre correction: rays hit the front of the sheep,
|
||||
# so the cluster centroid is biased toward the dog by SHEEP_RADIUS.
|
||||
# Push it outward along the dog→cluster direction.
|
||||
dx, dy = cx - dog_x, cy - dog_y
|
||||
d = math.hypot(dx, dy)
|
||||
if d > 1e-3:
|
||||
cx += SHEEP_RADIUS * dx / d
|
||||
cy += SHEEP_RADIUS * dy / d
|
||||
# Keep detections inside the field OR in the gate corridor /
|
||||
# external pen — penned sheep are still worth tracking so the
|
||||
# tracker can latch them as "penned" rather than spawn fresh
|
||||
# tracks each scan.
|
||||
# Accept detections inside the field, plus a narrow strip
|
||||
# immediately south of the gate to catch sheep mid-crossing
|
||||
# (so they get marked penned via is_penned_position before the
|
||||
# track goes stale). Detections deeper into the pen are
|
||||
# dropped entirely — Webots's pen posts and rails would
|
||||
# otherwise produce a torrent of phantom penned tracks that
|
||||
# the tracker can't keep up with.
|
||||
in_main = (FIELD_X[0] - 0.2 < cx < FIELD_X[1] + 0.2 and
|
||||
FIELD_Y[0] - 0.2 < cy < FIELD_Y[1] + 0.2)
|
||||
in_gate_strip = (PEN_X[0] - 0.2 < cx < PEN_X[1] + 0.2 and
|
||||
GATE_Y - 1.0 < cy < GATE_Y + 0.2)
|
||||
if not (in_main or in_gate_strip):
|
||||
continue
|
||||
# Known-static-feature filter: gate posts and corner pillars
|
||||
# show up as sheep-sized clusters but are never sheep.
|
||||
if any(math.hypot(cx - fx, cy - fy) < STATIC_REJECT
|
||||
for fx, fy in _STATIC_FEATURES):
|
||||
continue
|
||||
# Wall-proximity filter: at oblique scan angles, walls produce
|
||||
# multiple short clusters because adjacent ray returns are
|
||||
# spaced just above GAP_THRESHOLD. Sheep can't get within ~0.3 m
|
||||
# of a wall (the env clips them to FIELD_INSIDE), so anything
|
||||
# right at the wall line is structure noise.
|
||||
near_field_wall = (
|
||||
cx > FIELD_X[1] - WALL_REJECT or cx < FIELD_X[0] + WALL_REJECT or
|
||||
cy > FIELD_Y[1] - WALL_REJECT or
|
||||
(cy < FIELD_Y[0] + WALL_REJECT and not (PEN_X[0] <= cx <= PEN_X[1]))
|
||||
)
|
||||
if near_field_wall:
|
||||
continue
|
||||
detections.append((cx, cy))
|
||||
return detections
|
||||
@@ -0,0 +1,193 @@
|
||||
"""Fast 2D LiDAR simulator for the Gymnasium env.
|
||||
|
||||
Raycasts against:
|
||||
* **Sheep** — discs of radius ``SHEEP_RADIUS``.
|
||||
* **Static world geometry** — axis-aligned wall segments and gate
|
||||
posts taken from ``worlds/field.wbt``. Without these, demos
|
||||
collected in-env would never include the false-positive clusters
|
||||
Webots produces from the stone walls and gate-post boxes, and the
|
||||
BC student trained on those demos collapses on deployment.
|
||||
|
||||
Returns a range array matching the Webots Lidar device on the dog
|
||||
(see ``protos/ShepherdDog.proto``: 180 rays, 140° FOV centred on
|
||||
forward, 12 m max range, 5 mm noise).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# Match protos/ShepherdDog.proto Lidar device.
|
||||
LIDAR_N_RAYS = 180
|
||||
LIDAR_FOV = 2.44 # rad ≈ 140°
|
||||
LIDAR_MAX_RANGE = 12.0
|
||||
LIDAR_NOISE = 0.005 # m, gaussian std
|
||||
|
||||
# Sheep modelled as a vertical cylinder; this is the horizontal-section
|
||||
# radius the LiDAR plane intersects. Tuned to the proto sheep (~0.45 m
|
||||
# body length). The exact value is not load-bearing — the perception
|
||||
# clusterer is range-tolerant.
|
||||
SHEEP_RADIUS = 0.30
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Static world geometry — must match worlds/field.wbt
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Vertical walls: (x, y_min, y_max). Field east/west walls and the two
|
||||
# pen side walls are visible through the open gate.
|
||||
_VERTICAL_WALLS = (
|
||||
( 15.0, -15.0, 15.0), # field east
|
||||
(-15.0, -15.0, 15.0), # field west
|
||||
( 10.0, -22.0, -15.0), # pen west
|
||||
( 13.0, -22.0, -15.0), # pen east
|
||||
)
|
||||
|
||||
# Horizontal walls: (y, x_min, x_max). South wall is split by the 3 m
|
||||
# gate at x ∈ [10, 13]; the pen south wall closes the back of the pen.
|
||||
_HORIZONTAL_WALLS = (
|
||||
( 15.0, -15.0, 15.0), # field north
|
||||
(-15.0, -15.0, 10.0), # field south-west of gate
|
||||
(-15.0, 13.0, 15.0), # field south-east of gate
|
||||
(-22.0, 10.0, 13.0), # pen south
|
||||
)
|
||||
|
||||
# Gate posts and field corner pillars treated as vertical cylinders at
|
||||
# LiDAR height. Radius 0.25 m comes from the 0.44 × 0.44 m boxes in the
|
||||
# wbt — close enough to a circular cross-section for this purpose.
|
||||
_POSTS_XY = np.array([
|
||||
( 10.0, -15.0), # west gate post
|
||||
( 13.0, -15.0), # east gate post
|
||||
( 15.0, 15.0), # NE field corner
|
||||
( 15.0, -15.0), # SE field corner
|
||||
(-15.0, 15.0), # NW field corner
|
||||
(-15.0, -15.0), # SW field corner
|
||||
], dtype=np.float64)
|
||||
POST_RADIUS = 0.25
|
||||
|
||||
|
||||
def ray_angles(n: int = LIDAR_N_RAYS, fov: float = LIDAR_FOV) -> np.ndarray:
|
||||
"""Local-frame ray angles, sweeping from +fov/2 to -fov/2.
|
||||
|
||||
Convention: angle is measured CCW from the dog's forward axis. Ray 0
|
||||
points to the dog's left, last ray to the right. Webots' default
|
||||
Lidar sweep matches this.
|
||||
"""
|
||||
return np.linspace(fov / 2.0, -fov / 2.0, n, dtype=np.float64)
|
||||
|
||||
|
||||
# Cached so we don't rebuild every step.
|
||||
_ANGLES = ray_angles()
|
||||
_COS = np.cos(_ANGLES)
|
||||
_SIN = np.sin(_ANGLES)
|
||||
|
||||
|
||||
def _raycast_static(
|
||||
ox: float, oy: float, cos_w: np.ndarray, sin_w: np.ndarray,
|
||||
) -> np.ndarray:
|
||||
"""Per-ray distance to nearest wall or post hit (∞ if none).
|
||||
|
||||
Walls are axis-aligned line segments; for each ray we compute t at
|
||||
which it crosses the wall's constant-coord plane and check the
|
||||
other coord lies in the segment. Posts are circles; same disc
|
||||
intersection as for sheep.
|
||||
"""
|
||||
n_rays = cos_w.shape[0]
|
||||
best = np.full(n_rays, np.inf, dtype=np.float64)
|
||||
|
||||
EPS = 1e-3
|
||||
safe_cos = np.where(np.abs(cos_w) < 1e-9, 1e-9, cos_w)
|
||||
safe_sin = np.where(np.abs(sin_w) < 1e-9, 1e-9, sin_w)
|
||||
|
||||
# Vertical walls (x = const)
|
||||
for wx, ymin, ymax in _VERTICAL_WALLS:
|
||||
t = (wx - ox) / safe_cos
|
||||
y_at = oy + t * sin_w
|
||||
valid = (t > EPS) & (y_at >= ymin - EPS) & (y_at <= ymax + EPS)
|
||||
cand = np.where(valid, t, np.inf)
|
||||
np.minimum(best, cand, out=best)
|
||||
|
||||
# Horizontal walls (y = const)
|
||||
for wy, xmin, xmax in _HORIZONTAL_WALLS:
|
||||
t = (wy - oy) / safe_sin
|
||||
x_at = ox + t * cos_w
|
||||
valid = (t > EPS) & (x_at >= xmin - EPS) & (x_at <= xmax + EPS)
|
||||
cand = np.where(valid, t, np.inf)
|
||||
np.minimum(best, cand, out=best)
|
||||
|
||||
# Posts (treat as discs)
|
||||
if _POSTS_XY.size:
|
||||
px = _POSTS_XY[:, 0] - ox
|
||||
py = _POSTS_XY[:, 1] - oy
|
||||
t_post = np.outer(px, cos_w) + np.outer(py, sin_w) # (P, N)
|
||||
d2 = (px ** 2 + py ** 2)[:, None] # (P, 1)
|
||||
perp2 = d2 - t_post ** 2
|
||||
R2 = POST_RADIUS ** 2
|
||||
hit = (perp2 < R2) & (t_post > 0.0)
|
||||
half = np.sqrt(np.clip(R2 - perp2, 0.0, None))
|
||||
cand = np.where(hit, t_post - half, np.inf)
|
||||
nearest = cand.min(axis=0)
|
||||
np.minimum(best, nearest, out=best)
|
||||
|
||||
return best
|
||||
|
||||
|
||||
def simulate_scan(
|
||||
dog_x: float, dog_y: float, dog_heading: float,
|
||||
sheep_xy: list[tuple[float, float]],
|
||||
noise: float = LIDAR_NOISE,
|
||||
max_range: float = LIDAR_MAX_RANGE,
|
||||
rng: np.random.Generator | None = None,
|
||||
) -> np.ndarray:
|
||||
"""Return a (N,) float32 range array. No-hit entries equal ``max_range``.
|
||||
|
||||
``sheep_xy`` is the list of (x, y) world positions of every sheep in
|
||||
the scene (penned and active). Static world geometry (walls and
|
||||
posts) is also raycast so demos contain the same false-positive
|
||||
clusters Webots produces.
|
||||
"""
|
||||
n_rays = _ANGLES.shape[0]
|
||||
|
||||
ch, sh = math.cos(dog_heading), math.sin(dog_heading)
|
||||
cos_w = ch * _COS - sh * _SIN
|
||||
sin_w = sh * _COS + ch * _SIN
|
||||
|
||||
# Walls + posts
|
||||
best = _raycast_static(dog_x, dog_y, cos_w, sin_w)
|
||||
|
||||
# Sheep discs
|
||||
if sheep_xy:
|
||||
sx = np.asarray([p[0] for p in sheep_xy], dtype=np.float64) - dog_x
|
||||
sy = np.asarray([p[1] for p in sheep_xy], dtype=np.float64) - dog_y
|
||||
t = np.outer(sx, cos_w) + np.outer(sy, sin_w)
|
||||
s_dist2 = (sx ** 2 + sy ** 2)[:, None]
|
||||
perp2 = s_dist2 - t ** 2
|
||||
R2 = SHEEP_RADIUS ** 2
|
||||
hit = (perp2 < R2) & (t > 0.0)
|
||||
half = np.sqrt(np.clip(R2 - perp2, 0.0, None))
|
||||
candidate = np.where(hit, t - half, np.inf)
|
||||
nearest = candidate.min(axis=0)
|
||||
np.minimum(best, nearest, out=best)
|
||||
|
||||
# Clip to LIDAR_MAX_RANGE; entries that never got a hit stay at inf
|
||||
# → clipped down to max_range like the real Webots device.
|
||||
ranges = np.minimum(best, max_range).astype(np.float32)
|
||||
return _add_noise(ranges, noise, rng, max_range)
|
||||
|
||||
|
||||
def _add_noise(ranges: np.ndarray, sigma: float,
|
||||
rng: np.random.Generator | None, max_range: float) -> np.ndarray:
|
||||
if sigma <= 0.0:
|
||||
return ranges
|
||||
if rng is None:
|
||||
rng = np.random.default_rng()
|
||||
hit_mask = ranges < max_range - 1e-3
|
||||
n_hit = int(hit_mask.sum())
|
||||
if n_hit:
|
||||
ranges = ranges.copy()
|
||||
ranges[hit_mask] += rng.normal(0.0, sigma, size=n_hit).astype(np.float32)
|
||||
np.clip(ranges, 0.0, max_range, out=ranges)
|
||||
return ranges
|
||||
@@ -0,0 +1,197 @@
|
||||
"""Multi-target tracker for LiDAR-detected sheep.
|
||||
|
||||
Greedy nearest-neighbour data association (with a distance gate) across
|
||||
frames, plus a memory of last-seen positions for tracks that fall out
|
||||
of the dog's FOV. Output is a ``{name: (x, y)}`` dict shaped exactly
|
||||
like the receiver-based ``sheep_positions`` used previously by the
|
||||
Webots controller and by the env, so Strömbom and Sequential can
|
||||
consume it unchanged.
|
||||
|
||||
Penned-detection heuristic
|
||||
--------------------------
|
||||
Two ways a track is marked penned:
|
||||
1. Its current estimated position is south of the gate plane and
|
||||
within the gate column (the ``is_penned_position`` test the env
|
||||
already uses on ground truth).
|
||||
2. It hasn't been observed for ``STALE_STEPS`` and its last-seen
|
||||
position was inside the gate-approach band — the dog's LiDAR can
|
||||
only see ~2 m into the pen through the open gate, so a sheep
|
||||
that disappeared near the entry has almost certainly entered.
|
||||
|
||||
Tracks marked penned are excluded from ``get_positions()`` (which is
|
||||
what Strömbom consumes), matching the prior receiver-based behaviour.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
from herding.geometry import MAX_SHEEP, in_pen, is_penned_position
|
||||
|
||||
|
||||
GATE_M = 2.5 # m — primary NN gate (recent tracks)
|
||||
REACQUIRE_GATE_M = 4.5 # m — wider gate for re-acquiring stale tracks (sheep moved during occlusion)
|
||||
REACQUIRE_MIN_AGE = 20 # steps — only rebind via the wide gate if the track has been stale for this long
|
||||
PENNED_GATE_M = 4.0 # m — wide gate for matching against already-penned tracks; the pen is small (3×7 m) so duplicates are easy without it
|
||||
FORGET_STEPS = 200 # ~3.2 s — delete stale active tracks; tighter than 5 s to limit phantoms but long enough to bridge typical FOV gaps
|
||||
MAX_ACTIVE_TRACKS = MAX_SHEEP # hard cap to the worst-case real flock size
|
||||
# Penned tracks are never forgotten: sheep don't leave the pen, and
|
||||
# losing the track makes the counter oscillate as the same sheep gets
|
||||
# re-detected and counted multiple times.
|
||||
|
||||
|
||||
class SheepTracker:
|
||||
"""Online tracker with NN association and a forgetful memory.
|
||||
|
||||
Each track stores ``(x, y, last_seen_step, penned)``.
|
||||
"""
|
||||
|
||||
def __init__(self, gate: float = GATE_M):
|
||||
self.gate = gate
|
||||
# tid → (x, y, last_seen_step, penned)
|
||||
self._tracks: dict[int, tuple[float, float, int, bool]] = {}
|
||||
self._next_id = 0
|
||||
self.step = 0
|
||||
|
||||
def reset(self) -> None:
|
||||
self._tracks.clear()
|
||||
self._next_id = 0
|
||||
self.step = 0
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Update
|
||||
# ------------------------------------------------------------------
|
||||
def update(self, detections: list[tuple[float, float]]) -> dict[str, tuple[float, float]]:
|
||||
"""Fold a new set of detections in and return active positions."""
|
||||
self.step += 1
|
||||
|
||||
det_used: set[int] = set()
|
||||
updated_tids: set[int] = set()
|
||||
|
||||
# Pass 1: match against ACTIVE tracks first (oldest-seen-first so
|
||||
# a re-emerging long-lost sheep grabs its old ID before a fresh
|
||||
# neighbour does).
|
||||
active_tids = [tid for tid, t in self._tracks.items() if not t[3]]
|
||||
active_tids.sort(key=lambda tid: self._tracks[tid][2])
|
||||
for tid in active_tids:
|
||||
tx, ty, _, _ = self._tracks[tid]
|
||||
best_j, best_d = -1, self.gate
|
||||
for j, (dx, dy) in enumerate(detections):
|
||||
if j in det_used:
|
||||
continue
|
||||
d = math.hypot(dx - tx, dy - ty)
|
||||
if d < best_d:
|
||||
best_d = d
|
||||
best_j = j
|
||||
if best_j >= 0:
|
||||
dx, dy = detections[best_j]
|
||||
self._tracks[tid] = (dx, dy, self.step, False)
|
||||
det_used.add(best_j)
|
||||
updated_tids.add(tid)
|
||||
|
||||
# Pass 1b: re-acquisition with a wider gate for tracks that have
|
||||
# been stale for ≥ REACQUIRE_MIN_AGE steps. Sheep flee at
|
||||
# ~0.6 m/s; over a 1–2 s occlusion (dog rotating or driving)
|
||||
# they move enough that a fresh detection lies outside the
|
||||
# primary GATE_M but is still clearly the same sheep. Without
|
||||
# this, phantom tracks accumulate and corrupt the CoM.
|
||||
for tid in active_tids:
|
||||
if tid in updated_tids:
|
||||
continue
|
||||
tx, ty, last, _ = self._tracks[tid]
|
||||
if (self.step - last) < REACQUIRE_MIN_AGE:
|
||||
continue
|
||||
best_j, best_d = -1, REACQUIRE_GATE_M
|
||||
for j, (dx, dy) in enumerate(detections):
|
||||
if j in det_used:
|
||||
continue
|
||||
d = math.hypot(dx - tx, dy - ty)
|
||||
if d < best_d:
|
||||
best_d = d
|
||||
best_j = j
|
||||
if best_j >= 0:
|
||||
dx, dy = detections[best_j]
|
||||
self._tracks[tid] = (dx, dy, self.step, False)
|
||||
det_used.add(best_j)
|
||||
updated_tids.add(tid)
|
||||
|
||||
# Pass 2: match remaining detections against PENNED tracks with
|
||||
# a tighter gate. Without this, every frame near the gate spawns
|
||||
# a fresh penned track for the same sheep, which under a long
|
||||
# Webots run leads to thousands of phantom penned tracks.
|
||||
penned_tids = [tid for tid, t in self._tracks.items() if t[3]]
|
||||
for tid in penned_tids:
|
||||
tx, ty, _, _ = self._tracks[tid]
|
||||
best_j, best_d = -1, PENNED_GATE_M
|
||||
for j, (dx, dy) in enumerate(detections):
|
||||
if j in det_used:
|
||||
continue
|
||||
d = math.hypot(dx - tx, dy - ty)
|
||||
if d < best_d:
|
||||
best_d = d
|
||||
best_j = j
|
||||
if best_j >= 0:
|
||||
dx, dy = detections[best_j]
|
||||
self._tracks[tid] = (dx, dy, self.step, True)
|
||||
det_used.add(best_j)
|
||||
|
||||
# Unmatched detections → new tracks. A detection that is already
|
||||
# inside the pen is born "penned" so we don't accumulate active
|
||||
# tracks for sheep that arrived in the pen during occlusion.
|
||||
for j, (dx, dy) in enumerate(detections):
|
||||
if j in det_used:
|
||||
continue
|
||||
penned = in_pen(dx, dy) or is_penned_position(dx, dy)
|
||||
self._tracks[self._next_id] = (dx, dy, self.step, penned)
|
||||
self._next_id += 1
|
||||
|
||||
# Promote active tracks to penned ONLY by geometric position
|
||||
# (sheep is in the pen column south of the gate). The previous
|
||||
# "stale + near gate" heuristic was firing on ordinary occlusion
|
||||
# near the gate and creating phantom penned tracks.
|
||||
for tid, (tx, ty, last, penned) in list(self._tracks.items()):
|
||||
if penned:
|
||||
continue
|
||||
if is_penned_position(tx, ty):
|
||||
self._tracks[tid] = (tx, ty, last, True)
|
||||
|
||||
# Forget stale ACTIVE tracks after FORGET_STEPS. Penned tracks
|
||||
# are kept indefinitely — sheep can't escape the pen, so once a
|
||||
# track is marked penned, that sheep is permanently penned.
|
||||
for tid, (tx, ty, last, penned) in list(self._tracks.items()):
|
||||
if penned:
|
||||
continue
|
||||
if (self.step - last) > FORGET_STEPS:
|
||||
del self._tracks[tid]
|
||||
|
||||
# Hard cap on the active set. If we somehow have more than
|
||||
# MAX_ACTIVE_TRACKS active tracks, drop the oldest-seen ones
|
||||
# first — they are most likely false positives from world
|
||||
# geometry (walls, gate posts) the env's raycaster doesn't
|
||||
# model, and a bloated active set wrecks the downstream CoM.
|
||||
active = [(tid, last) for tid, (_, _, last, p) in self._tracks.items()
|
||||
if not p]
|
||||
if len(active) > MAX_ACTIVE_TRACKS:
|
||||
active.sort(key=lambda kv: kv[1]) # oldest-seen first
|
||||
for tid, _ in active[: len(active) - MAX_ACTIVE_TRACKS]:
|
||||
del self._tracks[tid]
|
||||
|
||||
return self.get_positions()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Outputs
|
||||
# ------------------------------------------------------------------
|
||||
def get_positions(self) -> dict[str, tuple[float, float]]:
|
||||
"""Active (not-yet-penned) tracks. Same shape as receiver dict."""
|
||||
return {f"t{tid}": (x, y)
|
||||
for tid, (x, y, _, penned) in self._tracks.items()
|
||||
if not penned}
|
||||
|
||||
def get_penned_set(self) -> set[str]:
|
||||
return {f"t{tid}" for tid, (_, _, _, penned) in self._tracks.items() if penned}
|
||||
|
||||
def n_active(self) -> int:
|
||||
return sum(1 for _, _, _, penned in self._tracks.values() if not penned)
|
||||
|
||||
def n_penned(self) -> int:
|
||||
return sum(1 for _, _, _, penned in self._tracks.values() if penned)
|
||||
Executable
+166
@@ -0,0 +1,166 @@
|
||||
#!/bin/bash
|
||||
# tools/auto_dagger.sh — automated DAgger collection across many headless
|
||||
# Webots runs.
|
||||
#
|
||||
# For each (flock_size, run_index) combination, generates a world with N
|
||||
# active sheep at randomised positions, launches Webots in fast/headless
|
||||
# mode, lets the controller log (lidar_obs, teacher_action) pairs for up
|
||||
# to RUN_SEC seconds, kills the run, and moves on. The dog controller's
|
||||
# 500-step periodic flush means each run produces a complete .npz even
|
||||
# when killed by timeout.
|
||||
#
|
||||
# Usage:
|
||||
# tools/auto_dagger.sh [RUNS_PER_FLOCK] [SECONDS_PER_RUN]
|
||||
# RUNS_PER_FLOCK : how many randomised runs per flock size (default 3)
|
||||
# SECONDS_PER_RUN: wall-clock cap per Webots run (default 60)
|
||||
#
|
||||
# Env-var overrides:
|
||||
# HERDING_POLICY_DIR : policy the controller loads (only used when
|
||||
# HERDING_DAGGER_DRIVER=student). Default bc_v3.
|
||||
# HERDING_DAGGER_DRIVER : "teacher" (default) or "student".
|
||||
# HEADLESS=1 : force --no-rendering (default on).
|
||||
# FLOCKS="1 3 5 8 10" : space-separated flock sizes to iterate over.
|
||||
#
|
||||
# Output:
|
||||
# training/dagger/dagger_<ts>.npz — one per Webots run.
|
||||
#
|
||||
# After collection, run:
|
||||
# python -m tools.dagger_merge_train --out training/runs/bc_dagger
|
||||
|
||||
set -e
|
||||
|
||||
RUNS_PER_FLOCK=${1:-3}
|
||||
RUN_SEC=${2:-60}
|
||||
FLOCKS=${FLOCKS:-"1 3 5 8 10"}
|
||||
HEADLESS=${HEADLESS:-1}
|
||||
|
||||
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
|
||||
SRC="$ROOT/worlds/field.wbt"
|
||||
DST="$ROOT/worlds/field_test.wbt"
|
||||
POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_v3}"
|
||||
DRIVER="${HERDING_DAGGER_DRIVER:-teacher}"
|
||||
DONE_FILE="$ROOT/training/dagger/.DONE"
|
||||
WEBOTS_PID=""
|
||||
|
||||
cleanup() {
|
||||
echo "Caught interrupt — killing Webots (pid=$WEBOTS_PID) and exiting."
|
||||
[[ -n "$WEBOTS_PID" ]] && kill "$WEBOTS_PID" 2>/dev/null
|
||||
wait "$WEBOTS_PID" 2>/dev/null || true
|
||||
exit 1
|
||||
}
|
||||
trap cleanup INT TERM
|
||||
|
||||
webots_args=(--mode=fast --batch --minimize)
|
||||
if [[ "$HEADLESS" == "1" ]]; then
|
||||
webots_args+=(--no-rendering)
|
||||
fi
|
||||
|
||||
echo "Auto-dagger collection"
|
||||
echo " flock sizes : $FLOCKS"
|
||||
echo " runs per size : $RUNS_PER_FLOCK"
|
||||
echo " seconds per run : $RUN_SEC"
|
||||
echo " policy dir : $POLICY_DIR (used only when driver=student)"
|
||||
echo " driver : $DRIVER"
|
||||
echo " webots flags : ${webots_args[*]}"
|
||||
echo
|
||||
|
||||
# Runtime config — re-written before each run anyway, but written once
|
||||
# here so a manual webots launch at the same time would also pick it up.
|
||||
cat > "$ROOT/herding_runtime.cfg" <<EOF
|
||||
HERDING_MODE=dagger
|
||||
HERDING_POLICY_DIR=$POLICY_DIR
|
||||
HERDING_DAGGER_DRIVER=$DRIVER
|
||||
EOF
|
||||
|
||||
# Count files before, so we can summarise what was added.
|
||||
mkdir -p "$ROOT/training/dagger"
|
||||
before_count=$(ls -1 "$ROOT/training/dagger"/dagger_*.npz 2>/dev/null | wc -l || echo 0)
|
||||
|
||||
run_idx=0
|
||||
total_runs=0
|
||||
for f in $FLOCKS; do total_runs=$((total_runs + RUNS_PER_FLOCK)); done
|
||||
|
||||
for flock in $FLOCKS; do
|
||||
for run in $(seq 1 "$RUNS_PER_FLOCK"); do
|
||||
run_idx=$((run_idx + 1))
|
||||
seed=$((1000 * flock + run))
|
||||
echo "=== [$run_idx/$total_runs] flock=$flock run=$run seed=$seed ==="
|
||||
|
||||
# Generate randomised world.
|
||||
cp "$SRC" "$DST"
|
||||
for i in $(seq $((flock + 1)) 10); do
|
||||
sed -i "s|^Sheep .* \"sheep${i}\".*|# &|" "$DST"
|
||||
done
|
||||
# Inline Python: jitter sheep1..flock translations.
|
||||
python3 - "$DST" "$flock" "$seed" <<'PYEOF'
|
||||
import re, random, sys
|
||||
path, n_str, seed = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
n = int(n_str); random.seed(int(seed))
|
||||
with open(path) as f:
|
||||
txt = f.read()
|
||||
def rand_pos():
|
||||
while True:
|
||||
x = random.uniform(-12.0, 12.0)
|
||||
y = random.uniform(-10.0, 12.0) # avoid the gate strip
|
||||
if x * x + y * y > 9.0: # at least 3 m from dog spawn
|
||||
return x, y
|
||||
for i in range(1, n + 1):
|
||||
x, y = rand_pos()
|
||||
pat = re.compile(
|
||||
r'Sheep \{ translation\s+\S+\s+\S+\s+(\S+)\s+name "sheep' + str(i) + r'"'
|
||||
)
|
||||
txt = pat.sub(rf'Sheep {{ translation {x:.2f} {y:.2f} \g<1> name "sheep{i}"', txt, count=1)
|
||||
with open(path, "w") as f:
|
||||
f.write(txt)
|
||||
PYEOF
|
||||
|
||||
# Run Webots in the background; poll for the .DONE sentinel or
|
||||
# the wall-clock timeout, whichever comes first.
|
||||
rm -f "$DONE_FILE"
|
||||
webots "${webots_args[@]}" "$DST" \
|
||||
> /tmp/webots_dagger_run.log 2>&1 &
|
||||
WEBOTS_PID=$!
|
||||
|
||||
# Give the controller 10 s to start before polling the sentinel,
|
||||
# otherwise a sheep that spawns already penned triggers an instant
|
||||
# false-positive kill.
|
||||
elapsed=0
|
||||
grace=10
|
||||
while kill -0 "$WEBOTS_PID" 2>/dev/null; do
|
||||
if (( elapsed >= grace )) && [[ -f "$DONE_FILE" ]]; then
|
||||
echo " sentinel .DONE detected — killing Webots early"
|
||||
kill "$WEBOTS_PID" 2>/dev/null
|
||||
wait "$WEBOTS_PID" 2>/dev/null || true
|
||||
break
|
||||
fi
|
||||
if (( elapsed >= RUN_SEC )); then
|
||||
echo " timeout ($RUN_SEC s) — killing Webots"
|
||||
kill "$WEBOTS_PID" 2>/dev/null
|
||||
wait "$WEBOTS_PID" 2>/dev/null || true
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
elapsed=$((elapsed + 2))
|
||||
done
|
||||
WEBOTS_PID=""
|
||||
|
||||
# Quick sanity from the log: did the controller actually run?
|
||||
if grep -q "running in mode=dagger" /tmp/webots_dagger_run.log; then
|
||||
new_pairs=$(tail -50 /tmp/webots_dagger_run.log | grep -oE 'logged=[0-9]+' | tail -1)
|
||||
echo " controller ran ($new_pairs)"
|
||||
else
|
||||
echo " WARNING: controller may not have started (see /tmp/webots_dagger_run.log)"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
after_count=$(ls -1 "$ROOT/training/dagger"/dagger_*.npz 2>/dev/null | wc -l || echo 0)
|
||||
new_files=$((after_count - before_count))
|
||||
|
||||
echo
|
||||
echo "Done."
|
||||
echo " new dagger files : $new_files"
|
||||
echo " total in dir : $after_count"
|
||||
echo
|
||||
echo "Next:"
|
||||
echo " python -m tools.dagger_merge_train --out training/runs/bc_dagger"
|
||||
+37
-9
@@ -26,12 +26,16 @@ if _PROJECT_ROOT not in sys.path:
|
||||
|
||||
import numpy as np
|
||||
|
||||
from herding.active_scan import ActiveScanTeacher
|
||||
from herding.geometry import PEN_ENTRY
|
||||
from herding.sequential import compute_action as sequential_action
|
||||
from herding.strombom import compute_action as strombom_action
|
||||
from training.herding_env import HerdingEnv
|
||||
|
||||
|
||||
# Base analytic teachers (no scanning). The default at demo-collection
|
||||
# time wraps these in ActiveScanTeacher, which under LiDAR makes the
|
||||
# teacher operate on the same partial obs as the student.
|
||||
TEACHERS = {
|
||||
"sequential": sequential_action,
|
||||
"strombom": strombom_action,
|
||||
@@ -39,19 +43,34 @@ TEACHERS = {
|
||||
|
||||
|
||||
def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int,
|
||||
teacher_fn):
|
||||
teacher_fn, frame_stack: int = 1, privileged: bool = False):
|
||||
env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
|
||||
difficulty=1.0, seed=seed)
|
||||
difficulty=1.0, seed=seed, frame_stack=frame_stack)
|
||||
obs, _ = env.reset(seed=seed)
|
||||
obs_list, action_list = [], []
|
||||
# Active-scan wrapper: scan first, then run the base teacher on the
|
||||
# tracker dict. Reset state per episode so the opening scan kicks in.
|
||||
scan_teacher = ActiveScanTeacher(teacher_fn)
|
||||
for step in range(max_steps):
|
||||
positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
|
||||
for i in range(env.n_sheep) if not env.sheep_penned[i]}
|
||||
if not positions:
|
||||
break
|
||||
vx, vy, _mode = teacher_fn(
|
||||
(env.dog_x, env.dog_y), positions, PEN_ENTRY,
|
||||
)
|
||||
if privileged:
|
||||
# Asymmetric "learning by cheating": teacher reads GT, student
|
||||
# gets LiDAR obs. Kept available for ablation; default off.
|
||||
positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
|
||||
for i in range(env.n_sheep) if not env.sheep_penned[i]}
|
||||
if not positions:
|
||||
break
|
||||
vx, vy, _mode = teacher_fn(
|
||||
(env.dog_x, env.dog_y), positions, PEN_ENTRY,
|
||||
)
|
||||
else:
|
||||
# Matched-perception teacher: it sees what the student sees
|
||||
# (the tracker dict), with active scanning to fill the
|
||||
# tracker before driving.
|
||||
positions = env.perceived_positions()
|
||||
vx, vy, _mode = scan_teacher(
|
||||
(env.dog_x, env.dog_y), env.dog_heading,
|
||||
positions, PEN_ENTRY,
|
||||
)
|
||||
action = np.array([vx, vy], dtype=np.float32)
|
||||
if step % subsample == 0:
|
||||
obs_list.append(obs.copy())
|
||||
@@ -81,6 +100,14 @@ def main():
|
||||
parser.add_argument("--teacher", default="sequential",
|
||||
choices=list(TEACHERS.keys()),
|
||||
help="Which analytic teacher to demonstrate.")
|
||||
parser.add_argument("--frame-stack", type=int, default=1,
|
||||
help="K — concatenate the last K env obs into a "
|
||||
"single (32·K)-D vector. Lets a memoryless "
|
||||
"MLP recover temporal info under partial "
|
||||
"LiDAR observability.")
|
||||
parser.add_argument("--privileged", action="store_true",
|
||||
help="Teacher reads ground truth (asymmetric BC). "
|
||||
"Default: matched-perception with active scan.")
|
||||
args = parser.parse_args()
|
||||
teacher_fn = TEACHERS[args.teacher]
|
||||
print(f"[demos] teacher: {args.teacher}")
|
||||
@@ -97,6 +124,7 @@ def main():
|
||||
for seed in range(args.seeds_per_n):
|
||||
obs, actions, success, total_steps = collect_one(
|
||||
n, seed, args.max_steps, args.subsample, teacher_fn,
|
||||
frame_stack=args.frame_stack, privileged=args.privileged,
|
||||
)
|
||||
n_total += 1
|
||||
if success:
|
||||
|
||||
@@ -0,0 +1,135 @@
|
||||
"""Merge Webots DAgger demos with sim demos and retrain the BC policy.
|
||||
|
||||
The dog controller in ``HERDING_MODE=dagger`` writes per-run files to
|
||||
``training/dagger/dagger_<ts>.npz`` containing ``(obs, actions)`` pairs
|
||||
where:
|
||||
|
||||
* ``obs`` is the **stacked LiDAR observation** as built by the live
|
||||
Webots tracker — exactly the input distribution the deployed
|
||||
controller sees.
|
||||
* ``actions`` is the **active-scan-teacher action computed from
|
||||
ground-truth sheep positions** (read off the sheep emitter).
|
||||
|
||||
Combined with the existing sim demos (``training/demos_v3.npz`` by
|
||||
default), this gives the BC student a training set that includes the
|
||||
real Webots false-positive distribution — closing the sim-to-real
|
||||
perception gap that the all-sim pipeline couldn't bridge.
|
||||
|
||||
Usage::
|
||||
|
||||
# Iteration 1 — merge all dagger files with sim demos, retrain
|
||||
python -m tools.dagger_merge_train \\
|
||||
--sim training/demos_v3.npz \\
|
||||
--out training/runs/bc_dagger1
|
||||
|
||||
# Iteration 2 — drop the sim baseline, train only on Webots data
|
||||
python -m tools.dagger_merge_train --no-sim --out training/runs/bc_dagger2
|
||||
|
||||
The new policy is saved as ``<out>/policy.zip`` and is auto-loaded by
|
||||
the controller's resolution priority on the next Webots run.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
|
||||
if _PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, _PROJECT_ROOT)
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--sim", default="training/demos_v3.npz",
|
||||
help="Sim demo file to mix with the Webots data. "
|
||||
"Pass --no-sim to train only on dagger data.")
|
||||
parser.add_argument("--no-sim", action="store_true",
|
||||
help="Skip the sim demos entirely.")
|
||||
parser.add_argument("--dagger-glob", default="training/dagger/dagger_*.npz",
|
||||
help="Glob for Webots-collected dagger files.")
|
||||
parser.add_argument("--merged-out", default="training/demos_dagger.npz",
|
||||
help="Where to write the merged demo file.")
|
||||
parser.add_argument("--out", default="training/runs/bc_dagger",
|
||||
help="Where to write the BC policy.")
|
||||
parser.add_argument("--epochs", type=int, default=60)
|
||||
parser.add_argument("--batch-size", type=int, default=256)
|
||||
parser.add_argument("--net-arch", default="512,512")
|
||||
parser.add_argument("--cos-weight", type=float, default=1.0)
|
||||
args = parser.parse_args()
|
||||
|
||||
# --- Gather Webots files ---
|
||||
dagger_paths = sorted(glob.glob(args.dagger_glob))
|
||||
if not dagger_paths:
|
||||
raise SystemExit(f"No dagger files found at {args.dagger_glob} — "
|
||||
"run Webots in HERDING_MODE=dagger first.")
|
||||
|
||||
chunks_obs: list[np.ndarray] = []
|
||||
chunks_act: list[np.ndarray] = []
|
||||
total_dagger = 0
|
||||
for p in dagger_paths:
|
||||
data = np.load(p)
|
||||
obs = data["obs"].astype(np.float32)
|
||||
act = data["actions"].astype(np.float32)
|
||||
chunks_obs.append(obs)
|
||||
chunks_act.append(act)
|
||||
total_dagger += len(obs)
|
||||
print(f" + {p}: {obs.shape[0]} pairs (obs dim {obs.shape[1]})")
|
||||
print(f"[merge] total dagger pairs: {total_dagger}")
|
||||
|
||||
obs_dim = chunks_obs[0].shape[1]
|
||||
if any(c.shape[1] != obs_dim for c in chunks_obs):
|
||||
raise SystemExit(
|
||||
"Dagger files have inconsistent obs dims — they were collected "
|
||||
"with different frame_stack settings. Either rerun with a "
|
||||
"consistent setting or filter the glob."
|
||||
)
|
||||
|
||||
# --- Optionally include sim demos ---
|
||||
if not args.no_sim:
|
||||
sim = np.load(args.sim)
|
||||
sim_obs = sim["obs"].astype(np.float32)
|
||||
sim_act = sim["actions"].astype(np.float32)
|
||||
if sim_obs.shape[1] != obs_dim:
|
||||
raise SystemExit(
|
||||
f"Sim demos have obs dim {sim_obs.shape[1]} but dagger demos "
|
||||
f"have {obs_dim}. Recollect sim demos at the same frame_stack."
|
||||
)
|
||||
chunks_obs.append(sim_obs)
|
||||
chunks_act.append(sim_act)
|
||||
print(f"[merge] + sim demos: {sim_obs.shape[0]} pairs from {args.sim}")
|
||||
|
||||
obs_all = np.concatenate(chunks_obs, axis=0)
|
||||
act_all = np.concatenate(chunks_act, axis=0)
|
||||
# Empty meta — bc_pretrain doesn't actually use it but the file format
|
||||
# has it.
|
||||
meta = np.zeros((0, 5), dtype=np.int32)
|
||||
|
||||
Path(args.merged_out).parent.mkdir(parents=True, exist_ok=True)
|
||||
np.savez(args.merged_out, obs=obs_all, actions=act_all, meta=meta)
|
||||
print(f"[merge] wrote {len(obs_all)} pairs → {args.merged_out}")
|
||||
print(f"[merge] obs shape {obs_all.shape}, action shape {act_all.shape}")
|
||||
|
||||
# --- Run BC training ---
|
||||
cmd = [
|
||||
sys.executable, "-m", "training.bc_pretrain",
|
||||
"--demos", args.merged_out,
|
||||
"--out", args.out,
|
||||
"--epochs", str(args.epochs),
|
||||
"--batch-size", str(args.batch_size),
|
||||
"--net-arch", args.net_arch,
|
||||
"--cos-weight", str(args.cos_weight),
|
||||
]
|
||||
print(f"\n[merge] launching: {' '.join(cmd)}")
|
||||
subprocess.run(cmd, check=True, cwd=_PROJECT_ROOT)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+14
-9
@@ -7,29 +7,33 @@
|
||||
# Usage:
|
||||
# tools/run_webots.sh [N] [MODE]
|
||||
# N : number of active sheep (1..10), default 10
|
||||
# MODE : "rl" | "strombom" | "sequential", default "rl"
|
||||
# MODE : "bc" | "rl" | "strombom" | "sequential" | "dagger", default "bc"
|
||||
#
|
||||
# Examples:
|
||||
# tools/run_webots.sh 10 rl # BC-trained RL policy, 10 sheep
|
||||
# tools/run_webots.sh 10 bc # BC-trained policy, 10 sheep
|
||||
# tools/run_webots.sh 10 rl # KL-PPO fine-tune of bc, 10 sheep
|
||||
# tools/run_webots.sh 5 sequential # the analytic teacher, 5 sheep
|
||||
# tools/run_webots.sh 3 strombom # canonical baseline, 3 sheep
|
||||
#
|
||||
# Notes:
|
||||
# * The RL mode loads training/runs/bc_solo/policy.zip by default.
|
||||
# Override via HERDING_POLICY_DIR=/path/to/run env var.
|
||||
# * The RL mode loads the latest BC policy by default — priority
|
||||
# bc_dagger_v2 → bc_dagger → bc_c2v3 (the controller resolves it).
|
||||
# (LiDAR-perception, frame-stack K=4). Override via
|
||||
# HERDING_POLICY_DIR=/path/to/run env var.
|
||||
# * Conda env "tir" must be active (provides stable-baselines3 + torch).
|
||||
|
||||
set -e
|
||||
N=${1:-10}
|
||||
MODE=${2:-rl}
|
||||
MODE=${2:-bc}
|
||||
|
||||
if (( N < 1 || N > 10 )); then
|
||||
echo "N must be 1..10, got $N" >&2; exit 1
|
||||
fi
|
||||
case "$MODE" in
|
||||
rl|strombom|sequential) ;;
|
||||
*) echo "MODE must be rl|strombom|sequential, got '$MODE'" >&2; exit 1 ;;
|
||||
bc|rl|strombom|sequential|dagger) ;;
|
||||
*) echo "MODE must be bc|rl|strombom|sequential|dagger, got '$MODE'" >&2; exit 1 ;;
|
||||
esac
|
||||
DAGGER_DRIVER=${HERDING_DAGGER_DRIVER:-teacher}
|
||||
|
||||
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
|
||||
SRC="$ROOT/worlds/field.wbt"
|
||||
@@ -46,15 +50,16 @@ echo "------------------------------------------------------------"
|
||||
echo "World : $DST"
|
||||
echo "Mode : $MODE"
|
||||
echo "Sheep : $active active"
|
||||
echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_solo}"
|
||||
echo "Policy dir : ${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_v3}"
|
||||
echo "------------------------------------------------------------"
|
||||
|
||||
# Webots strips HERDING_* env vars from controller subprocesses in some
|
||||
# setups, so we also write a runtime config file the controller reads.
|
||||
RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_solo}"
|
||||
RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc_v3}"
|
||||
cat > "$ROOT/herding_runtime.cfg" <<EOF
|
||||
HERDING_MODE=$MODE
|
||||
HERDING_POLICY_DIR=$RESOLVED_POLICY_DIR
|
||||
HERDING_DAGGER_DRIVER=$DAGGER_DRIVER
|
||||
EOF
|
||||
|
||||
export HERDING_MODE="$MODE"
|
||||
|
||||
+70
-54
@@ -1,21 +1,29 @@
|
||||
# Training pipeline
|
||||
|
||||
Behavior cloning of analytic herding teachers into a neural network
|
||||
policy that runs in Webots. PPO from scratch and PPO fine-tune of BC
|
||||
were tried earlier and are kept under `train_ppo.py` as experimental
|
||||
options, but the BC route alone is what we ship.
|
||||
Behavior cloning of analytic herding teachers into a neural-network
|
||||
policy that runs under LiDAR perception in Webots.
|
||||
|
||||
```
|
||||
sim demos (active-scan teacher on tracker output, K=4 frame stack)
|
||||
│
|
||||
▼
|
||||
bc_pretrain.py ──► runs/bc_v3 (deployed policy — beats Strömbom on n≥8)
|
||||
│
|
||||
▼ (optional: tools/auto_dagger.sh + tools/dagger_merge_train.py
|
||||
│ if sim-trained doesn't transfer cleanly to Webots)
|
||||
│
|
||||
runs/bc_dagger
|
||||
```
|
||||
|
||||
## Files
|
||||
|
||||
```
|
||||
herding_env.py — Gymnasium env (used for demo collection + eval)
|
||||
bc_pretrain.py — supervised MSE+cosine training of an SB3 MlpPolicy
|
||||
against (obs, action) demos
|
||||
herding_env.py — Gymnasium env (LiDAR raycast + tracker by default)
|
||||
bc_pretrain.py — MSE + cosine BC of (obs, action) demos into MlpPolicy
|
||||
eval.py — analytic teachers + BC policies, full n=1..10 grid
|
||||
parity_test.py — shape/determinism/baseline smoke test
|
||||
train_ppo.py — PPO trainer (experimental — see Appendix below)
|
||||
configs/ — PPO hyperparameter YAML
|
||||
runs/ — checkpoints (.gitignored)
|
||||
parity_test.py — shape / determinism / baseline smoke test
|
||||
runs/ — checkpoints (most are .gitignored; the deployed
|
||||
ones are whitelisted in the top-level .gitignore)
|
||||
```
|
||||
|
||||
## Setup
|
||||
@@ -31,66 +39,74 @@ rollout collection, not gradient compute.
|
||||
## The BC pipeline
|
||||
|
||||
```
|
||||
# 1. Generate demos from an analytic teacher.
|
||||
# --teacher: strombom (default), sequential, drive_only, hybrid, strombom_smooth
|
||||
# 1. Sim demos with the active-scan + Strömbom teacher under LiDAR
|
||||
# perception. K=4 frame stack so the MLP has temporal context.
|
||||
python -m tools.collect_demos --teacher strombom \
|
||||
--out demos.npz --seeds-per-n 30 --subsample 3
|
||||
--out demos_v3.npz --seeds-per-n 15 --subsample 3 --frame-stack 4
|
||||
|
||||
# 2. Behavior-clone the demos into an MLP policy.
|
||||
python -m training.bc_pretrain --demos demos.npz \
|
||||
--out runs/bc_flock --epochs 100 --net-arch 512,512
|
||||
# 2. Behavior-clone.
|
||||
python -m training.bc_pretrain --demos demos_v3.npz \
|
||||
--out runs/bc_v3 --epochs 60 --net-arch 512,512
|
||||
|
||||
# 3. Evaluate the resulting policy.
|
||||
python -m training.eval --policy runs/bc_flock \
|
||||
--max-flock 10 --max-steps 30000 --n-seeds 5
|
||||
# 3. Evaluate.
|
||||
python -m training.eval --policy runs/bc_v3 \
|
||||
--max-flock 10 --max-steps 8000 --n-seeds 5
|
||||
```
|
||||
|
||||
Wall time: ~10 min demos + ~5 min BC training + ~5 min eval.
|
||||
|
||||
`bc_pretrain.py` saves the **best-val_cos** snapshot, not the final
|
||||
epoch — multi-modal teachers (Strömbom's collect/drive switch) make
|
||||
training noisy and the last epoch is often worse than an earlier one.
|
||||
epoch — multi-modal teachers make training noisy and the last epoch is
|
||||
often worse than an earlier one.
|
||||
|
||||
## DAgger from Webots
|
||||
|
||||
Sim-only BC plateaus because the env's 2D raycast can't reproduce all
|
||||
the false-positive clusters Webots generates from real geometry. The
|
||||
fix is to collect (obs, teacher_action) pairs from inside Webots:
|
||||
|
||||
```
|
||||
# Headless DAgger collection: 5 flock sizes × 3 runs each.
|
||||
tools/auto_dagger.sh 3 60
|
||||
|
||||
# Merge with the sim baseline + retrain.
|
||||
python -m tools.dagger_merge_train --out runs/bc_dagger
|
||||
```
|
||||
|
||||
Iterate by re-running collection with the new student in the driver's
|
||||
seat:
|
||||
|
||||
```
|
||||
HERDING_POLICY_DIR=$PWD/training/runs/bc_dagger \
|
||||
HERDING_DAGGER_DRIVER=student \
|
||||
tools/auto_dagger.sh 3 60
|
||||
python -m tools.dagger_merge_train --out runs/bc_dagger_v2
|
||||
```
|
||||
|
||||
## Available analytic teachers
|
||||
|
||||
| Name | What it does | Best for |
|
||||
| Name | What it does | Notes |
|
||||
|---|---|---|
|
||||
| `strombom` | Canonical Strömbom — collect when flock is scattered, drive CoM otherwise | Tight-cohesion regime, n=1-10 |
|
||||
| `sequential` | Pick the sheep closest to the pen and drive only it | Loose-cohesion regime, n=1-10 |
|
||||
| `drive_only` | Strömbom drive without collect mode (continuous action) | Easier-to-BC alternative; less reliable than full Strömbom |
|
||||
| `hybrid` | Drive rearmost sheep when far, switch to closest near gate | Failed experiment, kept for write-up |
|
||||
| `strombom_smooth` | Sigmoid-blended Strömbom collect↔drive | Failed experiment |
|
||||
| `strombom` | Canonical Strömbom — collect when flock is scattered, drive CoM otherwise | Default; works well for n=1–10 under tight cohesion |
|
||||
| `sequential` | Pick the sheep closest to the pen and drive only it | Alternative; needs loose-cohesion regime |
|
||||
|
||||
## Evaluating the analytic teachers directly
|
||||
Both are wrapped at demo-collection time in
|
||||
`herding/active_scan.py:ActiveScanTeacher`, which adds an opening
|
||||
in-place rotation, walk-to-centre when the LiDAR sees nothing, and
|
||||
near-sheep speed modulation (the same modulation `herding/control.py`
|
||||
applies to every dog mode at inference).
|
||||
|
||||
## Evaluating analytic teachers directly
|
||||
|
||||
```
|
||||
python -m training.eval --policy strombom --max-flock 10 --max-steps 30000 --n-seeds 5
|
||||
python -m training.eval --policy sequential --max-flock 10 --max-steps 30000 --n-seeds 5
|
||||
python -m training.eval --policy strombom --max-flock 10 --max-steps 8000 --n-seeds 5
|
||||
python -m training.eval --policy sequential --max-flock 10 --max-steps 8000 --n-seeds 5
|
||||
```
|
||||
|
||||
## Webots inference
|
||||
|
||||
The Webots dog controller (`controllers/shepherd_dog/shepherd_dog.py`)
|
||||
loads a saved BC zip when launched in `rl` mode:
|
||||
|
||||
```
|
||||
HERDING_POLICY_DIR=$PWD/runs/bc_flock tools/run_webots.sh 10 rl
|
||||
tools/run_webots.sh 10 rl
|
||||
```
|
||||
|
||||
It auto-discovers a checkpoint named `policy.zip`, `best_model.zip`, or
|
||||
`final.zip` in the directory.
|
||||
|
||||
## Appendix — experimental PPO scripts
|
||||
|
||||
`train_ppo.py` contains the PPO/RL pipeline tried before BC:
|
||||
* PPO from scratch with curriculum learning over flock size + spawn area.
|
||||
* PPO fine-tune of a BC checkpoint.
|
||||
|
||||
Both ran into stability issues (PPO's exploration noise destroys BC
|
||||
weights faster than the reward signal can rebuild them; PPO from
|
||||
scratch never sees pen events often enough during random exploration to
|
||||
credit-assign the +500 done bonus).
|
||||
|
||||
The script is left in place because the abstractions are sound and the
|
||||
code is reusable for follow-up work (e.g. KL-regularised fine-tune
|
||||
with a frozen reference policy). Not part of the deliverable pipeline.
|
||||
The dog controller loads the highest-priority policy that exists
|
||||
(`bc_dagger_v2` → `bc_dagger` → `bc_v3`). Override with
|
||||
`HERDING_POLICY_DIR=…` if you want a specific checkpoint.
|
||||
|
||||
+18
-6
@@ -43,13 +43,15 @@ from stable_baselines3.common.vec_env import DummyVecEnv
|
||||
from training.herding_env import HerdingEnv
|
||||
|
||||
|
||||
def build_model(net_arch_pi, net_arch_vf, log_std_init: float):
|
||||
"""Build a fresh SB3 PPO with the same architecture as train_ppo.
|
||||
def build_model(net_arch_pi, net_arch_vf, log_std_init: float,
|
||||
frame_stack: int = 1):
|
||||
"""Build a fresh SB3 PPO solely as a vehicle for the policy weights.
|
||||
|
||||
We only need the policy to load weights into; PPO's training-loop
|
||||
plumbing isn't used during BC.
|
||||
PPO's training-loop plumbing isn't used during BC. ``frame_stack``
|
||||
must match the demo file so the env's obs space agrees with the
|
||||
recorded obs shape.
|
||||
"""
|
||||
env = DummyVecEnv([lambda: HerdingEnv()])
|
||||
env = DummyVecEnv([lambda: HerdingEnv(frame_stack=frame_stack)])
|
||||
model = PPO(
|
||||
"MlpPolicy", env,
|
||||
policy_kwargs=dict(
|
||||
@@ -139,7 +141,17 @@ def main():
|
||||
# --- Build model ---
|
||||
net_arch_pi = [int(x) for x in args.net_arch.split(",")]
|
||||
net_arch_vf = net_arch_pi[:]
|
||||
model, _env = build_model(net_arch_pi, net_arch_vf, args.log_std_init)
|
||||
# Auto-detect frame stacking from the demo file so a stacked-obs
|
||||
# demo trains a stacked-obs policy without an extra CLI flag.
|
||||
obs_dim = obs.shape[1]
|
||||
from herding.obs import OBS_DIM as _SINGLE
|
||||
if obs_dim % _SINGLE != 0:
|
||||
raise RuntimeError(f"demo obs dim {obs_dim} is not a multiple of {_SINGLE}")
|
||||
frame_stack = obs_dim // _SINGLE
|
||||
if frame_stack > 1:
|
||||
print(f"[bc] inferred frame_stack={frame_stack} from demo obs dim {obs_dim}")
|
||||
model, _env = build_model(net_arch_pi, net_arch_vf, args.log_std_init,
|
||||
frame_stack=frame_stack)
|
||||
policy = model.policy.to(args.device)
|
||||
optimizer = optim.Adam(policy.parameters(), lr=args.lr)
|
||||
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
# PPO hyperparameters for the herding env. Tuned for a 28-D obs / 2-D
|
||||
# continuous action space with 16 parallel envs on GPU. These are SB3
|
||||
# defaults nudged toward longer credit assignment (gamma=0.995) and a
|
||||
# slightly higher entropy bonus to keep exploration alive while curriculum
|
||||
# expands the flock size.
|
||||
|
||||
# --- PPO ---
|
||||
learning_rate: 3.0e-4
|
||||
n_steps: 2048 # rollout length per env before each update
|
||||
batch_size: 256
|
||||
n_epochs: 10
|
||||
gamma: 0.995
|
||||
gae_lambda: 0.95
|
||||
clip_range: 0.2
|
||||
ent_coef: 0.05 # was 0.01 — earlier runs collapsed to ~0 actions
|
||||
vf_coef: 0.5
|
||||
max_grad_norm: 0.5
|
||||
target_kl: null # disable early-stop on KL
|
||||
|
||||
# --- Network ---
|
||||
policy: MlpPolicy
|
||||
net_arch_pi: [128, 128]
|
||||
net_arch_vf: [128, 128]
|
||||
log_std_init: 0.5 # std≈1.6 instead of default 1.0 — more exploration
|
||||
|
||||
# --- Training schedule ---
|
||||
total_timesteps: 10_000_000
|
||||
n_envs: 16
|
||||
checkpoint_freq: 500_000 # in env steps
|
||||
eval_freq: 100_000 # in env steps
|
||||
n_eval_episodes: 20
|
||||
|
||||
# --- Curriculum (max-n_sheep schedule, in env steps) ---
|
||||
# Each entry: at step s, raise the env's max_n_sheep to k. The env samples
|
||||
# uniformly from [1, max_n_sheep] each reset, so this widens the
|
||||
# distribution gradually rather than swapping fixed sizes.
|
||||
#
|
||||
# State-space curriculum: difficulty controls sheep spawn area
|
||||
# (0 = sheep spawn just north of gate, 1 = sheep spawn anywhere in field).
|
||||
# Plus the existing flock-size curriculum.
|
||||
#
|
||||
# The two together let the policy first learn "what penning looks like"
|
||||
# in a regime where random exploration reliably triggers it, then
|
||||
# gradually generalise to the deployment distribution.
|
||||
curriculum:
|
||||
- { step: 0, max_n_sheep: 1, difficulty: 0.0 }
|
||||
- { step: 1_000_000, max_n_sheep: 1, difficulty: 0.3 }
|
||||
- { step: 2_000_000, max_n_sheep: 2, difficulty: 0.5 }
|
||||
- { step: 4_000_000, max_n_sheep: 3, difficulty: 0.8 }
|
||||
- { step: 6_000_000, max_n_sheep: 5, difficulty: 1.0 }
|
||||
- { step: 8_000_000, max_n_sheep: 8, difficulty: 1.0 }
|
||||
- { step: 9_000_000, max_n_sheep: 10, difficulty: 1.0 }
|
||||
+16
-4
@@ -46,9 +46,11 @@ def rollout(env: HerdingEnv, predict_fn, max_steps: int) -> dict:
|
||||
|
||||
def make_analytic_predictor(action_fn):
|
||||
def _predict(env, _obs):
|
||||
positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
|
||||
for i in range(env.n_sheep)
|
||||
if not env.sheep_penned[i]}
|
||||
# Use whatever perception the env exposes — tracker output in
|
||||
# LiDAR mode, ground truth in privileged mode. This makes
|
||||
# evaluation honest: the analytic teacher sees what the
|
||||
# deployed controller would see.
|
||||
positions = env.perceived_positions()
|
||||
vx, vy, _mode = action_fn((env.dog_x, env.dog_y), positions, PEN_ENTRY)
|
||||
return np.array([vx, vy], dtype=np.float32)
|
||||
return _predict
|
||||
@@ -82,6 +84,7 @@ def main():
|
||||
parser.add_argument("--difficulty", type=float, default=1.0)
|
||||
args = parser.parse_args()
|
||||
|
||||
frame_stack = 1 # default; analytic predictors don't use stacked obs
|
||||
if args.policy == "strombom":
|
||||
predict = make_analytic_predictor(strombom_action)
|
||||
elif args.policy == "sequential":
|
||||
@@ -103,6 +106,14 @@ def main():
|
||||
f"policy.zip, final.zip)"
|
||||
)
|
||||
model = PPO.load(str(zip_path), device="auto")
|
||||
# Auto-detect frame stacking from the policy's expected obs dim,
|
||||
# so eval runs with whatever stacking the policy was trained on.
|
||||
from herding.obs import OBS_DIM as _SINGLE
|
||||
policy_obs_dim = int(model.observation_space.shape[0])
|
||||
if policy_obs_dim % _SINGLE == 0 and policy_obs_dim // _SINGLE >= 1:
|
||||
frame_stack = policy_obs_dim // _SINGLE
|
||||
if frame_stack > 1:
|
||||
print(f"[eval] policy expects frame_stack={frame_stack}")
|
||||
vecnorm = None
|
||||
vn_path = run / "vecnormalize.pkl"
|
||||
if not vn_path.exists() and run.parent.name != "best":
|
||||
@@ -121,7 +132,8 @@ def main():
|
||||
successes, steps, penned = [], [], []
|
||||
for seed in range(args.n_seeds):
|
||||
env = HerdingEnv(n_sheep=n, max_steps=args.max_steps,
|
||||
difficulty=args.difficulty, seed=seed)
|
||||
difficulty=args.difficulty, seed=seed,
|
||||
frame_stack=frame_stack)
|
||||
r = rollout(env, predict, args.max_steps)
|
||||
successes.append(int(r["success"]))
|
||||
steps.append(r["steps"])
|
||||
|
||||
+104
-8
@@ -69,7 +69,10 @@ from herding.geometry import (
|
||||
SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS,
|
||||
WEBOTS_DT, is_penned_position,
|
||||
)
|
||||
from herding.lidar_perception import detections_from_scan
|
||||
from herding.lidar_sim import simulate_scan
|
||||
from herding.obs import OBS_DIM, build_obs
|
||||
from herding.sheep_tracker import SheepTracker
|
||||
from herding.strombom import compute_action as strombom_action
|
||||
|
||||
|
||||
@@ -130,11 +133,30 @@ class HerdingEnv(gym.Env):
|
||||
max_steps: int = DEFAULT_MAX_STEPS,
|
||||
difficulty: float = 0.0,
|
||||
seed: Optional[int] = None,
|
||||
use_lidar: bool = True,
|
||||
frame_stack: int = 1,
|
||||
):
|
||||
super().__init__()
|
||||
# When True (default), the obs and the imitation-reward teacher
|
||||
# see only LiDAR-perceived sheep positions through a tracker —
|
||||
# matching what the Webots controller has access to. When False,
|
||||
# both consume ground-truth positions (legacy "privileged" mode,
|
||||
# kept for ablation).
|
||||
self._use_lidar = bool(use_lidar)
|
||||
self._tracker = SheepTracker() if self._use_lidar else None
|
||||
self._np_rng_lidar: Optional[np.random.Generator] = None
|
||||
|
||||
# Frame stacking: the policy receives the last K single-frame
|
||||
# observations concatenated. Lets a memoryless MLP integrate
|
||||
# information across time, partly compensating for the limited
|
||||
# LiDAR FOV. K=1 reproduces the legacy single-frame obs.
|
||||
self._frame_stack = max(1, int(frame_stack))
|
||||
self._frame_buffer: list[np.ndarray] = []
|
||||
self.action_space = spaces.Box(-1.0, 1.0, shape=(2,), dtype=np.float32)
|
||||
self._single_obs_dim = OBS_DIM
|
||||
self.observation_space = spaces.Box(
|
||||
low=-np.inf, high=np.inf, shape=(OBS_DIM,), dtype=np.float32,
|
||||
low=-np.inf, high=np.inf,
|
||||
shape=(OBS_DIM * self._frame_stack,), dtype=np.float32,
|
||||
)
|
||||
|
||||
# If n_sheep is None, env will sample uniformly from [1, max_n_sheep]
|
||||
@@ -243,6 +265,16 @@ class HerdingEnv(gym.Env):
|
||||
self.prev_n_penned = 0
|
||||
self.prev_d_pen, self.prev_radius = self._flock_metrics()
|
||||
|
||||
if self._tracker is not None:
|
||||
self._tracker.reset()
|
||||
self._np_rng_lidar = np.random.default_rng(
|
||||
int(self.np_random.integers(0, 2**31 - 1)))
|
||||
# Prime the tracker with one scan so the first obs isn't empty.
|
||||
self._update_tracker()
|
||||
|
||||
# Clear the frame stack — the next _build_obs will repopulate.
|
||||
self._frame_buffer = []
|
||||
|
||||
obs = self._build_obs()
|
||||
info = {"n_sheep": self.n_sheep}
|
||||
return obs, info
|
||||
@@ -289,6 +321,12 @@ class HerdingEnv(gym.Env):
|
||||
and is_penned_position(self.sheep_x[i], self.sheep_y[i])):
|
||||
self.sheep_penned[i] = True
|
||||
|
||||
# --- Run LiDAR perception on this step's state (after sheep have
|
||||
# moved). Updates the tracker that obs and the imitation-
|
||||
# reward teacher consume. Reward / termination still use GT. ---
|
||||
if self._tracker is not None:
|
||||
self._update_tracker()
|
||||
|
||||
# --- Reward, termination ---
|
||||
d_pen, radius = self._flock_metrics()
|
||||
reward = self._compute_reward(d_pen, radius, action=action)
|
||||
@@ -395,10 +433,7 @@ class HerdingEnv(gym.Env):
|
||||
r = self.W_PEN_DELTA * delta_pen + self.W_PROGRESS * d_progress
|
||||
|
||||
if action is not None and self.W_IMITATE > 0.0:
|
||||
positions = {
|
||||
f"s{i}": (float(self.sheep_x[i]), float(self.sheep_y[i]))
|
||||
for i in range(self.n_sheep) if not self.sheep_penned[i]
|
||||
}
|
||||
positions = self._perceived_positions()
|
||||
if positions:
|
||||
sx, sy, _mode = strombom_action(
|
||||
(self.dog_x, self.dog_y), positions, PEN_ENTRY,
|
||||
@@ -411,11 +446,72 @@ class HerdingEnv(gym.Env):
|
||||
|
||||
return float(r)
|
||||
|
||||
def _build_obs(self) -> np.ndarray:
|
||||
sheep_xy_list = list(zip(self.sheep_x.tolist(), self.sheep_y.tolist()))
|
||||
sheep_penned_list = self.sheep_penned.tolist()
|
||||
def _build_single_obs(self) -> np.ndarray:
|
||||
if self._tracker is not None:
|
||||
# Obs sees only the tracker's active set; penned tracks are
|
||||
# intentionally excluded (matches the prior receiver-based
|
||||
# behaviour where penned sheep stopped contributing to the
|
||||
# symbolic obs).
|
||||
active = self._tracker.get_positions()
|
||||
sheep_xy_list = list(active.values())
|
||||
sheep_penned_list = [False] * len(sheep_xy_list)
|
||||
else:
|
||||
sheep_xy_list = list(zip(self.sheep_x.tolist(), self.sheep_y.tolist()))
|
||||
sheep_penned_list = self.sheep_penned.tolist()
|
||||
return build_obs(
|
||||
(self.dog_x, self.dog_y), self.dog_heading,
|
||||
sheep_xy_list, sheep_penned_list,
|
||||
n_max=self._max_n_sheep,
|
||||
)
|
||||
|
||||
def _build_obs(self) -> np.ndarray:
|
||||
single = self._build_single_obs()
|
||||
if self._frame_stack <= 1:
|
||||
return single
|
||||
# On a fresh reset the buffer is empty — duplicate the first
|
||||
# frame so the stack is always full-length.
|
||||
if not self._frame_buffer:
|
||||
self._frame_buffer = [single.copy() for _ in range(self._frame_stack)]
|
||||
else:
|
||||
self._frame_buffer.append(single)
|
||||
if len(self._frame_buffer) > self._frame_stack:
|
||||
self._frame_buffer = self._frame_buffer[-self._frame_stack:]
|
||||
# Concatenate oldest → newest.
|
||||
return np.concatenate(self._frame_buffer, axis=0).astype(np.float32)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# LiDAR perception helpers
|
||||
# ------------------------------------------------------------------
|
||||
def _all_sheep_xy(self) -> list[tuple[float, float]]:
|
||||
"""Every sheep, including penned ones (the LiDAR sees them)."""
|
||||
return [(float(self.sheep_x[i]), float(self.sheep_y[i]))
|
||||
for i in range(self.n_sheep)]
|
||||
|
||||
def _update_tracker(self) -> None:
|
||||
ranges = simulate_scan(
|
||||
self.dog_x, self.dog_y, self.dog_heading,
|
||||
self._all_sheep_xy(),
|
||||
rng=self._np_rng_lidar,
|
||||
)
|
||||
detections = detections_from_scan(
|
||||
ranges, self.dog_x, self.dog_y, self.dog_heading,
|
||||
)
|
||||
self._tracker.update(detections)
|
||||
|
||||
def perceived_positions(self) -> dict[str, tuple[float, float]]:
|
||||
"""Public accessor — what the controller would 'see' this step.
|
||||
|
||||
LiDAR mode → the tracker's active set.
|
||||
Privileged mode → ground-truth active sheep.
|
||||
|
||||
Used by ``training.eval`` and ``tools.collect_demos`` so analytic
|
||||
teachers run on the same perception the deployed controller has.
|
||||
"""
|
||||
if self._tracker is not None:
|
||||
return self._tracker.get_positions()
|
||||
return {f"s{i}": (float(self.sheep_x[i]), float(self.sheep_y[i]))
|
||||
for i in range(self.n_sheep) if not self.sheep_penned[i]}
|
||||
|
||||
# Internal alias so the imitation reward path doesn't need to know
|
||||
# which mode it's in.
|
||||
_perceived_positions = perceived_positions
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
+275
-206
@@ -1,31 +1,33 @@
|
||||
"""PPO trainer for the shepherd-dog policy — EXPERIMENTAL.
|
||||
"""KL-regularised PPO fine-tune of a behaviour-cloned policy.
|
||||
|
||||
The deliverable pipeline is `bc_pretrain.py` (see ``training/README.md``).
|
||||
This script is kept in the tree because it implements:
|
||||
The PPO-from-scratch and unregularised PPO-fine-tune-of-BC versions
|
||||
we tried earlier failed for the standard reasons (sparse pen reward,
|
||||
long horizons, exploration noise destroying BC weights). The fix is
|
||||
to anchor the policy to its BC initialisation with a KL penalty in
|
||||
the loss — the policy is free to refine the BC mean within a
|
||||
trust-region-like ball around the reference, and the dense-enough
|
||||
per-step reward signal does the rest.
|
||||
|
||||
* PPO from scratch with curriculum over flock size + spawn area, and
|
||||
* PPO fine-tune of a behavior-cloned policy.
|
||||
Pipeline
|
||||
--------
|
||||
1. Load ``bc_v3`` weights into both the trainable policy and a frozen
|
||||
reference ``ref_policy``.
|
||||
2. Initialise the policy's log_std to a small fixed value (≈ −1.5)
|
||||
and disable its gradient — exploration noise stays small so PPO
|
||||
updates don't blow up the BC mean before reward can stabilise.
|
||||
3. Override ``PPO.train()`` to add ``β · KL(π ‖ π_ref)`` to the loss
|
||||
each minibatch.
|
||||
4. Train for ~1–3 M timesteps with a low LR (5e-5).
|
||||
|
||||
Both ran into stability issues in our setting (long-horizon credit
|
||||
assignment for sparse pen reward, BC-degradation under PPO exploration
|
||||
noise). The abstractions are reusable for follow-up work — e.g.
|
||||
KL-regularised fine-tune with a frozen reference policy — so we leave
|
||||
the code in place.
|
||||
Output: ``runs/rl_v1/policy.zip`` — same SB3 format as bc_v3, loadable
|
||||
by the dog controller's ``HERDING_MODE=rl`` path.
|
||||
|
||||
Usage (PPO from scratch)::
|
||||
Usage::
|
||||
|
||||
python -m training.train_ppo \
|
||||
--config training/configs/ppo_default.yaml \
|
||||
--out-dir training/runs/ppo_scratch
|
||||
|
||||
Usage (PPO fine-tune of BC)::
|
||||
|
||||
python -m training.train_ppo \
|
||||
--resume training/runs/bc_flock/policy.zip \
|
||||
--out-dir training/runs/bc_ppo \
|
||||
--no-vecnorm --no-curriculum --imitate-weight 0 \
|
||||
--difficulty 1.0 --log-std -1.5 --learning-rate 5e-5 \
|
||||
--total-timesteps 3000000
|
||||
python -m training.train_ppo \\
|
||||
--bc training/runs/bc_v3 \\
|
||||
--out training/runs/rl_v1 \\
|
||||
--total-timesteps 2000000
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -35,8 +37,6 @@ import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
|
||||
if _PROJECT_ROOT not in sys.path:
|
||||
@@ -44,236 +44,305 @@ if _PROJECT_ROOT not in sys.path:
|
||||
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import torch.nn.functional as F
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.callbacks import (
|
||||
BaseCallback, CheckpointCallback, EvalCallback,
|
||||
)
|
||||
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
from stable_baselines3.common.vec_env import (
|
||||
DummyVecEnv, SubprocVecEnv, VecNormalize,
|
||||
)
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
|
||||
|
||||
from herding.obs import OBS_DIM
|
||||
from training.herding_env import HerdingEnv
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Env factories
|
||||
# --------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------
|
||||
# Env factory
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
def _make_env(rank: int, seed: int = 0):
|
||||
def _make_env(rank: int, seed: int, frame_stack: int):
|
||||
def _thunk():
|
||||
env = HerdingEnv(seed=seed + rank)
|
||||
env = HerdingEnv(seed=seed + rank, frame_stack=frame_stack)
|
||||
env = Monitor(env, info_keywords=("is_success", "n_sheep", "n_penned"))
|
||||
return env
|
||||
return _thunk
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Curriculum callback
|
||||
# --------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------
|
||||
# KL-regularised PPO
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
class CurriculumCallback(BaseCallback):
|
||||
"""Drive the env's flock-size + state-space difficulty curriculum.
|
||||
class KLPPO(PPO):
|
||||
"""PPO with an extra KL-to-reference penalty in the policy loss.
|
||||
|
||||
Schedule entries: {step, max_n_sheep, difficulty}. The largest entry
|
||||
whose step <= num_timesteps wins; both knobs update together.
|
||||
Subclasses SB3's PPO and overrides ``train()`` only to add a single
|
||||
line for the KL term — everything else (rollout buffer, clipped
|
||||
surrogate, value loss, entropy bonus) is unchanged.
|
||||
"""
|
||||
|
||||
def __init__(self, schedule, vec_envs, verbose: int = 0):
|
||||
super().__init__(verbose)
|
||||
self.schedule = sorted(schedule, key=lambda d: d["step"])
|
||||
# Accept a list of envs so the eval env tracks training difficulty.
|
||||
self.vec_envs = vec_envs if isinstance(vec_envs, (list, tuple)) else [vec_envs]
|
||||
self._last_n = None
|
||||
self._last_d = None
|
||||
def __init__(self, *args, ref_policy=None, kl_coef: float = 0.05, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# ref_policy is set after construction (caller can build it
|
||||
# from the BC checkpoint once `self.policy` exists).
|
||||
self.ref_policy = ref_policy
|
||||
if self.ref_policy is not None:
|
||||
self.ref_policy.set_training_mode(False)
|
||||
for p in self.ref_policy.parameters():
|
||||
p.requires_grad = False
|
||||
self.kl_coef = kl_coef
|
||||
|
||||
def _call(self, method, value):
|
||||
for v in self.vec_envs:
|
||||
try:
|
||||
v.env_method(method, value)
|
||||
except AttributeError:
|
||||
v.venv.env_method(method, value)
|
||||
def train(self) -> None:
|
||||
# Copied from stable_baselines3.ppo.PPO.train (v2.x), with the
|
||||
# KL-to-reference term added. Keeping the structure intact so
|
||||
# behavioural parity with stock PPO is obvious.
|
||||
self.policy.set_training_mode(True)
|
||||
self._update_learning_rate(self.policy.optimizer)
|
||||
clip_range = self.clip_range(self._current_progress_remaining)
|
||||
if self.clip_range_vf is not None:
|
||||
clip_range_vf = self.clip_range_vf(self._current_progress_remaining)
|
||||
|
||||
def _on_step(self) -> bool:
|
||||
t = self.num_timesteps
|
||||
n = self.schedule[0]["max_n_sheep"]
|
||||
d = self.schedule[0].get("difficulty", 1.0)
|
||||
for entry in self.schedule:
|
||||
if t >= entry["step"]:
|
||||
n = entry["max_n_sheep"]
|
||||
d = entry.get("difficulty", 1.0)
|
||||
if n != self._last_n:
|
||||
self._call("set_max_n_sheep", n)
|
||||
self._last_n = n
|
||||
if d != self._last_d:
|
||||
self._call("set_difficulty", d)
|
||||
self._last_d = d
|
||||
if self.verbose:
|
||||
print(f"[curriculum] t={t} → max_n_sheep={n} difficulty={d}")
|
||||
return True
|
||||
entropy_losses, pg_losses, value_losses, kl_losses = [], [], [], []
|
||||
clip_fractions = []
|
||||
continue_training = True
|
||||
|
||||
for epoch in range(self.n_epochs):
|
||||
approx_kl_divs = []
|
||||
for rollout_data in self.rollout_buffer.get(self.batch_size):
|
||||
actions = rollout_data.actions
|
||||
if isinstance(self.action_space, th.distributions.Categorical.__bases__):
|
||||
actions = rollout_data.actions.long().flatten()
|
||||
|
||||
values, log_prob, entropy = self.policy.evaluate_actions(
|
||||
rollout_data.observations, actions)
|
||||
values = values.flatten()
|
||||
advantages = rollout_data.advantages
|
||||
if self.normalize_advantage and len(advantages) > 1:
|
||||
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
|
||||
|
||||
ratio = th.exp(log_prob - rollout_data.old_log_prob)
|
||||
policy_loss_1 = advantages * ratio
|
||||
policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range)
|
||||
policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()
|
||||
pg_losses.append(policy_loss.item())
|
||||
clip_fraction = th.mean((th.abs(ratio - 1) > clip_range).float()).item()
|
||||
clip_fractions.append(clip_fraction)
|
||||
|
||||
if self.clip_range_vf is None:
|
||||
values_pred = values
|
||||
else:
|
||||
values_pred = rollout_data.old_values + th.clamp(
|
||||
values - rollout_data.old_values, -clip_range_vf, clip_range_vf)
|
||||
value_loss = F.mse_loss(rollout_data.returns, values_pred)
|
||||
value_losses.append(value_loss.item())
|
||||
|
||||
if entropy is None:
|
||||
entropy_loss = -th.mean(-log_prob)
|
||||
else:
|
||||
entropy_loss = -th.mean(entropy)
|
||||
entropy_losses.append(entropy_loss.item())
|
||||
|
||||
# --- KL-to-reference term ----------------------------
|
||||
# Both policies are diagonal Gaussian (ActorCriticPolicy).
|
||||
# KL(π ‖ π_ref) per-action-dim; sum over the action axis
|
||||
# to get total KL per sample, then mean over batch.
|
||||
# Computed on the rollout's observations so the penalty
|
||||
# reflects what the agent actually saw.
|
||||
if self.ref_policy is None:
|
||||
raise RuntimeError("KLPPO.train called without ref_policy")
|
||||
with th.no_grad():
|
||||
ref_dist = self.ref_policy.get_distribution(rollout_data.observations)
|
||||
pi_dist = self.policy.get_distribution(rollout_data.observations)
|
||||
kl_div = th.distributions.kl.kl_divergence(
|
||||
pi_dist.distribution, ref_dist.distribution).sum(dim=-1).mean()
|
||||
kl_losses.append(kl_div.item())
|
||||
# ----------------------------------------------------
|
||||
|
||||
loss = (policy_loss
|
||||
+ self.ent_coef * entropy_loss
|
||||
+ self.vf_coef * value_loss
|
||||
+ self.kl_coef * kl_div)
|
||||
|
||||
with th.no_grad():
|
||||
log_ratio = log_prob - rollout_data.old_log_prob
|
||||
approx_kl_div = th.mean((th.exp(log_ratio) - 1) - log_ratio).cpu().numpy()
|
||||
approx_kl_divs.append(approx_kl_div)
|
||||
|
||||
if self.target_kl is not None and approx_kl_div > 1.5 * self.target_kl:
|
||||
continue_training = False
|
||||
if self.verbose >= 1:
|
||||
print(f"Early stopping at step {epoch} due to reaching max kl: {approx_kl_div:.2f}")
|
||||
break
|
||||
|
||||
self.policy.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
|
||||
self.policy.optimizer.step()
|
||||
|
||||
self._n_updates += 1
|
||||
if not continue_training:
|
||||
break
|
||||
|
||||
explained_var = self._explained_variance()
|
||||
self.logger.record("train/entropy_loss", float(np.mean(entropy_losses)))
|
||||
self.logger.record("train/policy_gradient_loss", float(np.mean(pg_losses)))
|
||||
self.logger.record("train/value_loss", float(np.mean(value_losses)))
|
||||
self.logger.record("train/kl_to_reference", float(np.mean(kl_losses)))
|
||||
self.logger.record("train/approx_kl", float(np.mean(approx_kl_divs)))
|
||||
self.logger.record("train/clip_fraction", float(np.mean(clip_fractions)))
|
||||
self.logger.record("train/explained_variance", float(explained_var))
|
||||
if hasattr(self.policy, "log_std"):
|
||||
self.logger.record("train/std", th.exp(self.policy.log_std).mean().item())
|
||||
|
||||
def _explained_variance(self) -> float:
|
||||
# SB3 doesn't expose this as a method; replicate the computation.
|
||||
y_pred = self.rollout_buffer.values.flatten()
|
||||
y_true = self.rollout_buffer.returns.flatten()
|
||||
var_y = np.var(y_true)
|
||||
return float("nan") if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------
|
||||
# Main
|
||||
# --------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--config", default=os.path.join(_HERE, "configs", "ppo_default.yaml"))
|
||||
parser.add_argument("--out-dir", default=os.path.join(_HERE, "runs", "latest"))
|
||||
parser.add_argument("--n-envs", type=int, default=None,
|
||||
help="Override config n_envs.")
|
||||
parser.add_argument("--total-timesteps", type=int, default=None,
|
||||
help="Override config total_timesteps.")
|
||||
parser.add_argument("--bc", default="training/runs/bc_v3",
|
||||
help="Directory containing the BC initialisation (policy.zip).")
|
||||
parser.add_argument("--out", default="training/runs/rl_v1",
|
||||
help="Where to save the fine-tuned policy.")
|
||||
parser.add_argument("--total-timesteps", type=int, default=2_000_000)
|
||||
parser.add_argument("--n-envs", type=int, default=8)
|
||||
parser.add_argument("--learning-rate", type=float, default=5e-5,
|
||||
help="Low LR keeps PPO close to the BC mean.")
|
||||
parser.add_argument("--kl-coef", type=float, default=0.05,
|
||||
help="KL-to-reference penalty coefficient.")
|
||||
parser.add_argument("--log-std", type=float, default=-1.5,
|
||||
help="Initial (and frozen) log_std. σ ≈ exp(-1.5) ≈ 0.22.")
|
||||
parser.add_argument("--freeze-log-std", action="store_true", default=True,
|
||||
help="Keep log_std fixed; only the policy mean updates.")
|
||||
parser.add_argument("--n-steps", type=int, default=2048,
|
||||
help="Steps per rollout per env.")
|
||||
parser.add_argument("--batch-size", type=int, default=256)
|
||||
parser.add_argument("--n-epochs", type=int, default=10)
|
||||
parser.add_argument("--gamma", type=float, default=0.995)
|
||||
parser.add_argument("--gae-lambda", type=float, default=0.95)
|
||||
parser.add_argument("--clip-range", type=float, default=0.1,
|
||||
help="Tight clip range — keep updates conservative.")
|
||||
parser.add_argument("--ent-coef", type=float, default=0.0)
|
||||
parser.add_argument("--target-kl", type=float, default=0.02,
|
||||
help="SB3's per-batch KL early stop; safety belt.")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--resume", type=str, default=None,
|
||||
help="Path to a SB3 zip to resume from.")
|
||||
# SB3 recommends CPU for MlpPolicy — GPU helps CNN policies, not MLPs
|
||||
# of this size. Override with --device cuda if you really want it.
|
||||
parser.add_argument("--device", default="cpu")
|
||||
parser.add_argument("--no-vecnorm", action="store_true",
|
||||
help="Disable VecNormalize wrapper. Required when "
|
||||
"resuming from a BC-pretrained policy that "
|
||||
"wasn't trained under it.")
|
||||
parser.add_argument("--no-curriculum", action="store_true",
|
||||
help="Skip curriculum callback (resumed policy is "
|
||||
"already competent across the distribution).")
|
||||
parser.add_argument("--imitate-weight", type=float, default=None,
|
||||
help="Override env W_IMITATE. Set to 0 to disable "
|
||||
"Strömbom imitation reward.")
|
||||
parser.add_argument("--difficulty", type=float, default=None,
|
||||
help="Override env difficulty (0=easy, 1=hard). "
|
||||
"Used in BC fine-tune to skip easy curriculum.")
|
||||
parser.add_argument("--log-std", type=float, default=None,
|
||||
help="Override the policy's log_std after load. "
|
||||
"BC trained with std≈1.6 (log_std=0.5) which "
|
||||
"is too noisy for fine-tune. Use -1.5 (std≈0.22) "
|
||||
"to keep PPO close to the BC mean while still "
|
||||
"exploring locally.")
|
||||
parser.add_argument("--learning-rate", type=float, default=None,
|
||||
help="Override config learning rate. For BC "
|
||||
"fine-tune, 5e-5 is much safer than the 3e-4 "
|
||||
"default.")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.config) as f:
|
||||
cfg = yaml.safe_load(f)
|
||||
bc_zip = Path(args.bc) / "policy.zip"
|
||||
if not bc_zip.exists():
|
||||
raise SystemExit(
|
||||
f"BC checkpoint not found at {bc_zip}. Train bc_v3 first with "
|
||||
f"`python -m training.bc_pretrain`."
|
||||
)
|
||||
|
||||
n_envs = args.n_envs or cfg["n_envs"]
|
||||
total_timesteps = args.total_timesteps or cfg["total_timesteps"]
|
||||
|
||||
out = Path(args.out_dir)
|
||||
out = Path(args.out)
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
(out / "checkpoints").mkdir(exist_ok=True)
|
||||
(out / "best").mkdir(exist_ok=True)
|
||||
(out / "evals").mkdir(exist_ok=True)
|
||||
|
||||
print(f"[train] out={out} n_envs={n_envs} total={total_timesteps} device={args.device}")
|
||||
# --- Inspect BC obs dim → infer frame_stack ---
|
||||
ref_only = PPO.load(str(bc_zip), device=args.device)
|
||||
obs_dim = int(ref_only.observation_space.shape[0])
|
||||
if obs_dim % OBS_DIM != 0:
|
||||
raise SystemExit(f"BC obs dim {obs_dim} is not a multiple of {OBS_DIM}.")
|
||||
frame_stack = obs_dim // OBS_DIM
|
||||
print(f"[rl] BC obs dim {obs_dim} → frame_stack={frame_stack}")
|
||||
|
||||
# --- Train env (vectorised, optionally normalised) ---
|
||||
env_fns = [_make_env(i, seed=args.seed) for i in range(n_envs)]
|
||||
venv = SubprocVecEnv(env_fns) if n_envs > 1 else DummyVecEnv(env_fns)
|
||||
eval_venv = DummyVecEnv([_make_env(99, seed=args.seed + 999)])
|
||||
if not args.no_vecnorm:
|
||||
venv = VecNormalize(venv, norm_obs=True, norm_reward=False, clip_obs=10.0)
|
||||
eval_venv = VecNormalize(eval_venv, norm_obs=True, norm_reward=False,
|
||||
clip_obs=10.0, training=False)
|
||||
eval_venv.obs_rms = venv.obs_rms
|
||||
else:
|
||||
print("[train] VecNormalize disabled (resumed policy was trained without it).")
|
||||
# --- Vectorised envs (match BC obs space) ---
|
||||
env_fns = [_make_env(i, args.seed, frame_stack) for i in range(args.n_envs)]
|
||||
venv = SubprocVecEnv(env_fns) if args.n_envs > 1 else DummyVecEnv(env_fns)
|
||||
eval_venv = DummyVecEnv([_make_env(99, args.seed + 999, frame_stack)])
|
||||
|
||||
# Apply env-level overrides (used by BC fine-tune to disable Strömbom
|
||||
# imitation and start at full deployment difficulty).
|
||||
def _env_call(method, value):
|
||||
for v in (venv, eval_venv):
|
||||
try:
|
||||
v.env_method(method, value)
|
||||
except AttributeError:
|
||||
v.venv.env_method(method, value)
|
||||
|
||||
if args.imitate_weight is not None:
|
||||
_env_call("set_imitate_weight", args.imitate_weight)
|
||||
print(f"[train] W_IMITATE overridden to {args.imitate_weight}")
|
||||
if args.difficulty is not None:
|
||||
_env_call("set_difficulty", args.difficulty)
|
||||
print(f"[train] difficulty pinned to {args.difficulty}")
|
||||
|
||||
# --- Model ---
|
||||
policy_kwargs = dict(
|
||||
net_arch=dict(pi=cfg["net_arch_pi"], vf=cfg["net_arch_vf"]),
|
||||
log_std_init=cfg.get("log_std_init", 0.0),
|
||||
# --- Trainable policy: load BC weights, then bolt onto PPO ---
|
||||
# Trick: instantiate a PPO with the right env (so the policy
|
||||
# network is constructed at the correct obs/action shape), then
|
||||
# copy BC weights into it.
|
||||
model = KLPPO(
|
||||
"MlpPolicy", venv,
|
||||
ref_policy=None, # filled in below
|
||||
kl_coef=args.kl_coef,
|
||||
learning_rate=args.learning_rate,
|
||||
n_steps=args.n_steps,
|
||||
batch_size=args.batch_size,
|
||||
n_epochs=args.n_epochs,
|
||||
gamma=args.gamma,
|
||||
gae_lambda=args.gae_lambda,
|
||||
clip_range=args.clip_range,
|
||||
ent_coef=args.ent_coef,
|
||||
target_kl=args.target_kl,
|
||||
policy_kwargs=dict(
|
||||
net_arch=dict(pi=[512, 512], vf=[512, 512]),
|
||||
log_std_init=args.log_std,
|
||||
),
|
||||
verbose=1,
|
||||
seed=args.seed,
|
||||
device=args.device,
|
||||
tensorboard_log=str(out / "tb"),
|
||||
)
|
||||
|
||||
if args.resume:
|
||||
print(f"[train] resuming from {args.resume}")
|
||||
custom_objects = {}
|
||||
if args.learning_rate is not None:
|
||||
custom_objects["learning_rate"] = args.learning_rate
|
||||
model = PPO.load(args.resume, env=venv, device=args.device,
|
||||
tensorboard_log=str(out / "tb"),
|
||||
custom_objects=custom_objects or None)
|
||||
if args.log_std is not None:
|
||||
import torch as _th
|
||||
with _th.no_grad():
|
||||
model.policy.log_std.fill_(args.log_std)
|
||||
print(f"[train] log_std overridden to {args.log_std} "
|
||||
f"(std≈{2.71828 ** args.log_std:.2f})")
|
||||
if args.learning_rate is not None:
|
||||
print(f"[train] learning_rate overridden to {args.learning_rate}")
|
||||
else:
|
||||
model = PPO(
|
||||
cfg["policy"], venv,
|
||||
learning_rate=cfg["learning_rate"],
|
||||
n_steps=cfg["n_steps"],
|
||||
batch_size=cfg["batch_size"],
|
||||
n_epochs=cfg["n_epochs"],
|
||||
gamma=cfg["gamma"],
|
||||
gae_lambda=cfg["gae_lambda"],
|
||||
clip_range=cfg["clip_range"],
|
||||
ent_coef=cfg["ent_coef"],
|
||||
vf_coef=cfg["vf_coef"],
|
||||
max_grad_norm=cfg["max_grad_norm"],
|
||||
target_kl=cfg.get("target_kl"),
|
||||
policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=str(out / "tb"),
|
||||
seed=args.seed,
|
||||
device=args.device,
|
||||
verbose=1,
|
||||
)
|
||||
# --- Load BC weights into both `model.policy` and `ref_policy` ---
|
||||
bc_state = ref_only.policy.state_dict()
|
||||
# Strict=False because the value head may not have been trained in
|
||||
# BC — that's fine, PPO will train it from scratch.
|
||||
missing, unexpected = model.policy.load_state_dict(bc_state, strict=False)
|
||||
print(f"[rl] BC → policy: missing={len(missing)} unexpected={len(unexpected)}")
|
||||
|
||||
# Build a separate reference policy with identical architecture and
|
||||
# the BC weights, frozen.
|
||||
ref_policy = type(model.policy)(
|
||||
observation_space=model.observation_space,
|
||||
action_space=model.action_space,
|
||||
lr_schedule=lambda _: 0.0,
|
||||
net_arch=dict(pi=[512, 512], vf=[512, 512]),
|
||||
log_std_init=args.log_std,
|
||||
).to(args.device)
|
||||
ref_policy.load_state_dict(bc_state, strict=False)
|
||||
model.ref_policy = ref_policy
|
||||
model.ref_policy.set_training_mode(False)
|
||||
for p in model.ref_policy.parameters():
|
||||
p.requires_grad = False
|
||||
|
||||
# Align both policies' log_std. BC was trained with log_std≈0.5
|
||||
# (σ≈1.65), which would make the KL term huge from a std mismatch
|
||||
# rather than the mean drift we actually care about. Force both to
|
||||
# the same small value so KL measures only how far the policy mean
|
||||
# has drifted from the BC mean.
|
||||
with th.no_grad():
|
||||
model.policy.log_std.fill_(args.log_std)
|
||||
model.ref_policy.log_std.fill_(args.log_std)
|
||||
if args.freeze_log_std:
|
||||
model.policy.log_std.requires_grad = False
|
||||
print(f"[rl] log_std frozen at {args.log_std} (σ ≈ {np.exp(args.log_std):.3f})")
|
||||
|
||||
# --- Callbacks ---
|
||||
ckpt_cb = CheckpointCallback(
|
||||
save_freq=max(1, cfg["checkpoint_freq"] // n_envs),
|
||||
save_path=str(out / "checkpoints"), name_prefix="ppo",
|
||||
save_vecnormalize=True,
|
||||
save_freq=max(1, 50_000 // args.n_envs),
|
||||
save_path=str(out / "checkpoints"),
|
||||
name_prefix="ppo",
|
||||
)
|
||||
eval_cb = EvalCallback(
|
||||
eval_venv,
|
||||
best_model_save_path=str(out / "best"),
|
||||
log_path=str(out / "evals"),
|
||||
eval_freq=max(1, cfg["eval_freq"] // n_envs),
|
||||
n_eval_episodes=cfg["n_eval_episodes"],
|
||||
eval_freq=max(1, 20_000 // args.n_envs),
|
||||
n_eval_episodes=5,
|
||||
deterministic=True,
|
||||
)
|
||||
callbacks = [ckpt_cb, eval_cb]
|
||||
if not args.no_curriculum and "curriculum" in cfg and cfg["curriculum"]:
|
||||
callbacks.append(CurriculumCallback(
|
||||
cfg["curriculum"], [venv, eval_venv], verbose=1,
|
||||
))
|
||||
elif args.no_curriculum:
|
||||
print("[train] curriculum disabled — env knobs left at their current values.")
|
||||
|
||||
# --- Train ---
|
||||
model.learn(total_timesteps=total_timesteps, callback=callbacks,
|
||||
progress_bar=True)
|
||||
print(f"[rl] training: total_timesteps={args.total_timesteps} "
|
||||
f"n_envs={args.n_envs} lr={args.learning_rate} kl_coef={args.kl_coef}")
|
||||
model.learn(total_timesteps=args.total_timesteps,
|
||||
callback=[ckpt_cb, eval_cb], progress_bar=True)
|
||||
|
||||
# --- Save final model + VecNormalize stats ---
|
||||
model.save(out / "final.zip")
|
||||
venv.save(str(out / "vecnormalize.pkl"))
|
||||
# The EvalCallback already wrote best_model.zip into out/best/ — drop the
|
||||
# VecNormalize stats next to it for the controller to pick up.
|
||||
venv.save(str(out / "best" / "vecnormalize.pkl"))
|
||||
print(f"[train] done. saved to {out}")
|
||||
# --- Save final checkpoint in the SB3 zip the controller expects ---
|
||||
model.save(out / "policy.zip")
|
||||
print(f"[rl] saved fine-tuned policy → {out/'policy.zip'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
Webots Project File version R2025a
|
||||
perspectives: 000000ff00000000fd00000002000000010000011c00000405fc0200000001fb0000001400540065007800740045006400690074006f00720100000000000004050000003f00ffffff00000003000007c500000092fc0100000001fb0000001a0043006f006e0073006f006c00650041006c006c0041006c006c0100000000000007c50000006900ffffff000006a70000040500000001000000020000000100000008fc00000000
|
||||
simulationViewPerspectives: 000000ff000000010000000200000100000003a80100000002010000000100
|
||||
sceneTreePerspectives: 000000ff00000001000000030000001f0000018b000000fa0100000002010000000200
|
||||
maximizedDockId: -1
|
||||
centralWidgetVisible: 1
|
||||
orthographicViewHeight: 1
|
||||
textFiles: -1
|
||||
consoles: Console:All:All
|
||||
Reference in New Issue
Block a user