Checkpoint 6
This commit is contained in:
@@ -12,7 +12,7 @@ gate into an external pen. The dog has three deployable modes:
|
|||||||
| `rl` | KL-regularised PPO fine-tune of `bc` | Reward-driven refinement |
|
| `rl` | KL-regularised PPO fine-tune of `bc` | Reward-driven refinement |
|
||||||
|
|
||||||
`sequential` (single-target pin-and-push) is kept as an alternative
|
`sequential` (single-target pin-and-push) is kept as an alternative
|
||||||
analytic baseline. `dagger` is a data-collection mode, not deployment.
|
analytic baseline.
|
||||||
|
|
||||||
## Perception
|
## Perception
|
||||||
|
|
||||||
@@ -28,13 +28,13 @@ control step:
|
|||||||
(`herding/sheep_tracker.py`).
|
(`herding/sheep_tracker.py`).
|
||||||
|
|
||||||
**LiDAR validation** (intermediate-goal item v from `docs/project.md`):
|
**LiDAR validation** (intermediate-goal item v from `docs/project.md`):
|
||||||
run the dog controller in `HERDING_MODE=diag` mode to capture 80
|
during development a diagnostic-dump controller captured 80 real
|
||||||
real Webots scans plus the ground-truth sheep positions in
|
Webots scans plus the ground-truth sheep positions. Comparing
|
||||||
`training/dagger/diag_<ts>.npz`. Comparing detections against GT in
|
detections against GT showed clustered centroids match GT positions
|
||||||
that file showed clustered centroids match GT positions within 0.15 m
|
within 0.15 m after the +SHEEP_RADIUS surface-to-centre correction —
|
||||||
after the +SHEEP_RADIUS surface-to-centre correction — i.e. the
|
i.e. the LiDAR pipeline produces correct sheep-position estimates
|
||||||
LiDAR pipeline produces correct sheep-position estimates from the
|
from the real Webots scan, validating the sensor for the herding
|
||||||
real Webots scan, validating the sensor for the herding task.
|
task.
|
||||||
|
|
||||||
The tracker outputs a `{name: (x, y)}` dict shaped exactly like the
|
The tracker outputs a `{name: (x, y)}` dict shaped exactly like the
|
||||||
prior receiver-based one, so Strömbom, Sequential, and the BC obs
|
prior receiver-based one, so Strömbom, Sequential, and the BC obs
|
||||||
@@ -53,7 +53,7 @@ Privileged ground-truth perception is available for ablation —
|
|||||||
pip install -r training/requirements.txt
|
pip install -r training/requirements.txt
|
||||||
|
|
||||||
# 2. Smoke test
|
# 2. Smoke test
|
||||||
python -m training.parity_test
|
python -m tests.parity_test
|
||||||
|
|
||||||
# 3. Reproduce the BC policy (~10 min on CPU: ~5 min demos + ~3 min BC)
|
# 3. Reproduce the BC policy (~10 min on CPU: ~5 min demos + ~3 min BC)
|
||||||
python -m tools.collect_demos --teacher strombom \
|
python -m tools.collect_demos --teacher strombom \
|
||||||
@@ -61,21 +61,17 @@ python -m tools.collect_demos --teacher strombom \
|
|||||||
python -m training.bc_pretrain --demos training/demos.npz \
|
python -m training.bc_pretrain --demos training/demos.npz \
|
||||||
--out training/runs/bc --epochs 60 --net-arch 512,512
|
--out training/runs/bc --epochs 60 --net-arch 512,512
|
||||||
|
|
||||||
# 4. Optional: DAgger from inside Webots if sim-trained doesn't transfer
|
# 4. KL-PPO fine-tune of the BC policy (~30 min on CPU, 1 M steps)
|
||||||
tools/auto_dagger.sh 3 60
|
|
||||||
python -m tools.dagger_merge_train --out training/runs/bc_dagger
|
|
||||||
|
|
||||||
# 5. Evaluate (env)
|
|
||||||
python -m training.eval --policy training/runs/bc \
|
|
||||||
--max-flock 10 --max-steps 8000 --n-seeds 5
|
|
||||||
|
|
||||||
# 6. Optional RL fine-tune of the BC policy (~40 min on CPU, 1 M steps)
|
|
||||||
python -m training.train_ppo \
|
python -m training.train_ppo \
|
||||||
--bc training/runs/bc \
|
--bc training/runs/bc \
|
||||||
--out training/runs/rl \
|
--out training/runs/rl \
|
||||||
--total-timesteps 1000000
|
--total-timesteps 1000000
|
||||||
|
|
||||||
# 7. Run in Webots
|
# 5. Evaluate (env)
|
||||||
|
python -m training.eval --policy training/runs/rl \
|
||||||
|
--max-flock 10 --max-steps 15000 --n-seeds 10
|
||||||
|
|
||||||
|
# 6. Run in Webots
|
||||||
tools/run_webots.sh 10 bc # behaviour-cloned MLP
|
tools/run_webots.sh 10 bc # behaviour-cloned MLP
|
||||||
tools/run_webots.sh 10 rl # KL-PPO fine-tune
|
tools/run_webots.sh 10 rl # KL-PPO fine-tune
|
||||||
tools/run_webots.sh 10 strombom # analytic baseline
|
tools/run_webots.sh 10 strombom # analytic baseline
|
||||||
@@ -84,22 +80,25 @@ tools/run_webots.sh 10 strombom # analytic baseline
|
|||||||
## Layout
|
## Layout
|
||||||
|
|
||||||
```
|
```
|
||||||
herding/ — single source of truth (env + Webots both import)
|
herding/ — perception / control / world primitives
|
||||||
geometry.py — field/pen constants, robot specs
|
|
||||||
flocking_sim.py — Reynolds-style sheep dynamics
|
|
||||||
diffdrive.py — differential-drive kinematics
|
|
||||||
control.py — shared near-sheep speed-modulation helper
|
|
||||||
obs.py — 32-D order-invariant observation builder
|
obs.py — 32-D order-invariant observation builder
|
||||||
strombom.py — canonical CoM-drive teacher
|
world/ — environment-side physics & geometry
|
||||||
sequential.py — single-target "pin-and-push" teacher
|
geometry.py field/pen constants, robot specs
|
||||||
active_scan.py — wraps a base teacher with opening rotation +
|
diffdrive.py differential-drive kinematics
|
||||||
walk-to-centre + speed modulation
|
flocking_sim.py Reynolds + Strömbom 2014 sheep dynamics
|
||||||
lidar_sim.py — fast 2D raycast for the env (sheep + walls + posts)
|
perception/ — LiDAR → tracked-sheep pipeline
|
||||||
lidar_perception.py — scan → world-frame cluster centroids + filters
|
lidar_sim.py fast 2D raycast for the env
|
||||||
sheep_tracker.py — multi-target NN tracker with FOV memory
|
lidar_perception.py scan → world-frame cluster centroids + filters
|
||||||
|
sheep_tracker.py multi-target NN tracker with FOV memory
|
||||||
|
control/ — every dog mode's action source
|
||||||
|
strombom.py canonical CoM collect/drive heuristic
|
||||||
|
sequential.py single-target "pin-and-push" alternative
|
||||||
|
active_scan.py wraps a base teacher with opening rotation +
|
||||||
|
walk-to-centre fallback
|
||||||
|
modulation.py shared near-sheep speed-modulation helper
|
||||||
|
|
||||||
controllers/
|
controllers/
|
||||||
sheep/sheep.py — Webots sheep controller (uses herding.flocking_sim)
|
sheep/sheep.py — Webots sheep controller (uses herding.world.flocking_sim)
|
||||||
shepherd_dog/
|
shepherd_dog/
|
||||||
shepherd_dog.py — Webots dog controller, mode-switched
|
shepherd_dog.py — Webots dog controller, mode-switched
|
||||||
policy_loader.py — lazy SB3 policy loader (auto-detects frame stack)
|
policy_loader.py — lazy SB3 policy loader (auto-detects frame stack)
|
||||||
@@ -107,16 +106,17 @@ controllers/
|
|||||||
training/
|
training/
|
||||||
herding_env.py — Gymnasium env (LiDAR + tracker by default)
|
herding_env.py — Gymnasium env (LiDAR + tracker by default)
|
||||||
bc_pretrain.py — supervised BC of (obs, action) demos into MLP
|
bc_pretrain.py — supervised BC of (obs, action) demos into MLP
|
||||||
eval.py — analytic + BC policy comparison harness
|
train_ppo.py — KL-regularised PPO fine-tune of BC
|
||||||
parity_test.py — shape / determinism smoke test
|
eval.py — analytic + learned policy comparison harness
|
||||||
runs/ — checkpoints (whitelisted in .gitignore)
|
runs/ — checkpoints (whitelisted in .gitignore)
|
||||||
requirements.txt
|
requirements.txt
|
||||||
|
|
||||||
|
tests/
|
||||||
|
parity_test.py — shape / determinism / baseline smoke test
|
||||||
|
|
||||||
tools/
|
tools/
|
||||||
collect_demos.py — sim demos via the active-scan teacher
|
collect_demos.py — sim demos via the active-scan teacher
|
||||||
dagger_merge_train.py — merge Webots-collected DAgger demos and retrain
|
|
||||||
run_webots.sh — launch Webots with N sheep + chosen mode
|
run_webots.sh — launch Webots with N sheep + chosen mode
|
||||||
auto_dagger.sh — headless DAgger collection across many runs
|
|
||||||
|
|
||||||
worlds/
|
worlds/
|
||||||
field.wbt — main world (3 m gate, external pen)
|
field.wbt — main world (3 m gate, external pen)
|
||||||
@@ -127,8 +127,8 @@ docs/project.md — original project goals
|
|||||||
|
|
||||||
## Shared low-level control
|
## Shared low-level control
|
||||||
|
|
||||||
Every dog mode (RL, Strömbom, Sequential, the DAgger teacher) routes
|
Every dog mode (Strömbom, Sequential, BC, RL) routes its action
|
||||||
its action through `herding/control.py:modulate_speed_near_sheep`,
|
through `herding/control/modulation.py:modulate_speed_near_sheep`,
|
||||||
which scales action magnitude down when within ~2.5 m of the nearest
|
which scales action magnitude down when within ~2.5 m of the nearest
|
||||||
tracked sheep. This stops the dog from charging in at full speed and
|
tracked sheep. This stops the dog from charging in at full speed and
|
||||||
scattering the flock. Direction (intent) is preserved.
|
scattering the flock. Direction (intent) is preserved.
|
||||||
|
|||||||
@@ -11,14 +11,14 @@ _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
|
|||||||
if _PROJECT_ROOT not in sys.path:
|
if _PROJECT_ROOT not in sys.path:
|
||||||
sys.path.insert(0, _PROJECT_ROOT)
|
sys.path.insert(0, _PROJECT_ROOT)
|
||||||
|
|
||||||
from herding.flocking_sim import ( # noqa: F401
|
from herding.world.flocking_sim import ( # noqa: F401
|
||||||
MAX_SPEED, FLEE_SPEED, WANDER_SPEED,
|
MAX_SPEED, FLEE_SPEED, WANDER_SPEED,
|
||||||
WALL_MARGIN, WALL_HARD_MARGIN, WALL_HARD_GAIN,
|
WALL_MARGIN, WALL_HARD_MARGIN, WALL_HARD_GAIN,
|
||||||
FLEE_DIST, SEPARATION_DIST, COHESION_DIST,
|
FLEE_DIST, SEPARATION_DIST, COHESION_DIST,
|
||||||
PEN_MARGIN,
|
PEN_MARGIN,
|
||||||
compute_heading_speed,
|
compute_heading_speed,
|
||||||
)
|
)
|
||||||
from herding.geometry import ( # noqa: F401
|
from herding.world.geometry import ( # noqa: F401
|
||||||
FIELD_X, FIELD_Y, PEN_X, PEN_Y,
|
FIELD_X, FIELD_Y, PEN_X, PEN_Y,
|
||||||
in_pen,
|
in_pen,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -24,9 +24,9 @@ if _PROJECT_ROOT not in sys.path:
|
|||||||
|
|
||||||
from controller import Supervisor
|
from controller import Supervisor
|
||||||
|
|
||||||
from herding.diffdrive import heading_speed_to_wheels
|
from herding.world.diffdrive import heading_speed_to_wheels
|
||||||
from herding.flocking_sim import MAX_SPEED, compute_heading_speed
|
from herding.world.flocking_sim import MAX_SPEED, compute_heading_speed
|
||||||
from herding.geometry import (
|
from herding.world.geometry import (
|
||||||
SHEEP_MAX_WHEEL_OMEGA,
|
SHEEP_MAX_WHEEL_OMEGA,
|
||||||
is_penned_position,
|
is_penned_position,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -4,52 +4,39 @@ Mode is selected by ``HERDING_MODE`` (env var, or via the
|
|||||||
``herding_runtime.cfg`` file the launcher writes since Webots strips
|
``herding_runtime.cfg`` file the launcher writes since Webots strips
|
||||||
env vars on some setups):
|
env vars on some setups):
|
||||||
|
|
||||||
strombom → canonical Strömbom collect/drive heuristic.
|
strombom → canonical Strömbom (2014) collect/drive heuristic
|
||||||
sequential → single-target "pin and push" — drives the sheep
|
wrapped in ActiveScanTeacher (opening rotation +
|
||||||
closest to the pen.
|
walk-to-centre when the tracker briefly empties).
|
||||||
bc → behaviour-cloned MLP, trained on Strömbom demos via
|
sequential → single-target "pin-and-push", same wrapper.
|
||||||
sim. Default policy directory: training/runs/bc.
|
bc → behaviour-cloned MLP, trained on Strömbom demos.
|
||||||
rl → KL-regularised PPO fine-tune of the BC policy. Same
|
Default policy: training/runs/bc/policy.zip.
|
||||||
obs/action space as bc; refines time-to-pen via
|
rl → KL-regularised PPO fine-tune of bc. Same obs/action
|
||||||
environment reward while staying anchored to bc.
|
space as bc; refines time-to-pen via reward while
|
||||||
Default policy directory: training/runs/rl.
|
staying anchored to bc.
|
||||||
dagger → DAgger data collection. Reads sheep ground-truth
|
Default policy: training/runs/rl/policy.zip.
|
||||||
via the receiver, computes the active-scan teacher's
|
|
||||||
recommended action at every step, drives with either
|
|
||||||
the teacher (HERDING_DAGGER_DRIVER=teacher, default)
|
|
||||||
or the loaded student (=student), and logs each
|
|
||||||
(lidar_stacked_obs, teacher_action) pair. On exit
|
|
||||||
dumps to ``training/dagger/dagger_<ts>.npz`` for
|
|
||||||
``tools.dagger_merge_train`` to consume.
|
|
||||||
|
|
||||||
Sheep perception
|
Sheep perception
|
||||||
----------------
|
----------------
|
||||||
The dog now perceives sheep through its **front-mounted 140° LiDAR**
|
The dog perceives sheep through its **front-mounted 140° LiDAR**
|
||||||
(``protos/ShepherdDog.proto``: 180 rays, 12 m max range). Each step
|
(``protos/ShepherdDog.proto``: 180 rays, 12 m max range). Each step:
|
||||||
the controller:
|
|
||||||
|
|
||||||
1. Reads ``lidar.getRangeImage()``.
|
1. Reads ``lidar.getRangeImage()``.
|
||||||
2. Runs ``herding.lidar_perception.detections_from_scan`` to cluster
|
2. Runs ``herding.perception.lidar_perception.detections_from_scan``
|
||||||
returns into world-frame ``(x, y)`` sheep estimates.
|
to cluster returns into world-frame ``(x, y)`` sheep estimates.
|
||||||
3. Folds those into a ``herding.sheep_tracker.SheepTracker`` which
|
3. Folds those into a ``SheepTracker`` which maintains last-seen
|
||||||
maintains last-seen positions for sheep currently out of the
|
positions for sheep currently out of FOV and latches "penned"
|
||||||
FOV and latches "penned" once a track disappears near the gate.
|
once a track crosses the gate plane south.
|
||||||
|
|
||||||
The output of step 3 is a ``{name: (x, y)}`` dict shaped exactly like
|
Sheep ``emitter`` messages are read **for diagnostic logging only**
|
||||||
the receiver-based one we used to consume — so Strömbom, Sequential
|
(GT_penned counter + auto-finish sentinel); they are never used to
|
||||||
and the BC obs builder run unchanged. The sheep→dog Emitter/Receiver
|
drive the policy. Perception for control comes entirely from LiDAR.
|
||||||
link is still up (kept passively for compatibility) but its messages
|
|
||||||
are *not* used for control.
|
|
||||||
|
|
||||||
All modes share the same low-level differential-drive controller
|
Auto-finish
|
||||||
(``herding.diffdrive.velocity_to_wheels`` with cos(err)-clamped forward
|
-----------
|
||||||
speed), so switching modes does not retune actuation.
|
When the dog observes (via GT, read off the receiver) that all sheep
|
||||||
|
are penned, it writes ``training/.run_done`` and the launcher
|
||||||
A safety supervisor enforces the "dog stays out of the pen" invariant:
|
(``tools/run_webots.sh``) detects it and closes Webots. This keeps
|
||||||
if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is
|
batch evaluation runs bounded.
|
||||||
overridden with a north-driving correction. RL fallback: if the policy
|
|
||||||
zip can't be loaded (SB3 missing, file missing), the controller drops
|
|
||||||
to strombom mode automatically.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
@@ -62,26 +49,27 @@ _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
|
|||||||
if _PROJECT_ROOT not in sys.path:
|
if _PROJECT_ROOT not in sys.path:
|
||||||
sys.path.insert(0, _PROJECT_ROOT)
|
sys.path.insert(0, _PROJECT_ROOT)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from controller import Robot
|
from controller import Robot
|
||||||
|
|
||||||
from herding.active_scan import ActiveScanTeacher
|
from herding.control.active_scan import ActiveScanTeacher
|
||||||
from herding.control import modulate_speed_near_sheep
|
from herding.control.modulation import modulate_speed_near_sheep
|
||||||
from herding.diffdrive import velocity_to_wheels
|
from herding.control.sequential import compute_action as sequential_action
|
||||||
from herding.geometry import (
|
from herding.control.strombom import compute_action as strombom_action
|
||||||
|
from herding.obs import build_obs
|
||||||
|
from herding.perception.lidar_perception import detections_from_scan
|
||||||
|
from herding.perception.sheep_tracker import SheepTracker
|
||||||
|
from herding.world.diffdrive import velocity_to_wheels
|
||||||
|
from herding.world.geometry import (
|
||||||
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA,
|
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA,
|
||||||
DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS,
|
DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS,
|
||||||
PEN_ENTRY, is_penned_position,
|
PEN_ENTRY, is_penned_position,
|
||||||
)
|
)
|
||||||
from herding.lidar_perception import detections_from_scan
|
|
||||||
from herding.obs import OBS_DIM, build_obs
|
|
||||||
from herding.sequential import compute_action_debug as sequential_action_debug
|
|
||||||
from herding.sheep_tracker import SheepTracker
|
|
||||||
from herding.strombom import compute_action as strombom_action
|
|
||||||
from herding.strombom import compute_action_debug as strombom_action_debug
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Mode selection
|
# Mode + policy resolution
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _load_runtime_config():
|
def _load_runtime_config():
|
||||||
@@ -135,60 +123,41 @@ def _resolve_policy_dir(mode: str) -> str:
|
|||||||
mode_default = {
|
mode_default = {
|
||||||
"bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
|
"bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
|
||||||
"rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl"),
|
"rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl"),
|
||||||
"dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
|
|
||||||
}
|
}
|
||||||
primary = mode_default.get(mode, mode_default["bc"])
|
primary = mode_default.get(mode, mode_default["bc"])
|
||||||
if os.path.isdir(primary):
|
if os.path.isdir(primary):
|
||||||
return primary
|
return primary
|
||||||
# Fall back to BC if the requested checkpoint isn't there yet
|
|
||||||
# (e.g., user asked for `rl` before training the fine-tune).
|
|
||||||
fallback = mode_default["bc"]
|
fallback = mode_default["bc"]
|
||||||
if os.path.isdir(fallback):
|
if os.path.isdir(fallback):
|
||||||
return fallback
|
return fallback
|
||||||
return env_dir or primary
|
return env_dir or primary
|
||||||
|
|
||||||
|
|
||||||
_VALID_MODES = ("bc", "rl", "strombom", "sequential", "dagger", "diag")
|
_VALID_MODES = ("bc", "rl", "strombom", "sequential")
|
||||||
# Back-compat: an old config saying HERDING_MODE=rl meant "the BC policy".
|
|
||||||
# We now use `rl` strictly for the KL-PPO fine-tune. If the rl
|
|
||||||
# directory isn't present, _resolve_policy_dir below silently falls
|
|
||||||
# back to bc, preserving the old behaviour.
|
|
||||||
if MODE not in _VALID_MODES:
|
if MODE not in _VALID_MODES:
|
||||||
print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
|
print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
|
||||||
MODE = "strombom"
|
MODE = "strombom"
|
||||||
|
|
||||||
DAGGER_DRIVER = (os.environ.get("HERDING_DAGGER_DRIVER")
|
|
||||||
or _runtime_cfg.get("HERDING_DAGGER_DRIVER")
|
|
||||||
or "teacher").lower()
|
|
||||||
if DAGGER_DRIVER not in ("teacher", "student"):
|
|
||||||
DAGGER_DRIVER = "teacher"
|
|
||||||
|
|
||||||
POLICY_DIR = _resolve_policy_dir(MODE)
|
POLICY_DIR = _resolve_policy_dir(MODE)
|
||||||
policy_handle = None
|
policy_handle = None
|
||||||
if MODE in ("bc", "rl", "dagger"):
|
if MODE in ("bc", "rl"):
|
||||||
print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists={os.path.isdir(POLICY_DIR)}")
|
print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists={os.path.isdir(POLICY_DIR)}")
|
||||||
try:
|
try:
|
||||||
from policy_loader import load as _load_policy
|
from policy_loader import load as _load_policy
|
||||||
policy_handle = _load_policy(POLICY_DIR)
|
policy_handle = _load_policy(POLICY_DIR)
|
||||||
print(f"[dog] policy loaded from {POLICY_DIR}")
|
print(f"[dog] policy loaded from {POLICY_DIR}")
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
if MODE in ("bc", "rl"):
|
|
||||||
print(f"[dog] policy load failed ({exc!r}); falling back to strombom.")
|
print(f"[dog] policy load failed ({exc!r}); falling back to strombom.")
|
||||||
MODE = "strombom"
|
MODE = "strombom"
|
||||||
else:
|
print(f"[dog] running in mode={MODE}")
|
||||||
# In dagger mode, no policy is fine if driver=teacher.
|
|
||||||
print(f"[dog] policy load failed ({exc!r}); dagger driver forced to teacher.")
|
|
||||||
policy_handle = None
|
|
||||||
print(f"[dog] running in mode={MODE}"
|
|
||||||
+ (f" driver={DAGGER_DRIVER}" if MODE == "dagger" else ""))
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Action smoothing + safety supervisor
|
# Control parameters
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
ACTION_SMOOTH = 0.55 # was 0.35; bumped for less frame-to-frame action jitter
|
ACTION_SMOOTH = 0.55 # EMA on (vx, vy) — kills frame-to-frame jitter
|
||||||
prev_action = (0.0, 0.0)
|
RUN_DONE_FILE = os.path.join(_PROJECT_ROOT, "training", ".run_done")
|
||||||
|
|
||||||
|
|
||||||
def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
|
def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
|
||||||
@@ -202,10 +171,6 @@ def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
|
|||||||
return (vx, vy)
|
return (vx, vy)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Driving
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float):
|
def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float):
|
||||||
if math.hypot(vx, vy) < 1e-3:
|
if math.hypot(vx, vy) < 1e-3:
|
||||||
left_motor.setVelocity(0.0)
|
left_motor.setVelocity(0.0)
|
||||||
@@ -245,12 +210,9 @@ receiver = robot.getDevice("receiver"); receiver.enable(timestep)
|
|||||||
emitter = robot.getDevice("emitter")
|
emitter = robot.getDevice("emitter")
|
||||||
lidar = robot.getDevice("lidar"); lidar.enable(timestep)
|
lidar = robot.getDevice("lidar"); lidar.enable(timestep)
|
||||||
|
|
||||||
# The receiver channel from sheep is no longer consumed for perception
|
|
||||||
# (kept enabled in case any peripheral tooling reads it). Sheep
|
|
||||||
# positions come exclusively from the LiDAR + tracker pipeline below.
|
|
||||||
tracker = SheepTracker()
|
tracker = SheepTracker()
|
||||||
|
|
||||||
# Cosmetic ear motors — ignored by control logic but keep them animated.
|
# Cosmetic ear motors — animated; not used by control.
|
||||||
left_ear = robot.getDevice("left ear motor")
|
left_ear = robot.getDevice("left ear motor")
|
||||||
right_ear = robot.getDevice("right ear motor")
|
right_ear = robot.getDevice("right ear motor")
|
||||||
left_ear.setPosition(float("inf"))
|
left_ear.setPosition(float("inf"))
|
||||||
@@ -266,75 +228,26 @@ EAR_RATE = 8.0
|
|||||||
# Main loop
|
# Main loop
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
# Active sheep positions come from the LiDAR-fed tracker each step;
|
# Analytic-teacher wrapper (instantiated lazily so RL/BC modes don't pay
|
||||||
# penned_set is the tracker's ``get_penned_set()`` call. We drain the
|
# the import-time cost). Each gets the same ActiveScanTeacher treatment:
|
||||||
# receiver queue without consuming it, so the small backlog of sheep
|
# rotate-on-empty, walk-to-centre, near-sheep speed modulation.
|
||||||
# pings can't grow unbounded.
|
analytic_teacher = None
|
||||||
step_count = 0
|
if MODE in ("strombom", "sequential"):
|
||||||
|
base_fn = strombom_action if MODE == "strombom" else sequential_action
|
||||||
|
analytic_teacher = ActiveScanTeacher(base_fn)
|
||||||
|
|
||||||
import atexit
|
# GT positions from sheep emitters — used **only** for the auto-finish
|
||||||
import time
|
# sentinel and the GT_penned diagnostic line. Never fed into control.
|
||||||
import numpy as _np
|
|
||||||
|
|
||||||
# DAgger state ----------------------------------------------------------
|
|
||||||
# Logged each step in dagger mode: (stacked_lidar_obs, teacher_action).
|
|
||||||
DAGGER_LOG_OBS: list = []
|
|
||||||
DAGGER_LOG_ACT: list = []
|
|
||||||
# Diagnostic mode buffer (one dict per step).
|
|
||||||
DIAG_BUF: list = []
|
|
||||||
# Frame stack buffer the controller maintains itself when dagger mode is
|
|
||||||
# active — the stacked obs we log must match what the policy sees so the
|
|
||||||
# downstream BC consumes (stacked_obs, teacher_action) pairs cleanly.
|
|
||||||
_FRAME_STACK = (policy_handle.frame_stack if policy_handle is not None else 4)
|
|
||||||
_dagger_buffer: list = []
|
|
||||||
# Active-scan teacher operates on GT (read from receiver).
|
|
||||||
_dagger_teacher = ActiveScanTeacher(strombom_action) if MODE == "dagger" else None
|
|
||||||
# GT positions accumulated from the receiver (sheep emit their xy each step).
|
|
||||||
_gt_sheep: dict = {}
|
_gt_sheep: dict = {}
|
||||||
|
_run_done = False
|
||||||
|
|
||||||
|
prev_action = (0.0, 0.0)
|
||||||
_DAGGER_RUN_TS = int(time.time()) # one file per controller run
|
step_count = 0
|
||||||
_DAGGER_DUMPED = False
|
|
||||||
# Sentinel that the auto-collection script polls — empty file written
|
|
||||||
# when this controller decides the run is "done" (all sheep penned, by
|
|
||||||
# GT). The launcher then kills Webots and moves on without waiting out
|
|
||||||
# its timeout. Honoured only in dagger mode.
|
|
||||||
_DAGGER_DONE_FILE = os.path.join(_PROJECT_ROOT, "training", "dagger", ".DONE")
|
|
||||||
|
|
||||||
|
|
||||||
def _dump_dagger_log():
|
|
||||||
"""Save accumulated (obs, teacher_action) pairs to disk on exit.
|
|
||||||
|
|
||||||
Webots may SIGKILL the controller, so the loop also calls this every
|
|
||||||
DAGGER_FLUSH_STEPS so we lose at most a few seconds of data per run.
|
|
||||||
Idempotent — repeated calls overwrite the same file with the latest
|
|
||||||
accumulated buffer.
|
|
||||||
"""
|
|
||||||
global _DAGGER_DUMPED
|
|
||||||
if MODE != "dagger" or not DAGGER_LOG_OBS:
|
|
||||||
return
|
|
||||||
out_dir = os.path.join(_PROJECT_ROOT, "training", "dagger")
|
|
||||||
os.makedirs(out_dir, exist_ok=True)
|
|
||||||
out_path = os.path.join(out_dir, f"dagger_{_DAGGER_RUN_TS}.npz")
|
|
||||||
obs_arr = _np.stack(DAGGER_LOG_OBS).astype(_np.float32)
|
|
||||||
act_arr = _np.stack(DAGGER_LOG_ACT).astype(_np.float32)
|
|
||||||
_np.savez(out_path, obs=obs_arr, actions=act_arr)
|
|
||||||
if not _DAGGER_DUMPED:
|
|
||||||
print(f"[dog dagger] wrote {len(DAGGER_LOG_OBS)} pairs → {out_path}")
|
|
||||||
_DAGGER_DUMPED = True
|
|
||||||
|
|
||||||
|
|
||||||
DAGGER_FLUSH_STEPS = 500
|
|
||||||
|
|
||||||
|
|
||||||
atexit.register(_dump_dagger_log)
|
|
||||||
|
|
||||||
|
|
||||||
while robot.step(timestep) != -1:
|
while robot.step(timestep) != -1:
|
||||||
step_count += 1
|
step_count += 1
|
||||||
|
|
||||||
# Drain receiver. In every mode we capture GT for the diagnostic
|
# Drain sheep emitter messages → GT (diagnostic only).
|
||||||
# log line — perception still comes from LiDAR, the GT is read-only.
|
|
||||||
while receiver.getQueueLength() > 0:
|
while receiver.getQueueLength() > 0:
|
||||||
msg = receiver.getString()
|
msg = receiver.getString()
|
||||||
receiver.nextPacket()
|
receiver.nextPacket()
|
||||||
@@ -350,115 +263,28 @@ while robot.step(timestep) != -1:
|
|||||||
n = compass.getValues()
|
n = compass.getValues()
|
||||||
dog_heading = math.atan2(n[0], n[1])
|
dog_heading = math.atan2(n[0], n[1])
|
||||||
|
|
||||||
# ---- LiDAR perception → tracker → sheep_positions dict ----
|
# ---- LiDAR perception → tracker → active sheep positions ----
|
||||||
ranges = _np.asarray(lidar.getRangeImage(), dtype=_np.float32)
|
ranges = np.asarray(lidar.getRangeImage(), dtype=np.float32)
|
||||||
detections = detections_from_scan(ranges, dog_xy[0], dog_xy[1], dog_heading)
|
detections = detections_from_scan(ranges, dog_xy[0], dog_xy[1], dog_heading)
|
||||||
sheep_positions = tracker.update(detections)
|
sheep_positions = tracker.update(detections)
|
||||||
penned_set = tracker.get_penned_set()
|
|
||||||
|
|
||||||
# ---- Diagnostic mode: dump the first DIAG_STEPS scans + GT to disk.
|
|
||||||
if MODE == "diag":
|
|
||||||
DIAG_STEPS = 80
|
|
||||||
if step_count <= DIAG_STEPS:
|
|
||||||
DIAG_BUF.append(dict(
|
|
||||||
step=step_count,
|
|
||||||
ranges=ranges.copy(),
|
|
||||||
dog_x=dog_xy[0], dog_y=dog_xy[1], dog_h=dog_heading,
|
|
||||||
gt_sheep=dict(_gt_sheep),
|
|
||||||
detections=list(detections),
|
|
||||||
))
|
|
||||||
if step_count == DIAG_STEPS:
|
|
||||||
_diag_path = os.path.join(_PROJECT_ROOT, "training", "dagger",
|
|
||||||
f"diag_{int(time.time())}.npz")
|
|
||||||
os.makedirs(os.path.dirname(_diag_path), exist_ok=True)
|
|
||||||
_np.savez(
|
|
||||||
_diag_path,
|
|
||||||
ranges=_np.stack([d["ranges"] for d in DIAG_BUF]),
|
|
||||||
dog_xy=_np.array([[d["dog_x"], d["dog_y"]] for d in DIAG_BUF],
|
|
||||||
dtype=_np.float32),
|
|
||||||
dog_h=_np.array([d["dog_h"] for d in DIAG_BUF], dtype=_np.float32),
|
|
||||||
# Per-step GT serialised: max-pad to 10 sheep.
|
|
||||||
gt_xy=_np.array([
|
|
||||||
[list(d["gt_sheep"].get(f"sheep{i}", (1e9, 1e9)))
|
|
||||||
for i in range(1, 11)]
|
|
||||||
for d in DIAG_BUF
|
|
||||||
], dtype=_np.float32),
|
|
||||||
detections=_np.array([
|
|
||||||
len(d["detections"]) for d in DIAG_BUF
|
|
||||||
], dtype=_np.int32),
|
|
||||||
)
|
|
||||||
print(f"[dog diag] wrote {DIAG_STEPS} scans → {_diag_path}")
|
|
||||||
|
|
||||||
# Build the single-frame LiDAR obs (matches what the env produces).
|
|
||||||
sheep_xy_list = list(sheep_positions.values())
|
sheep_xy_list = list(sheep_positions.values())
|
||||||
sheep_penned_list = [False] * len(sheep_xy_list)
|
sheep_penned_list = [False] * len(sheep_xy_list)
|
||||||
single_obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list)
|
single_obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list)
|
||||||
# Maintain our own frame stack so logged obs == what policy sees.
|
|
||||||
if not _dagger_buffer:
|
|
||||||
_dagger_buffer = [single_obs.copy() for _ in range(_FRAME_STACK)]
|
|
||||||
else:
|
|
||||||
_dagger_buffer.append(single_obs)
|
|
||||||
if len(_dagger_buffer) > _FRAME_STACK:
|
|
||||||
_dagger_buffer = _dagger_buffer[-_FRAME_STACK:]
|
|
||||||
stacked_obs = _np.concatenate(_dagger_buffer, axis=0).astype(_np.float32)
|
|
||||||
|
|
||||||
# ---- Action selection ----
|
# ---- Action selection ----
|
||||||
if MODE == "diag":
|
if MODE in ("bc", "rl") and policy_handle is not None:
|
||||||
# Diagnostic mode: rotate in place so the captured scans cover
|
|
||||||
# all 360° of view from one position. Target = heading + π →
|
|
||||||
# cos(err) clamps forward to ~0, the dog spins.
|
|
||||||
_t = dog_heading + math.pi
|
|
||||||
vx, vy = math.cos(_t), math.sin(_t)
|
|
||||||
elif MODE == "dagger":
|
|
||||||
# Teacher: active-scan + Strömbom on GT (active sheep only).
|
|
||||||
gt_active = {name: xy for name, xy in _gt_sheep.items()
|
|
||||||
if not is_penned_position(xy[0], xy[1])}
|
|
||||||
t_vx, t_vy, _mode_str = _dagger_teacher(
|
|
||||||
dog_xy, dog_heading, gt_active, PEN_ENTRY,
|
|
||||||
)
|
|
||||||
# Student (if a policy is loaded).
|
|
||||||
s_vx, s_vy = None, None
|
|
||||||
if policy_handle is not None:
|
|
||||||
action = policy_handle.predict(stacked_obs)
|
|
||||||
s_vx, s_vy = float(action[0]), float(action[1])
|
|
||||||
# Drive selection.
|
|
||||||
if DAGGER_DRIVER == "student" and policy_handle is not None:
|
|
||||||
vx, vy = s_vx, s_vy
|
|
||||||
else:
|
|
||||||
vx, vy = t_vx, t_vy
|
|
||||||
# Always log the teacher action (this is the supervision signal).
|
|
||||||
DAGGER_LOG_OBS.append(stacked_obs.copy())
|
|
||||||
DAGGER_LOG_ACT.append(_np.array([t_vx, t_vy], dtype=_np.float32))
|
|
||||||
elif MODE in ("bc", "rl") and policy_handle is not None:
|
|
||||||
# Pass the single-frame obs; the policy_loader maintains its own
|
|
||||||
# frame stack internally. Both bc and rl use the same control
|
|
||||||
# interface — the only difference is which checkpoint loaded.
|
|
||||||
action = policy_handle.predict(single_obs)
|
action = policy_handle.predict(single_obs)
|
||||||
vx, vy = float(action[0]), float(action[1])
|
vx, vy = float(action[0]), float(action[1])
|
||||||
elif MODE in ("strombom", "sequential"):
|
else:
|
||||||
# Wrap the analytic teacher in ActiveScanTeacher so the dog
|
vx, vy, _mode_str = analytic_teacher(
|
||||||
# rotates / walks-to-centre when the tracker briefly empties,
|
|
||||||
# instead of going idle. Without this wrapper, the first 2 s
|
|
||||||
# of LiDAR-blind operation kills the run because Strömbom and
|
|
||||||
# Sequential both return (0, 0) when there are no positions.
|
|
||||||
if "_analytic_teacher" not in globals():
|
|
||||||
from herding.sequential import compute_action as sequential_action
|
|
||||||
_analytic_teacher = ActiveScanTeacher(
|
|
||||||
strombom_action if MODE == "strombom" else sequential_action
|
|
||||||
)
|
|
||||||
vx, vy, _mode_str = _analytic_teacher(
|
|
||||||
dog_xy, dog_heading, sheep_positions, PEN_ENTRY,
|
dog_xy, dog_heading, sheep_positions, PEN_ENTRY,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Shared post-process: speed modulation near sheep. Applies to bc,
|
# Near-sheep speed modulation (shared by every mode).
|
||||||
# rl, strombom, sequential — every mode where the action source is
|
|
||||||
# nominally unit-magnitude. In dagger mode the active-scan teacher
|
|
||||||
# has already modulated, and the diag mode action is hand-built for
|
|
||||||
# rotation; both skip.
|
|
||||||
if MODE not in ("dagger", "diag"):
|
|
||||||
vx, vy = modulate_speed_near_sheep(vx, vy, dog_xy, sheep_positions)
|
vx, vy = modulate_speed_near_sheep(vx, vy, dog_xy, sheep_positions)
|
||||||
|
|
||||||
# EMA smoothing — reduces oscillation from policy or Strömbom flips.
|
# EMA smoothing — kills frame-to-frame action jitter.
|
||||||
vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx
|
vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx
|
||||||
vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy
|
vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy
|
||||||
|
|
||||||
@@ -469,7 +295,7 @@ while robot.step(timestep) != -1:
|
|||||||
drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX)
|
drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX)
|
||||||
emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}")
|
emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}")
|
||||||
|
|
||||||
# Cosmetic ear wiggle — purely visual.
|
# Cosmetic ear wiggle.
|
||||||
ear_phase += 0.12
|
ear_phase += 0.12
|
||||||
ear_pos = EAR_AMPLITUDE * math.sin(ear_phase)
|
ear_pos = EAR_AMPLITUDE * math.sin(ear_phase)
|
||||||
left_ear.setVelocity(EAR_RATE)
|
left_ear.setVelocity(EAR_RATE)
|
||||||
@@ -477,38 +303,26 @@ while robot.step(timestep) != -1:
|
|||||||
left_ear.setPosition(ear_pos)
|
left_ear.setPosition(ear_pos)
|
||||||
right_ear.setPosition(-ear_pos)
|
right_ear.setPosition(-ear_pos)
|
||||||
|
|
||||||
# --- Early-stop when all GT sheep are penned (all modes) ---
|
# Auto-finish: when all GT sheep are penned, write the sentinel.
|
||||||
# The dog isn't a Supervisor so it can't call simulationQuit() —
|
# The launcher polls for it and closes Webots so batch evals don't
|
||||||
# instead we write a sentinel file the launcher polls for and uses
|
# hang after the task is done. Bounded by `_gt_sheep` so we don't
|
||||||
# to kill the Webots process. Bounded by `_gt_sheep` so we don't
|
|
||||||
# fire during the first few steps while the receiver fills.
|
# fire during the first few steps while the receiver fills.
|
||||||
if _gt_sheep and not os.path.exists(_DAGGER_DONE_FILE):
|
if _gt_sheep and not _run_done:
|
||||||
gt_active_count = sum(1 for x, y in _gt_sheep.values()
|
gt_active = sum(1 for x, y in _gt_sheep.values()
|
||||||
if not is_penned_position(x, y))
|
if not is_penned_position(x, y))
|
||||||
if gt_active_count == 0:
|
if gt_active == 0:
|
||||||
if MODE == "dagger":
|
os.makedirs(os.path.dirname(RUN_DONE_FILE), exist_ok=True)
|
||||||
_dump_dagger_log()
|
open(RUN_DONE_FILE, "w").close()
|
||||||
os.makedirs(os.path.dirname(_DAGGER_DONE_FILE), exist_ok=True)
|
_run_done = True
|
||||||
open(_DAGGER_DONE_FILE, "w").close()
|
|
||||||
print(f"[dog] all {len(_gt_sheep)} sheep penned at step "
|
print(f"[dog] all {len(_gt_sheep)} sheep penned at step "
|
||||||
f"{step_count} — wrote {_DAGGER_DONE_FILE}, "
|
f"{step_count} — wrote sentinel, launcher will close Webots")
|
||||||
f"launcher will close Webots")
|
|
||||||
|
|
||||||
if MODE == "dagger" and step_count % DAGGER_FLUSH_STEPS == 0 and DAGGER_LOG_OBS:
|
|
||||||
_dump_dagger_log()
|
|
||||||
|
|
||||||
if step_count % 200 == 0:
|
if step_count % 200 == 0:
|
||||||
gt_penned = sum(1 for x, y in _gt_sheep.values()
|
gt_penned = sum(1 for x, y in _gt_sheep.values()
|
||||||
if is_penned_position(x, y))
|
if is_penned_position(x, y))
|
||||||
gt_total = len(_gt_sheep)
|
gt_total = len(_gt_sheep)
|
||||||
extra = ""
|
|
||||||
if MODE == "dagger":
|
|
||||||
extra = f" logged={len(DAGGER_LOG_OBS)}"
|
|
||||||
print(f"[dog mode={MODE}] step={step_count} "
|
print(f"[dog mode={MODE}] step={step_count} "
|
||||||
f"GT_penned={gt_penned}/{gt_total} "
|
f"GT_penned={gt_penned}/{gt_total} "
|
||||||
f"tracks_active={tracker.n_active()} "
|
f"tracks_active={tracker.n_active()} "
|
||||||
f"tracks_penned={tracker.n_penned()} "
|
f"tracks_penned={tracker.n_penned()} "
|
||||||
f"detections={len(detections)} action=({vx:+.2f}, {vy:+.2f}){extra}")
|
f"detections={len(detections)} action=({vx:+.2f}, {vy:+.2f})")
|
||||||
|
|
||||||
# Loop ended (Webots told us to quit). Flush any remaining DAgger log.
|
|
||||||
_dump_dagger_log()
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
from herding.control import modulate_speed_near_sheep
|
from herding.control.modulation import modulate_speed_near_sheep
|
||||||
|
|
||||||
|
|
||||||
INITIAL_SCAN_STEPS = 80 # ≈1.3 s at dt=16 ms — full rotation at the +π turn target.
|
INITIAL_SCAN_STEPS = 80 # ≈1.3 s at dt=16 ms — full rotation at the +π turn target.
|
||||||
@@ -24,7 +24,7 @@ flock size and works up to at least n=10 within a 15 000-step budget.
|
|||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
from herding.geometry import GATE_Y, PEN_ENTRY, in_pen
|
from herding.world.geometry import GATE_Y, PEN_ENTRY, in_pen
|
||||||
|
|
||||||
|
|
||||||
DELTA_DRIVE = 1.5 # standoff behind the target sheep
|
DELTA_DRIVE = 1.5 # standoff behind the target sheep
|
||||||
@@ -9,7 +9,7 @@ Reference: Strömbom et al. 2014, "Solving the shepherding problem".
|
|||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
from herding.geometry import PEN_ENTRY, GATE_Y, in_pen
|
from herding.world.geometry import PEN_ENTRY, GATE_Y, in_pen
|
||||||
|
|
||||||
# Algorithm parameters. DELTA_DRIVE / DELTA_COLLECT were tightened from
|
# Algorithm parameters. DELTA_DRIVE / DELTA_COLLECT were tightened from
|
||||||
# the original (4.0 / 2.5) because the new external pen sits ~26 m from
|
# the original (4.0 / 2.5) because the new external pen sits ~26 m from
|
||||||
+1
-1
@@ -31,7 +31,7 @@ Layout (all components normalised so values stay roughly in [-1, 1]):
|
|||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from herding.geometry import (
|
from herding.world.geometry import (
|
||||||
FIELD_X, FIELD_Y, PEN_ENTRY, MAX_SHEEP,
|
FIELD_X, FIELD_Y, PEN_ENTRY, MAX_SHEEP,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -29,8 +29,8 @@ import math
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from herding.geometry import FIELD_X, FIELD_Y, GATE_Y, PEN_X, PEN_Y
|
from herding.world.geometry import FIELD_X, FIELD_Y, GATE_Y, PEN_X, PEN_Y
|
||||||
from herding.lidar_sim import (
|
from herding.perception.lidar_sim import (
|
||||||
LIDAR_FOV, LIDAR_MAX_RANGE, LIDAR_N_RAYS, SHEEP_RADIUS, ray_angles,
|
LIDAR_FOV, LIDAR_MAX_RANGE, LIDAR_N_RAYS, SHEEP_RADIUS, ray_angles,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -26,7 +26,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
from herding.geometry import MAX_SHEEP, in_pen, is_penned_position
|
from herding.world.geometry import MAX_SHEEP, in_pen, is_penned_position
|
||||||
|
|
||||||
|
|
||||||
GATE_M = 2.5 # m — primary NN gate (recent tracks)
|
GATE_M = 2.5 # m — primary NN gate (recent tracks)
|
||||||
@@ -51,7 +51,7 @@ is a defensible engineering adaptation of Strömbom's qualitative
|
|||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from herding.geometry import (
|
from herding.world.geometry import (
|
||||||
FIELD_X, FIELD_Y,
|
FIELD_X, FIELD_Y,
|
||||||
PEN_X, PEN_Y,
|
PEN_X, PEN_Y,
|
||||||
GATE_X,
|
GATE_X,
|
||||||
@@ -21,9 +21,9 @@ if _PROJECT_ROOT not in sys.path:
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from herding.geometry import MAX_SHEEP, PEN_ENTRY
|
from herding.world.geometry import MAX_SHEEP, PEN_ENTRY
|
||||||
from herding.obs import OBS_DIM
|
from herding.obs import OBS_DIM
|
||||||
from herding.strombom import compute_action
|
from herding.control.strombom import compute_action
|
||||||
from training.herding_env import HerdingEnv
|
from training.herding_env import HerdingEnv
|
||||||
|
|
||||||
|
|
||||||
@@ -1,166 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# tools/auto_dagger.sh — automated DAgger collection across many headless
|
|
||||||
# Webots runs.
|
|
||||||
#
|
|
||||||
# For each (flock_size, run_index) combination, generates a world with N
|
|
||||||
# active sheep at randomised positions, launches Webots in fast/headless
|
|
||||||
# mode, lets the controller log (lidar_obs, teacher_action) pairs for up
|
|
||||||
# to RUN_SEC seconds, kills the run, and moves on. The dog controller's
|
|
||||||
# 500-step periodic flush means each run produces a complete .npz even
|
|
||||||
# when killed by timeout.
|
|
||||||
#
|
|
||||||
# Usage:
|
|
||||||
# tools/auto_dagger.sh [RUNS_PER_FLOCK] [SECONDS_PER_RUN]
|
|
||||||
# RUNS_PER_FLOCK : how many randomised runs per flock size (default 3)
|
|
||||||
# SECONDS_PER_RUN: wall-clock cap per Webots run (default 60)
|
|
||||||
#
|
|
||||||
# Env-var overrides:
|
|
||||||
# HERDING_POLICY_DIR : policy the controller loads (only used when
|
|
||||||
# HERDING_DAGGER_DRIVER=student). Default bc.
|
|
||||||
# HERDING_DAGGER_DRIVER : "teacher" (default) or "student".
|
|
||||||
# HEADLESS=1 : force --no-rendering (default on).
|
|
||||||
# FLOCKS="1 3 5 8 10" : space-separated flock sizes to iterate over.
|
|
||||||
#
|
|
||||||
# Output:
|
|
||||||
# training/dagger/dagger_<ts>.npz — one per Webots run.
|
|
||||||
#
|
|
||||||
# After collection, run:
|
|
||||||
# python -m tools.dagger_merge_train --out training/runs/bc_dagger
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
RUNS_PER_FLOCK=${1:-3}
|
|
||||||
RUN_SEC=${2:-60}
|
|
||||||
FLOCKS=${FLOCKS:-"1 3 5 8 10"}
|
|
||||||
HEADLESS=${HEADLESS:-1}
|
|
||||||
|
|
||||||
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
|
|
||||||
SRC="$ROOT/worlds/field.wbt"
|
|
||||||
DST="$ROOT/worlds/field_test.wbt"
|
|
||||||
POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc}"
|
|
||||||
DRIVER="${HERDING_DAGGER_DRIVER:-teacher}"
|
|
||||||
DONE_FILE="$ROOT/training/dagger/.DONE"
|
|
||||||
WEBOTS_PID=""
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
echo "Caught interrupt — killing Webots (pid=$WEBOTS_PID) and exiting."
|
|
||||||
[[ -n "$WEBOTS_PID" ]] && kill "$WEBOTS_PID" 2>/dev/null
|
|
||||||
wait "$WEBOTS_PID" 2>/dev/null || true
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
trap cleanup INT TERM
|
|
||||||
|
|
||||||
webots_args=(--mode=fast --batch --minimize)
|
|
||||||
if [[ "$HEADLESS" == "1" ]]; then
|
|
||||||
webots_args+=(--no-rendering)
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Auto-dagger collection"
|
|
||||||
echo " flock sizes : $FLOCKS"
|
|
||||||
echo " runs per size : $RUNS_PER_FLOCK"
|
|
||||||
echo " seconds per run : $RUN_SEC"
|
|
||||||
echo " policy dir : $POLICY_DIR (used only when driver=student)"
|
|
||||||
echo " driver : $DRIVER"
|
|
||||||
echo " webots flags : ${webots_args[*]}"
|
|
||||||
echo
|
|
||||||
|
|
||||||
# Runtime config — re-written before each run anyway, but written once
|
|
||||||
# here so a manual webots launch at the same time would also pick it up.
|
|
||||||
cat > "$ROOT/herding_runtime.cfg" <<EOF
|
|
||||||
HERDING_MODE=dagger
|
|
||||||
HERDING_POLICY_DIR=$POLICY_DIR
|
|
||||||
HERDING_DAGGER_DRIVER=$DRIVER
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Count files before, so we can summarise what was added.
|
|
||||||
mkdir -p "$ROOT/training/dagger"
|
|
||||||
before_count=$(ls -1 "$ROOT/training/dagger"/dagger_*.npz 2>/dev/null | wc -l || echo 0)
|
|
||||||
|
|
||||||
run_idx=0
|
|
||||||
total_runs=0
|
|
||||||
for f in $FLOCKS; do total_runs=$((total_runs + RUNS_PER_FLOCK)); done
|
|
||||||
|
|
||||||
for flock in $FLOCKS; do
|
|
||||||
for run in $(seq 1 "$RUNS_PER_FLOCK"); do
|
|
||||||
run_idx=$((run_idx + 1))
|
|
||||||
seed=$((1000 * flock + run))
|
|
||||||
echo "=== [$run_idx/$total_runs] flock=$flock run=$run seed=$seed ==="
|
|
||||||
|
|
||||||
# Generate randomised world.
|
|
||||||
cp "$SRC" "$DST"
|
|
||||||
for i in $(seq $((flock + 1)) 10); do
|
|
||||||
sed -i "s|^Sheep .* \"sheep${i}\".*|# &|" "$DST"
|
|
||||||
done
|
|
||||||
# Inline Python: jitter sheep1..flock translations.
|
|
||||||
python3 - "$DST" "$flock" "$seed" <<'PYEOF'
|
|
||||||
import re, random, sys
|
|
||||||
path, n_str, seed = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
||||||
n = int(n_str); random.seed(int(seed))
|
|
||||||
with open(path) as f:
|
|
||||||
txt = f.read()
|
|
||||||
def rand_pos():
|
|
||||||
while True:
|
|
||||||
x = random.uniform(-12.0, 12.0)
|
|
||||||
y = random.uniform(-10.0, 12.0) # avoid the gate strip
|
|
||||||
if x * x + y * y > 9.0: # at least 3 m from dog spawn
|
|
||||||
return x, y
|
|
||||||
for i in range(1, n + 1):
|
|
||||||
x, y = rand_pos()
|
|
||||||
pat = re.compile(
|
|
||||||
r'Sheep \{ translation\s+\S+\s+\S+\s+(\S+)\s+name "sheep' + str(i) + r'"'
|
|
||||||
)
|
|
||||||
txt = pat.sub(rf'Sheep {{ translation {x:.2f} {y:.2f} \g<1> name "sheep{i}"', txt, count=1)
|
|
||||||
with open(path, "w") as f:
|
|
||||||
f.write(txt)
|
|
||||||
PYEOF
|
|
||||||
|
|
||||||
# Run Webots in the background; poll for the .DONE sentinel or
|
|
||||||
# the wall-clock timeout, whichever comes first.
|
|
||||||
rm -f "$DONE_FILE"
|
|
||||||
webots "${webots_args[@]}" "$DST" \
|
|
||||||
> /tmp/webots_dagger_run.log 2>&1 &
|
|
||||||
WEBOTS_PID=$!
|
|
||||||
|
|
||||||
# Give the controller 10 s to start before polling the sentinel,
|
|
||||||
# otherwise a sheep that spawns already penned triggers an instant
|
|
||||||
# false-positive kill.
|
|
||||||
elapsed=0
|
|
||||||
grace=10
|
|
||||||
while kill -0 "$WEBOTS_PID" 2>/dev/null; do
|
|
||||||
if (( elapsed >= grace )) && [[ -f "$DONE_FILE" ]]; then
|
|
||||||
echo " sentinel .DONE detected — killing Webots early"
|
|
||||||
kill "$WEBOTS_PID" 2>/dev/null
|
|
||||||
wait "$WEBOTS_PID" 2>/dev/null || true
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
if (( elapsed >= RUN_SEC )); then
|
|
||||||
echo " timeout ($RUN_SEC s) — killing Webots"
|
|
||||||
kill "$WEBOTS_PID" 2>/dev/null
|
|
||||||
wait "$WEBOTS_PID" 2>/dev/null || true
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 2
|
|
||||||
elapsed=$((elapsed + 2))
|
|
||||||
done
|
|
||||||
WEBOTS_PID=""
|
|
||||||
|
|
||||||
# Quick sanity from the log: did the controller actually run?
|
|
||||||
if grep -q "running in mode=dagger" /tmp/webots_dagger_run.log; then
|
|
||||||
new_pairs=$(tail -50 /tmp/webots_dagger_run.log | grep -oE 'logged=[0-9]+' | tail -1)
|
|
||||||
echo " controller ran ($new_pairs)"
|
|
||||||
else
|
|
||||||
echo " WARNING: controller may not have started (see /tmp/webots_dagger_run.log)"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|
||||||
after_count=$(ls -1 "$ROOT/training/dagger"/dagger_*.npz 2>/dev/null | wc -l || echo 0)
|
|
||||||
new_files=$((after_count - before_count))
|
|
||||||
|
|
||||||
echo
|
|
||||||
echo "Done."
|
|
||||||
echo " new dagger files : $new_files"
|
|
||||||
echo " total in dir : $after_count"
|
|
||||||
echo
|
|
||||||
echo "Next:"
|
|
||||||
echo " python -m tools.dagger_merge_train --out training/runs/bc_dagger"
|
|
||||||
@@ -26,10 +26,10 @@ if _PROJECT_ROOT not in sys.path:
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from herding.active_scan import ActiveScanTeacher
|
from herding.control.active_scan import ActiveScanTeacher
|
||||||
from herding.geometry import PEN_ENTRY
|
from herding.world.geometry import PEN_ENTRY
|
||||||
from herding.sequential import compute_action as sequential_action
|
from herding.control.sequential import compute_action as sequential_action
|
||||||
from herding.strombom import compute_action as strombom_action
|
from herding.control.strombom import compute_action as strombom_action
|
||||||
from training.herding_env import HerdingEnv
|
from training.herding_env import HerdingEnv
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,135 +0,0 @@
|
|||||||
"""Merge Webots DAgger demos with sim demos and retrain the BC policy.
|
|
||||||
|
|
||||||
The dog controller in ``HERDING_MODE=dagger`` writes per-run files to
|
|
||||||
``training/dagger/dagger_<ts>.npz`` containing ``(obs, actions)`` pairs
|
|
||||||
where:
|
|
||||||
|
|
||||||
* ``obs`` is the **stacked LiDAR observation** as built by the live
|
|
||||||
Webots tracker — exactly the input distribution the deployed
|
|
||||||
controller sees.
|
|
||||||
* ``actions`` is the **active-scan-teacher action computed from
|
|
||||||
ground-truth sheep positions** (read off the sheep emitter).
|
|
||||||
|
|
||||||
Combined with the existing sim demos (``training/demos.npz`` by
|
|
||||||
default), this gives the BC student a training set that includes the
|
|
||||||
real Webots false-positive distribution — closing the sim-to-real
|
|
||||||
perception gap that the all-sim pipeline couldn't bridge.
|
|
||||||
|
|
||||||
Usage::
|
|
||||||
|
|
||||||
# Iteration 1 — merge all dagger files with sim demos, retrain
|
|
||||||
python -m tools.dagger_merge_train \\
|
|
||||||
--sim training/demos.npz \\
|
|
||||||
--out training/runs/bc_dagger1
|
|
||||||
|
|
||||||
# Iteration 2 — drop the sim baseline, train only on Webots data
|
|
||||||
python -m tools.dagger_merge_train --no-sim --out training/runs/bc_dagger2
|
|
||||||
|
|
||||||
The new policy is saved as ``<out>/policy.zip`` and is auto-loaded by
|
|
||||||
the controller's resolution priority on the next Webots run.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
|
|
||||||
if _PROJECT_ROOT not in sys.path:
|
|
||||||
sys.path.insert(0, _PROJECT_ROOT)
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--sim", default="training/demos.npz",
|
|
||||||
help="Sim demo file to mix with the Webots data. "
|
|
||||||
"Pass --no-sim to train only on dagger data.")
|
|
||||||
parser.add_argument("--no-sim", action="store_true",
|
|
||||||
help="Skip the sim demos entirely.")
|
|
||||||
parser.add_argument("--dagger-glob", default="training/dagger/dagger_*.npz",
|
|
||||||
help="Glob for Webots-collected dagger files.")
|
|
||||||
parser.add_argument("--merged-out", default="training/demos_dagger.npz",
|
|
||||||
help="Where to write the merged demo file.")
|
|
||||||
parser.add_argument("--out", default="training/runs/bc_dagger",
|
|
||||||
help="Where to write the BC policy.")
|
|
||||||
parser.add_argument("--epochs", type=int, default=60)
|
|
||||||
parser.add_argument("--batch-size", type=int, default=256)
|
|
||||||
parser.add_argument("--net-arch", default="512,512")
|
|
||||||
parser.add_argument("--cos-weight", type=float, default=1.0)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# --- Gather Webots files ---
|
|
||||||
dagger_paths = sorted(glob.glob(args.dagger_glob))
|
|
||||||
if not dagger_paths:
|
|
||||||
raise SystemExit(f"No dagger files found at {args.dagger_glob} — "
|
|
||||||
"run Webots in HERDING_MODE=dagger first.")
|
|
||||||
|
|
||||||
chunks_obs: list[np.ndarray] = []
|
|
||||||
chunks_act: list[np.ndarray] = []
|
|
||||||
total_dagger = 0
|
|
||||||
for p in dagger_paths:
|
|
||||||
data = np.load(p)
|
|
||||||
obs = data["obs"].astype(np.float32)
|
|
||||||
act = data["actions"].astype(np.float32)
|
|
||||||
chunks_obs.append(obs)
|
|
||||||
chunks_act.append(act)
|
|
||||||
total_dagger += len(obs)
|
|
||||||
print(f" + {p}: {obs.shape[0]} pairs (obs dim {obs.shape[1]})")
|
|
||||||
print(f"[merge] total dagger pairs: {total_dagger}")
|
|
||||||
|
|
||||||
obs_dim = chunks_obs[0].shape[1]
|
|
||||||
if any(c.shape[1] != obs_dim for c in chunks_obs):
|
|
||||||
raise SystemExit(
|
|
||||||
"Dagger files have inconsistent obs dims — they were collected "
|
|
||||||
"with different frame_stack settings. Either rerun with a "
|
|
||||||
"consistent setting or filter the glob."
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Optionally include sim demos ---
|
|
||||||
if not args.no_sim:
|
|
||||||
sim = np.load(args.sim)
|
|
||||||
sim_obs = sim["obs"].astype(np.float32)
|
|
||||||
sim_act = sim["actions"].astype(np.float32)
|
|
||||||
if sim_obs.shape[1] != obs_dim:
|
|
||||||
raise SystemExit(
|
|
||||||
f"Sim demos have obs dim {sim_obs.shape[1]} but dagger demos "
|
|
||||||
f"have {obs_dim}. Recollect sim demos at the same frame_stack."
|
|
||||||
)
|
|
||||||
chunks_obs.append(sim_obs)
|
|
||||||
chunks_act.append(sim_act)
|
|
||||||
print(f"[merge] + sim demos: {sim_obs.shape[0]} pairs from {args.sim}")
|
|
||||||
|
|
||||||
obs_all = np.concatenate(chunks_obs, axis=0)
|
|
||||||
act_all = np.concatenate(chunks_act, axis=0)
|
|
||||||
# Empty meta — bc_pretrain doesn't actually use it but the file format
|
|
||||||
# has it.
|
|
||||||
meta = np.zeros((0, 5), dtype=np.int32)
|
|
||||||
|
|
||||||
Path(args.merged_out).parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
np.savez(args.merged_out, obs=obs_all, actions=act_all, meta=meta)
|
|
||||||
print(f"[merge] wrote {len(obs_all)} pairs → {args.merged_out}")
|
|
||||||
print(f"[merge] obs shape {obs_all.shape}, action shape {act_all.shape}")
|
|
||||||
|
|
||||||
# --- Run BC training ---
|
|
||||||
cmd = [
|
|
||||||
sys.executable, "-m", "training.bc_pretrain",
|
|
||||||
"--demos", args.merged_out,
|
|
||||||
"--out", args.out,
|
|
||||||
"--epochs", str(args.epochs),
|
|
||||||
"--batch-size", str(args.batch_size),
|
|
||||||
"--net-arch", args.net_arch,
|
|
||||||
"--cos-weight", str(args.cos_weight),
|
|
||||||
]
|
|
||||||
print(f"\n[merge] launching: {' '.join(cmd)}")
|
|
||||||
subprocess.run(cmd, check=True, cwd=_PROJECT_ROOT)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
+9
-13
@@ -7,19 +7,17 @@
|
|||||||
# Usage:
|
# Usage:
|
||||||
# tools/run_webots.sh [N] [MODE]
|
# tools/run_webots.sh [N] [MODE]
|
||||||
# N : number of active sheep (1..10), default 10
|
# N : number of active sheep (1..10), default 10
|
||||||
# MODE : "bc" | "rl" | "strombom" | "sequential" | "dagger", default "bc"
|
# MODE : "bc" | "rl" | "strombom" | "sequential", default "bc"
|
||||||
#
|
#
|
||||||
# Examples:
|
# Examples:
|
||||||
# tools/run_webots.sh 10 bc # BC-trained policy, 10 sheep
|
# tools/run_webots.sh 10 bc # behaviour-cloned MLP, 10 sheep
|
||||||
# tools/run_webots.sh 10 rl # KL-PPO fine-tune of bc, 10 sheep
|
# tools/run_webots.sh 10 rl # KL-PPO fine-tune of bc, 10 sheep
|
||||||
# tools/run_webots.sh 5 sequential # the analytic teacher, 5 sheep
|
# tools/run_webots.sh 5 sequential # single-target analytic baseline
|
||||||
# tools/run_webots.sh 3 strombom # canonical baseline, 3 sheep
|
# tools/run_webots.sh 3 strombom # canonical Strömbom analytic
|
||||||
#
|
#
|
||||||
# Notes:
|
# Notes:
|
||||||
# * The RL mode loads the latest BC policy by default — priority
|
# * bc loads training/runs/bc/policy.zip, rl loads training/runs/rl.
|
||||||
# the BC policy (bc/policy.zip) (the controller resolves it).
|
# Override via HERDING_POLICY_DIR=/path/to/run env var.
|
||||||
# (LiDAR-perception, frame-stack K=4). Override via
|
|
||||||
# HERDING_POLICY_DIR=/path/to/run env var.
|
|
||||||
# * Conda env "tir" must be active (provides stable-baselines3 + torch).
|
# * Conda env "tir" must be active (provides stable-baselines3 + torch).
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
@@ -30,10 +28,9 @@ if (( N < 1 || N > 10 )); then
|
|||||||
echo "N must be 1..10, got $N" >&2; exit 1
|
echo "N must be 1..10, got $N" >&2; exit 1
|
||||||
fi
|
fi
|
||||||
case "$MODE" in
|
case "$MODE" in
|
||||||
bc|rl|strombom|sequential|dagger) ;;
|
bc|rl|strombom|sequential) ;;
|
||||||
*) echo "MODE must be bc|rl|strombom|sequential|dagger, got '$MODE'" >&2; exit 1 ;;
|
*) echo "MODE must be bc|rl|strombom|sequential, got '$MODE'" >&2; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
DAGGER_DRIVER=${HERDING_DAGGER_DRIVER:-teacher}
|
|
||||||
|
|
||||||
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
|
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
|
||||||
SRC="$ROOT/worlds/field.wbt"
|
SRC="$ROOT/worlds/field.wbt"
|
||||||
@@ -59,7 +56,6 @@ RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc}"
|
|||||||
cat > "$ROOT/herding_runtime.cfg" <<EOF
|
cat > "$ROOT/herding_runtime.cfg" <<EOF
|
||||||
HERDING_MODE=$MODE
|
HERDING_MODE=$MODE
|
||||||
HERDING_POLICY_DIR=$RESOLVED_POLICY_DIR
|
HERDING_POLICY_DIR=$RESOLVED_POLICY_DIR
|
||||||
HERDING_DAGGER_DRIVER=$DAGGER_DRIVER
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
export HERDING_MODE="$MODE"
|
export HERDING_MODE="$MODE"
|
||||||
@@ -68,7 +64,7 @@ export HERDING_POLICY_DIR="$RESOLVED_POLICY_DIR"
|
|||||||
# The controller writes this sentinel when all GT sheep are penned. We
|
# The controller writes this sentinel when all GT sheep are penned. We
|
||||||
# poll for it and kill Webots so the run finishes cleanly instead of
|
# poll for it and kill Webots so the run finishes cleanly instead of
|
||||||
# idling for minutes after the task is done.
|
# idling for minutes after the task is done.
|
||||||
DONE_FILE="$ROOT/training/dagger/.DONE"
|
DONE_FILE="$ROOT/training/.run_done"
|
||||||
mkdir -p "$(dirname "$DONE_FILE")"
|
mkdir -p "$(dirname "$DONE_FILE")"
|
||||||
rm -f "$DONE_FILE"
|
rm -f "$DONE_FILE"
|
||||||
|
|
||||||
|
|||||||
+35
-54
@@ -1,21 +1,16 @@
|
|||||||
# Training pipeline
|
# Training pipeline
|
||||||
|
|
||||||
Behavior cloning of analytic herding teachers into a neural-network
|
Two stages, strictly sequential:
|
||||||
policy that runs under LiDAR perception in Webots.
|
|
||||||
|
|
||||||
```
|
```
|
||||||
sim demos (active-scan teacher on tracker output, K=4 frame stack)
|
sim demos (Strömbom on tracker output, K=4 frame stack)
|
||||||
│
|
│
|
||||||
▼
|
▼
|
||||||
bc_pretrain.py ──► runs/bc (BC baseline)
|
bc_pretrain.py ──► runs/bc (Strömbom-imitated MLP)
|
||||||
│
|
│
|
||||||
▼ KL-regularised PPO fine-tune (training/train_ppo.py)
|
▼ KL-regularised PPO fine-tune
|
||||||
│
|
│
|
||||||
runs/rl (deployed `rl` mode)
|
runs/rl (deployed `rl` mode — beats BC and Strömbom)
|
||||||
|
|
||||||
# optional branch — kept for reference, not deployed:
|
|
||||||
runs/bc_dagger (Webots-grounded DAgger refinement, useful if a
|
|
||||||
modified world breaks sim-to-real transfer)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Files
|
## Files
|
||||||
@@ -23,10 +18,9 @@ runs/bc_dagger (Webots-grounded DAgger refinement, useful if a
|
|||||||
```
|
```
|
||||||
herding_env.py — Gymnasium env (LiDAR raycast + tracker by default)
|
herding_env.py — Gymnasium env (LiDAR raycast + tracker by default)
|
||||||
bc_pretrain.py — MSE + cosine BC of (obs, action) demos into MlpPolicy
|
bc_pretrain.py — MSE + cosine BC of (obs, action) demos into MlpPolicy
|
||||||
eval.py — analytic teachers + BC policies, full n=1..10 grid
|
train_ppo.py — KL-regularised PPO fine-tune of a BC checkpoint
|
||||||
parity_test.py — shape / determinism / baseline smoke test
|
eval.py — multi-seed analytic / learned policy comparison
|
||||||
runs/ — checkpoints (most are .gitignored; the deployed
|
runs/ — checkpoints (whitelisted entries in top-level .gitignore)
|
||||||
ones are whitelisted in the top-level .gitignore)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
@@ -39,75 +33,62 @@ CPU is the default and recommended device — SB3 PPO with an MLP policy
|
|||||||
of this size runs faster on CPU than GPU because the bottleneck is
|
of this size runs faster on CPU than GPU because the bottleneck is
|
||||||
rollout collection, not gradient compute.
|
rollout collection, not gradient compute.
|
||||||
|
|
||||||
## The BC pipeline
|
## End-to-end pipeline
|
||||||
|
|
||||||
```
|
```bash
|
||||||
# 1. Sim demos with the active-scan + Strömbom teacher under LiDAR
|
# 1. Sim demos with the active-scan + Strömbom teacher under LiDAR
|
||||||
# perception. K=4 frame stack so the MLP has temporal context.
|
# perception. K=4 frame stack so the MLP has temporal context.
|
||||||
python -m tools.collect_demos --teacher strombom \
|
python -m tools.collect_demos --teacher strombom \
|
||||||
--out demos.npz --seeds-per-n 15 --subsample 3 --frame-stack 4
|
--out training/demos.npz --seeds-per-n 15 --subsample 3 --frame-stack 4
|
||||||
|
|
||||||
# 2. Behavior-clone.
|
# 2. Behaviour-clone.
|
||||||
python -m training.bc_pretrain --demos demos.npz \
|
python -m training.bc_pretrain --demos training/demos.npz \
|
||||||
--out runs/bc --epochs 60 --net-arch 512,512
|
--out training/runs/bc --epochs 60 --net-arch 512,512
|
||||||
|
|
||||||
# 3. Evaluate.
|
# 3. KL-regularised PPO fine-tune of bc.
|
||||||
python -m training.eval --policy runs/bc \
|
python -m training.train_ppo \
|
||||||
--max-flock 10 --max-steps 8000 --n-seeds 5
|
--bc training/runs/bc --out training/runs/rl \
|
||||||
|
--total-timesteps 1000000
|
||||||
|
|
||||||
|
# 4. Multi-seed eval (env-side, fast).
|
||||||
|
python -m training.eval --policy training/runs/rl \
|
||||||
|
--max-flock 10 --max-steps 15000 --n-seeds 10
|
||||||
```
|
```
|
||||||
|
|
||||||
`bc_pretrain.py` saves the **best-val_cos** snapshot, not the final
|
`bc_pretrain.py` saves the **best-val_cos** snapshot, not the final
|
||||||
epoch — multi-modal teachers make training noisy and the last epoch is
|
epoch — multi-modal teachers make training noisy and the last epoch is
|
||||||
often worse than an earlier one.
|
often worse than an earlier one.
|
||||||
|
|
||||||
## DAgger from Webots
|
`train_ppo.py` loads BC weights into both a trainable policy and a
|
||||||
|
frozen reference, fixes `log_std` small, and adds `β · KL(π‖π_ref)` to
|
||||||
Sim-only BC plateaus because the env's 2D raycast can't reproduce all
|
the loss so the policy can only move within a trust region around BC.
|
||||||
the false-positive clusters Webots generates from real geometry. The
|
See the file header for hyperparameter rationale.
|
||||||
fix is to collect (obs, teacher_action) pairs from inside Webots:
|
|
||||||
|
|
||||||
```
|
|
||||||
# Headless DAgger collection: 5 flock sizes × 3 runs each.
|
|
||||||
tools/auto_dagger.sh 3 60
|
|
||||||
|
|
||||||
# Merge with the sim baseline + retrain.
|
|
||||||
python -m tools.dagger_merge_train --out runs/bc_dagger
|
|
||||||
```
|
|
||||||
|
|
||||||
Iterate by re-running collection with the new student in the driver's
|
|
||||||
seat:
|
|
||||||
|
|
||||||
```
|
|
||||||
HERDING_POLICY_DIR=$PWD/training/runs/bc_dagger \
|
|
||||||
HERDING_DAGGER_DRIVER=student \
|
|
||||||
tools/auto_dagger.sh 3 60
|
|
||||||
python -m tools.dagger_merge_train --out runs/bc_dagger
|
|
||||||
```
|
|
||||||
|
|
||||||
## Available analytic teachers
|
## Available analytic teachers
|
||||||
|
|
||||||
| Name | What it does | Notes |
|
| Name | What it does | Notes |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| `strombom` | Canonical Strömbom — collect when flock is scattered, drive CoM otherwise | Default; works well for n=1–10 under tight cohesion |
|
| `strombom` | Strömbom 2014 — collect when flock is scattered, drive CoM otherwise | Default; works for n=1–10 under tight cohesion |
|
||||||
| `sequential` | Pick the sheep closest to the pen and drive only it | Alternative; needs loose-cohesion regime |
|
| `sequential` | Pick the sheep closest to the pen and drive only it | Alternative; needs loose-cohesion regime |
|
||||||
|
|
||||||
Both are wrapped at demo-collection time in
|
Both are wrapped at demo-collection time in
|
||||||
`herding/active_scan.py:ActiveScanTeacher`, which adds an opening
|
`herding/control/active_scan.py:ActiveScanTeacher`, which adds an
|
||||||
in-place rotation, walk-to-centre when the LiDAR sees nothing, and
|
opening in-place rotation, walk-to-centre when the LiDAR sees
|
||||||
near-sheep speed modulation (the same modulation `herding/control.py`
|
nothing, and near-sheep speed modulation (same modulation
|
||||||
applies to every dog mode at inference).
|
`herding/control/modulation.py` applies to every dog mode at
|
||||||
|
inference).
|
||||||
|
|
||||||
## Evaluating analytic teachers directly
|
## Evaluating analytic teachers directly
|
||||||
|
|
||||||
```
|
```
|
||||||
python -m training.eval --policy strombom --max-flock 10 --max-steps 8000 --n-seeds 5
|
python -m training.eval --policy strombom --max-flock 10 --max-steps 15000 --n-seeds 10
|
||||||
python -m training.eval --policy sequential --max-flock 10 --max-steps 8000 --n-seeds 5
|
python -m training.eval --policy sequential --max-flock 10 --max-steps 15000 --n-seeds 10
|
||||||
```
|
```
|
||||||
|
|
||||||
## Webots inference
|
## Webots inference
|
||||||
|
|
||||||
```
|
```
|
||||||
tools/run_webots.sh 10 rl
|
tools/run_webots.sh 10 bc # or rl, strombom, sequential
|
||||||
```
|
```
|
||||||
|
|
||||||
The dog controller loads `runs/bc` for `bc` mode and `runs/rl` for
|
The dog controller loads `runs/bc` for `bc` mode and `runs/rl` for
|
||||||
|
|||||||
+3
-3
@@ -25,9 +25,9 @@ if _PROJECT_ROOT not in sys.path:
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from herding.geometry import MAX_SHEEP, PEN_ENTRY
|
from herding.world.geometry import MAX_SHEEP, PEN_ENTRY
|
||||||
from herding.sequential import compute_action as sequential_action
|
from herding.control.sequential import compute_action as sequential_action
|
||||||
from herding.strombom import compute_action as strombom_action
|
from herding.control.strombom import compute_action as strombom_action
|
||||||
from training.herding_env import HerdingEnv
|
from training.herding_env import HerdingEnv
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -56,24 +56,24 @@ _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
|
|||||||
if _PROJECT_ROOT not in sys.path:
|
if _PROJECT_ROOT not in sys.path:
|
||||||
sys.path.insert(0, _PROJECT_ROOT)
|
sys.path.insert(0, _PROJECT_ROOT)
|
||||||
|
|
||||||
from herding.diffdrive import (
|
from herding.world.diffdrive import (
|
||||||
heading_speed_to_wheels, kinematics_step, velocity_to_wheels,
|
heading_speed_to_wheels, kinematics_step, velocity_to_wheels,
|
||||||
)
|
)
|
||||||
from herding.flocking_sim import (
|
from herding.world.flocking_sim import (
|
||||||
FLEE_SPEED, MAX_SPEED, WANDER_SPEED, compute_heading_speed,
|
FLEE_SPEED, MAX_SPEED, WANDER_SPEED, compute_heading_speed,
|
||||||
)
|
)
|
||||||
from herding.geometry import (
|
from herding.world.geometry import (
|
||||||
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_BASE,
|
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_BASE,
|
||||||
DOG_WHEEL_RADIUS, FIELD_X, FIELD_Y, GATE_X, MAX_SHEEP,
|
DOG_WHEEL_RADIUS, FIELD_X, FIELD_Y, GATE_X, MAX_SHEEP,
|
||||||
PEN_ENTRY, PEN_X, PEN_Y,
|
PEN_ENTRY, PEN_X, PEN_Y,
|
||||||
SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS,
|
SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS,
|
||||||
WEBOTS_DT, is_penned_position,
|
WEBOTS_DT, is_penned_position,
|
||||||
)
|
)
|
||||||
from herding.lidar_perception import detections_from_scan
|
from herding.perception.lidar_perception import detections_from_scan
|
||||||
from herding.lidar_sim import simulate_scan
|
from herding.perception.lidar_sim import simulate_scan
|
||||||
from herding.obs import OBS_DIM, build_obs
|
from herding.obs import OBS_DIM, build_obs
|
||||||
from herding.sheep_tracker import SheepTracker
|
from herding.perception.sheep_tracker import SheepTracker
|
||||||
from herding.strombom import compute_action as strombom_action
|
from herding.control.strombom import compute_action as strombom_action
|
||||||
|
|
||||||
|
|
||||||
class HerdingEnv(gym.Env):
|
class HerdingEnv(gym.Env):
|
||||||
|
|||||||
Reference in New Issue
Block a user