Webots sim-to-real fixes, DAgger pipeline, 360° proto variant
Today's session worked across the full Webots delivery stack — found and
fixed a cluster of bugs blocking the BC/RL transfer, then explored
training-side mitigations for the residual perception gap.
Bug fixes:
- Makefile FP_RATE default 2.0 → 0.0: BC demos used fp_rate=0 but RL
fine-tune defaulted to fp_rate=2, poisoning the BC obs distribution
and stalling PPO at 0% success across 1.46M+ steps.
- controllers/{shepherd_dog,sheep}/runtime.ini: Webots was launching
controllers under system python3 (no numpy) and they were crashing
silently. Pinned to the conda tir env.
- herding/config.py HERDING_WEBOTS preset: pen_latch_depth 0.5 → 2.0,
max_new_tracks_per_step 3 → 1, static_reject 0.8 → 1.2. Stops phantom
FPs near the gate from latching as permanently-penned tracks.
- herding/perception/sheep_tracker.py: penned tracks now decay at
forget_steps × 8 instead of living forever. Adds get_positions
min_freshness filter for deploy-time use.
Training/eval matches deployment:
- training/bc/collect.py: --dagger-policy flag for DAgger rollouts
(policy drives, teacher labels) + --use-webots-preset for matched
140° tracker + DR config.
- controllers/shepherd_dog/shepherd_dog.py: scan-fallback (0, 0.6) when
BC/RL sees empty sheep_positions — recovers from FOV gaps.
Tooling:
- tools/dagger_round.sh: one-shot DAgger round (collect + concat + bc).
- tools/webots_sweep_gt.sh: full sweep with HERDING_USE_GT=1 for the
perception-gap diagnosis matrix.
- protos/ShepherdDog360.proto: 360° FOV variant for the FOV-ablation
comparison. Canonical proto stays at 140° per project spec.
Artifacts: v1 BC/RL policies for all 4 (drive × world) combos trained
in clean gym (success: diff/field 90-100%, diff/round 58%, mec/field
60-100%, mec/round 50-100%). DAgger r1/r2 BCs for diff/field show
12%→38% progression on gym HERDING_WEBOTS proxy but did not close
to actual Webots LiDAR (0/5 throughout). Next: LSTM policy or
learned tracker per the project-state memory.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+86
-23
@@ -21,22 +21,9 @@ from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Early CLI parse so we can configure geometry before heavy imports.
|
||||
# (argparse is used again below for the full parse; this is a lightweight
|
||||
# pre-pass that only reads --world.)
|
||||
_pre_argv = [a for a in os.sys.argv[1:]]
|
||||
_pre_world = None
|
||||
for i, a in enumerate(_pre_argv):
|
||||
if a == "--world" and i + 1 < len(_pre_argv):
|
||||
_pre_world = _pre_argv[i + 1]
|
||||
break
|
||||
if a.startswith("--world="):
|
||||
_pre_world = a.split("=", 1)[1]
|
||||
break
|
||||
if _pre_world is not None:
|
||||
from herding.world.geometry import configure as _geo_configure
|
||||
_geo_configure(_pre_world)
|
||||
os.environ["HERDING_WORLD"] = _pre_world
|
||||
# Configure field geometry before other herding imports read it at module level.
|
||||
from herding.world.geometry import configure_from_args as _configure_from_args
|
||||
_configure_from_args()
|
||||
|
||||
from herding.control.active_scan import ActiveScanTeacher
|
||||
from herding.world.geometry import PEN_ENTRY, FIELD_SHAPE
|
||||
@@ -83,10 +70,17 @@ def _call_teacher(fn, dog_xy, dog_heading, sheep_positions, pen_target,
|
||||
|
||||
def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int,
|
||||
teacher_fn, frame_stack: int = 1, privileged: bool = False,
|
||||
drive_mode: str = "differential"):
|
||||
drive_mode: str = "differential", herding_cfg=None,
|
||||
actor_policy=None):
|
||||
"""Collect (obs, teacher_action) pairs from one episode.
|
||||
|
||||
``actor_policy`` (DAgger mode): a callable ``policy(obs) -> action`` that
|
||||
drives the env. The teacher still labels each visited state. If ``None``
|
||||
(default), the teacher drives.
|
||||
"""
|
||||
env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
|
||||
difficulty=1.0, seed=seed, frame_stack=frame_stack,
|
||||
drive_mode=drive_mode)
|
||||
drive_mode=drive_mode, herding_cfg=herding_cfg)
|
||||
obs, _ = env.reset(seed=seed)
|
||||
obs_list, action_list = [], []
|
||||
scan_teacher = ActiveScanTeacher(teacher_fn)
|
||||
@@ -108,13 +102,16 @@ def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int,
|
||||
)
|
||||
vx, vy, omega, _mode = result
|
||||
if drive_mode == "mecanum":
|
||||
action = np.array([vx, vy, omega], dtype=np.float32)
|
||||
teacher_action = np.array([vx, vy, omega], dtype=np.float32)
|
||||
else:
|
||||
action = np.array([vx, vy], dtype=np.float32)
|
||||
teacher_action = np.array([vx, vy], dtype=np.float32)
|
||||
if step % subsample == 0:
|
||||
obs_list.append(obs.copy())
|
||||
action_list.append(action.copy())
|
||||
obs, _r, term, trunc, _info = env.step(action)
|
||||
action_list.append(teacher_action.copy())
|
||||
# In DAgger mode the policy drives; otherwise the teacher does.
|
||||
step_action = (actor_policy(obs) if actor_policy is not None
|
||||
else teacher_action)
|
||||
obs, _r, term, trunc, _info = env.step(step_action)
|
||||
if term or trunc:
|
||||
break
|
||||
success = bool(env.sheep_penned.all())
|
||||
@@ -153,6 +150,24 @@ def main():
|
||||
help="World shape. If not set, uses HERDING_WORLD "
|
||||
"env var or defaults to 'field'. Must be set "
|
||||
"before geometry is imported.")
|
||||
# Domain randomisation — applied to the gym env during collection so
|
||||
# the teacher demonstrates under the same noise the policy will face.
|
||||
parser.add_argument("--fp-rate", type=float, default=0.0,
|
||||
help="Mean false-positive detections injected per "
|
||||
"step (Poisson λ). 0 = clean sim (default).")
|
||||
parser.add_argument("--action-smooth", type=float, default=0.0,
|
||||
help="EMA coefficient on dog actions (0 = none). "
|
||||
"Set to 0.55 to match the Webots controller.")
|
||||
parser.add_argument("--wheel-slip-std", type=float, default=0.0,
|
||||
help="Gaussian noise (rad/s) on wheel speeds for "
|
||||
"mecanum dynamics domain randomisation.")
|
||||
parser.add_argument("--dagger-policy", default=None,
|
||||
help="Path to a BC/PPO policy directory. When set, "
|
||||
"the policy drives the env (DAgger) while the "
|
||||
"teacher labels every visited state.")
|
||||
parser.add_argument("--use-webots-preset", action="store_true",
|
||||
help="Use HERDING_WEBOTS preset (140° FOV + tight "
|
||||
"tracker). Match this to deployment for DAgger.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate --world matches geometry (already configured by the
|
||||
@@ -161,6 +176,53 @@ def main():
|
||||
print(f"[demos] WARNING: --world={args.world} but geometry is "
|
||||
f"'{FIELD_SHAPE}'. This should not happen — file a bug.")
|
||||
|
||||
from herding.config import HerdingConfig, HERDING_WEBOTS, DomainRandomConfig, RobotConfig
|
||||
if args.use_webots_preset:
|
||||
herding_cfg = HERDING_WEBOTS.replace(
|
||||
domain_random=DomainRandomConfig(
|
||||
fp_rate=args.fp_rate,
|
||||
wheel_slip_std=args.wheel_slip_std,
|
||||
),
|
||||
robot=RobotConfig(action_smooth=args.action_smooth),
|
||||
)
|
||||
print(f"[demos] HERDING_WEBOTS preset + DR: fp_rate={args.fp_rate} "
|
||||
f"action_smooth={args.action_smooth} wheel_slip_std={args.wheel_slip_std}")
|
||||
else:
|
||||
herding_cfg = None
|
||||
if args.fp_rate > 0.0 or args.action_smooth > 0.0 or args.wheel_slip_std > 0.0:
|
||||
herding_cfg = HerdingConfig(
|
||||
domain_random=DomainRandomConfig(
|
||||
fp_rate=args.fp_rate,
|
||||
wheel_slip_std=args.wheel_slip_std,
|
||||
),
|
||||
robot=RobotConfig(action_smooth=args.action_smooth),
|
||||
)
|
||||
print(f"[demos] domain-random: fp_rate={args.fp_rate} "
|
||||
f"action_smooth={args.action_smooth} "
|
||||
f"wheel_slip_std={args.wheel_slip_std}")
|
||||
|
||||
actor_policy = None
|
||||
if args.dagger_policy is not None:
|
||||
# DAgger: failures are the most valuable data (off-policy states
|
||||
# where the student needs teacher correction). Always keep them.
|
||||
args.keep_failures = True
|
||||
from stable_baselines3 import PPO
|
||||
from pathlib import Path as _P
|
||||
run = _P(args.dagger_policy)
|
||||
for name in ("policy.zip", "final.zip"):
|
||||
if (run / name).exists():
|
||||
zip_path = run / name
|
||||
break
|
||||
else:
|
||||
raise FileNotFoundError(
|
||||
f"No policy found in {run} (tried policy.zip, final.zip)")
|
||||
_model = PPO.load(str(zip_path), device="auto")
|
||||
print(f"[demos] DAgger mode: actor = {zip_path}")
|
||||
def actor_policy(obs):
|
||||
obs_b = np.asarray(obs, dtype=np.float32).reshape(1, -1)
|
||||
a, _ = _model.predict(obs_b, deterministic=True)
|
||||
return a[0]
|
||||
|
||||
teacher_fn = TEACHERS[args.teacher]
|
||||
print(f"[demos] teacher: {args.teacher} world: {FIELD_SHAPE}")
|
||||
|
||||
@@ -177,7 +239,8 @@ def main():
|
||||
obs, actions, success, total_steps = collect_one(
|
||||
n, seed, args.max_steps, args.subsample, teacher_fn,
|
||||
frame_stack=args.frame_stack, privileged=args.privileged,
|
||||
drive_mode=args.drive_mode,
|
||||
drive_mode=args.drive_mode, herding_cfg=herding_cfg,
|
||||
actor_policy=actor_policy,
|
||||
)
|
||||
n_total += 1
|
||||
if success:
|
||||
|
||||
Reference in New Issue
Block a user