Checkpoint 6

2026-05-11 10:35:48 +01:00
parent b457155538
commit fce0e0c786
27 changed files with 194 additions and 704 deletions
@@ -12,7 +12,7 @@ gate into an external pen. The dog has three deployable modes:
 | `rl` | KL-regularised PPO fine-tune of `bc` | Reward-driven refinement |
 `sequential` (single-target pin-and-push) is kept as an alternative
-analytic baseline. `dagger` is a data-collection mode, not deployment.
+analytic baseline.
 ## Perception
@@ -28,13 +28,13 @@ control step:
   (`herding/sheep_tracker.py`).
 **LiDAR validation** (intermediate-goal item v from `docs/project.md`):
-run the dog controller in `HERDING_MODE=diag` mode to capture 80
+during development a diagnostic-dump controller captured 80 real
-real Webots scans plus the ground-truth sheep positions in
+Webots scans plus the ground-truth sheep positions. Comparing
-`training/dagger/diag_<ts>.npz`. Comparing detections against GT in
+detections against GT showed clustered centroids match GT positions
-that file showed clustered centroids match GT positions within 0.15 m
+within 0.15 m after the +SHEEP_RADIUS surface-to-centre correction —
-after the +SHEEP_RADIUS surface-to-centre correction — i.e. the
+i.e. the LiDAR pipeline produces correct sheep-position estimates
-LiDAR pipeline produces correct sheep-position estimates from the
+from the real Webots scan, validating the sensor for the herding
-real Webots scan, validating the sensor for the herding task.
+task.
 The tracker outputs a `{name: (x, y)}` dict shaped exactly like the
 prior receiver-based one, so Strömbom, Sequential, and the BC obs
@@ -53,7 +53,7 @@ Privileged ground-truth perception is available for ablation —
 pip install -r training/requirements.txt
 # 2. Smoke test
-python -m training.parity_test
+python -m tests.parity_test
 # 3. Reproduce the BC policy (~10 min on CPU: ~5 min demos + ~3 min BC)
 python -m tools.collect_demos --teacher strombom \
@@ -61,21 +61,17 @@ python -m tools.collect_demos --teacher strombom \
 python -m training.bc_pretrain --demos training/demos.npz \
    --out training/runs/bc --epochs 60 --net-arch 512,512
-# 4. Optional: DAgger from inside Webots if sim-trained doesn't transfer
+# 4. KL-PPO fine-tune of the BC policy (~30 min on CPU, 1 M steps)
 tools/auto_dagger.sh 3 60
 python -m tools.dagger_merge_train --out training/runs/bc_dagger
 # 5. Evaluate (env)
 python -m training.eval --policy training/runs/bc \
    --max-flock 10 --max-steps 8000 --n-seeds 5
 # 6. Optional RL fine-tune of the BC policy (~40 min on CPU, 1 M steps)
 python -m training.train_ppo \
    --bc training/runs/bc \
    --out training/runs/rl \
    --total-timesteps 1000000
-# 7. Run in Webots
+# 5. Evaluate (env)
 python -m training.eval --policy training/runs/rl \
    --max-flock 10 --max-steps 15000 --n-seeds 10
 # 6. Run in Webots
 tools/run_webots.sh 10 bc          # behaviour-cloned MLP
 tools/run_webots.sh 10 rl          # KL-PPO fine-tune
 tools/run_webots.sh 10 strombom    # analytic baseline
@@ -84,22 +80,25 @@ tools/run_webots.sh 10 strombom    # analytic baseline
 ## Layout
 ```
-herding/                  — single source of truth (env + Webots both import)
+herding/                  — perception / control / world primitives
  geometry.py             — field/pen constants, robot specs
  flocking_sim.py         — Reynolds-style sheep dynamics
  diffdrive.py            — differential-drive kinematics
  control.py              — shared near-sheep speed-modulation helper
  obs.py                  — 32-D order-invariant observation builder
-  strombom.py             — canonical CoM-drive teacher
+  world/                  — environment-side physics & geometry
-  sequential.py           — single-target "pin-and-push" teacher
+    geometry.py             field/pen constants, robot specs
-  active_scan.py          — wraps a base teacher with opening rotation +
+    diffdrive.py            differential-drive kinematics
-                            walk-to-centre + speed modulation
+    flocking_sim.py         Reynolds + Strömbom 2014 sheep dynamics
-  lidar_sim.py            — fast 2D raycast for the env (sheep + walls + posts)
+  perception/             — LiDAR → tracked-sheep pipeline
-  lidar_perception.py     — scan → world-frame cluster centroids + filters
+    lidar_sim.py            fast 2D raycast for the env
-  sheep_tracker.py        — multi-target NN tracker with FOV memory
+    lidar_perception.py     scan → world-frame cluster centroids + filters
    sheep_tracker.py        multi-target NN tracker with FOV memory
  control/                — every dog mode's action source
    strombom.py             canonical CoM collect/drive heuristic
    sequential.py           single-target "pin-and-push" alternative
    active_scan.py          wraps a base teacher with opening rotation +
                            walk-to-centre fallback
    modulation.py           shared near-sheep speed-modulation helper
 controllers/
-  sheep/sheep.py          — Webots sheep controller (uses herding.flocking_sim)
+  sheep/sheep.py          — Webots sheep controller (uses herding.world.flocking_sim)
  shepherd_dog/
    shepherd_dog.py       — Webots dog controller, mode-switched
    policy_loader.py      — lazy SB3 policy loader (auto-detects frame stack)
@@ -107,16 +106,17 @@ controllers/
 training/
  herding_env.py          — Gymnasium env (LiDAR + tracker by default)
  bc_pretrain.py          — supervised BC of (obs, action) demos into MLP
-  eval.py                 — analytic + BC policy comparison harness
+  train_ppo.py            — KL-regularised PPO fine-tune of BC
-  parity_test.py          — shape / determinism smoke test
+  eval.py                 — analytic + learned policy comparison harness
  runs/                   — checkpoints (whitelisted in .gitignore)
  requirements.txt
 tests/
  parity_test.py          — shape / determinism / baseline smoke test
 tools/
  collect_demos.py        — sim demos via the active-scan teacher
  dagger_merge_train.py   — merge Webots-collected DAgger demos and retrain
  run_webots.sh           — launch Webots with N sheep + chosen mode
  auto_dagger.sh          — headless DAgger collection across many runs
 worlds/
  field.wbt               — main world (3 m gate, external pen)
@@ -127,8 +127,8 @@ docs/project.md           — original project goals
 ## Shared low-level control
-Every dog mode (RL, Strömbom, Sequential, the DAgger teacher) routes
+Every dog mode (Strömbom, Sequential, BC, RL) routes its action
-its action through `herding/control.py:modulate_speed_near_sheep`,
+through `herding/control/modulation.py:modulate_speed_near_sheep`,
 which scales action magnitude down when within ~2.5 m of the nearest
 tracked sheep. This stops the dog from charging in at full speed and
 scattering the flock. Direction (intent) is preserved.
@@ -11,14 +11,14 @@ _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
-from herding.flocking_sim import (  # noqa: F401
+from herding.world.flocking_sim import (  # noqa: F401
    MAX_SPEED, FLEE_SPEED, WANDER_SPEED,
    WALL_MARGIN, WALL_HARD_MARGIN, WALL_HARD_GAIN,
    FLEE_DIST, SEPARATION_DIST, COHESION_DIST,
    PEN_MARGIN,
    compute_heading_speed,
 )
-from herding.geometry import (  # noqa: F401
+from herding.world.geometry import (  # noqa: F401
    FIELD_X, FIELD_Y, PEN_X, PEN_Y,
    in_pen,
 )
@@ -24,9 +24,9 @@ if _PROJECT_ROOT not in sys.path:
 from controller import Supervisor
-from herding.diffdrive import heading_speed_to_wheels
+from herding.world.diffdrive import heading_speed_to_wheels
-from herding.flocking_sim import MAX_SPEED, compute_heading_speed
+from herding.world.flocking_sim import MAX_SPEED, compute_heading_speed
-from herding.geometry import (
+from herding.world.geometry import (
    SHEEP_MAX_WHEEL_OMEGA,
    is_penned_position,
 )
@@ -4,52 +4,39 @@ Mode is selected by ``HERDING_MODE`` (env var, or via the
 ``herding_runtime.cfg`` file the launcher writes since Webots strips
 env vars on some setups):
-    strombom    → canonical Strömbom collect/drive heuristic.
+    strombom    → canonical Strömbom (2014) collect/drive heuristic
-    sequential  → single-target "pin and push" — drives the sheep
+                  wrapped in ActiveScanTeacher (opening rotation +
-                  closest to the pen.
+                  walk-to-centre when the tracker briefly empties).
-    bc          → behaviour-cloned MLP, trained on Strömbom demos via
+    sequential  → single-target "pin-and-push", same wrapper.
-                  sim. Default policy directory: training/runs/bc.
+    bc          → behaviour-cloned MLP, trained on Strömbom demos.
-    rl          → KL-regularised PPO fine-tune of the BC policy. Same
+                  Default policy: training/runs/bc/policy.zip.
-                  obs/action space as bc; refines time-to-pen via
+    rl          → KL-regularised PPO fine-tune of bc. Same obs/action
-                  environment reward while staying anchored to bc.
+                  space as bc; refines time-to-pen via reward while
-                  Default policy directory: training/runs/rl.
+                  staying anchored to bc.
-    dagger      → DAgger data collection. Reads sheep ground-truth
+                  Default policy: training/runs/rl/policy.zip.
                  via the receiver, computes the active-scan teacher's
                  recommended action at every step, drives with either
                  the teacher (HERDING_DAGGER_DRIVER=teacher, default)
                  or the loaded student (=student), and logs each
                  (lidar_stacked_obs, teacher_action) pair. On exit
                  dumps to ``training/dagger/dagger_<ts>.npz`` for
                  ``tools.dagger_merge_train`` to consume.
 Sheep perception
 ----------------
-The dog now perceives sheep through its **front-mounted 140° LiDAR**
+The dog perceives sheep through its **front-mounted 140° LiDAR**
-(``protos/ShepherdDog.proto``: 180 rays, 12 m max range). Each step
+(``protos/ShepherdDog.proto``: 180 rays, 12 m max range). Each step:
 the controller:
    1. Reads ``lidar.getRangeImage()``.
-    2. Runs ``herding.lidar_perception.detections_from_scan`` to cluster
+    2. Runs ``herding.perception.lidar_perception.detections_from_scan``
-       returns into world-frame ``(x, y)`` sheep estimates.
+       to cluster returns into world-frame ``(x, y)`` sheep estimates.
-    3. Folds those into a ``herding.sheep_tracker.SheepTracker`` which
+    3. Folds those into a ``SheepTracker`` which maintains last-seen
-       maintains last-seen positions for sheep currently out of the
+       positions for sheep currently out of FOV and latches "penned"
-       FOV and latches "penned" once a track disappears near the gate.
+       once a track crosses the gate plane south.
-The output of step 3 is a ``{name: (x, y)}`` dict shaped exactly like
+Sheep ``emitter`` messages are read **for diagnostic logging only**
-the receiver-based one we used to consume — so Strömbom, Sequential
+(GT_penned counter + auto-finish sentinel); they are never used to
-and the BC obs builder run unchanged. The sheep→dog Emitter/Receiver
+drive the policy. Perception for control comes entirely from LiDAR.
 link is still up (kept passively for compatibility) but its messages
 are *not* used for control.
-All modes share the same low-level differential-drive controller
+Auto-finish
-(``herding.diffdrive.velocity_to_wheels`` with cos(err)-clamped forward
+-----------
-speed), so switching modes does not retune actuation.
+When the dog observes (via GT, read off the receiver) that all sheep
-
+are penned, it writes ``training/.run_done`` and the launcher
-A safety supervisor enforces the "dog stays out of the pen" invariant:
+(``tools/run_webots.sh``) detects it and closes Webots. This keeps
-if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is
+batch evaluation runs bounded.
 overridden with a north-driving correction. RL fallback: if the policy
 zip can't be loaded (SB3 missing, file missing), the controller drops
 to strombom mode automatically.
 """
 import math
@@ -62,26 +49,27 @@ _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
 import numpy as np
 from controller import Robot
-from herding.active_scan import ActiveScanTeacher
+from herding.control.active_scan import ActiveScanTeacher
-from herding.control import modulate_speed_near_sheep
+from herding.control.modulation import modulate_speed_near_sheep
-from herding.diffdrive import velocity_to_wheels
+from herding.control.sequential import compute_action as sequential_action
-from herding.geometry import (
+from herding.control.strombom import compute_action as strombom_action
 from herding.obs import build_obs
 from herding.perception.lidar_perception import detections_from_scan
 from herding.perception.sheep_tracker import SheepTracker
 from herding.world.diffdrive import velocity_to_wheels
 from herding.world.geometry import (
    DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA,
    DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS,
    PEN_ENTRY, is_penned_position,
 )
 from herding.lidar_perception import detections_from_scan
 from herding.obs import OBS_DIM, build_obs
 from herding.sequential import compute_action_debug as sequential_action_debug
 from herding.sheep_tracker import SheepTracker
 from herding.strombom import compute_action as strombom_action
 from herding.strombom import compute_action_debug as strombom_action_debug
 # ---------------------------------------------------------------------------
-# Mode selection
+# Mode + policy resolution
 # ---------------------------------------------------------------------------
 def _load_runtime_config():
@@ -135,60 +123,41 @@ def _resolve_policy_dir(mode: str) -> str:
    mode_default = {
        "bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
        "rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl"),
        "dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
    }
    primary = mode_default.get(mode, mode_default["bc"])
    if os.path.isdir(primary):
        return primary
    # Fall back to BC if the requested checkpoint isn't there yet
    # (e.g., user asked for `rl` before training the fine-tune).
    fallback = mode_default["bc"]
    if os.path.isdir(fallback):
        return fallback
    return env_dir or primary
-_VALID_MODES = ("bc", "rl", "strombom", "sequential", "dagger", "diag")
+_VALID_MODES = ("bc", "rl", "strombom", "sequential")
 # Back-compat: an old config saying HERDING_MODE=rl meant "the BC policy".
 # We now use `rl` strictly for the KL-PPO fine-tune. If the rl
 # directory isn't present, _resolve_policy_dir below silently falls
 # back to bc, preserving the old behaviour.
 if MODE not in _VALID_MODES:
    print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
    MODE = "strombom"
 DAGGER_DRIVER = (os.environ.get("HERDING_DAGGER_DRIVER")
                 or _runtime_cfg.get("HERDING_DAGGER_DRIVER")
                 or "teacher").lower()
 if DAGGER_DRIVER not in ("teacher", "student"):
    DAGGER_DRIVER = "teacher"
 POLICY_DIR = _resolve_policy_dir(MODE)
 policy_handle = None
-if MODE in ("bc", "rl", "dagger"):
+if MODE in ("bc", "rl"):
    print(f"[dog] resolved POLICY_DIR={POLICY_DIR}  exists={os.path.isdir(POLICY_DIR)}")
    try:
        from policy_loader import load as _load_policy
        policy_handle = _load_policy(POLICY_DIR)
        print(f"[dog] policy loaded from {POLICY_DIR}")
    except Exception as exc:
        if MODE in ("bc", "rl"):
        print(f"[dog] policy load failed ({exc!r}); falling back to strombom.")
        MODE = "strombom"
-        else:
+print(f"[dog] running in mode={MODE}")
            # In dagger mode, no policy is fine if driver=teacher.
            print(f"[dog] policy load failed ({exc!r}); dagger driver forced to teacher.")
            policy_handle = None
 print(f"[dog] running in mode={MODE}"
      + (f" driver={DAGGER_DRIVER}" if MODE == "dagger" else ""))
 # ---------------------------------------------------------------------------
-# Action smoothing + safety supervisor
+# Control parameters
 # ---------------------------------------------------------------------------
-ACTION_SMOOTH = 0.55  # was 0.35; bumped for less frame-to-frame action jitter
+ACTION_SMOOTH = 0.55           # EMA on (vx, vy) — kills frame-to-frame jitter
-prev_action = (0.0, 0.0)
+RUN_DONE_FILE = os.path.join(_PROJECT_ROOT, "training", ".run_done")
 def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
@@ -202,10 +171,6 @@ def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple:
    return (vx, vy)
 # ---------------------------------------------------------------------------
 # Driving
 # ---------------------------------------------------------------------------
 def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float):
    if math.hypot(vx, vy) < 1e-3:
        left_motor.setVelocity(0.0)
@@ -245,12 +210,9 @@ receiver = robot.getDevice("receiver"); receiver.enable(timestep)
 emitter = robot.getDevice("emitter")
 lidar = robot.getDevice("lidar");       lidar.enable(timestep)
 # The receiver channel from sheep is no longer consumed for perception
 # (kept enabled in case any peripheral tooling reads it). Sheep
 # positions come exclusively from the LiDAR + tracker pipeline below.
 tracker = SheepTracker()
-# Cosmetic ear motors — ignored by control logic but keep them animated.
+# Cosmetic ear motors — animated; not used by control.
 left_ear = robot.getDevice("left ear motor")
 right_ear = robot.getDevice("right ear motor")
 left_ear.setPosition(float("inf"))
@@ -266,75 +228,26 @@ EAR_RATE = 8.0
 # Main loop
 # ---------------------------------------------------------------------------
-# Active sheep positions come from the LiDAR-fed tracker each step;
+# Analytic-teacher wrapper (instantiated lazily so RL/BC modes don't pay
-# penned_set is the tracker's ``get_penned_set()`` call. We drain the
+# the import-time cost). Each gets the same ActiveScanTeacher treatment:
-# receiver queue without consuming it, so the small backlog of sheep
+# rotate-on-empty, walk-to-centre, near-sheep speed modulation.
-# pings can't grow unbounded.
+analytic_teacher = None
-step_count = 0
+if MODE in ("strombom", "sequential"):
    base_fn = strombom_action if MODE == "strombom" else sequential_action
    analytic_teacher = ActiveScanTeacher(base_fn)
-import atexit
+# GT positions from sheep emitters — used **only** for the auto-finish
-import time
+# sentinel and the GT_penned diagnostic line. Never fed into control.
 import numpy as _np
 # DAgger state ----------------------------------------------------------
 # Logged each step in dagger mode: (stacked_lidar_obs, teacher_action).
 DAGGER_LOG_OBS: list = []
 DAGGER_LOG_ACT: list = []
 # Diagnostic mode buffer (one dict per step).
 DIAG_BUF: list = []
 # Frame stack buffer the controller maintains itself when dagger mode is
 # active — the stacked obs we log must match what the policy sees so the
 # downstream BC consumes (stacked_obs, teacher_action) pairs cleanly.
 _FRAME_STACK = (policy_handle.frame_stack if policy_handle is not None else 4)
 _dagger_buffer: list = []
 # Active-scan teacher operates on GT (read from receiver).
 _dagger_teacher = ActiveScanTeacher(strombom_action) if MODE == "dagger" else None
 # GT positions accumulated from the receiver (sheep emit their xy each step).
 _gt_sheep: dict = {}
 _run_done = False
-
+prev_action = (0.0, 0.0)
-_DAGGER_RUN_TS = int(time.time())  # one file per controller run
+step_count = 0
 _DAGGER_DUMPED = False
 # Sentinel that the auto-collection script polls — empty file written
 # when this controller decides the run is "done" (all sheep penned, by
 # GT). The launcher then kills Webots and moves on without waiting out
 # its timeout. Honoured only in dagger mode.
 _DAGGER_DONE_FILE = os.path.join(_PROJECT_ROOT, "training", "dagger", ".DONE")
 def _dump_dagger_log():
    """Save accumulated (obs, teacher_action) pairs to disk on exit.
    Webots may SIGKILL the controller, so the loop also calls this every
    DAGGER_FLUSH_STEPS so we lose at most a few seconds of data per run.
    Idempotent — repeated calls overwrite the same file with the latest
    accumulated buffer.
    """
    global _DAGGER_DUMPED
    if MODE != "dagger" or not DAGGER_LOG_OBS:
        return
    out_dir = os.path.join(_PROJECT_ROOT, "training", "dagger")
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"dagger_{_DAGGER_RUN_TS}.npz")
    obs_arr = _np.stack(DAGGER_LOG_OBS).astype(_np.float32)
    act_arr = _np.stack(DAGGER_LOG_ACT).astype(_np.float32)
    _np.savez(out_path, obs=obs_arr, actions=act_arr)
    if not _DAGGER_DUMPED:
        print(f"[dog dagger] wrote {len(DAGGER_LOG_OBS)} pairs → {out_path}")
        _DAGGER_DUMPED = True
 DAGGER_FLUSH_STEPS = 500
 atexit.register(_dump_dagger_log)
 while robot.step(timestep) != -1:
    step_count += 1
-    # Drain receiver. In every mode we capture GT for the diagnostic
+    # Drain sheep emitter messages → GT (diagnostic only).
    # log line — perception still comes from LiDAR, the GT is read-only.
    while receiver.getQueueLength() > 0:
        msg = receiver.getString()
        receiver.nextPacket()
@@ -350,115 +263,28 @@ while robot.step(timestep) != -1:
    n = compass.getValues()
    dog_heading = math.atan2(n[0], n[1])
-    # ---- LiDAR perception → tracker → sheep_positions dict ----
+    # ---- LiDAR perception → tracker → active sheep positions ----
-    ranges = _np.asarray(lidar.getRangeImage(), dtype=_np.float32)
+    ranges = np.asarray(lidar.getRangeImage(), dtype=np.float32)
    detections = detections_from_scan(ranges, dog_xy[0], dog_xy[1], dog_heading)
    sheep_positions = tracker.update(detections)
    penned_set = tracker.get_penned_set()
    # ---- Diagnostic mode: dump the first DIAG_STEPS scans + GT to disk.
    if MODE == "diag":
        DIAG_STEPS = 80
        if step_count <= DIAG_STEPS:
            DIAG_BUF.append(dict(
                step=step_count,
                ranges=ranges.copy(),
                dog_x=dog_xy[0], dog_y=dog_xy[1], dog_h=dog_heading,
                gt_sheep=dict(_gt_sheep),
                detections=list(detections),
            ))
            if step_count == DIAG_STEPS:
                _diag_path = os.path.join(_PROJECT_ROOT, "training", "dagger",
                                          f"diag_{int(time.time())}.npz")
                os.makedirs(os.path.dirname(_diag_path), exist_ok=True)
                _np.savez(
                    _diag_path,
                    ranges=_np.stack([d["ranges"] for d in DIAG_BUF]),
                    dog_xy=_np.array([[d["dog_x"], d["dog_y"]] for d in DIAG_BUF],
                                     dtype=_np.float32),
                    dog_h=_np.array([d["dog_h"] for d in DIAG_BUF], dtype=_np.float32),
                    # Per-step GT serialised: max-pad to 10 sheep.
                    gt_xy=_np.array([
                        [list(d["gt_sheep"].get(f"sheep{i}", (1e9, 1e9)))
                         for i in range(1, 11)]
                        for d in DIAG_BUF
                    ], dtype=_np.float32),
                    detections=_np.array([
                        len(d["detections"]) for d in DIAG_BUF
                    ], dtype=_np.int32),
                )
                print(f"[dog diag] wrote {DIAG_STEPS} scans → {_diag_path}")
    # Build the single-frame LiDAR obs (matches what the env produces).
    sheep_xy_list = list(sheep_positions.values())
    sheep_penned_list = [False] * len(sheep_xy_list)
    single_obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list)
    # Maintain our own frame stack so logged obs == what policy sees.
    if not _dagger_buffer:
        _dagger_buffer = [single_obs.copy() for _ in range(_FRAME_STACK)]
    else:
        _dagger_buffer.append(single_obs)
        if len(_dagger_buffer) > _FRAME_STACK:
            _dagger_buffer = _dagger_buffer[-_FRAME_STACK:]
    stacked_obs = _np.concatenate(_dagger_buffer, axis=0).astype(_np.float32)
    # ---- Action selection ----
-    if MODE == "diag":
+    if MODE in ("bc", "rl") and policy_handle is not None:
        # Diagnostic mode: rotate in place so the captured scans cover
        # all 360° of view from one position. Target = heading + π →
        # cos(err) clamps forward to ~0, the dog spins.
        _t = dog_heading + math.pi
        vx, vy = math.cos(_t), math.sin(_t)
    elif MODE == "dagger":
        # Teacher: active-scan + Strömbom on GT (active sheep only).
        gt_active = {name: xy for name, xy in _gt_sheep.items()
                     if not is_penned_position(xy[0], xy[1])}
        t_vx, t_vy, _mode_str = _dagger_teacher(
            dog_xy, dog_heading, gt_active, PEN_ENTRY,
        )
        # Student (if a policy is loaded).
        s_vx, s_vy = None, None
        if policy_handle is not None:
            action = policy_handle.predict(stacked_obs)
            s_vx, s_vy = float(action[0]), float(action[1])
        # Drive selection.
        if DAGGER_DRIVER == "student" and policy_handle is not None:
            vx, vy = s_vx, s_vy
        else:
            vx, vy = t_vx, t_vy
        # Always log the teacher action (this is the supervision signal).
        DAGGER_LOG_OBS.append(stacked_obs.copy())
        DAGGER_LOG_ACT.append(_np.array([t_vx, t_vy], dtype=_np.float32))
    elif MODE in ("bc", "rl") and policy_handle is not None:
        # Pass the single-frame obs; the policy_loader maintains its own
        # frame stack internally. Both bc and rl use the same control
        # interface — the only difference is which checkpoint loaded.
        action = policy_handle.predict(single_obs)
        vx, vy = float(action[0]), float(action[1])
-    elif MODE in ("strombom", "sequential"):
+    else:
-        # Wrap the analytic teacher in ActiveScanTeacher so the dog
+        vx, vy, _mode_str = analytic_teacher(
        # rotates / walks-to-centre when the tracker briefly empties,
        # instead of going idle. Without this wrapper, the first 2 s
        # of LiDAR-blind operation kills the run because Strömbom and
        # Sequential both return (0, 0) when there are no positions.
        if "_analytic_teacher" not in globals():
            from herding.sequential import compute_action as sequential_action
            _analytic_teacher = ActiveScanTeacher(
                strombom_action if MODE == "strombom" else sequential_action
            )
        vx, vy, _mode_str = _analytic_teacher(
            dog_xy, dog_heading, sheep_positions, PEN_ENTRY,
        )
-    # Shared post-process: speed modulation near sheep. Applies to bc,
+    # Near-sheep speed modulation (shared by every mode).
    # rl, strombom, sequential — every mode where the action source is
    # nominally unit-magnitude. In dagger mode the active-scan teacher
    # has already modulated, and the diag mode action is hand-built for
    # rotation; both skip.
    if MODE not in ("dagger", "diag"):
    vx, vy = modulate_speed_near_sheep(vx, vy, dog_xy, sheep_positions)
-    # EMA smoothing — reduces oscillation from policy or Strömbom flips.
+    # EMA smoothing — kills frame-to-frame action jitter.
    vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx
    vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy
@@ -469,7 +295,7 @@ while robot.step(timestep) != -1:
    drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX)
    emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}")
-    # Cosmetic ear wiggle — purely visual.
+    # Cosmetic ear wiggle.
    ear_phase += 0.12
    ear_pos = EAR_AMPLITUDE * math.sin(ear_phase)
    left_ear.setVelocity(EAR_RATE)
@@ -477,38 +303,26 @@ while robot.step(timestep) != -1:
    left_ear.setPosition(ear_pos)
    right_ear.setPosition(-ear_pos)
-    # --- Early-stop when all GT sheep are penned (all modes) ---
+    # Auto-finish: when all GT sheep are penned, write the sentinel.
-    # The dog isn't a Supervisor so it can't call simulationQuit() —
+    # The launcher polls for it and closes Webots so batch evals don't
-    # instead we write a sentinel file the launcher polls for and uses
+    # hang after the task is done. Bounded by `_gt_sheep` so we don't
    # to kill the Webots process. Bounded by `_gt_sheep` so we don't
    # fire during the first few steps while the receiver fills.
-    if _gt_sheep and not os.path.exists(_DAGGER_DONE_FILE):
+    if _gt_sheep and not _run_done:
-        gt_active_count = sum(1 for x, y in _gt_sheep.values()
+        gt_active = sum(1 for x, y in _gt_sheep.values()
                        if not is_penned_position(x, y))
-        if gt_active_count == 0:
+        if gt_active == 0:
-            if MODE == "dagger":
+            os.makedirs(os.path.dirname(RUN_DONE_FILE), exist_ok=True)
-                _dump_dagger_log()
+            open(RUN_DONE_FILE, "w").close()
-            os.makedirs(os.path.dirname(_DAGGER_DONE_FILE), exist_ok=True)
+            _run_done = True
            open(_DAGGER_DONE_FILE, "w").close()
            print(f"[dog] all {len(_gt_sheep)} sheep penned at step "
-                  f"{step_count} — wrote {_DAGGER_DONE_FILE}, "
+                  f"{step_count} — wrote sentinel, launcher will close Webots")
                  f"launcher will close Webots")
    if MODE == "dagger" and step_count % DAGGER_FLUSH_STEPS == 0 and DAGGER_LOG_OBS:
        _dump_dagger_log()
    if step_count % 200 == 0:
        gt_penned = sum(1 for x, y in _gt_sheep.values()
                        if is_penned_position(x, y))
        gt_total = len(_gt_sheep)
        extra = ""
        if MODE == "dagger":
            extra = f" logged={len(DAGGER_LOG_OBS)}"
        print(f"[dog mode={MODE}] step={step_count} "
              f"GT_penned={gt_penned}/{gt_total} "
              f"tracks_active={tracker.n_active()} "
              f"tracks_penned={tracker.n_penned()} "
-              f"detections={len(detections)} action=({vx:+.2f}, {vy:+.2f}){extra}")
+              f"detections={len(detections)} action=({vx:+.2f}, {vy:+.2f})")
 # Loop ended (Webots told us to quit). Flush any remaining DAgger log.
 _dump_dagger_log()
@@ -24,7 +24,7 @@ from __future__ import annotations
 import math
-from herding.control import modulate_speed_near_sheep
+from herding.control.modulation import modulate_speed_near_sheep
 INITIAL_SCAN_STEPS = 80    # ≈1.3 s at dt=16 ms — full rotation at the +π turn target.
@@ -24,7 +24,7 @@ flock size and works up to at least n=10 within a 15 000-step budget.
 import math
-from herding.geometry import GATE_Y, PEN_ENTRY, in_pen
+from herding.world.geometry import GATE_Y, PEN_ENTRY, in_pen
 DELTA_DRIVE = 1.5     # standoff behind the target sheep
@@ -9,7 +9,7 @@ Reference: Strömbom et al. 2014, "Solving the shepherding problem".
 import math
-from herding.geometry import PEN_ENTRY, GATE_Y, in_pen
+from herding.world.geometry import PEN_ENTRY, GATE_Y, in_pen
 # Algorithm parameters. DELTA_DRIVE / DELTA_COLLECT were tightened from
 # the original (4.0 / 2.5) because the new external pen sits ~26 m from
@@ -31,7 +31,7 @@ Layout (all components normalised so values stay roughly in [-1, 1]):
 import math
 import numpy as np
-from herding.geometry import (
+from herding.world.geometry import (
    FIELD_X, FIELD_Y, PEN_ENTRY, MAX_SHEEP,
 )
@@ -29,8 +29,8 @@ import math
 import numpy as np
-from herding.geometry import FIELD_X, FIELD_Y, GATE_Y, PEN_X, PEN_Y
+from herding.world.geometry import FIELD_X, FIELD_Y, GATE_Y, PEN_X, PEN_Y
-from herding.lidar_sim import (
+from herding.perception.lidar_sim import (
    LIDAR_FOV, LIDAR_MAX_RANGE, LIDAR_N_RAYS, SHEEP_RADIUS, ray_angles,
 )
@@ -26,7 +26,7 @@ from __future__ import annotations
 import math
-from herding.geometry import MAX_SHEEP, in_pen, is_penned_position
+from herding.world.geometry import MAX_SHEEP, in_pen, is_penned_position
 GATE_M = 2.5              # m — primary NN gate (recent tracks)
@@ -51,7 +51,7 @@ is a defensible engineering adaptation of Strömbom's qualitative
 import math
 import random
-from herding.geometry import (
+from herding.world.geometry import (
    FIELD_X, FIELD_Y,
    PEN_X, PEN_Y,
    GATE_X,
@@ -21,9 +21,9 @@ if _PROJECT_ROOT not in sys.path:
 import numpy as np
-from herding.geometry import MAX_SHEEP, PEN_ENTRY
+from herding.world.geometry import MAX_SHEEP, PEN_ENTRY
 from herding.obs import OBS_DIM
-from herding.strombom import compute_action
+from herding.control.strombom import compute_action
 from training.herding_env import HerdingEnv
@@ -1,166 +0,0 @@
 #!/bin/bash
 # tools/auto_dagger.sh — automated DAgger collection across many headless
 # Webots runs.
 #
 # For each (flock_size, run_index) combination, generates a world with N
 # active sheep at randomised positions, launches Webots in fast/headless
 # mode, lets the controller log (lidar_obs, teacher_action) pairs for up
 # to RUN_SEC seconds, kills the run, and moves on. The dog controller's
 # 500-step periodic flush means each run produces a complete .npz even
 # when killed by timeout.
 #
 # Usage:
 #   tools/auto_dagger.sh [RUNS_PER_FLOCK] [SECONDS_PER_RUN]
 #     RUNS_PER_FLOCK : how many randomised runs per flock size (default 3)
 #     SECONDS_PER_RUN: wall-clock cap per Webots run (default 60)
 #
 # Env-var overrides:
 #   HERDING_POLICY_DIR : policy the controller loads (only used when
 #                        HERDING_DAGGER_DRIVER=student). Default bc.
 #   HERDING_DAGGER_DRIVER : "teacher" (default) or "student".
 #   HEADLESS=1          : force --no-rendering (default on).
 #   FLOCKS="1 3 5 8 10" : space-separated flock sizes to iterate over.
 #
 # Output:
 #   training/dagger/dagger_<ts>.npz — one per Webots run.
 #
 # After collection, run:
 #   python -m tools.dagger_merge_train --out training/runs/bc_dagger
 set -e
 RUNS_PER_FLOCK=${1:-3}
 RUN_SEC=${2:-60}
 FLOCKS=${FLOCKS:-"1 3 5 8 10"}
 HEADLESS=${HEADLESS:-1}
 ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
 SRC="$ROOT/worlds/field.wbt"
 DST="$ROOT/worlds/field_test.wbt"
 POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc}"
 DRIVER="${HERDING_DAGGER_DRIVER:-teacher}"
 DONE_FILE="$ROOT/training/dagger/.DONE"
 WEBOTS_PID=""
 cleanup() {
    echo "Caught interrupt — killing Webots (pid=$WEBOTS_PID) and exiting."
    [[ -n "$WEBOTS_PID" ]] && kill "$WEBOTS_PID" 2>/dev/null
    wait "$WEBOTS_PID" 2>/dev/null || true
    exit 1
 }
 trap cleanup INT TERM
 webots_args=(--mode=fast --batch --minimize)
 if [[ "$HEADLESS" == "1" ]]; then
    webots_args+=(--no-rendering)
 fi
 echo "Auto-dagger collection"
 echo "  flock sizes      : $FLOCKS"
 echo "  runs per size    : $RUNS_PER_FLOCK"
 echo "  seconds per run  : $RUN_SEC"
 echo "  policy dir       : $POLICY_DIR  (used only when driver=student)"
 echo "  driver           : $DRIVER"
 echo "  webots flags     : ${webots_args[*]}"
 echo
 # Runtime config — re-written before each run anyway, but written once
 # here so a manual webots launch at the same time would also pick it up.
 cat > "$ROOT/herding_runtime.cfg" <<EOF
 HERDING_MODE=dagger
 HERDING_POLICY_DIR=$POLICY_DIR
 HERDING_DAGGER_DRIVER=$DRIVER
 EOF
 # Count files before, so we can summarise what was added.
 mkdir -p "$ROOT/training/dagger"
 before_count=$(ls -1 "$ROOT/training/dagger"/dagger_*.npz 2>/dev/null | wc -l || echo 0)
 run_idx=0
 total_runs=0
 for f in $FLOCKS; do total_runs=$((total_runs + RUNS_PER_FLOCK)); done
 for flock in $FLOCKS; do
    for run in $(seq 1 "$RUNS_PER_FLOCK"); do
        run_idx=$((run_idx + 1))
        seed=$((1000 * flock + run))
        echo "=== [$run_idx/$total_runs] flock=$flock run=$run seed=$seed ==="
        # Generate randomised world.
        cp "$SRC" "$DST"
        for i in $(seq $((flock + 1)) 10); do
            sed -i "s|^Sheep .* \"sheep${i}\".*|# &|" "$DST"
        done
        # Inline Python: jitter sheep1..flock translations.
        python3 - "$DST" "$flock" "$seed" <<'PYEOF'
 import re, random, sys
 path, n_str, seed = sys.argv[1], sys.argv[2], sys.argv[3]
 n = int(n_str); random.seed(int(seed))
 with open(path) as f:
    txt = f.read()
 def rand_pos():
    while True:
        x = random.uniform(-12.0, 12.0)
        y = random.uniform(-10.0, 12.0)  # avoid the gate strip
        if x * x + y * y > 9.0:           # at least 3 m from dog spawn
            return x, y
 for i in range(1, n + 1):
    x, y = rand_pos()
    pat = re.compile(
        r'Sheep \{ translation\s+\S+\s+\S+\s+(\S+)\s+name "sheep' + str(i) + r'"'
    )
    txt = pat.sub(rf'Sheep {{ translation {x:.2f} {y:.2f} \g<1> name "sheep{i}"', txt, count=1)
 with open(path, "w") as f:
    f.write(txt)
 PYEOF
        # Run Webots in the background; poll for the .DONE sentinel or
        # the wall-clock timeout, whichever comes first.
        rm -f "$DONE_FILE"
        webots "${webots_args[@]}" "$DST" \
            > /tmp/webots_dagger_run.log 2>&1 &
        WEBOTS_PID=$!
        # Give the controller 10 s to start before polling the sentinel,
        # otherwise a sheep that spawns already penned triggers an instant
        # false-positive kill.
        elapsed=0
        grace=10
        while kill -0 "$WEBOTS_PID" 2>/dev/null; do
            if (( elapsed >= grace )) && [[ -f "$DONE_FILE" ]]; then
                echo "  sentinel .DONE detected — killing Webots early"
                kill "$WEBOTS_PID" 2>/dev/null
                wait "$WEBOTS_PID" 2>/dev/null || true
                break
            fi
            if (( elapsed >= RUN_SEC )); then
                echo "  timeout ($RUN_SEC s) — killing Webots"
                kill "$WEBOTS_PID" 2>/dev/null
                wait "$WEBOTS_PID" 2>/dev/null || true
                break
            fi
            sleep 2
            elapsed=$((elapsed + 2))
        done
        WEBOTS_PID=""
        # Quick sanity from the log: did the controller actually run?
        if grep -q "running in mode=dagger" /tmp/webots_dagger_run.log; then
            new_pairs=$(tail -50 /tmp/webots_dagger_run.log | grep -oE 'logged=[0-9]+' | tail -1)
            echo "  controller ran  ($new_pairs)"
        else
            echo "  WARNING: controller may not have started (see /tmp/webots_dagger_run.log)"
        fi
    done
 done
 after_count=$(ls -1 "$ROOT/training/dagger"/dagger_*.npz 2>/dev/null | wc -l || echo 0)
 new_files=$((after_count - before_count))
 echo
 echo "Done."
 echo "  new dagger files : $new_files"
 echo "  total in dir     : $after_count"
 echo
 echo "Next:"
 echo "  python -m tools.dagger_merge_train --out training/runs/bc_dagger"
@@ -26,10 +26,10 @@ if _PROJECT_ROOT not in sys.path:
 import numpy as np
-from herding.active_scan import ActiveScanTeacher
+from herding.control.active_scan import ActiveScanTeacher
-from herding.geometry import PEN_ENTRY
+from herding.world.geometry import PEN_ENTRY
-from herding.sequential import compute_action as sequential_action
+from herding.control.sequential import compute_action as sequential_action
-from herding.strombom import compute_action as strombom_action
+from herding.control.strombom import compute_action as strombom_action
 from training.herding_env import HerdingEnv
@@ -1,135 +0,0 @@
 """Merge Webots DAgger demos with sim demos and retrain the BC policy.
 The dog controller in ``HERDING_MODE=dagger`` writes per-run files to
 ``training/dagger/dagger_<ts>.npz`` containing ``(obs, actions)`` pairs
 where:
 * ``obs`` is the **stacked LiDAR observation** as built by the live
  Webots tracker — exactly the input distribution the deployed
  controller sees.
 * ``actions`` is the **active-scan-teacher action computed from
  ground-truth sheep positions** (read off the sheep emitter).
 Combined with the existing sim demos (``training/demos.npz`` by
 default), this gives the BC student a training set that includes the
 real Webots false-positive distribution — closing the sim-to-real
 perception gap that the all-sim pipeline couldn't bridge.
 Usage::
    # Iteration 1 — merge all dagger files with sim demos, retrain
    python -m tools.dagger_merge_train \\
        --sim training/demos.npz \\
        --out training/runs/bc_dagger1
    # Iteration 2 — drop the sim baseline, train only on Webots data
    python -m tools.dagger_merge_train --no-sim --out training/runs/bc_dagger2
 The new policy is saved as ``<out>/policy.zip`` and is auto-loaded by
 the controller's resolution priority on the next Webots run.
 """
 from __future__ import annotations
 import argparse
 import glob
 import os
 import subprocess
 import sys
 from pathlib import Path
 _HERE = os.path.dirname(os.path.abspath(__file__))
 _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
 import numpy as np
 def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--sim", default="training/demos.npz",
                        help="Sim demo file to mix with the Webots data. "
                             "Pass --no-sim to train only on dagger data.")
    parser.add_argument("--no-sim", action="store_true",
                        help="Skip the sim demos entirely.")
    parser.add_argument("--dagger-glob", default="training/dagger/dagger_*.npz",
                        help="Glob for Webots-collected dagger files.")
    parser.add_argument("--merged-out", default="training/demos_dagger.npz",
                        help="Where to write the merged demo file.")
    parser.add_argument("--out", default="training/runs/bc_dagger",
                        help="Where to write the BC policy.")
    parser.add_argument("--epochs", type=int, default=60)
    parser.add_argument("--batch-size", type=int, default=256)
    parser.add_argument("--net-arch", default="512,512")
    parser.add_argument("--cos-weight", type=float, default=1.0)
    args = parser.parse_args()
    # --- Gather Webots files ---
    dagger_paths = sorted(glob.glob(args.dagger_glob))
    if not dagger_paths:
        raise SystemExit(f"No dagger files found at {args.dagger_glob} — "
                         "run Webots in HERDING_MODE=dagger first.")
    chunks_obs: list[np.ndarray] = []
    chunks_act: list[np.ndarray] = []
    total_dagger = 0
    for p in dagger_paths:
        data = np.load(p)
        obs = data["obs"].astype(np.float32)
        act = data["actions"].astype(np.float32)
        chunks_obs.append(obs)
        chunks_act.append(act)
        total_dagger += len(obs)
        print(f"  + {p}: {obs.shape[0]} pairs (obs dim {obs.shape[1]})")
    print(f"[merge] total dagger pairs: {total_dagger}")
    obs_dim = chunks_obs[0].shape[1]
    if any(c.shape[1] != obs_dim for c in chunks_obs):
        raise SystemExit(
            "Dagger files have inconsistent obs dims — they were collected "
            "with different frame_stack settings. Either rerun with a "
            "consistent setting or filter the glob."
        )
    # --- Optionally include sim demos ---
    if not args.no_sim:
        sim = np.load(args.sim)
        sim_obs = sim["obs"].astype(np.float32)
        sim_act = sim["actions"].astype(np.float32)
        if sim_obs.shape[1] != obs_dim:
            raise SystemExit(
                f"Sim demos have obs dim {sim_obs.shape[1]} but dagger demos "
                f"have {obs_dim}. Recollect sim demos at the same frame_stack."
            )
        chunks_obs.append(sim_obs)
        chunks_act.append(sim_act)
        print(f"[merge] + sim demos: {sim_obs.shape[0]} pairs from {args.sim}")
    obs_all = np.concatenate(chunks_obs, axis=0)
    act_all = np.concatenate(chunks_act, axis=0)
    # Empty meta — bc_pretrain doesn't actually use it but the file format
    # has it.
    meta = np.zeros((0, 5), dtype=np.int32)
    Path(args.merged_out).parent.mkdir(parents=True, exist_ok=True)
    np.savez(args.merged_out, obs=obs_all, actions=act_all, meta=meta)
    print(f"[merge] wrote {len(obs_all)} pairs → {args.merged_out}")
    print(f"[merge] obs shape {obs_all.shape}, action shape {act_all.shape}")
    # --- Run BC training ---
    cmd = [
        sys.executable, "-m", "training.bc_pretrain",
        "--demos", args.merged_out,
        "--out", args.out,
        "--epochs", str(args.epochs),
        "--batch-size", str(args.batch_size),
        "--net-arch", args.net_arch,
        "--cos-weight", str(args.cos_weight),
    ]
    print(f"\n[merge] launching: {' '.join(cmd)}")
    subprocess.run(cmd, check=True, cwd=_PROJECT_ROOT)
 if __name__ == "__main__":
    main()
@@ -7,19 +7,17 @@
 # Usage:
 #   tools/run_webots.sh [N] [MODE]
 #     N    : number of active sheep (1..10), default 10
-#     MODE : "bc" | "rl" | "strombom" | "sequential" | "dagger", default "bc"
+#     MODE : "bc" | "rl" | "strombom" | "sequential", default "bc"
 #
 # Examples:
-#   tools/run_webots.sh 10 bc         # BC-trained policy, 10 sheep
+#   tools/run_webots.sh 10 bc         # behaviour-cloned MLP, 10 sheep
 #   tools/run_webots.sh 10 rl         # KL-PPO fine-tune of bc, 10 sheep
-#   tools/run_webots.sh 5 sequential  # the analytic teacher, 5 sheep
+#   tools/run_webots.sh 5 sequential  # single-target analytic baseline
-#   tools/run_webots.sh 3 strombom    # canonical baseline, 3 sheep
+#   tools/run_webots.sh 3 strombom    # canonical Strömbom analytic
 #
 # Notes:
-# * The RL mode loads the latest BC policy by default — priority
+# * bc loads training/runs/bc/policy.zip, rl loads training/runs/rl.
-#   the BC policy (bc/policy.zip) (the controller resolves it).
+#   Override via HERDING_POLICY_DIR=/path/to/run env var.
 #   (LiDAR-perception, frame-stack K=4). Override via
 #   HERDING_POLICY_DIR=/path/to/run env var.
 # * Conda env "tir" must be active (provides stable-baselines3 + torch).
 set -e
@@ -30,10 +28,9 @@ if (( N < 1 || N > 10 )); then
    echo "N must be 1..10, got $N" >&2; exit 1
 fi
 case "$MODE" in
-    bc|rl|strombom|sequential|dagger) ;;
+    bc|rl|strombom|sequential) ;;
-    *) echo "MODE must be bc|rl|strombom|sequential|dagger, got '$MODE'" >&2; exit 1 ;;
+    *) echo "MODE must be bc|rl|strombom|sequential, got '$MODE'" >&2; exit 1 ;;
 esac
 DAGGER_DRIVER=${HERDING_DAGGER_DRIVER:-teacher}
 ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
 SRC="$ROOT/worlds/field.wbt"
@@ -59,7 +56,6 @@ RESOLVED_POLICY_DIR="${HERDING_POLICY_DIR:-$ROOT/training/runs/bc}"
 cat > "$ROOT/herding_runtime.cfg" <<EOF
 HERDING_MODE=$MODE
 HERDING_POLICY_DIR=$RESOLVED_POLICY_DIR
 HERDING_DAGGER_DRIVER=$DAGGER_DRIVER
 EOF
 export HERDING_MODE="$MODE"
@@ -68,7 +64,7 @@ export HERDING_POLICY_DIR="$RESOLVED_POLICY_DIR"
 # The controller writes this sentinel when all GT sheep are penned. We
 # poll for it and kill Webots so the run finishes cleanly instead of
 # idling for minutes after the task is done.
-DONE_FILE="$ROOT/training/dagger/.DONE"
+DONE_FILE="$ROOT/training/.run_done"
 mkdir -p "$(dirname "$DONE_FILE")"
 rm -f "$DONE_FILE"
@@ -1,21 +1,16 @@
 # Training pipeline
-Behavior cloning of analytic herding teachers into a neural-network
+Two stages, strictly sequential:
 policy that runs under LiDAR perception in Webots.
 ```
-sim demos (active-scan teacher on tracker output, K=4 frame stack)
+sim demos (Strömbom on tracker output, K=4 frame stack)
    │
    ▼
-bc_pretrain.py  ──►  runs/bc   (BC baseline)
+bc_pretrain.py  ──►  runs/bc   (Strömbom-imitated MLP)
    │
-    ▼  KL-regularised PPO fine-tune (training/train_ppo.py)
+    ▼  KL-regularised PPO fine-tune
    │
-runs/rl         (deployed `rl` mode)
+runs/rl                        (deployed `rl` mode — beats BC and Strömbom)
 # optional branch — kept for reference, not deployed:
 runs/bc_dagger     (Webots-grounded DAgger refinement, useful if a
                    modified world breaks sim-to-real transfer)
 ```
 ## Files
@@ -23,10 +18,9 @@ runs/bc_dagger     (Webots-grounded DAgger refinement, useful if a
 ```
 herding_env.py     — Gymnasium env (LiDAR raycast + tracker by default)
 bc_pretrain.py     — MSE + cosine BC of (obs, action) demos into MlpPolicy
-eval.py            — analytic teachers + BC policies, full n=1..10 grid
+train_ppo.py       — KL-regularised PPO fine-tune of a BC checkpoint
-parity_test.py     — shape / determinism / baseline smoke test
+eval.py            — multi-seed analytic / learned policy comparison
-runs/              — checkpoints (most are .gitignored; the deployed
+runs/              — checkpoints (whitelisted entries in top-level .gitignore)
                      ones are whitelisted in the top-level .gitignore)
 ```
 ## Setup
@@ -39,75 +33,62 @@ CPU is the default and recommended device — SB3 PPO with an MLP policy
 of this size runs faster on CPU than GPU because the bottleneck is
 rollout collection, not gradient compute.
-## The BC pipeline
+## End-to-end pipeline
-```
+```bash
 # 1. Sim demos with the active-scan + Strömbom teacher under LiDAR
 #    perception. K=4 frame stack so the MLP has temporal context.
 python -m tools.collect_demos --teacher strombom \
-    --out demos.npz --seeds-per-n 15 --subsample 3 --frame-stack 4
+    --out training/demos.npz --seeds-per-n 15 --subsample 3 --frame-stack 4
-# 2. Behavior-clone.
+# 2. Behaviour-clone.
-python -m training.bc_pretrain --demos demos.npz \
+python -m training.bc_pretrain --demos training/demos.npz \
-    --out runs/bc --epochs 60 --net-arch 512,512
+    --out training/runs/bc --epochs 60 --net-arch 512,512
-# 3. Evaluate.
+# 3. KL-regularised PPO fine-tune of bc.
-python -m training.eval --policy runs/bc \
+python -m training.train_ppo \
-    --max-flock 10 --max-steps 8000 --n-seeds 5
+    --bc training/runs/bc --out training/runs/rl \
    --total-timesteps 1000000
 # 4. Multi-seed eval (env-side, fast).
 python -m training.eval --policy training/runs/rl \
    --max-flock 10 --max-steps 15000 --n-seeds 10
 ```
 `bc_pretrain.py` saves the **best-val_cos** snapshot, not the final
 epoch — multi-modal teachers make training noisy and the last epoch is
 often worse than an earlier one.
-## DAgger from Webots
+`train_ppo.py` loads BC weights into both a trainable policy and a
-
+frozen reference, fixes `log_std` small, and adds `β · KL(π‖π_ref)` to
-Sim-only BC plateaus because the env's 2D raycast can't reproduce all
+the loss so the policy can only move within a trust region around BC.
-the false-positive clusters Webots generates from real geometry. The
+See the file header for hyperparameter rationale.
 fix is to collect (obs, teacher_action) pairs from inside Webots:
 ```
 # Headless DAgger collection: 5 flock sizes × 3 runs each.
 tools/auto_dagger.sh 3 60
 # Merge with the sim baseline + retrain.
 python -m tools.dagger_merge_train --out runs/bc_dagger
 ```
 Iterate by re-running collection with the new student in the driver's
 seat:
 ```
 HERDING_POLICY_DIR=$PWD/training/runs/bc_dagger \
 HERDING_DAGGER_DRIVER=student \
 tools/auto_dagger.sh 3 60
 python -m tools.dagger_merge_train --out runs/bc_dagger
 ```
 ## Available analytic teachers
 | Name | What it does | Notes |
 |---|---|---|
-| `strombom` | Canonical Strömbom — collect when flock is scattered, drive CoM otherwise | Default; works well for n=1–10 under tight cohesion |
+| `strombom` | Strömbom 2014 — collect when flock is scattered, drive CoM otherwise | Default; works for n=1–10 under tight cohesion |
 | `sequential` | Pick the sheep closest to the pen and drive only it | Alternative; needs loose-cohesion regime |
 Both are wrapped at demo-collection time in
-`herding/active_scan.py:ActiveScanTeacher`, which adds an opening
+`herding/control/active_scan.py:ActiveScanTeacher`, which adds an
-in-place rotation, walk-to-centre when the LiDAR sees nothing, and
+opening in-place rotation, walk-to-centre when the LiDAR sees
-near-sheep speed modulation (the same modulation `herding/control.py`
+nothing, and near-sheep speed modulation (same modulation
-applies to every dog mode at inference).
+`herding/control/modulation.py` applies to every dog mode at
 inference).
 ## Evaluating analytic teachers directly
 ```
-python -m training.eval --policy strombom    --max-flock 10 --max-steps 8000 --n-seeds 5
+python -m training.eval --policy strombom    --max-flock 10 --max-steps 15000 --n-seeds 10
-python -m training.eval --policy sequential  --max-flock 10 --max-steps 8000 --n-seeds 5
+python -m training.eval --policy sequential  --max-flock 10 --max-steps 15000 --n-seeds 10
 ```
 ## Webots inference
 ```
-tools/run_webots.sh 10 rl
+tools/run_webots.sh 10 bc          # or rl, strombom, sequential
 ```
 The dog controller loads `runs/bc` for `bc` mode and `runs/rl` for
@@ -25,9 +25,9 @@ if _PROJECT_ROOT not in sys.path:
 import numpy as np
-from herding.geometry import MAX_SHEEP, PEN_ENTRY
+from herding.world.geometry import MAX_SHEEP, PEN_ENTRY
-from herding.sequential import compute_action as sequential_action
+from herding.control.sequential import compute_action as sequential_action
-from herding.strombom import compute_action as strombom_action
+from herding.control.strombom import compute_action as strombom_action
 from training.herding_env import HerdingEnv
@@ -56,24 +56,24 @@ _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
 if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)
-from herding.diffdrive import (
+from herding.world.diffdrive import (
    heading_speed_to_wheels, kinematics_step, velocity_to_wheels,
 )
-from herding.flocking_sim import (
+from herding.world.flocking_sim import (
    FLEE_SPEED, MAX_SPEED, WANDER_SPEED, compute_heading_speed,
 )
-from herding.geometry import (
+from herding.world.geometry import (
    DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_BASE,
    DOG_WHEEL_RADIUS, FIELD_X, FIELD_Y, GATE_X, MAX_SHEEP,
    PEN_ENTRY, PEN_X, PEN_Y,
    SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS,
    WEBOTS_DT, is_penned_position,
 )
-from herding.lidar_perception import detections_from_scan
+from herding.perception.lidar_perception import detections_from_scan
-from herding.lidar_sim import simulate_scan
+from herding.perception.lidar_sim import simulate_scan
 from herding.obs import OBS_DIM, build_obs
-from herding.sheep_tracker import SheepTracker
+from herding.perception.sheep_tracker import SheepTracker
-from herding.strombom import compute_action as strombom_action
+from herding.control.strombom import compute_action as strombom_action
 class HerdingEnv(gym.Env):