Checkpoint 3

2026-05-10 12:46:14 +01:00
parent 1bb9415414
commit 2a6db038df
16 changed files with 305 additions and 662 deletions
@@ -1,26 +1,24 @@
 """Shepherd Dog controller (Webots).

-Runs in one of two modes selected by the ``HERDING_MODE`` environment
-variable:
+Mode is selected by ``HERDING_MODE`` (env var, or via the
+``herding_runtime.cfg`` file the launcher writes since Webots strips
+env vars on some setups):

-    HERDING_MODE=rl        → load an SB3 PPO policy from
-                             HERDING_POLICY_DIR (default
-                             training/runs/latest/best) and use its
-                             (vx, vy) action each step.
-    HERDING_MODE=strombom  → use the analytic Strömbom collect/drive
-                             heuristic. This is the fallback if the RL
-                             policy can't be loaded (e.g. SB3 not
-                             installed in the Webots Python env, or no
-                             checkpoint yet).
+    rl          → load a BC-trained SB3 policy from HERDING_POLICY_DIR
+                  and use its (vx, vy) action each step.
+    strombom    → canonical Strömbom collect/drive heuristic.
+    sequential  → single-target "pin and push" — drives the sheep
+                  closest to the pen.

-Both modes share the same low-level differential-drive controller
-(``herding.diffdrive.velocity_to_wheels`` + clamped forward speed), so
-switching modes does not retune the actuation layer.
+All modes share the same low-level differential-drive controller
+(``herding.diffdrive.velocity_to_wheels`` with cos(err)-clamped forward
+speed), so switching modes does not retune actuation.

 A safety supervisor enforces the "dog stays out of the pen" invariant:
 if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is
-overridden with a north-driving correction. This is a hard guarantee
-the policy cannot escape.
+overridden with a north-driving correction. RL fallback: if the policy
+zip can't be loaded (SB3 missing, file missing), the controller drops
+to strombom mode automatically.
 """

 import math
@@ -85,19 +83,21 @@ def _resolve_policy_dir() -> str:
    """Where to look for the trained policy.

    Priority:
-      1. HERDING_POLICY_DIR env var (if set and points to a real dir)
-      2. training/runs/bc_pretrained/  (BC-only checkpoint)
-      3. training/runs/bc_ppo/best/    (PPO fine-tuned best)
-      4. training/runs/latest/best/    (legacy default)
+      1. HERDING_POLICY_DIR env var or runtime-cfg entry, if it points
+         to a real directory.
+      2. ``training/runs/bc_flock`` — flock-style BC (current default;
+         requires the tight-cohesion sheep regime).
+      3. ``training/runs/bc_solo`` — single-target BC (1-by-1 style;
+         only works if ``herding/flocking_sim.py`` is reverted to the
+         loose-cohesion regime).
    """
    env_dir = (os.environ.get("HERDING_POLICY_DIR")
               or _runtime_cfg.get("HERDING_POLICY_DIR"))
    if env_dir and os.path.isdir(env_dir):
        return env_dir
    candidates = [
-        os.path.join(_PROJECT_ROOT, "training", "runs", "bc_pretrained"),
-        os.path.join(_PROJECT_ROOT, "training", "runs", "bc_ppo", "best"),
-        os.path.join(_PROJECT_ROOT, "training", "runs", "latest", "best"),
+        os.path.join(_PROJECT_ROOT, "training", "runs", "bc_flock"),
+        os.path.join(_PROJECT_ROOT, "training", "runs", "bc_solo"),
    ]
    for c in candidates:
        if os.path.isdir(c):
@@ -106,30 +106,22 @@ def _resolve_policy_dir() -> str:
    return env_dir or candidates[0]


-POLICY_DIR = _resolve_policy_dir()
+_VALID_MODES = ("rl", "strombom", "sequential")
+if MODE not in _VALID_MODES:
+    print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
+    MODE = "strombom"

+POLICY_DIR = _resolve_policy_dir()
 policy_handle = None
 if MODE == "rl":
-    print(f"[dog] HERDING_MODE={MODE}  HERDING_POLICY_DIR(env)="
-          f"{os.environ.get('HERDING_POLICY_DIR', '<unset>')}")
-    print(f"[dog] resolved POLICY_DIR={POLICY_DIR}  exists="
-          f"{os.path.isdir(POLICY_DIR)}")
-    if os.path.isdir(POLICY_DIR):
-        try:
-            entries = sorted(os.listdir(POLICY_DIR))
-        except OSError:
-            entries = []
-        print(f"[dog] dir contents: {entries}")
+    print(f"[dog] resolved POLICY_DIR={POLICY_DIR}  exists={os.path.isdir(POLICY_DIR)}")
    try:
        from policy_loader import load as _load_policy
        policy_handle = _load_policy(POLICY_DIR)
        print(f"[dog] RL policy loaded from {POLICY_DIR}")
    except Exception as exc:
-        print(f"[dog] RL policy load failed ({exc!r}); falling back to Strömbom.")
+        print(f"[dog] RL policy load failed ({exc!r}); falling back to strombom.")
        MODE = "strombom"
-if MODE not in ("rl", "strombom", "sequential"):
-    print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
-    MODE = "strombom"
 print(f"[dog] running in mode={MODE}")