Checkpoint 5 - incomplete

2026-05-11 10:35:39 +01:00
parent 6688325d89
commit b457155538
13 changed files with 174 additions and 74 deletions
@@ -8,11 +8,11 @@ env vars on some setups):
    sequential  → single-target "pin and push" — drives the sheep
                  closest to the pen.
    bc          → behaviour-cloned MLP, trained on Strömbom demos via
-                  sim. Default policy directory: training/runs/bc_v3.
+                  sim. Default policy directory: training/runs/bc.
    rl          → KL-regularised PPO fine-tune of the BC policy. Same
                  obs/action space as bc; refines time-to-pen via
                  environment reward while staying anchored to bc.
-                  Default policy directory: training/runs/rl_v1.
+                  Default policy directory: training/runs/rl.
    dagger      → DAgger data collection. Reads sheep ground-truth
                  via the receiver, computes the active-scan teacher's
                  recommended action at every step, drives with either
@@ -122,9 +122,9 @@ def _resolve_policy_dir(mode: str) -> str:
      1. HERDING_POLICY_DIR env var or runtime-cfg entry, if it points
         to a real directory.
      2. Mode-specific default:
-            bc → training/runs/bc_v3 (Strömbom-imitated MLP)
-            rl → training/runs/rl_v1 (KL-PPO fine-tune of bc_v3)
-      3. Fall back to bc_v3.
+            bc → training/runs/bc (Strömbom-imitated MLP)
+            rl → training/runs/rl (KL-PPO fine-tune of bc)
+      3. Fall back to bc.
    All checkpoints are frame-stacked K = 4; ``policy_loader`` reads
    the stacking factor from the policy's observation space.
    """
@@ -133,9 +133,9 @@ def _resolve_policy_dir(mode: str) -> str:
    if env_dir and os.path.isdir(env_dir):
        return env_dir
    mode_default = {
-        "bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"),
-        "rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl_v1"),
-        "dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"),
+        "bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
+        "rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl"),
+        "dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
    }
    primary = mode_default.get(mode, mode_default["bc"])
    if os.path.isdir(primary):
@@ -150,9 +150,9 @@ def _resolve_policy_dir(mode: str) -> str:

 _VALID_MODES = ("bc", "rl", "strombom", "sequential", "dagger", "diag")
 # Back-compat: an old config saying HERDING_MODE=rl meant "the BC policy".
-# We now use `rl` strictly for the KL-PPO fine-tune. If the rl_v1
+# We now use `rl` strictly for the KL-PPO fine-tune. If the rl
 # directory isn't present, _resolve_policy_dir below silently falls
-# back to bc_v3, preserving the old behaviour.
+# back to bc, preserving the old behaviour.
 if MODE not in _VALID_MODES:
    print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
    MODE = "strombom"
@@ -477,15 +477,22 @@ while robot.step(timestep) != -1:
    left_ear.setPosition(ear_pos)
    right_ear.setPosition(-ear_pos)

-    # --- DAgger: early-stop when all GT sheep are penned ---
-    if MODE == "dagger" and _gt_sheep:
+    # --- Early-stop when all GT sheep are penned (all modes) ---
+    # The dog isn't a Supervisor so it can't call simulationQuit() —
+    # instead we write a sentinel file the launcher polls for and uses
+    # to kill the Webots process. Bounded by `_gt_sheep` so we don't
+    # fire during the first few steps while the receiver fills.
+    if _gt_sheep and not os.path.exists(_DAGGER_DONE_FILE):
        gt_active_count = sum(1 for x, y in _gt_sheep.values()
                              if not is_penned_position(x, y))
-        if gt_active_count == 0 and not os.path.exists(_DAGGER_DONE_FILE):
-            _dump_dagger_log()
+        if gt_active_count == 0:
+            if MODE == "dagger":
+                _dump_dagger_log()
+            os.makedirs(os.path.dirname(_DAGGER_DONE_FILE), exist_ok=True)
            open(_DAGGER_DONE_FILE, "w").close()
-            print(f"[dog dagger] all {len(_gt_sheep)} sheep penned — "
-                  f"wrote {_DAGGER_DONE_FILE}, exiting early")
+            print(f"[dog] all {len(_gt_sheep)} sheep penned at step "
+                  f"{step_count} — wrote {_DAGGER_DONE_FILE}, "
+                  f"launcher will close Webots")

    if MODE == "dagger" and step_count % DAGGER_FLUSH_STEPS == 0 and DAGGER_LOG_OBS:
        _dump_dagger_log()