Checkpoint 5 - incomplete

This commit is contained in:
Johnny Fernandes
2026-05-11 10:35:39 +01:00
parent 6688325d89
commit b457155538
13 changed files with 174 additions and 74 deletions
+23 -16
View File
@@ -8,11 +8,11 @@ env vars on some setups):
sequential → single-target "pin and push" — drives the sheep
closest to the pen.
bc → behaviour-cloned MLP, trained on Strömbom demos via
sim. Default policy directory: training/runs/bc_v3.
sim. Default policy directory: training/runs/bc.
rl → KL-regularised PPO fine-tune of the BC policy. Same
obs/action space as bc; refines time-to-pen via
environment reward while staying anchored to bc.
Default policy directory: training/runs/rl_v1.
Default policy directory: training/runs/rl.
dagger → DAgger data collection. Reads sheep ground-truth
via the receiver, computes the active-scan teacher's
recommended action at every step, drives with either
@@ -122,9 +122,9 @@ def _resolve_policy_dir(mode: str) -> str:
1. HERDING_POLICY_DIR env var or runtime-cfg entry, if it points
to a real directory.
2. Mode-specific default:
bc → training/runs/bc_v3 (Strömbom-imitated MLP)
rl → training/runs/rl_v1 (KL-PPO fine-tune of bc_v3)
3. Fall back to bc_v3.
bc → training/runs/bc (Strömbom-imitated MLP)
rl → training/runs/rl (KL-PPO fine-tune of bc)
3. Fall back to bc.
All checkpoints are frame-stacked K = 4; ``policy_loader`` reads
the stacking factor from the policy's observation space.
"""
@@ -133,9 +133,9 @@ def _resolve_policy_dir(mode: str) -> str:
if env_dir and os.path.isdir(env_dir):
return env_dir
mode_default = {
"bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"),
"rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl_v1"),
"dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"),
"bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
"rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl"),
"dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
}
primary = mode_default.get(mode, mode_default["bc"])
if os.path.isdir(primary):
@@ -150,9 +150,9 @@ def _resolve_policy_dir(mode: str) -> str:
_VALID_MODES = ("bc", "rl", "strombom", "sequential", "dagger", "diag")
# Back-compat: an old config saying HERDING_MODE=rl meant "the BC policy".
# We now use `rl` strictly for the KL-PPO fine-tune. If the rl_v1
# We now use `rl` strictly for the KL-PPO fine-tune. If the rl
# directory isn't present, _resolve_policy_dir below silently falls
# back to bc_v3, preserving the old behaviour.
# back to bc, preserving the old behaviour.
if MODE not in _VALID_MODES:
print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
MODE = "strombom"
@@ -477,15 +477,22 @@ while robot.step(timestep) != -1:
left_ear.setPosition(ear_pos)
right_ear.setPosition(-ear_pos)
# --- DAgger: early-stop when all GT sheep are penned ---
if MODE == "dagger" and _gt_sheep:
# --- Early-stop when all GT sheep are penned (all modes) ---
# The dog isn't a Supervisor so it can't call simulationQuit() —
# instead we write a sentinel file the launcher polls for and uses
# to kill the Webots process. Bounded by `_gt_sheep` so we don't
# fire during the first few steps while the receiver fills.
if _gt_sheep and not os.path.exists(_DAGGER_DONE_FILE):
gt_active_count = sum(1 for x, y in _gt_sheep.values()
if not is_penned_position(x, y))
if gt_active_count == 0 and not os.path.exists(_DAGGER_DONE_FILE):
_dump_dagger_log()
if gt_active_count == 0:
if MODE == "dagger":
_dump_dagger_log()
os.makedirs(os.path.dirname(_DAGGER_DONE_FILE), exist_ok=True)
open(_DAGGER_DONE_FILE, "w").close()
print(f"[dog dagger] all {len(_gt_sheep)} sheep penned "
f"wrote {_DAGGER_DONE_FILE}, exiting early")
print(f"[dog] all {len(_gt_sheep)} sheep penned at step "
f"{step_count}wrote {_DAGGER_DONE_FILE}, "
f"launcher will close Webots")
if MODE == "dagger" and step_count % DAGGER_FLUSH_STEPS == 0 and DAGGER_LOG_OBS:
_dump_dagger_log()