Checkpoint 5 - incomplete
This commit is contained in:
@@ -8,11 +8,11 @@ env vars on some setups):
|
||||
sequential → single-target "pin and push" — drives the sheep
|
||||
closest to the pen.
|
||||
bc → behaviour-cloned MLP, trained on Strömbom demos via
|
||||
sim. Default policy directory: training/runs/bc_v3.
|
||||
sim. Default policy directory: training/runs/bc.
|
||||
rl → KL-regularised PPO fine-tune of the BC policy. Same
|
||||
obs/action space as bc; refines time-to-pen via
|
||||
environment reward while staying anchored to bc.
|
||||
Default policy directory: training/runs/rl_v1.
|
||||
Default policy directory: training/runs/rl.
|
||||
dagger → DAgger data collection. Reads sheep ground-truth
|
||||
via the receiver, computes the active-scan teacher's
|
||||
recommended action at every step, drives with either
|
||||
@@ -122,9 +122,9 @@ def _resolve_policy_dir(mode: str) -> str:
|
||||
1. HERDING_POLICY_DIR env var or runtime-cfg entry, if it points
|
||||
to a real directory.
|
||||
2. Mode-specific default:
|
||||
bc → training/runs/bc_v3 (Strömbom-imitated MLP)
|
||||
rl → training/runs/rl_v1 (KL-PPO fine-tune of bc_v3)
|
||||
3. Fall back to bc_v3.
|
||||
bc → training/runs/bc (Strömbom-imitated MLP)
|
||||
rl → training/runs/rl (KL-PPO fine-tune of bc)
|
||||
3. Fall back to bc.
|
||||
All checkpoints are frame-stacked K = 4; ``policy_loader`` reads
|
||||
the stacking factor from the policy's observation space.
|
||||
"""
|
||||
@@ -133,9 +133,9 @@ def _resolve_policy_dir(mode: str) -> str:
|
||||
if env_dir and os.path.isdir(env_dir):
|
||||
return env_dir
|
||||
mode_default = {
|
||||
"bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"),
|
||||
"rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl_v1"),
|
||||
"dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"),
|
||||
"bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
|
||||
"rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl"),
|
||||
"dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc"),
|
||||
}
|
||||
primary = mode_default.get(mode, mode_default["bc"])
|
||||
if os.path.isdir(primary):
|
||||
@@ -150,9 +150,9 @@ def _resolve_policy_dir(mode: str) -> str:
|
||||
|
||||
_VALID_MODES = ("bc", "rl", "strombom", "sequential", "dagger", "diag")
|
||||
# Back-compat: an old config saying HERDING_MODE=rl meant "the BC policy".
|
||||
# We now use `rl` strictly for the KL-PPO fine-tune. If the rl_v1
|
||||
# We now use `rl` strictly for the KL-PPO fine-tune. If the rl
|
||||
# directory isn't present, _resolve_policy_dir below silently falls
|
||||
# back to bc_v3, preserving the old behaviour.
|
||||
# back to bc, preserving the old behaviour.
|
||||
if MODE not in _VALID_MODES:
|
||||
print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.")
|
||||
MODE = "strombom"
|
||||
@@ -477,15 +477,22 @@ while robot.step(timestep) != -1:
|
||||
left_ear.setPosition(ear_pos)
|
||||
right_ear.setPosition(-ear_pos)
|
||||
|
||||
# --- DAgger: early-stop when all GT sheep are penned ---
|
||||
if MODE == "dagger" and _gt_sheep:
|
||||
# --- Early-stop when all GT sheep are penned (all modes) ---
|
||||
# The dog isn't a Supervisor so it can't call simulationQuit() —
|
||||
# instead we write a sentinel file the launcher polls for and uses
|
||||
# to kill the Webots process. Bounded by `_gt_sheep` so we don't
|
||||
# fire during the first few steps while the receiver fills.
|
||||
if _gt_sheep and not os.path.exists(_DAGGER_DONE_FILE):
|
||||
gt_active_count = sum(1 for x, y in _gt_sheep.values()
|
||||
if not is_penned_position(x, y))
|
||||
if gt_active_count == 0 and not os.path.exists(_DAGGER_DONE_FILE):
|
||||
_dump_dagger_log()
|
||||
if gt_active_count == 0:
|
||||
if MODE == "dagger":
|
||||
_dump_dagger_log()
|
||||
os.makedirs(os.path.dirname(_DAGGER_DONE_FILE), exist_ok=True)
|
||||
open(_DAGGER_DONE_FILE, "w").close()
|
||||
print(f"[dog dagger] all {len(_gt_sheep)} sheep penned — "
|
||||
f"wrote {_DAGGER_DONE_FILE}, exiting early")
|
||||
print(f"[dog] all {len(_gt_sheep)} sheep penned at step "
|
||||
f"{step_count} — wrote {_DAGGER_DONE_FILE}, "
|
||||
f"launcher will close Webots")
|
||||
|
||||
if MODE == "dagger" and step_count % DAGGER_FLUSH_STEPS == 0 and DAGGER_LOG_OBS:
|
||||
_dump_dagger_log()
|
||||
|
||||
Reference in New Issue
Block a user