Sheep training flock _ improver
This commit is contained in:
@@ -0,0 +1,109 @@
|
|||||||
|
"""
|
||||||
|
Load a saved run and evaluate the policy at every n_sheep from 1..N.
|
||||||
|
Tells you exactly where the curriculum stopped working.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python eval_per_sheep.py --run-dir runs/ppo_v3
|
||||||
|
python eval_per_sheep.py --run-dir runs/ppo_v3 --max-sheep 10 --episodes 20
|
||||||
|
python eval_per_sheep.py --model runs/ppo_v3/final_model.zip \
|
||||||
|
--vecnorm runs/ppo_v3/vecnorm.pkl
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from stable_baselines3 import PPO
|
||||||
|
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
|
||||||
|
|
||||||
|
from herding_env import HerdingEnv
|
||||||
|
from train import _classify, COMPACT_RADIUS
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(model, vn_template, n_sheep, n_episodes, max_steps):
|
||||||
|
raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep, max_steps=max_steps)])
|
||||||
|
vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
|
||||||
|
vn.obs_rms = deepcopy(vn_template.obs_rms)
|
||||||
|
vn.ret_rms = deepcopy(vn_template.ret_rms)
|
||||||
|
|
||||||
|
failure = {}
|
||||||
|
successes = 0
|
||||||
|
act_mags, min_radii, min_dog_com, min_pen = [], [], [], []
|
||||||
|
|
||||||
|
for _ in range(n_episodes):
|
||||||
|
obs = vn.reset()
|
||||||
|
done = False
|
||||||
|
ep_radius, ep_com_dist, ep_dog_com, ep_act = [], [], [], []
|
||||||
|
while not done:
|
||||||
|
action, _ = model.predict(obs, deterministic=True)
|
||||||
|
obs, _, dones, infos = vn.step(action)
|
||||||
|
done = dones[0]
|
||||||
|
inner = vn.envs[0]
|
||||||
|
com, radius, _ = inner._flock_stats()
|
||||||
|
ep_radius.append(radius)
|
||||||
|
ep_com_dist.append(float(np.linalg.norm(com - inner.PEN_CENTER)))
|
||||||
|
ep_dog_com.append(float(np.linalg.norm(inner.dog_pos - com)))
|
||||||
|
ep_act.append(float(np.linalg.norm(action[0])))
|
||||||
|
npen = infos[0].get("n_penned", 0)
|
||||||
|
success = npen == n_sheep
|
||||||
|
successes += int(success)
|
||||||
|
mode = _classify(ep_radius, ep_com_dist, npen, n_sheep, success)
|
||||||
|
failure[mode] = failure.get(mode, 0) + 1
|
||||||
|
act_mags.extend(ep_act)
|
||||||
|
min_radii.append(min(ep_radius))
|
||||||
|
min_dog_com.append(min(ep_dog_com))
|
||||||
|
min_pen.append(min(ep_com_dist))
|
||||||
|
vn.close()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"n_sheep": n_sheep,
|
||||||
|
"success_rate": successes / n_episodes,
|
||||||
|
"failure": failure,
|
||||||
|
"mean_action": float(np.mean(act_mags)),
|
||||||
|
"mean_min_radius": float(np.mean(min_radii)),
|
||||||
|
"mean_min_dog_com": float(np.mean(min_dog_com)),
|
||||||
|
"mean_min_pen": float(np.mean(min_pen)),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--run-dir", type=str, default=None)
|
||||||
|
p.add_argument("--model", type=str, default=None)
|
||||||
|
p.add_argument("--vecnorm", type=str, default=None)
|
||||||
|
p.add_argument("--max-sheep", type=int, default=10)
|
||||||
|
p.add_argument("--episodes", type=int, default=10)
|
||||||
|
p.add_argument("--max-steps", type=int, default=2000)
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
if args.run_dir:
|
||||||
|
model_path = os.path.join(args.run_dir, "final_model.zip")
|
||||||
|
if not os.path.exists(model_path):
|
||||||
|
model_path = os.path.join(args.run_dir, "best_model", "best_model.zip")
|
||||||
|
vn_path = os.path.join(args.run_dir, "vecnorm.pkl")
|
||||||
|
else:
|
||||||
|
model_path = args.model
|
||||||
|
vn_path = args.vecnorm
|
||||||
|
|
||||||
|
print(f"Loading model: {model_path}")
|
||||||
|
print(f"Loading vecnorm: {vn_path}\n")
|
||||||
|
model = PPO.load(model_path, device="cpu")
|
||||||
|
raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=1, max_steps=args.max_steps)])
|
||||||
|
vn_template = VecNormalize.load(vn_path, raw)
|
||||||
|
|
||||||
|
print(f"{'n_sheep':>7} {'success':>8} {'act':>6} {'min_r':>7} "
|
||||||
|
f"{'dog→com':>8} {'com→pen':>8} failure breakdown")
|
||||||
|
print("-" * 90)
|
||||||
|
for n in range(1, args.max_sheep + 1):
|
||||||
|
r = evaluate(model, vn_template, n, args.episodes, args.max_steps)
|
||||||
|
fb = " ".join(f"{m}={c}" for m, c in
|
||||||
|
sorted(r["failure"].items(), key=lambda x: -x[1]))
|
||||||
|
print(f"{n:>7d} {r['success_rate']*100:>6.0f}% "
|
||||||
|
f"{r['mean_action']:>6.2f} "
|
||||||
|
f"{r['mean_min_radius']:>6.2f}m "
|
||||||
|
f"{r['mean_min_dog_com']:>7.2f}m "
|
||||||
|
f"{r['mean_min_pen']:>7.2f}m {fb}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
+16
-9
@@ -179,10 +179,11 @@ class HerdingEnv(gym.Env):
|
|||||||
newly_penned = n_penned - self._prev_penned
|
newly_penned = n_penned - self._prev_penned
|
||||||
self._prev_penned = n_penned
|
self._prev_penned = n_penned
|
||||||
|
|
||||||
reward = self._reward(n_penned, newly_penned)
|
reward, rcomps = self._reward(n_penned, newly_penned)
|
||||||
terminated = n_penned == self.n_sheep
|
terminated = n_penned == self.n_sheep
|
||||||
truncated = self._step_count >= self.max_steps
|
truncated = self._step_count >= self.max_steps
|
||||||
info = {"n_penned": n_penned, "n_sheep": self.n_sheep}
|
info = {"n_penned": n_penned, "n_sheep": self.n_sheep,
|
||||||
|
"rcomps": rcomps}
|
||||||
|
|
||||||
if self.render_mode == "human":
|
if self.render_mode == "human":
|
||||||
self.render()
|
self.render()
|
||||||
@@ -297,7 +298,7 @@ class HerdingEnv(gym.Env):
|
|||||||
active_mask.sum() / self.n_sheep,
|
active_mask.sum() / self.n_sheep,
|
||||||
], dtype=np.float32)
|
], dtype=np.float32)
|
||||||
|
|
||||||
def _reward(self, n_penned: int, newly_penned: int) -> float:
|
def _reward(self, n_penned: int, newly_penned: int):
|
||||||
active = ~self.penned[:self.n_sheep]
|
active = ~self.penned[:self.n_sheep]
|
||||||
|
|
||||||
# Per-sheep progress toward pen: fires whenever any sheep moves closer.
|
# Per-sheep progress toward pen: fires whenever any sheep moves closer.
|
||||||
@@ -326,12 +327,18 @@ class HerdingEnv(gym.Env):
|
|||||||
else:
|
else:
|
||||||
alignment = 0.0
|
alignment = 0.0
|
||||||
|
|
||||||
reward = r_progress + alignment
|
r_pen_bonus = newly_penned * self.W_PEN_BONUS
|
||||||
reward += newly_penned * self.W_PEN_BONUS
|
r_step_cost = -self.W_STEP_COST
|
||||||
reward -= self.W_STEP_COST
|
r_complete = self.W_COMPLETE if n_penned == self.n_sheep else 0.0
|
||||||
if n_penned == self.n_sheep:
|
reward = r_progress + alignment + r_pen_bonus + r_step_cost + r_complete
|
||||||
reward += self.W_COMPLETE
|
rcomps = {
|
||||||
return reward
|
"progress": float(r_progress),
|
||||||
|
"alignment": float(alignment),
|
||||||
|
"pen_bonus": float(r_pen_bonus),
|
||||||
|
"step_cost": float(r_step_cost),
|
||||||
|
"complete": float(r_complete),
|
||||||
|
}
|
||||||
|
return reward, rcomps
|
||||||
|
|
||||||
def _step_sheep(self, i: int) -> np.ndarray:
|
def _step_sheep(self, i: int) -> np.ndarray:
|
||||||
"""Apply one timestep of boid dynamics to sheep i (mirrors sheep.py)."""
|
"""Apply one timestep of boid dynamics to sheep i (mirrors sheep.py)."""
|
||||||
|
|||||||
+65
-15
@@ -83,6 +83,13 @@ class CurriculumCallback(BaseCallback):
|
|||||||
self._stage_start = 0
|
self._stage_start = 0
|
||||||
|
|
||||||
def _advance(self):
|
def _advance(self):
|
||||||
|
prev_sheep = self._cur_sheep
|
||||||
|
recent_sr = (np.mean(self._successes) if self._successes else float("nan"))
|
||||||
|
if self.verbose:
|
||||||
|
print(f"\n[Curriculum] leaving stage n_sheep={prev_sheep} "
|
||||||
|
f"after {self.num_timesteps - self._stage_start:,} steps "
|
||||||
|
f"| training success rate (last {len(self._successes)} eps) = "
|
||||||
|
f"{recent_sr*100:.0f}%")
|
||||||
self._cur_sheep += 1
|
self._cur_sheep += 1
|
||||||
self.training_env.env_method("set_n_sheep", self._cur_sheep)
|
self.training_env.env_method("set_n_sheep", self._cur_sheep)
|
||||||
if self.eval_env is not None:
|
if self.eval_env is not None:
|
||||||
@@ -90,26 +97,26 @@ class CurriculumCallback(BaseCallback):
|
|||||||
self._stage_start = self.num_timesteps
|
self._stage_start = self.num_timesteps
|
||||||
self._successes.clear()
|
self._successes.clear()
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"\n[Curriculum] → {self._cur_sheep} sheep "
|
print(f"[Curriculum] → {self._cur_sheep} sheep "
|
||||||
f"at step {self.num_timesteps:,}\n")
|
f"at step {self.num_timesteps:,}\n")
|
||||||
|
|
||||||
def _on_step(self) -> bool:
|
def _on_step(self) -> bool:
|
||||||
if self._cur_sheep >= self.max_sheep:
|
if self._cur_sheep >= self.max_sheep:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# Always track training-side success (success = sheep all penned, not truncated)
|
||||||
|
for info, done in zip(self.locals["infos"], self.locals["dones"]):
|
||||||
|
if done:
|
||||||
|
npen = info.get("n_penned", 0)
|
||||||
|
nshp = info.get("n_sheep", self._cur_sheep)
|
||||||
|
self._successes.append(1 if npen == nshp else 0)
|
||||||
|
if len(self._successes) > self.window:
|
||||||
|
self._successes.pop(0)
|
||||||
|
|
||||||
if self.steps_per_stage is not None:
|
if self.steps_per_stage is not None:
|
||||||
# Time-based: advance every steps_per_stage env steps
|
|
||||||
if self.num_timesteps - self._stage_start >= self.steps_per_stage:
|
if self.num_timesteps - self._stage_start >= self.steps_per_stage:
|
||||||
self._advance()
|
self._advance()
|
||||||
else:
|
else:
|
||||||
# Success-rate based
|
|
||||||
for info, done in zip(self.locals["infos"], self.locals["dones"]):
|
|
||||||
if done:
|
|
||||||
truncated = info.get("TimeLimit.truncated", False)
|
|
||||||
self._successes.append(0 if truncated else 1)
|
|
||||||
if len(self._successes) > self.window:
|
|
||||||
self._successes.pop(0)
|
|
||||||
|
|
||||||
if (len(self._successes) >= self.min_episodes
|
if (len(self._successes) >= self.min_episodes
|
||||||
and np.mean(self._successes) >= self.threshold):
|
and np.mean(self._successes) >= self.threshold):
|
||||||
self._advance()
|
self._advance()
|
||||||
@@ -131,11 +138,13 @@ class DiagnosticCallback(BaseCallback):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, diag_freq: int = 500_000, n_episodes: int = 20,
|
def __init__(self, diag_freq: int = 500_000, n_episodes: int = 20,
|
||||||
max_steps: int = 2000, verbose: int = 1):
|
max_steps: int = 2000, abort_on_stall: bool = True,
|
||||||
|
verbose: int = 1):
|
||||||
super().__init__(verbose)
|
super().__init__(verbose)
|
||||||
self.diag_freq = diag_freq
|
self.diag_freq = diag_freq
|
||||||
self.n_episodes = n_episodes
|
self.n_episodes = n_episodes
|
||||||
self.max_steps = max_steps
|
self.max_steps = max_steps
|
||||||
|
self.abort_on_stall = abort_on_stall
|
||||||
self._last_diag = 0
|
self._last_diag = 0
|
||||||
self._prev_dominant = None # (n_sheep, mode) from last check
|
self._prev_dominant = None # (n_sheep, mode) from last check
|
||||||
self._stall_count = 0
|
self._stall_count = 0
|
||||||
@@ -156,11 +165,19 @@ class DiagnosticCallback(BaseCallback):
|
|||||||
|
|
||||||
failure_counts = {}
|
failure_counts = {}
|
||||||
successes = 0
|
successes = 0
|
||||||
|
all_action_mags = []
|
||||||
|
ep_min_radii = []
|
||||||
|
ep_min_dog_com = [] # closest the dog ever got to flock COM
|
||||||
|
ep_min_pen_dists = [] # closest COM ever got to pen
|
||||||
|
rcomp_sums = {"progress":0.0,"alignment":0.0,"pen_bonus":0.0,
|
||||||
|
"step_cost":0.0,"complete":0.0}
|
||||||
|
rcomp_n = 0
|
||||||
|
|
||||||
for _ in range(self.n_episodes):
|
for _ in range(self.n_episodes):
|
||||||
obs = vn.reset()
|
obs = vn.reset()
|
||||||
done = False
|
done = False
|
||||||
ep_radius, ep_com_dist = [], []
|
ep_radius, ep_com_dist, ep_dog_com = [], [], []
|
||||||
|
ep_actions = []
|
||||||
n_penned = 0
|
n_penned = 0
|
||||||
|
|
||||||
while not done:
|
while not done:
|
||||||
@@ -173,12 +190,24 @@ class DiagnosticCallback(BaseCallback):
|
|||||||
ep_com_dist.append(
|
ep_com_dist.append(
|
||||||
float(np.linalg.norm(com - inner.PEN_CENTER))
|
float(np.linalg.norm(com - inner.PEN_CENTER))
|
||||||
)
|
)
|
||||||
|
ep_dog_com.append(
|
||||||
|
float(np.linalg.norm(inner.dog_pos - com))
|
||||||
|
)
|
||||||
|
ep_actions.append(float(np.linalg.norm(action[0])))
|
||||||
|
rc = infos[0].get("rcomps")
|
||||||
|
if rc is not None:
|
||||||
|
for k in rcomp_sums: rcomp_sums[k] += rc[k]
|
||||||
|
rcomp_n += 1
|
||||||
|
|
||||||
n_penned = infos[0].get("n_penned", 0)
|
n_penned = infos[0].get("n_penned", 0)
|
||||||
success = n_penned == n_sheep
|
success = n_penned == n_sheep
|
||||||
successes += int(success)
|
successes += int(success)
|
||||||
mode = _classify(ep_radius, ep_com_dist, n_penned, n_sheep, success)
|
mode = _classify(ep_radius, ep_com_dist, n_penned, n_sheep, success)
|
||||||
failure_counts[mode] = failure_counts.get(mode, 0) + 1
|
failure_counts[mode] = failure_counts.get(mode, 0) + 1
|
||||||
|
all_action_mags.extend(ep_actions)
|
||||||
|
ep_min_radii.append(min(ep_radius))
|
||||||
|
ep_min_dog_com.append(min(ep_dog_com))
|
||||||
|
ep_min_pen_dists.append(min(ep_com_dist))
|
||||||
|
|
||||||
vn.close()
|
vn.close()
|
||||||
|
|
||||||
@@ -190,13 +219,30 @@ class DiagnosticCallback(BaseCallback):
|
|||||||
f"success={success_rate*100:.0f}%]")
|
f"success={success_rate*100:.0f}%]")
|
||||||
for m, c in sorted(failure_counts.items(), key=lambda x: -x[1]):
|
for m, c in sorted(failure_counts.items(), key=lambda x: -x[1]):
|
||||||
print(f" {m:<26} {c}/{self.n_episodes}")
|
print(f" {m:<26} {c}/{self.n_episodes}")
|
||||||
|
mean_act = float(np.mean(all_action_mags)) if all_action_mags else 0.0
|
||||||
|
p10 = float(np.percentile(all_action_mags, 10)) if all_action_mags else 0.0
|
||||||
|
p90 = float(np.percentile(all_action_mags, 90)) if all_action_mags else 0.0
|
||||||
|
print(f" action_mag mean={mean_act:.3f} p10={p10:.3f} p90={p90:.3f} "
|
||||||
|
f"(0=stopped, 1=full speed)")
|
||||||
|
print(f" min_flock_radius mean={np.mean(ep_min_radii):.2f}m "
|
||||||
|
f"best={np.min(ep_min_radii):.2f}m (target <5m to compact)")
|
||||||
|
print(f" min_dog_to_com mean={np.mean(ep_min_dog_com):.2f}m "
|
||||||
|
f"best={np.min(ep_min_dog_com):.2f}m (FLEE_DIST=7m)")
|
||||||
|
print(f" min_com_to_pen mean={np.mean(ep_min_pen_dists):.2f}m "
|
||||||
|
f"best={np.min(ep_min_pen_dists):.2f}m")
|
||||||
|
if rcomp_n > 0:
|
||||||
|
print(f" reward/step (mean): " + " ".join(
|
||||||
|
f"{k}={rcomp_sums[k]/rcomp_n:+.4f}" for k in
|
||||||
|
("progress","alignment","pen_bonus","step_cost","complete")
|
||||||
|
))
|
||||||
|
|
||||||
# Stall detection: same dominant failure at same n_sheep 5 checks in a row,
|
# Stall detection — disabled when --no-stall-abort or when we've never
|
||||||
# and only after 3M total steps (give early stages time to warm up).
|
# seen any stage succeed (we want full visibility into what's happening).
|
||||||
key = (n_sheep, dominant)
|
key = (n_sheep, dominant)
|
||||||
if key == self._prev_dominant and dominant != "SUCCESS":
|
if key == self._prev_dominant and dominant != "SUCCESS":
|
||||||
self._stall_count += 1
|
self._stall_count += 1
|
||||||
if self._stall_count >= 5 and self.num_timesteps >= 3_000_000:
|
if (self.abort_on_stall and self._stall_count >= 5
|
||||||
|
and self.num_timesteps >= 3_000_000):
|
||||||
print(f"\n[Diag] STALL DETECTED — '{dominant}' on {n_sheep} sheep "
|
print(f"\n[Diag] STALL DETECTED — '{dominant}' on {n_sheep} sheep "
|
||||||
f"for {self._stall_count} consecutive checks. "
|
f"for {self._stall_count} consecutive checks. "
|
||||||
f"Aborting training early.")
|
f"Aborting training early.")
|
||||||
@@ -250,6 +296,9 @@ def parse_args():
|
|||||||
p.add_argument("--eval-eps", type=int, default=20)
|
p.add_argument("--eval-eps", type=int, default=20)
|
||||||
p.add_argument("--diag-freq", type=int, default=500_000,
|
p.add_argument("--diag-freq", type=int, default=500_000,
|
||||||
help="Run failure-mode diagnostics every N env steps")
|
help="Run failure-mode diagnostics every N env steps")
|
||||||
|
p.add_argument("--no-stall-abort", action="store_true",
|
||||||
|
help="Disable early-abort on stall — run full --total-steps "
|
||||||
|
"for diagnostics")
|
||||||
p.add_argument("--mixed", action="store_true",
|
p.add_argument("--mixed", action="store_true",
|
||||||
help="Randomise n_sheep each episode (consolidation pass, "
|
help="Randomise n_sheep each episode (consolidation pass, "
|
||||||
"use with --resume after curriculum training)")
|
"use with --resume after curriculum training)")
|
||||||
@@ -306,6 +355,7 @@ def main():
|
|||||||
diag_freq=args.diag_freq,
|
diag_freq=args.diag_freq,
|
||||||
n_episodes=20,
|
n_episodes=20,
|
||||||
max_steps=args.max_steps,
|
max_steps=args.max_steps,
|
||||||
|
abort_on_stall=not args.no_stall_abort,
|
||||||
)
|
)
|
||||||
callbacks = [checkpoint_cb, eval_cb, diag_cb]
|
callbacks = [checkpoint_cb, eval_cb, diag_cb]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user