diff --git a/training/eval_per_sheep.py b/training/eval_per_sheep.py new file mode 100644 index 0000000..90779a3 --- /dev/null +++ b/training/eval_per_sheep.py @@ -0,0 +1,109 @@ +""" +Load a saved run and evaluate the policy at every n_sheep from 1..N. +Tells you exactly where the curriculum stopped working. + +Usage: + python eval_per_sheep.py --run-dir runs/ppo_v3 + python eval_per_sheep.py --run-dir runs/ppo_v3 --max-sheep 10 --episodes 20 + python eval_per_sheep.py --model runs/ppo_v3/final_model.zip \ + --vecnorm runs/ppo_v3/vecnorm.pkl +""" +import argparse +import os +from copy import deepcopy + +import numpy as np +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize + +from herding_env import HerdingEnv +from train import _classify, COMPACT_RADIUS + + +def evaluate(model, vn_template, n_sheep, n_episodes, max_steps): + raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep, max_steps=max_steps)]) + vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) + vn.obs_rms = deepcopy(vn_template.obs_rms) + vn.ret_rms = deepcopy(vn_template.ret_rms) + + failure = {} + successes = 0 + act_mags, min_radii, min_dog_com, min_pen = [], [], [], [] + + for _ in range(n_episodes): + obs = vn.reset() + done = False + ep_radius, ep_com_dist, ep_dog_com, ep_act = [], [], [], [] + while not done: + action, _ = model.predict(obs, deterministic=True) + obs, _, dones, infos = vn.step(action) + done = dones[0] + inner = vn.envs[0] + com, radius, _ = inner._flock_stats() + ep_radius.append(radius) + ep_com_dist.append(float(np.linalg.norm(com - inner.PEN_CENTER))) + ep_dog_com.append(float(np.linalg.norm(inner.dog_pos - com))) + ep_act.append(float(np.linalg.norm(action[0]))) + npen = infos[0].get("n_penned", 0) + success = npen == n_sheep + successes += int(success) + mode = _classify(ep_radius, ep_com_dist, npen, n_sheep, success) + failure[mode] = failure.get(mode, 0) + 1 + act_mags.extend(ep_act) + min_radii.append(min(ep_radius)) + min_dog_com.append(min(ep_dog_com)) + min_pen.append(min(ep_com_dist)) + vn.close() + + return { + "n_sheep": n_sheep, + "success_rate": successes / n_episodes, + "failure": failure, + "mean_action": float(np.mean(act_mags)), + "mean_min_radius": float(np.mean(min_radii)), + "mean_min_dog_com": float(np.mean(min_dog_com)), + "mean_min_pen": float(np.mean(min_pen)), + } + + +def main(): + p = argparse.ArgumentParser() + p.add_argument("--run-dir", type=str, default=None) + p.add_argument("--model", type=str, default=None) + p.add_argument("--vecnorm", type=str, default=None) + p.add_argument("--max-sheep", type=int, default=10) + p.add_argument("--episodes", type=int, default=10) + p.add_argument("--max-steps", type=int, default=2000) + args = p.parse_args() + + if args.run_dir: + model_path = os.path.join(args.run_dir, "final_model.zip") + if not os.path.exists(model_path): + model_path = os.path.join(args.run_dir, "best_model", "best_model.zip") + vn_path = os.path.join(args.run_dir, "vecnorm.pkl") + else: + model_path = args.model + vn_path = args.vecnorm + + print(f"Loading model: {model_path}") + print(f"Loading vecnorm: {vn_path}\n") + model = PPO.load(model_path, device="cpu") + raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=1, max_steps=args.max_steps)]) + vn_template = VecNormalize.load(vn_path, raw) + + print(f"{'n_sheep':>7} {'success':>8} {'act':>6} {'min_r':>7} " + f"{'dog→com':>8} {'com→pen':>8} failure breakdown") + print("-" * 90) + for n in range(1, args.max_sheep + 1): + r = evaluate(model, vn_template, n, args.episodes, args.max_steps) + fb = " ".join(f"{m}={c}" for m, c in + sorted(r["failure"].items(), key=lambda x: -x[1])) + print(f"{n:>7d} {r['success_rate']*100:>6.0f}% " + f"{r['mean_action']:>6.2f} " + f"{r['mean_min_radius']:>6.2f}m " + f"{r['mean_min_dog_com']:>7.2f}m " + f"{r['mean_min_pen']:>7.2f}m {fb}") + + +if __name__ == "__main__": + main() diff --git a/training/herding_env.py b/training/herding_env.py index 7c408a0..440d319 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -179,10 +179,11 @@ class HerdingEnv(gym.Env): newly_penned = n_penned - self._prev_penned self._prev_penned = n_penned - reward = self._reward(n_penned, newly_penned) + reward, rcomps = self._reward(n_penned, newly_penned) terminated = n_penned == self.n_sheep truncated = self._step_count >= self.max_steps - info = {"n_penned": n_penned, "n_sheep": self.n_sheep} + info = {"n_penned": n_penned, "n_sheep": self.n_sheep, + "rcomps": rcomps} if self.render_mode == "human": self.render() @@ -297,7 +298,7 @@ class HerdingEnv(gym.Env): active_mask.sum() / self.n_sheep, ], dtype=np.float32) - def _reward(self, n_penned: int, newly_penned: int) -> float: + def _reward(self, n_penned: int, newly_penned: int): active = ~self.penned[:self.n_sheep] # Per-sheep progress toward pen: fires whenever any sheep moves closer. @@ -326,12 +327,18 @@ class HerdingEnv(gym.Env): else: alignment = 0.0 - reward = r_progress + alignment - reward += newly_penned * self.W_PEN_BONUS - reward -= self.W_STEP_COST - if n_penned == self.n_sheep: - reward += self.W_COMPLETE - return reward + r_pen_bonus = newly_penned * self.W_PEN_BONUS + r_step_cost = -self.W_STEP_COST + r_complete = self.W_COMPLETE if n_penned == self.n_sheep else 0.0 + reward = r_progress + alignment + r_pen_bonus + r_step_cost + r_complete + rcomps = { + "progress": float(r_progress), + "alignment": float(alignment), + "pen_bonus": float(r_pen_bonus), + "step_cost": float(r_step_cost), + "complete": float(r_complete), + } + return reward, rcomps def _step_sheep(self, i: int) -> np.ndarray: """Apply one timestep of boid dynamics to sheep i (mirrors sheep.py).""" diff --git a/training/train.py b/training/train.py index 7f4fc05..b961cd6 100644 --- a/training/train.py +++ b/training/train.py @@ -83,6 +83,13 @@ class CurriculumCallback(BaseCallback): self._stage_start = 0 def _advance(self): + prev_sheep = self._cur_sheep + recent_sr = (np.mean(self._successes) if self._successes else float("nan")) + if self.verbose: + print(f"\n[Curriculum] leaving stage n_sheep={prev_sheep} " + f"after {self.num_timesteps - self._stage_start:,} steps " + f"| training success rate (last {len(self._successes)} eps) = " + f"{recent_sr*100:.0f}%") self._cur_sheep += 1 self.training_env.env_method("set_n_sheep", self._cur_sheep) if self.eval_env is not None: @@ -90,26 +97,26 @@ class CurriculumCallback(BaseCallback): self._stage_start = self.num_timesteps self._successes.clear() if self.verbose: - print(f"\n[Curriculum] → {self._cur_sheep} sheep " + print(f"[Curriculum] → {self._cur_sheep} sheep " f"at step {self.num_timesteps:,}\n") def _on_step(self) -> bool: if self._cur_sheep >= self.max_sheep: return True + # Always track training-side success (success = sheep all penned, not truncated) + for info, done in zip(self.locals["infos"], self.locals["dones"]): + if done: + npen = info.get("n_penned", 0) + nshp = info.get("n_sheep", self._cur_sheep) + self._successes.append(1 if npen == nshp else 0) + if len(self._successes) > self.window: + self._successes.pop(0) + if self.steps_per_stage is not None: - # Time-based: advance every steps_per_stage env steps if self.num_timesteps - self._stage_start >= self.steps_per_stage: self._advance() else: - # Success-rate based - for info, done in zip(self.locals["infos"], self.locals["dones"]): - if done: - truncated = info.get("TimeLimit.truncated", False) - self._successes.append(0 if truncated else 1) - if len(self._successes) > self.window: - self._successes.pop(0) - if (len(self._successes) >= self.min_episodes and np.mean(self._successes) >= self.threshold): self._advance() @@ -131,11 +138,13 @@ class DiagnosticCallback(BaseCallback): """ def __init__(self, diag_freq: int = 500_000, n_episodes: int = 20, - max_steps: int = 2000, verbose: int = 1): + max_steps: int = 2000, abort_on_stall: bool = True, + verbose: int = 1): super().__init__(verbose) self.diag_freq = diag_freq self.n_episodes = n_episodes self.max_steps = max_steps + self.abort_on_stall = abort_on_stall self._last_diag = 0 self._prev_dominant = None # (n_sheep, mode) from last check self._stall_count = 0 @@ -156,11 +165,19 @@ class DiagnosticCallback(BaseCallback): failure_counts = {} successes = 0 + all_action_mags = [] + ep_min_radii = [] + ep_min_dog_com = [] # closest the dog ever got to flock COM + ep_min_pen_dists = [] # closest COM ever got to pen + rcomp_sums = {"progress":0.0,"alignment":0.0,"pen_bonus":0.0, + "step_cost":0.0,"complete":0.0} + rcomp_n = 0 for _ in range(self.n_episodes): obs = vn.reset() done = False - ep_radius, ep_com_dist = [], [] + ep_radius, ep_com_dist, ep_dog_com = [], [], [] + ep_actions = [] n_penned = 0 while not done: @@ -173,12 +190,24 @@ class DiagnosticCallback(BaseCallback): ep_com_dist.append( float(np.linalg.norm(com - inner.PEN_CENTER)) ) + ep_dog_com.append( + float(np.linalg.norm(inner.dog_pos - com)) + ) + ep_actions.append(float(np.linalg.norm(action[0]))) + rc = infos[0].get("rcomps") + if rc is not None: + for k in rcomp_sums: rcomp_sums[k] += rc[k] + rcomp_n += 1 n_penned = infos[0].get("n_penned", 0) success = n_penned == n_sheep successes += int(success) mode = _classify(ep_radius, ep_com_dist, n_penned, n_sheep, success) failure_counts[mode] = failure_counts.get(mode, 0) + 1 + all_action_mags.extend(ep_actions) + ep_min_radii.append(min(ep_radius)) + ep_min_dog_com.append(min(ep_dog_com)) + ep_min_pen_dists.append(min(ep_com_dist)) vn.close() @@ -190,13 +219,30 @@ class DiagnosticCallback(BaseCallback): f"success={success_rate*100:.0f}%]") for m, c in sorted(failure_counts.items(), key=lambda x: -x[1]): print(f" {m:<26} {c}/{self.n_episodes}") + mean_act = float(np.mean(all_action_mags)) if all_action_mags else 0.0 + p10 = float(np.percentile(all_action_mags, 10)) if all_action_mags else 0.0 + p90 = float(np.percentile(all_action_mags, 90)) if all_action_mags else 0.0 + print(f" action_mag mean={mean_act:.3f} p10={p10:.3f} p90={p90:.3f} " + f"(0=stopped, 1=full speed)") + print(f" min_flock_radius mean={np.mean(ep_min_radii):.2f}m " + f"best={np.min(ep_min_radii):.2f}m (target <5m to compact)") + print(f" min_dog_to_com mean={np.mean(ep_min_dog_com):.2f}m " + f"best={np.min(ep_min_dog_com):.2f}m (FLEE_DIST=7m)") + print(f" min_com_to_pen mean={np.mean(ep_min_pen_dists):.2f}m " + f"best={np.min(ep_min_pen_dists):.2f}m") + if rcomp_n > 0: + print(f" reward/step (mean): " + " ".join( + f"{k}={rcomp_sums[k]/rcomp_n:+.4f}" for k in + ("progress","alignment","pen_bonus","step_cost","complete") + )) - # Stall detection: same dominant failure at same n_sheep 5 checks in a row, - # and only after 3M total steps (give early stages time to warm up). + # Stall detection — disabled when --no-stall-abort or when we've never + # seen any stage succeed (we want full visibility into what's happening). key = (n_sheep, dominant) if key == self._prev_dominant and dominant != "SUCCESS": self._stall_count += 1 - if self._stall_count >= 5 and self.num_timesteps >= 3_000_000: + if (self.abort_on_stall and self._stall_count >= 5 + and self.num_timesteps >= 3_000_000): print(f"\n[Diag] STALL DETECTED — '{dominant}' on {n_sheep} sheep " f"for {self._stall_count} consecutive checks. " f"Aborting training early.") @@ -250,6 +296,9 @@ def parse_args(): p.add_argument("--eval-eps", type=int, default=20) p.add_argument("--diag-freq", type=int, default=500_000, help="Run failure-mode diagnostics every N env steps") + p.add_argument("--no-stall-abort", action="store_true", + help="Disable early-abort on stall — run full --total-steps " + "for diagnostics") p.add_argument("--mixed", action="store_true", help="Randomise n_sheep each episode (consolidation pass, " "use with --resume after curriculum training)") @@ -306,6 +355,7 @@ def main(): diag_freq=args.diag_freq, n_episodes=20, max_steps=args.max_steps, + abort_on_stall=not args.no_stall_abort, ) callbacks = [checkpoint_cb, eval_cb, diag_cb]