From 17eb25864e1bedbfb6d38d3c748c17bc3e8bdced Mon Sep 17 00:00:00 2001 From: Johnny Fernandes Date: Fri, 24 Apr 2026 10:58:36 +0100 Subject: [PATCH] Sheep training flock of 10 fix? --- .../shepherd_dog_rl/shepherd_dog_rl.py | 15 +- training/diagnose.py | 223 ++++++++++++++++++ training/herding_env.py | 82 ++++--- 3 files changed, 280 insertions(+), 40 deletions(-) create mode 100644 training/diagnose.py diff --git a/controllers/shepherd_dog_rl/shepherd_dog_rl.py b/controllers/shepherd_dog_rl/shepherd_dog_rl.py index ed91682..d94c574 100644 --- a/controllers/shepherd_dog_rl/shepherd_dog_rl.py +++ b/controllers/shepherd_dog_rl/shepherd_dog_rl.py @@ -80,14 +80,15 @@ def build_obs(dog_pos: np.ndarray, n_active = len(active_pos) if n_active > 0: - com = active_pos.mean(axis=0) + com = active_pos.mean(axis=0) d_from_com = np.linalg.norm(active_pos - com, axis=1) - radius = float(d_from_com.max()) - mean_disp = float(d_from_com.mean()) - far = active_pos[int(np.argmax(d_from_com))] + sorted_idx = np.argsort(d_from_com)[::-1] + radius = float(d_from_com[sorted_idx[0]]) + far = active_pos[sorted_idx[0]] + second_far_dist = float(d_from_com[sorted_idx[1]]) if len(sorted_idx) > 1 else 0.0 else: com = PEN_CENTER.copy() - radius = mean_disp = 0.0 + radius = second_far_dist = 0.0 far = PEN_CENTER.copy() frac_active = n_active / max(n_sheep, 1) @@ -98,8 +99,8 @@ def build_obs(dog_pos: np.ndarray, (far[0] - dog_pos[0]) / D, (far[1] - dog_pos[1]) / D, (PEN_CENTER[0] - com[0]) / D, (PEN_CENTER[1] - com[1]) / D, (PEN_CENTER[0] - far[0]) / D, (PEN_CENTER[1] - far[1]) / D, - radius / D, - mean_disp / D, + radius / D, + second_far_dist / D, frac_active, ], dtype=np.float32) diff --git a/training/diagnose.py b/training/diagnose.py new file mode 100644 index 0000000..59022a1 --- /dev/null +++ b/training/diagnose.py @@ -0,0 +1,223 @@ +""" +Episode-level diagnostics for the herding policy. + +Runs N episodes and for each one tracks: + - flock radius over time + - COM-to-pen distance over time + - dog position over time + - when (if ever) the flock first became compact + - failure mode classification + +Then produces: + 1. Console summary of failure modes + 2. Per-episode time-series plots (radius + com_dist) + 3. Optional rendered playback of the worst episodes + +Usage +----- + python diagnose.py --model runs/ppo_consolidation/final_model.zip \ + --vecnorm runs/ppo_consolidation/vecnorm.pkl \ + --n-sheep 5 --episodes 20 + + # Watch the policy live (first episode rendered): + python diagnose.py ... --render + + # Save plots to a directory instead of showing interactively: + python diagnose.py ... --plot-dir debug_plots/ +""" + +import argparse +import os +import numpy as np +import matplotlib +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches + +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize +from herding_env import HerdingEnv + + +# ── failure mode constants ──────────────────────────────────────────────────── + +COMPACT_RADIUS = 5.0 # must match DRIVE_GATE_RADIUS in herding_env.py + + +def classify_failure(ep_radius, ep_com_dist, n_penned, n_sheep, success): + if success: + return "SUCCESS" + if min(ep_radius) > COMPACT_RADIUS: + return "NEVER_COMPACT" # flock was always too scattered + first_compact = next(i for i, r in enumerate(ep_radius) if r <= COMPACT_RADIUS) + min_com_after = min(ep_com_dist[first_compact:]) + pen_close = 3.0 # COM within 3m of pen counts as "got close" + if min_com_after > pen_close: + return "COMPACT_CANT_DRIVE" # compacted but never drove to pen + if n_penned == 0: + return "DROVE_NO_SHEEP" # got near pen, nothing went in + return f"PARTIAL_{n_penned}of{n_sheep}" # some in, not all + + +# ── main ───────────────────────────────────────────────────────────────────── + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--model", required=True) + p.add_argument("--vecnorm", default=None) + p.add_argument("--n-sheep", type=int, default=5) + p.add_argument("--episodes", type=int, default=20) + p.add_argument("--max-steps", type=int, default=4000) + p.add_argument("--render", action="store_true", + help="Show matplotlib animation of the first episode") + p.add_argument("--plot-dir", default=None, + help="Save time-series plots here (one per episode)") + p.add_argument("--seed", type=int, default=0) + return p.parse_args() + + +def make_env(n_sheep, max_steps, render_mode=None): + def _init(): + return HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, + render_mode=render_mode) + return _init + + +def main(): + args = parse_args() + + if args.plot_dir: + os.makedirs(args.plot_dir, exist_ok=True) + matplotlib.use("Agg") + + render_mode = "human" if args.render else None + raw_env = DummyVecEnv([make_env(args.n_sheep, args.max_steps, render_mode)]) + + if args.vecnorm: + env = VecNormalize.load(args.vecnorm, raw_env) + env.training = False + env.norm_reward = False + else: + env = raw_env + + model = PPO.load(args.model, env=env) + + failure_counts = {} + all_ep_data = [] + + for ep in range(args.episodes): + obs = env.reset() + done = False + step = 0 + + ep_radius = [] + ep_com_dist = [] + ep_dog_x = [] + ep_dog_y = [] + ep_n_penned = [] + + while not done: + action, _ = model.predict(obs, deterministic=True) + obs, _, dones, infos = env.step(action) + done = dones[0] + step += 1 + + inner = env.envs[0] if hasattr(env, "envs") else env.venv.envs[0] + com, radius, _ = inner._flock_stats() + com_dist = float(np.linalg.norm(com - inner.PEN_CENTER)) + n_penned = int(inner.penned[:inner.n_sheep].sum()) + + ep_radius.append(radius) + ep_com_dist.append(com_dist) + ep_dog_x.append(float(inner.dog_pos[0])) + ep_dog_y.append(float(inner.dog_pos[1])) + ep_n_penned.append(n_penned) + + info = infos[0] + n_pen = info.get("n_penned", 0) + n_sheep = info.get("n_sheep", args.n_sheep) + success = n_pen == n_sheep + mode = classify_failure(ep_radius, ep_com_dist, n_pen, n_sheep, success) + + failure_counts[mode] = failure_counts.get(mode, 0) + 1 + + compact_step = next((i for i, r in enumerate(ep_radius) + if r <= COMPACT_RADIUS), None) + min_radius = min(ep_radius) + min_com_dist = min(ep_com_dist) + + print(f" ep {ep+1:>3} steps={step:>5} penned={n_pen}/{n_sheep}" + f" min_r={min_radius:.1f}m" + f" min_com={min_com_dist:.1f}m" + f" compact@step={compact_step if compact_step is not None else 'NEVER'}" + f" [{mode}]") + + all_ep_data.append(dict( + ep=ep, radius=ep_radius, com_dist=ep_com_dist, + dog_x=ep_dog_x, dog_y=ep_dog_y, n_penned=ep_n_penned, + steps=step, mode=mode, success=success, + )) + + # ── per-episode time-series plot ────────────────────────────────── + if args.plot_dir or (not args.render and ep < 5): + fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True) + t = np.arange(len(ep_radius)) + + axes[0].plot(t, ep_radius, color="steelblue", label="flock radius (m)") + axes[0].axhline(COMPACT_RADIUS, color="orange", linestyle="--", + label=f"compact threshold ({COMPACT_RADIUS}m)") + if compact_step is not None: + axes[0].axvline(compact_step, color="green", linestyle=":", + alpha=0.6, label=f"first compact (step {compact_step})") + axes[0].set_ylabel("radius (m)") + axes[0].legend(fontsize=8) + axes[0].set_title(f"ep {ep+1} | n_sheep={n_sheep} | {mode}") + + axes[1].plot(t, ep_com_dist, color="tomato", label="COM-to-pen dist (m)") + axes[1].set_ylabel("COM-to-pen (m)") + axes[1].set_xlabel("step") + axes[1].legend(fontsize=8) + + plt.tight_layout() + if args.plot_dir: + fig.savefig(os.path.join(args.plot_dir, f"ep{ep+1:03d}_{mode}.png"), + dpi=100) + plt.close(fig) + else: + plt.show(block=False) + plt.pause(0.5) + + env.close() + + # ── summary ────────────────────────────────────────────────────────────── + print("\n" + "=" * 55) + print(f" Model : {args.model}") + print(f" n_sheep : {args.n_sheep} episodes : {args.episodes}") + print("-" * 55) + total = sum(failure_counts.values()) + for mode, cnt in sorted(failure_counts.items(), key=lambda x: -x[1]): + bar = "█" * cnt + print(f" {mode:<26} {cnt:>3}/{total} {bar}") + print("-" * 55) + + never_compact = failure_counts.get("NEVER_COMPACT", 0) + cant_drive = failure_counts.get("COMPACT_CANT_DRIVE", 0) + partial = sum(v for k, v in failure_counts.items() if k.startswith("PARTIAL")) + successes = failure_counts.get("SUCCESS", 0) + + print(f"\n Diagnosis:") + if never_compact / total > 0.5: + print(" ► COLLECT problem: dog rarely compacts the flock.") + print(" → Phase-gate W_DRIVE, increase W_COLLECT, check alignment reward.") + if cant_drive / total > 0.3: + print(" ► DRIVE problem: flock compacts but doesn't reach pen.") + print(" → Check dog alignment, pen direction, W_DRIVE magnitude.") + if partial / total > 0.3: + print(" ► PARTIAL problem: some sheep penned, stragglers remain.") + print(" → Flock splits; need better straggler-chasing behavior.") + if successes / total > 0.5: + print(" ► Mostly working! Fine-tune for consistency.") + print("=" * 55) + + +if __name__ == "__main__": + main() diff --git a/training/herding_env.py b/training/herding_env.py index b0778d2..c20ff0c 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -51,14 +51,17 @@ class HerdingEnv(gym.Env): WALL_MARGIN = 3.5 # ----------------------------------------------------------------------- - # Reward weights (progress-based potential shaping + sparse bonuses) + # Reward weights (two-phase: collect first, then drive) # ----------------------------------------------------------------------- - W_DRIVE = 2.0 # progress: flock COM moved toward pen - W_COLLECT = 2.0 # progress: flock radius shrank (was 0.5 — must match W_DRIVE) - W_ALIGN = 0.5 # position: dog on anti-pen side of flock COM - W_PEN_BONUS = 10.0 # per sheep penned (was 5.0) - W_COMPLETE = 100.0 # all sheep penned (was 20.0 — must dominate dense rewards) - W_STEP_COST = 0.002 # time penalty + W_DRIVE = 2.0 # progress: COM moved toward pen (only when compact) + W_COLLECT = 4.0 # progress: radius shrank (2× stronger when scattered) + W_ALIGN = 0.5 # position: dog on anti-pen side of COM + W_COMPACT_BONUS = 0.1 # per-step bonus for staying compact (sustained signal) + W_PEN_BONUS = 10.0 # per sheep penned + W_COMPLETE = 100.0 # all sheep penned + W_STEP_COST = 0.002 # time penalty + + DRIVE_GATE_RADIUS = 5.0 # flock must compact below this (m) before drive reward fires def __init__(self, n_sheep: int = 1, max_steps: int = 2000, render_mode: str = None, random_n_sheep: bool = False): @@ -71,7 +74,7 @@ class HerdingEnv(gym.Env): # Fixed 13-dim observation regardless of n_sheep: # dog_pos(2) + rel_com(2) + rel_far(2) + com_to_pen(2) - # + far_to_pen(2) + radius(1) + mean_disp(1) + frac_penned(1) + # + far_to_pen(2) + radius(1) + second_far_dist(1) + frac_penned(1) self.observation_space = spaces.Box( low=-np.inf, high=np.inf, shape=(13,), dtype=np.float32 ) @@ -259,60 +262,73 @@ class HerdingEnv(gym.Env): return com, float(dists.max()), float(dists.mean()) def _obs(self) -> np.ndarray: - com, radius, mean_disp = self._flock_stats() + com, radius, _ = self._flock_stats() active_mask = ~self.penned[:self.n_sheep] - # Farthest active sheep from COM (outlier the dog needs to chase) if active_mask.any(): pts = self.sheep_pos[:self.n_sheep][active_mask] - idx = int(np.argmax(np.linalg.norm(pts - com, axis=1))) - far = pts[idx] + dists = np.linalg.norm(pts - com, axis=1) + sorted_idx = np.argsort(dists)[::-1] # farthest first + far = pts[sorted_idx[0]] + # 2nd farthest — if only 1 active sheep, reuse the same position + far2 = pts[sorted_idx[1]] if len(sorted_idx) > 1 else far + second_far_dist = float(dists[sorted_idx[1]]) if len(sorted_idx) > 1 else 0.0 else: - far = self.PEN_CENTER.copy() + far = far2 = self.PEN_CENTER.copy() + second_far_dist = 0.0 - S = self.FIELD # normalisation scale for positions - D = 2 * self.FIELD # for relative vectors that can span the whole field + S = self.FIELD + D = 2 * self.FIELD return np.array([ - self.dog_pos[0] / S, self.dog_pos[1] / S, # dog abs pos - (com[0] - self.dog_pos[0]) / D, # COM relative to dog + self.dog_pos[0] / S, self.dog_pos[1] / S, + (com[0] - self.dog_pos[0]) / D, (com[1] - self.dog_pos[1]) / D, - (far[0] - self.dog_pos[0]) / D, # farthest relative to dog + (far[0] - self.dog_pos[0]) / D, (far[1] - self.dog_pos[1]) / D, - (self.PEN_CENTER[0] - com[0]) / D, # COM to pen + (self.PEN_CENTER[0] - com[0]) / D, (self.PEN_CENTER[1] - com[1]) / D, - (self.PEN_CENTER[0] - far[0]) / D, # farthest to pen + (self.PEN_CENTER[0] - far[0]) / D, (self.PEN_CENTER[1] - far[1]) / D, - radius / D, # flock compactness - mean_disp / D, # mean spread - active_mask.sum() / self.n_sheep, # fraction still active + radius / D, + second_far_dist / D, # replaced mean_disp: 2nd farthest sheep from COM + active_mask.sum() / self.n_sheep, ], dtype=np.float32) def _reward(self, n_penned: int, newly_penned: int) -> float: com, radius, _ = self._flock_stats() com_dist = float(np.linalg.norm(com - self.PEN_CENTER)) - # Progress rewards: positive when state improves - drive_progress = (self._prev_com_dist - com_dist) * self.W_DRIVE - collect_progress = (self._prev_radius - radius) * self.W_COLLECT + drive_delta = self._prev_com_dist - com_dist + collect_delta = self._prev_radius - radius self._prev_com_dist = com_dist self._prev_radius = radius - # Alignment: reward dog for being on the anti-pen side of the flock - # COM, gated by proximity so only nearby positioning counts. - # +1 = dog directly behind flock, -1 = dog on pen side (wrong). + # Alignment: dog on anti-pen side of COM, gated by proximity. d_dog_com = float(np.linalg.norm(self.dog_pos - com)) if d_dog_com > 0.1 and com_dist > 0.1: - pen_dir = (self.PEN_CENTER - com) / com_dist # COM → pen - dog_dir = (self.dog_pos - com) / d_dog_com # COM → dog - cosine = -float(np.dot(pen_dir, dog_dir)) # +1 when opposite + pen_dir = (self.PEN_CENTER - com) / com_dist + dog_dir = (self.dog_pos - com) / d_dog_com + cosine = -float(np.dot(pen_dir, dog_dir)) proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST) alignment = cosine * proximity * self.W_ALIGN else: alignment = 0.0 - reward = drive_progress + collect_progress + alignment + scattered = radius > self.DRIVE_GATE_RADIUS + + # Collect always on; 2× scale when scattered to force collect-first. + r_collect = collect_delta * self.W_COLLECT * (2.0 if scattered else 1.0) + + # Drive only fires when flock is compact — prevents rewarding COM movement + # while sheep are spread across the field. + r_drive = 0.0 if scattered else drive_delta * self.W_DRIVE + + # Small sustained reward for maintaining a compact flock. + r_compact = 0.0 if scattered else self.W_COMPACT_BONUS + + reward = r_drive + r_collect + r_compact + alignment reward += newly_penned * self.W_PEN_BONUS reward -= self.W_STEP_COST if n_penned == self.n_sheep: