From 17eb25864e1bedbfb6d38d3c748c17bc3e8bdced Mon Sep 17 00:00:00 2001
From: Johnny Fernandes <up202402612@up.pt>
Date: Fri, 24 Apr 2026 10:58:36 +0100
Subject: [PATCH] Sheep training flock of 10 fix?

---
 .../shepherd_dog_rl/shepherd_dog_rl.py        |  15 +-
 training/diagnose.py                          | 223 ++++++++++++++++++
 training/herding_env.py                       |  82 ++++---
 3 files changed, 280 insertions(+), 40 deletions(-)
 create mode 100644 training/diagnose.py

diff --git a/controllers/shepherd_dog_rl/shepherd_dog_rl.py b/controllers/shepherd_dog_rl/shepherd_dog_rl.py
index ed91682..d94c574 100644
--- a/controllers/shepherd_dog_rl/shepherd_dog_rl.py
+++ b/controllers/shepherd_dog_rl/shepherd_dog_rl.py
@@ -80,14 +80,15 @@ def build_obs(dog_pos: np.ndarray,
     n_active = len(active_pos)
 
     if n_active > 0:
-        com       = active_pos.mean(axis=0)
+        com        = active_pos.mean(axis=0)
         d_from_com = np.linalg.norm(active_pos - com, axis=1)
-        radius    = float(d_from_com.max())
-        mean_disp = float(d_from_com.mean())
-        far       = active_pos[int(np.argmax(d_from_com))]
+        sorted_idx = np.argsort(d_from_com)[::-1]
+        radius     = float(d_from_com[sorted_idx[0]])
+        far        = active_pos[sorted_idx[0]]
+        second_far_dist = float(d_from_com[sorted_idx[1]]) if len(sorted_idx) > 1 else 0.0
     else:
         com = PEN_CENTER.copy()
-        radius = mean_disp = 0.0
+        radius = second_far_dist = 0.0
         far = PEN_CENTER.copy()
 
     frac_active = n_active / max(n_sheep, 1)
@@ -98,8 +99,8 @@ def build_obs(dog_pos: np.ndarray,
         (far[0] - dog_pos[0]) / D,  (far[1] - dog_pos[1]) / D,
         (PEN_CENTER[0] - com[0]) / D,  (PEN_CENTER[1] - com[1]) / D,
         (PEN_CENTER[0] - far[0]) / D,  (PEN_CENTER[1] - far[1]) / D,
-        radius    / D,
-        mean_disp / D,
+        radius          / D,
+        second_far_dist / D,
         frac_active,
     ], dtype=np.float32)
 
diff --git a/training/diagnose.py b/training/diagnose.py
new file mode 100644
index 0000000..59022a1
--- /dev/null
+++ b/training/diagnose.py
@@ -0,0 +1,223 @@
+"""
+Episode-level diagnostics for the herding policy.
+
+Runs N episodes and for each one tracks:
+  - flock radius over time
+  - COM-to-pen distance over time
+  - dog position over time
+  - when (if ever) the flock first became compact
+  - failure mode classification
+
+Then produces:
+  1. Console summary of failure modes
+  2. Per-episode time-series plots (radius + com_dist)
+  3. Optional rendered playback of the worst episodes
+
+Usage
+-----
+    python diagnose.py --model runs/ppo_consolidation/final_model.zip \
+                       --vecnorm runs/ppo_consolidation/vecnorm.pkl \
+                       --n-sheep 5 --episodes 20
+
+    # Watch the policy live (first episode rendered):
+    python diagnose.py ... --render
+
+    # Save plots to a directory instead of showing interactively:
+    python diagnose.py ... --plot-dir debug_plots/
+"""
+
+import argparse
+import os
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+
+from stable_baselines3 import PPO
+from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
+from herding_env import HerdingEnv
+
+
+# ── failure mode constants ────────────────────────────────────────────────────
+
+COMPACT_RADIUS = 5.0   # must match DRIVE_GATE_RADIUS in herding_env.py
+
+
+def classify_failure(ep_radius, ep_com_dist, n_penned, n_sheep, success):
+    if success:
+        return "SUCCESS"
+    if min(ep_radius) > COMPACT_RADIUS:
+        return "NEVER_COMPACT"         # flock was always too scattered
+    first_compact = next(i for i, r in enumerate(ep_radius) if r <= COMPACT_RADIUS)
+    min_com_after = min(ep_com_dist[first_compact:])
+    pen_close = 3.0   # COM within 3m of pen counts as "got close"
+    if min_com_after > pen_close:
+        return "COMPACT_CANT_DRIVE"    # compacted but never drove to pen
+    if n_penned == 0:
+        return "DROVE_NO_SHEEP"        # got near pen, nothing went in
+    return f"PARTIAL_{n_penned}of{n_sheep}"   # some in, not all
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--model",    required=True)
+    p.add_argument("--vecnorm",  default=None)
+    p.add_argument("--n-sheep",  type=int, default=5)
+    p.add_argument("--episodes", type=int, default=20)
+    p.add_argument("--max-steps", type=int, default=4000)
+    p.add_argument("--render",   action="store_true",
+                   help="Show matplotlib animation of the first episode")
+    p.add_argument("--plot-dir", default=None,
+                   help="Save time-series plots here (one per episode)")
+    p.add_argument("--seed",     type=int, default=0)
+    return p.parse_args()
+
+
+def make_env(n_sheep, max_steps, render_mode=None):
+    def _init():
+        return HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
+                          render_mode=render_mode)
+    return _init
+
+
+def main():
+    args = parse_args()
+
+    if args.plot_dir:
+        os.makedirs(args.plot_dir, exist_ok=True)
+        matplotlib.use("Agg")
+
+    render_mode = "human" if args.render else None
+    raw_env = DummyVecEnv([make_env(args.n_sheep, args.max_steps, render_mode)])
+
+    if args.vecnorm:
+        env = VecNormalize.load(args.vecnorm, raw_env)
+        env.training    = False
+        env.norm_reward = False
+    else:
+        env = raw_env
+
+    model = PPO.load(args.model, env=env)
+
+    failure_counts = {}
+    all_ep_data    = []
+
+    for ep in range(args.episodes):
+        obs   = env.reset()
+        done  = False
+        step  = 0
+
+        ep_radius   = []
+        ep_com_dist = []
+        ep_dog_x    = []
+        ep_dog_y    = []
+        ep_n_penned = []
+
+        while not done:
+            action, _ = model.predict(obs, deterministic=True)
+            obs, _, dones, infos = env.step(action)
+            done  = dones[0]
+            step += 1
+
+            inner = env.envs[0] if hasattr(env, "envs") else env.venv.envs[0]
+            com, radius, _ = inner._flock_stats()
+            com_dist = float(np.linalg.norm(com - inner.PEN_CENTER))
+            n_penned = int(inner.penned[:inner.n_sheep].sum())
+
+            ep_radius.append(radius)
+            ep_com_dist.append(com_dist)
+            ep_dog_x.append(float(inner.dog_pos[0]))
+            ep_dog_y.append(float(inner.dog_pos[1]))
+            ep_n_penned.append(n_penned)
+
+        info    = infos[0]
+        n_pen   = info.get("n_penned", 0)
+        n_sheep = info.get("n_sheep", args.n_sheep)
+        success = n_pen == n_sheep
+        mode    = classify_failure(ep_radius, ep_com_dist, n_pen, n_sheep, success)
+
+        failure_counts[mode] = failure_counts.get(mode, 0) + 1
+
+        compact_step = next((i for i, r in enumerate(ep_radius)
+                             if r <= COMPACT_RADIUS), None)
+        min_radius   = min(ep_radius)
+        min_com_dist = min(ep_com_dist)
+
+        print(f"  ep {ep+1:>3}  steps={step:>5}  penned={n_pen}/{n_sheep}"
+              f"  min_r={min_radius:.1f}m"
+              f"  min_com={min_com_dist:.1f}m"
+              f"  compact@step={compact_step if compact_step is not None else 'NEVER'}"
+              f"  [{mode}]")
+
+        all_ep_data.append(dict(
+            ep=ep, radius=ep_radius, com_dist=ep_com_dist,
+            dog_x=ep_dog_x, dog_y=ep_dog_y, n_penned=ep_n_penned,
+            steps=step, mode=mode, success=success,
+        ))
+
+        # ── per-episode time-series plot ──────────────────────────────────
+        if args.plot_dir or (not args.render and ep < 5):
+            fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
+            t = np.arange(len(ep_radius))
+
+            axes[0].plot(t, ep_radius, color="steelblue", label="flock radius (m)")
+            axes[0].axhline(COMPACT_RADIUS, color="orange", linestyle="--",
+                            label=f"compact threshold ({COMPACT_RADIUS}m)")
+            if compact_step is not None:
+                axes[0].axvline(compact_step, color="green", linestyle=":",
+                                alpha=0.6, label=f"first compact (step {compact_step})")
+            axes[0].set_ylabel("radius (m)")
+            axes[0].legend(fontsize=8)
+            axes[0].set_title(f"ep {ep+1} | n_sheep={n_sheep} | {mode}")
+
+            axes[1].plot(t, ep_com_dist, color="tomato", label="COM-to-pen dist (m)")
+            axes[1].set_ylabel("COM-to-pen (m)")
+            axes[1].set_xlabel("step")
+            axes[1].legend(fontsize=8)
+
+            plt.tight_layout()
+            if args.plot_dir:
+                fig.savefig(os.path.join(args.plot_dir, f"ep{ep+1:03d}_{mode}.png"),
+                            dpi=100)
+                plt.close(fig)
+            else:
+                plt.show(block=False)
+                plt.pause(0.5)
+
+    env.close()
+
+    # ── summary ──────────────────────────────────────────────────────────────
+    print("\n" + "=" * 55)
+    print(f"  Model   : {args.model}")
+    print(f"  n_sheep : {args.n_sheep}   episodes : {args.episodes}")
+    print("-" * 55)
+    total = sum(failure_counts.values())
+    for mode, cnt in sorted(failure_counts.items(), key=lambda x: -x[1]):
+        bar = "█" * cnt
+        print(f"  {mode:<26} {cnt:>3}/{total}  {bar}")
+    print("-" * 55)
+
+    never_compact = failure_counts.get("NEVER_COMPACT", 0)
+    cant_drive    = failure_counts.get("COMPACT_CANT_DRIVE", 0)
+    partial       = sum(v for k, v in failure_counts.items() if k.startswith("PARTIAL"))
+    successes     = failure_counts.get("SUCCESS", 0)
+
+    print(f"\n  Diagnosis:")
+    if never_compact / total > 0.5:
+        print("  ► COLLECT problem: dog rarely compacts the flock.")
+        print("    → Phase-gate W_DRIVE, increase W_COLLECT, check alignment reward.")
+    if cant_drive / total > 0.3:
+        print("  ► DRIVE problem: flock compacts but doesn't reach pen.")
+        print("    → Check dog alignment, pen direction, W_DRIVE magnitude.")
+    if partial / total > 0.3:
+        print("  ► PARTIAL problem: some sheep penned, stragglers remain.")
+        print("    → Flock splits; need better straggler-chasing behavior.")
+    if successes / total > 0.5:
+        print("  ► Mostly working! Fine-tune for consistency.")
+    print("=" * 55)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/herding_env.py b/training/herding_env.py
index b0778d2..c20ff0c 100644
--- a/training/herding_env.py
+++ b/training/herding_env.py
@@ -51,14 +51,17 @@ class HerdingEnv(gym.Env):
     WALL_MARGIN     = 3.5
 
     # -----------------------------------------------------------------------
-    # Reward weights  (progress-based potential shaping + sparse bonuses)
+    # Reward weights  (two-phase: collect first, then drive)
     # -----------------------------------------------------------------------
-    W_DRIVE     = 2.0    # progress: flock COM moved toward pen
-    W_COLLECT   = 2.0   # progress: flock radius shrank (was 0.5 — must match W_DRIVE)
-    W_ALIGN     = 0.5   # position: dog on anti-pen side of flock COM
-    W_PEN_BONUS = 10.0  # per sheep penned (was 5.0)
-    W_COMPLETE  = 100.0 # all sheep penned (was 20.0 — must dominate dense rewards)
-    W_STEP_COST = 0.002 # time penalty
+    W_DRIVE          = 2.0    # progress: COM moved toward pen (only when compact)
+    W_COLLECT        = 4.0    # progress: radius shrank (2× stronger when scattered)
+    W_ALIGN          = 0.5    # position: dog on anti-pen side of COM
+    W_COMPACT_BONUS  = 0.1    # per-step bonus for staying compact (sustained signal)
+    W_PEN_BONUS      = 10.0   # per sheep penned
+    W_COMPLETE       = 100.0  # all sheep penned
+    W_STEP_COST      = 0.002  # time penalty
+
+    DRIVE_GATE_RADIUS = 5.0   # flock must compact below this (m) before drive reward fires
 
     def __init__(self, n_sheep: int = 1, max_steps: int = 2000,
                  render_mode: str = None, random_n_sheep: bool = False):
@@ -71,7 +74,7 @@ class HerdingEnv(gym.Env):
 
         # Fixed 13-dim observation regardless of n_sheep:
         #   dog_pos(2) + rel_com(2) + rel_far(2) + com_to_pen(2)
-        #   + far_to_pen(2) + radius(1) + mean_disp(1) + frac_penned(1)
+        #   + far_to_pen(2) + radius(1) + second_far_dist(1) + frac_penned(1)
         self.observation_space = spaces.Box(
             low=-np.inf, high=np.inf, shape=(13,), dtype=np.float32
         )
@@ -259,60 +262,73 @@ class HerdingEnv(gym.Env):
         return com, float(dists.max()), float(dists.mean())
 
     def _obs(self) -> np.ndarray:
-        com, radius, mean_disp = self._flock_stats()
+        com, radius, _ = self._flock_stats()
         active_mask = ~self.penned[:self.n_sheep]
 
-        # Farthest active sheep from COM (outlier the dog needs to chase)
         if active_mask.any():
             pts   = self.sheep_pos[:self.n_sheep][active_mask]
-            idx   = int(np.argmax(np.linalg.norm(pts - com, axis=1)))
-            far   = pts[idx]
+            dists = np.linalg.norm(pts - com, axis=1)
+            sorted_idx = np.argsort(dists)[::-1]   # farthest first
+            far  = pts[sorted_idx[0]]
+            # 2nd farthest — if only 1 active sheep, reuse the same position
+            far2 = pts[sorted_idx[1]] if len(sorted_idx) > 1 else far
+            second_far_dist = float(dists[sorted_idx[1]]) if len(sorted_idx) > 1 else 0.0
         else:
-            far = self.PEN_CENTER.copy()
+            far = far2 = self.PEN_CENTER.copy()
+            second_far_dist = 0.0
 
-        S = self.FIELD       # normalisation scale for positions
-        D = 2 * self.FIELD   # for relative vectors that can span the whole field
+        S = self.FIELD
+        D = 2 * self.FIELD
 
         return np.array([
-            self.dog_pos[0] / S,  self.dog_pos[1] / S,      # dog abs pos
-            (com[0] - self.dog_pos[0]) / D,                  # COM relative to dog
+            self.dog_pos[0] / S,  self.dog_pos[1] / S,
+            (com[0] - self.dog_pos[0]) / D,
             (com[1] - self.dog_pos[1]) / D,
-            (far[0] - self.dog_pos[0]) / D,                  # farthest relative to dog
+            (far[0] - self.dog_pos[0]) / D,
             (far[1] - self.dog_pos[1]) / D,
-            (self.PEN_CENTER[0] - com[0]) / D,               # COM to pen
+            (self.PEN_CENTER[0] - com[0]) / D,
             (self.PEN_CENTER[1] - com[1]) / D,
-            (self.PEN_CENTER[0] - far[0]) / D,               # farthest to pen
+            (self.PEN_CENTER[0] - far[0]) / D,
             (self.PEN_CENTER[1] - far[1]) / D,
-            radius   / D,                                     # flock compactness
-            mean_disp / D,                                    # mean spread
-            active_mask.sum() / self.n_sheep,                 # fraction still active
+            radius          / D,
+            second_far_dist / D,   # replaced mean_disp: 2nd farthest sheep from COM
+            active_mask.sum() / self.n_sheep,
         ], dtype=np.float32)
 
     def _reward(self, n_penned: int, newly_penned: int) -> float:
         com, radius, _ = self._flock_stats()
         com_dist = float(np.linalg.norm(com - self.PEN_CENTER))
 
-        # Progress rewards: positive when state improves
-        drive_progress   = (self._prev_com_dist - com_dist) * self.W_DRIVE
-        collect_progress = (self._prev_radius   - radius)   * self.W_COLLECT
+        drive_delta   = self._prev_com_dist - com_dist
+        collect_delta = self._prev_radius   - radius
 
         self._prev_com_dist = com_dist
         self._prev_radius   = radius
 
-        # Alignment: reward dog for being on the anti-pen side of the flock
-        # COM, gated by proximity so only nearby positioning counts.
-        # +1 = dog directly behind flock, -1 = dog on pen side (wrong).
+        # Alignment: dog on anti-pen side of COM, gated by proximity.
         d_dog_com = float(np.linalg.norm(self.dog_pos - com))
         if d_dog_com > 0.1 and com_dist > 0.1:
-            pen_dir = (self.PEN_CENTER - com) / com_dist       # COM → pen
-            dog_dir = (self.dog_pos    - com) / d_dog_com      # COM → dog
-            cosine    = -float(np.dot(pen_dir, dog_dir))       # +1 when opposite
+            pen_dir   = (self.PEN_CENTER - com) / com_dist
+            dog_dir   = (self.dog_pos    - com) / d_dog_com
+            cosine    = -float(np.dot(pen_dir, dog_dir))
             proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST)
             alignment = cosine * proximity * self.W_ALIGN
         else:
             alignment = 0.0
 
-        reward  = drive_progress + collect_progress + alignment
+        scattered = radius > self.DRIVE_GATE_RADIUS
+
+        # Collect always on; 2× scale when scattered to force collect-first.
+        r_collect = collect_delta * self.W_COLLECT * (2.0 if scattered else 1.0)
+
+        # Drive only fires when flock is compact — prevents rewarding COM movement
+        # while sheep are spread across the field.
+        r_drive   = 0.0 if scattered else drive_delta * self.W_DRIVE
+
+        # Small sustained reward for maintaining a compact flock.
+        r_compact = 0.0 if scattered else self.W_COMPACT_BONUS
+
+        reward  = r_drive + r_collect + r_compact + alignment
         reward += newly_penned * self.W_PEN_BONUS
         reward -= self.W_STEP_COST
         if n_penned == self.n_sheep: