Sheep training flock of 10 fix?

2026-04-24 10:58:36 +01:00
parent 4189cc8dba
commit 17eb25864e
3 changed files with 280 additions and 40 deletions
@@ -51,14 +51,17 @@ class HerdingEnv(gym.Env):
    WALL_MARGIN     = 3.5

    # -----------------------------------------------------------------------
-    # Reward weights  (progress-based potential shaping + sparse bonuses)
+    # Reward weights  (two-phase: collect first, then drive)
    # -----------------------------------------------------------------------
-    W_DRIVE     = 2.0    # progress: flock COM moved toward pen
-    W_COLLECT   = 2.0   # progress: flock radius shrank (was 0.5 — must match W_DRIVE)
-    W_ALIGN     = 0.5   # position: dog on anti-pen side of flock COM
-    W_PEN_BONUS = 10.0  # per sheep penned (was 5.0)
-    W_COMPLETE  = 100.0 # all sheep penned (was 20.0 — must dominate dense rewards)
-    W_STEP_COST = 0.002 # time penalty
+    W_DRIVE          = 2.0    # progress: COM moved toward pen (only when compact)
+    W_COLLECT        = 4.0    # progress: radius shrank (2× stronger when scattered)
+    W_ALIGN          = 0.5    # position: dog on anti-pen side of COM
+    W_COMPACT_BONUS  = 0.1    # per-step bonus for staying compact (sustained signal)
+    W_PEN_BONUS      = 10.0   # per sheep penned
+    W_COMPLETE       = 100.0  # all sheep penned
+    W_STEP_COST      = 0.002  # time penalty
+
+    DRIVE_GATE_RADIUS = 5.0   # flock must compact below this (m) before drive reward fires

    def __init__(self, n_sheep: int = 1, max_steps: int = 2000,
                 render_mode: str = None, random_n_sheep: bool = False):
@@ -71,7 +74,7 @@ class HerdingEnv(gym.Env):

        # Fixed 13-dim observation regardless of n_sheep:
        #   dog_pos(2) + rel_com(2) + rel_far(2) + com_to_pen(2)
-        #   + far_to_pen(2) + radius(1) + mean_disp(1) + frac_penned(1)
+        #   + far_to_pen(2) + radius(1) + second_far_dist(1) + frac_penned(1)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(13,), dtype=np.float32
        )
@@ -259,60 +262,73 @@ class HerdingEnv(gym.Env):
        return com, float(dists.max()), float(dists.mean())

    def _obs(self) -> np.ndarray:
-        com, radius, mean_disp = self._flock_stats()
+        com, radius, _ = self._flock_stats()
        active_mask = ~self.penned[:self.n_sheep]

-        # Farthest active sheep from COM (outlier the dog needs to chase)
        if active_mask.any():
            pts   = self.sheep_pos[:self.n_sheep][active_mask]
-            idx   = int(np.argmax(np.linalg.norm(pts - com, axis=1)))
-            far   = pts[idx]
+            dists = np.linalg.norm(pts - com, axis=1)
+            sorted_idx = np.argsort(dists)[::-1]   # farthest first
+            far  = pts[sorted_idx[0]]
+            # 2nd farthest — if only 1 active sheep, reuse the same position
+            far2 = pts[sorted_idx[1]] if len(sorted_idx) > 1 else far
+            second_far_dist = float(dists[sorted_idx[1]]) if len(sorted_idx) > 1 else 0.0
        else:
-            far = self.PEN_CENTER.copy()
+            far = far2 = self.PEN_CENTER.copy()
+            second_far_dist = 0.0

-        S = self.FIELD       # normalisation scale for positions
-        D = 2 * self.FIELD   # for relative vectors that can span the whole field
+        S = self.FIELD
+        D = 2 * self.FIELD

        return np.array([
-            self.dog_pos[0] / S,  self.dog_pos[1] / S,      # dog abs pos
-            (com[0] - self.dog_pos[0]) / D,                  # COM relative to dog
+            self.dog_pos[0] / S,  self.dog_pos[1] / S,
+            (com[0] - self.dog_pos[0]) / D,
            (com[1] - self.dog_pos[1]) / D,
-            (far[0] - self.dog_pos[0]) / D,                  # farthest relative to dog
+            (far[0] - self.dog_pos[0]) / D,
            (far[1] - self.dog_pos[1]) / D,
-            (self.PEN_CENTER[0] - com[0]) / D,               # COM to pen
+            (self.PEN_CENTER[0] - com[0]) / D,
            (self.PEN_CENTER[1] - com[1]) / D,
-            (self.PEN_CENTER[0] - far[0]) / D,               # farthest to pen
+            (self.PEN_CENTER[0] - far[0]) / D,
            (self.PEN_CENTER[1] - far[1]) / D,
-            radius   / D,                                     # flock compactness
-            mean_disp / D,                                    # mean spread
-            active_mask.sum() / self.n_sheep,                 # fraction still active
+            radius          / D,
+            second_far_dist / D,   # replaced mean_disp: 2nd farthest sheep from COM
+            active_mask.sum() / self.n_sheep,
        ], dtype=np.float32)

    def _reward(self, n_penned: int, newly_penned: int) -> float:
        com, radius, _ = self._flock_stats()
        com_dist = float(np.linalg.norm(com - self.PEN_CENTER))

-        # Progress rewards: positive when state improves
-        drive_progress   = (self._prev_com_dist - com_dist) * self.W_DRIVE
-        collect_progress = (self._prev_radius   - radius)   * self.W_COLLECT
+        drive_delta   = self._prev_com_dist - com_dist
+        collect_delta = self._prev_radius   - radius

        self._prev_com_dist = com_dist
        self._prev_radius   = radius

-        # Alignment: reward dog for being on the anti-pen side of the flock
-        # COM, gated by proximity so only nearby positioning counts.
-        # +1 = dog directly behind flock, -1 = dog on pen side (wrong).
+        # Alignment: dog on anti-pen side of COM, gated by proximity.
        d_dog_com = float(np.linalg.norm(self.dog_pos - com))
        if d_dog_com > 0.1 and com_dist > 0.1:
-            pen_dir = (self.PEN_CENTER - com) / com_dist       # COM → pen
-            dog_dir = (self.dog_pos    - com) / d_dog_com      # COM → dog
-            cosine    = -float(np.dot(pen_dir, dog_dir))       # +1 when opposite
+            pen_dir   = (self.PEN_CENTER - com) / com_dist
+            dog_dir   = (self.dog_pos    - com) / d_dog_com
+            cosine    = -float(np.dot(pen_dir, dog_dir))
            proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST)
            alignment = cosine * proximity * self.W_ALIGN
        else:
            alignment = 0.0

-        reward  = drive_progress + collect_progress + alignment
+        scattered = radius > self.DRIVE_GATE_RADIUS
+
+        # Collect always on; 2× scale when scattered to force collect-first.
+        r_collect = collect_delta * self.W_COLLECT * (2.0 if scattered else 1.0)
+
+        # Drive only fires when flock is compact — prevents rewarding COM movement
+        # while sheep are spread across the field.
+        r_drive   = 0.0 if scattered else drive_delta * self.W_DRIVE
+
+        # Small sustained reward for maintaining a compact flock.
+        r_compact = 0.0 if scattered else self.W_COMPACT_BONUS
+
+        reward  = r_drive + r_collect + r_compact + alignment
        reward += newly_penned * self.W_PEN_BONUS
        reward -= self.W_STEP_COST
        if n_penned == self.n_sheep: