Cleanup and new approach

2026-04-26 01:50:01 +01:00
parent b031473758
commit 61f8a7db15
139 changed files with 510 additions and 16170 deletions
@@ -61,18 +61,19 @@ class HerdingEnv(gym.Env):
    W_COMPLETE  = 100.0  # all sheep penned
    W_STEP_COST = 0.02   # time penalty — strong enough to punish doing nothing
    W_COMPACT   = 0.0    # reward for flock-radius reduction (off by default)
-    W_WALL_TOUCH = 0.01  # per-sheep, per-step penalty when an active sheep is
-                         # pinned against the outside of a pen W/E wall. Kept
-                         # small (<step_cost) so the dog isn't incentivised to
-                         # hover above the entrance to avoid the penalty.
-    WALL_TOUCH_BUFFER = 0.3   # metres outside the wall counted as "touching"
+    W_WALL_TOUCH = 0.15  # per-sheep max penalty at wall surface. Linear ramp
+                         # within WALL_TOUCH_BUFFER gives the RL agent a gradient
+                         # signal to avoid pinning sheep against pen walls.
+                         # 0.15 ≈ 7.5× step_cost — strong enough to shape behavior
+                         # without overwhelming progress reward.
+    WALL_TOUCH_BUFFER = 0.8   # metres from wall where penalty starts ramping
    ALIGN_SHAPE = "standoff"   # "standoff" (peaks at IDEAL) | "near" (peaks at 0)
    ALIGN_GATED = True   # gate alignment on action magnitude
-    ENTRY_AWARE = True   # progress reward targets PEN_ENTRY (entrance face), not
-                         # PEN_CENTER. Stops the wall-corraling exploit: when a
-                         # sheep is shoved south past y=-8 outside the pen x-range,
-                         # distance to PEN_ENTRY grows (since target is at y=-8),
-                         # so progress reward goes negative instead of positive.
+    ENTRY_AWARE = False  # When True, targets PEN_ENTRY (entrance face) instead
+                         # of PEN_CENTER for progress/obs. Intended to fix wall-
+                         # corralling but collapsed n_sheep≥2 success rate.
+                         # The wall-touch gradient penalty handles wall avoidance
+                         # without breaking the core herding signal.

    # Initial sheep spawn: first sheep placed anywhere; rest within CLUSTER_RADIUS
    # of it. Set to None for legacy uniform-scatter behaviour.
@@ -406,16 +407,25 @@ class HerdingEnv(gym.Env):
        else:
            alignment = 0.0

-        # Wall-touch penalty: count active sheep pinned against outside W/E pen walls.
+        # Wall-touch penalty: distance-based gradient covering all 3 solid pen
+        # walls (west, east, south). Linearly ramps from 0 at buffer edge to
+        # W_WALL_TOUCH at the wall surface — gives the agent a smooth signal
+        # to avoid pinning sheep against walls.
        if self.W_WALL_TOUCH and active.any():
            pts = self.sheep_pos[:self.n_sheep][active]
            px0, px1 = self.PEN_X
            py0, py1 = self.PEN_Y
-            in_y     = (pts[:, 1] > py0) & (pts[:, 1] < py1)
-            near_w   = (pts[:, 0] < px0) & (pts[:, 0] > px0 - self.WALL_TOUCH_BUFFER)
-            near_e   = (pts[:, 0] > px1) & (pts[:, 0] < px1 + self.WALL_TOUCH_BUFFER)
-            n_touch  = int(((near_w | near_e) & in_y).sum())
-            r_wall_touch = -n_touch * self.W_WALL_TOUCH
+            buf = self.WALL_TOUCH_BUFFER
+            far = buf + 1.0
+            d_w = np.where((pts[:, 0] < px0) & (pts[:, 1] > py0) & (pts[:, 1] < py1),
+                           px0 - pts[:, 0], far)
+            d_e = np.where((pts[:, 0] > px1) & (pts[:, 1] > py0) & (pts[:, 1] < py1),
+                           pts[:, 0] - px1, far)
+            d_s = np.where((pts[:, 1] < py0) & (pts[:, 0] > px0) & (pts[:, 0] < px1),
+                           py0 - pts[:, 1], far)
+            d_min = np.minimum(np.minimum(d_w, d_e), d_s)
+            penalties = np.maximum(0.0, 1.0 - d_min / buf) * self.W_WALL_TOUCH
+            r_wall_touch = -float(penalties.sum())
        else:
            r_wall_touch = 0.0