Behaviour refinement

2026-04-25 21:35:23 +01:00
parent e302c76886
commit 7b87908410
2 changed files with 61 additions and 7 deletions
@@ -35,6 +35,7 @@ class HerdingEnv(gym.Env):
    PEN_X      = (10.0, 13.0)
    PEN_Y      = (-15.0, -8.0)
    PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
+    PEN_ENTRY  = np.array([11.5,  -8.0], dtype=np.float32)   # north entrance face center

    # -----------------------------------------------------------------------
    # Dynamics — calibrated to match Webots robot specs
@@ -62,6 +63,11 @@ class HerdingEnv(gym.Env):
    W_COMPACT   = 0.0    # reward for flock-radius reduction (off by default)
    ALIGN_SHAPE = "standoff"   # "standoff" (peaks at IDEAL) | "near" (peaks at 0)
    ALIGN_GATED = True   # gate alignment on action magnitude
+    ENTRY_AWARE = True   # progress reward targets PEN_ENTRY (entrance face), not
+                         # PEN_CENTER. Stops the wall-corraling exploit: when a
+                         # sheep is shoved south past y=-8 outside the pen x-range,
+                         # distance to PEN_ENTRY grows (since target is at y=-8),
+                         # so progress reward goes negative instead of positive.

    # Initial sheep spawn: first sheep placed anywhere; rest within CLUSTER_RADIUS
    # of it. Set to None for legacy uniform-scatter behaviour.
@@ -182,10 +188,11 @@ class HerdingEnv(gym.Env):

        # Initialise per-sheep pen-distance sum for progress reward
        active = ~self.penned[:self.n_sheep]
+        target = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER
        if active.any():
            self._prev_pen_dist_sum = float(
                np.linalg.norm(
-                    self.sheep_pos[:self.n_sheep][active] - self.PEN_CENTER, axis=1
+                    self.sheep_pos[:self.n_sheep][active] - target, axis=1
                ).sum()
            )
            com0 = self.sheep_pos[:self.n_sheep][active].mean(axis=0)
@@ -202,10 +209,26 @@ class HerdingEnv(gym.Env):
        self._step_count += 1

        act = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0)
-        self.dog_pos = np.clip(
+        old_dog = self.dog_pos.copy()
+        new_dog = np.clip(
            self.dog_pos + act * self.DOG_SPEED * self.DT,
            -self.FIELD, self.FIELD
        )
+        # Pen wall collision — mirrors Webots geometry. West (x=PEN_X[0]) and
+        # east (x=PEN_X[1]) walls block the dog within the pen's y-range.
+        # North face (y=PEN_Y[1]=-8) is open. South is the field edge.
+        px0, px1 = self.PEN_X
+        py0, py1 = self.PEN_Y
+        if py0 < new_dog[1] < py1:
+            if old_dog[0] < px0 <= new_dog[0]:
+                new_dog[0] = px0 - 1e-3
+            elif old_dog[0] > px0 >= new_dog[0]:
+                new_dog[0] = px0 + 1e-3
+            if old_dog[0] > px1 >= new_dog[0]:
+                new_dog[0] = px1 + 1e-3
+            elif old_dog[0] < px1 <= new_dog[0]:
+                new_dog[0] = px1 - 1e-3
+        self.dog_pos = new_dog.astype(np.float32)

        for i in range(self.n_sheep):
            if self.penned[i]:
@@ -325,14 +348,18 @@ class HerdingEnv(gym.Env):
        # For 1 sheep: far1-COM = far2-COM = far3-COM = [0,0] → cleanly ignorable.
        # For 3+ sheep: non-zero vectors tell the dog where each straggler is
        # within the group, without conflicting with weights trained on 1 sheep.
+        # Pen reference for the policy. Aligned with the reward target so the
+        # policy isn't forced to learn an implicit offset between what it sees
+        # ("pen is here") and what it's rewarded for ("get sheep close to here").
+        pen_ref = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER
        return np.array([
            self.dog_pos[0] / S,  self.dog_pos[1] / S,
            (com[0]  - self.dog_pos[0]) / D, (com[1]  - self.dog_pos[1]) / D,
            (far1[0] - com[0]) / D, (far1[1] - com[1]) / D,
            (far2[0] - com[0]) / D, (far2[1] - com[1]) / D,
            (far3[0] - com[0]) / D, (far3[1] - com[1]) / D,
-            (self.PEN_CENTER[0] - com[0])  / D, (self.PEN_CENTER[1] - com[1])  / D,
-            (self.PEN_CENTER[0] - far1[0]) / D, (self.PEN_CENTER[1] - far1[1]) / D,
+            (pen_ref[0] - com[0])  / D, (pen_ref[1] - com[1])  / D,
+            (pen_ref[0] - far1[0]) / D, (pen_ref[1] - far1[1]) / D,
            radius / D,
            active_mask.sum() / self.n_sheep,
        ], dtype=np.float32)
@@ -344,9 +371,10 @@ class HerdingEnv(gym.Env):
        # Naturally rewards keeping the flock together and pushing toward pen:
        # dog behind flock → all sheep flee toward pen → all contribute positive reward.
        # Dog from wrong side → sheep scatter away from pen → negative reward.
+        target = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER
        if active.any():
            pen_dists = np.linalg.norm(
-                self.sheep_pos[:self.n_sheep][active] - self.PEN_CENTER, axis=1
+                self.sheep_pos[:self.n_sheep][active] - target, axis=1
            )
            cur_sum = float(pen_dists.sum())
            r_progress = (self._prev_pen_dist_sum - cur_sum) * self.W_PER_SHEEP
@@ -355,10 +383,10 @@ class HerdingEnv(gym.Env):
            r_progress = 0.0

        com, _, _ = self._flock_stats()
-        com_dist  = float(np.linalg.norm(com - self.PEN_CENTER))
+        com_dist  = float(np.linalg.norm(com - target))
        d_dog_com = float(np.linalg.norm(self.dog_pos - com))
        if d_dog_com > 0.1 and com_dist > 0.1:
-            pen_dir   = (self.PEN_CENTER - com) / com_dist
+            pen_dir   = (target - com) / com_dist
            dog_dir   = (self.dog_pos    - com) / d_dog_com
            cosine    = -float(np.dot(pen_dir, dog_dir))
            if self.ALIGN_SHAPE == "standoff":