Dog rewarding adjustment

2026-04-23 11:35:15 +01:00
parent 00eaf47d1f
commit f9c5093211
1 changed files with 50 additions and 16 deletions
@@ -52,8 +52,9 @@ class HerdingEnv(gym.Env):
    # -----------------------------------------------------------------------
    # Reward weights
    # -----------------------------------------------------------------------
-    W_APPROACH   = 0.3     # dense: dog distance to nearest active sheep
-    W_SHAPING    = 0.5     # dense: mean sheep distance to pen  (was 0.01)
+    W_ALIGN      = 0.4     # dense: dog on anti-pen side of each active sheep
+    W_SHAPING    = 0.5     # dense: mean sheep distance to pen
+    W_APPROACH   = 0.1     # dense: dog within flee range of nearest sheep
    W_PEN_BONUS  = 5.0     # sparse: per sheep successfully penned
    W_COMPLETE   = 20.0    # bonus when ALL active sheep are penned
    W_STEP_COST  = 0.002   # penalty per step (encourages efficiency)
@@ -106,11 +107,8 @@ class HerdingEnv(gym.Env):
        self._step_count  = 0
        self._prev_penned = 0

-        # Dog: random start in the open field (not near the pen)
-        self.dog_pos = self.np_random.uniform(-8.0, 5.0, size=(2,)).astype(np.float32)
-
        # Active sheep (0 .. n_sheep-1): random non-pen positions
-        self.sheep_pos[:] = self.PEN_CENTER   # default all to pen centre
+        self.sheep_pos[:] = self.PEN_CENTER
        self.penned[:]    = True

        placed = 0
@@ -121,6 +119,25 @@ class HerdingEnv(gym.Env):
                self.penned[placed]    = False
                placed += 1

+        # Dog: 50 % of the time start already on the anti-pen side of the
+        # nearest sheep (within flee range) so early training gets aligned
+        # starts; the other 50 % is fully random to ensure generalisation.
+        if self.np_random.random() < 0.5:
+            # Place dog behind the first active sheep relative to the pen
+            ref = self.sheep_pos[0]
+            away = ref - self.PEN_CENTER                       # sheep→anti-pen
+            dist = float(np.linalg.norm(away))
+            if dist > 0.1:
+                away = away / dist
+            offset = away * self.np_random.uniform(2.0, self.FLEE_DIST * 0.8)
+            self.dog_pos = np.clip(
+                (ref + offset).astype(np.float32), -self.FIELD, self.FIELD
+            )
+        else:
+            self.dog_pos = self.np_random.uniform(
+                -self.FIELD * 0.8, self.FIELD * 0.8, size=(2,)
+            ).astype(np.float32)
+
        # Inactive slots (n_sheep .. MAX_SHEEP-1): already at pen centre, penned=True

        self.wander_ang = self.np_random.uniform(
@@ -237,20 +254,37 @@ class HerdingEnv(gym.Env):
        active_mask = ~self.penned[:self.n_sheep]
        if active_mask.any():
            active_pos = self.sheep_pos[:self.n_sheep][active_mask]
+            dists_pen  = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1)
+            dists_dog  = np.linalg.norm(active_pos - self.dog_pos, axis=1)

-            # Sheep-to-pen shaping: encourages moving sheep toward pen
-            dists_pen = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1)
-            shaping   = -(dists_pen.mean() / (2 * self.FIELD))  # ∈ [-1, 0]
+            # Sheep-to-pen shaping
+            shaping = -(dists_pen.mean() / (2 * self.FIELD))

-            # Dog-to-nearest-sheep approach: incentivises the dog to stay
-            # within flee range (FLEE_DIST=7m) rather than wandering away
-            dists_dog = np.linalg.norm(active_pos - self.dog_pos, axis=1)
-            approach  = -(dists_dog.min() / (2 * self.FIELD))   # ∈ [-1, 0]
+            # Approach: dog penalised for being far from nearest sheep
+            approach = -(dists_dog.min() / (2 * self.FIELD))
+
+            # Alignment: reward dog for being on the anti-pen side of each sheep.
+            # When the dog is opposite the pen relative to a sheep, that sheep
+            # flees toward the pen.  Score ∈ [-1, 1] per sheep, weighted by
+            # a proximity gate so only nearby dogs count.
+            align_scores = []
+            for s_pos, d_pen, d_dog in zip(active_pos, dists_pen, dists_dog):
+                if d_pen < 0.1 or d_dog < 0.1:
+                    continue
+                pen_dir = (self.PEN_CENTER - s_pos) / d_pen   # sheep → pen
+                dog_dir = (self.dog_pos    - s_pos) / d_dog   # sheep → dog
+                # cos(angle): +1 → dog behind sheep, -1 → dog on pen side
+                cosine    = -float(np.dot(pen_dir, dog_dir))
+                # gate: full credit inside flee range, fades beyond
+                proximity = max(0.0, 1.0 - d_dog / self.FLEE_DIST)
+                align_scores.append(cosine * proximity)
+            alignment = float(np.mean(align_scores)) if align_scores else 0.0
        else:
-            shaping = approach = 0.0
+            shaping = approach = alignment = 0.0

-        reward  = shaping  * self.W_SHAPING
-        reward += approach * self.W_APPROACH
+        reward  = shaping   * self.W_SHAPING
+        reward += approach  * self.W_APPROACH
+        reward += alignment * self.W_ALIGN
        reward += newly_penned * self.W_PEN_BONUS
        reward -= self.W_STEP_COST
        if n_penned == self.n_sheep: