Dog rewarding adjustment

2026-04-23 11:35:15 +01:00
parent 00eaf47d1f
commit f9c5093211
1 changed files with 50 additions and 16 deletions
@@ -52,8 +52,9 @@ class HerdingEnv(gym.Env):
    # -----------------------------------------------------------------------
    # Reward weights
    # -----------------------------------------------------------------------
-    W_APPROACH   = 0.3     # dense: dog distance to nearest active sheep
+    W_ALIGN      = 0.4     # dense: dog on anti-pen side of each active sheep
-    W_SHAPING    = 0.5     # dense: mean sheep distance to pen  (was 0.01)
+    W_SHAPING    = 0.5     # dense: mean sheep distance to pen
    W_APPROACH   = 0.1     # dense: dog within flee range of nearest sheep
    W_PEN_BONUS  = 5.0     # sparse: per sheep successfully penned
    W_COMPLETE   = 20.0    # bonus when ALL active sheep are penned
    W_STEP_COST  = 0.002   # penalty per step (encourages efficiency)
@@ -106,11 +107,8 @@ class HerdingEnv(gym.Env):
        self._step_count  = 0
        self._prev_penned = 0
        # Dog: random start in the open field (not near the pen)
        self.dog_pos = self.np_random.uniform(-8.0, 5.0, size=(2,)).astype(np.float32)
        # Active sheep (0 .. n_sheep-1): random non-pen positions
-        self.sheep_pos[:] = self.PEN_CENTER   # default all to pen centre
+        self.sheep_pos[:] = self.PEN_CENTER
        self.penned[:]    = True
        placed = 0
@@ -121,6 +119,25 @@ class HerdingEnv(gym.Env):
                self.penned[placed]    = False
                placed += 1
        # Dog: 50 % of the time start already on the anti-pen side of the
        # nearest sheep (within flee range) so early training gets aligned
        # starts; the other 50 % is fully random to ensure generalisation.
        if self.np_random.random() < 0.5:
            # Place dog behind the first active sheep relative to the pen
            ref = self.sheep_pos[0]
            away = ref - self.PEN_CENTER                       # sheep→anti-pen
            dist = float(np.linalg.norm(away))
            if dist > 0.1:
                away = away / dist
            offset = away * self.np_random.uniform(2.0, self.FLEE_DIST * 0.8)
            self.dog_pos = np.clip(
                (ref + offset).astype(np.float32), -self.FIELD, self.FIELD
            )
        else:
            self.dog_pos = self.np_random.uniform(
                -self.FIELD * 0.8, self.FIELD * 0.8, size=(2,)
            ).astype(np.float32)
        # Inactive slots (n_sheep .. MAX_SHEEP-1): already at pen centre, penned=True
        self.wander_ang = self.np_random.uniform(
@@ -237,20 +254,37 @@ class HerdingEnv(gym.Env):
        active_mask = ~self.penned[:self.n_sheep]
        if active_mask.any():
            active_pos = self.sheep_pos[:self.n_sheep][active_mask]
            # Sheep-to-pen shaping: encourages moving sheep toward pen
            dists_pen  = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1)
            shaping   = -(dists_pen.mean() / (2 * self.FIELD))  # ∈ [-1, 0]
            # Dog-to-nearest-sheep approach: incentivises the dog to stay
            # within flee range (FLEE_DIST=7m) rather than wandering away
            dists_dog  = np.linalg.norm(active_pos - self.dog_pos, axis=1)
-            approach  = -(dists_dog.min() / (2 * self.FIELD))   # ∈ [-1, 0]
+
            # Sheep-to-pen shaping
            shaping = -(dists_pen.mean() / (2 * self.FIELD))
            # Approach: dog penalised for being far from nearest sheep
            approach = -(dists_dog.min() / (2 * self.FIELD))
            # Alignment: reward dog for being on the anti-pen side of each sheep.
            # When the dog is opposite the pen relative to a sheep, that sheep
            # flees toward the pen.  Score ∈ [-1, 1] per sheep, weighted by
            # a proximity gate so only nearby dogs count.
            align_scores = []
            for s_pos, d_pen, d_dog in zip(active_pos, dists_pen, dists_dog):
                if d_pen < 0.1 or d_dog < 0.1:
                    continue
                pen_dir = (self.PEN_CENTER - s_pos) / d_pen   # sheep → pen
                dog_dir = (self.dog_pos    - s_pos) / d_dog   # sheep → dog
                # cos(angle): +1 → dog behind sheep, -1 → dog on pen side
                cosine    = -float(np.dot(pen_dir, dog_dir))
                # gate: full credit inside flee range, fades beyond
                proximity = max(0.0, 1.0 - d_dog / self.FLEE_DIST)
                align_scores.append(cosine * proximity)
            alignment = float(np.mean(align_scores)) if align_scores else 0.0
        else:
-            shaping = approach = 0.0
+            shaping = approach = alignment = 0.0
        reward  = shaping   * self.W_SHAPING
        reward += approach  * self.W_APPROACH
        reward += alignment * self.W_ALIGN
        reward += newly_penned * self.W_PEN_BONUS
        reward -= self.W_STEP_COST
        if n_penned == self.n_sheep: