diff --git a/training/herding_env.py b/training/herding_env.py index 2a3a1c8..a1d334d 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -52,8 +52,9 @@ class HerdingEnv(gym.Env): # ----------------------------------------------------------------------- # Reward weights # ----------------------------------------------------------------------- - W_APPROACH = 0.3 # dense: dog distance to nearest active sheep - W_SHAPING = 0.5 # dense: mean sheep distance to pen (was 0.01) + W_ALIGN = 0.4 # dense: dog on anti-pen side of each active sheep + W_SHAPING = 0.5 # dense: mean sheep distance to pen + W_APPROACH = 0.1 # dense: dog within flee range of nearest sheep W_PEN_BONUS = 5.0 # sparse: per sheep successfully penned W_COMPLETE = 20.0 # bonus when ALL active sheep are penned W_STEP_COST = 0.002 # penalty per step (encourages efficiency) @@ -106,11 +107,8 @@ class HerdingEnv(gym.Env): self._step_count = 0 self._prev_penned = 0 - # Dog: random start in the open field (not near the pen) - self.dog_pos = self.np_random.uniform(-8.0, 5.0, size=(2,)).astype(np.float32) - # Active sheep (0 .. n_sheep-1): random non-pen positions - self.sheep_pos[:] = self.PEN_CENTER # default all to pen centre + self.sheep_pos[:] = self.PEN_CENTER self.penned[:] = True placed = 0 @@ -121,6 +119,25 @@ class HerdingEnv(gym.Env): self.penned[placed] = False placed += 1 + # Dog: 50 % of the time start already on the anti-pen side of the + # nearest sheep (within flee range) so early training gets aligned + # starts; the other 50 % is fully random to ensure generalisation. + if self.np_random.random() < 0.5: + # Place dog behind the first active sheep relative to the pen + ref = self.sheep_pos[0] + away = ref - self.PEN_CENTER # sheep→anti-pen + dist = float(np.linalg.norm(away)) + if dist > 0.1: + away = away / dist + offset = away * self.np_random.uniform(2.0, self.FLEE_DIST * 0.8) + self.dog_pos = np.clip( + (ref + offset).astype(np.float32), -self.FIELD, self.FIELD + ) + else: + self.dog_pos = self.np_random.uniform( + -self.FIELD * 0.8, self.FIELD * 0.8, size=(2,) + ).astype(np.float32) + # Inactive slots (n_sheep .. MAX_SHEEP-1): already at pen centre, penned=True self.wander_ang = self.np_random.uniform( @@ -237,20 +254,37 @@ class HerdingEnv(gym.Env): active_mask = ~self.penned[:self.n_sheep] if active_mask.any(): active_pos = self.sheep_pos[:self.n_sheep][active_mask] + dists_pen = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1) + dists_dog = np.linalg.norm(active_pos - self.dog_pos, axis=1) - # Sheep-to-pen shaping: encourages moving sheep toward pen - dists_pen = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1) - shaping = -(dists_pen.mean() / (2 * self.FIELD)) # ∈ [-1, 0] + # Sheep-to-pen shaping + shaping = -(dists_pen.mean() / (2 * self.FIELD)) - # Dog-to-nearest-sheep approach: incentivises the dog to stay - # within flee range (FLEE_DIST=7m) rather than wandering away - dists_dog = np.linalg.norm(active_pos - self.dog_pos, axis=1) - approach = -(dists_dog.min() / (2 * self.FIELD)) # ∈ [-1, 0] + # Approach: dog penalised for being far from nearest sheep + approach = -(dists_dog.min() / (2 * self.FIELD)) + + # Alignment: reward dog for being on the anti-pen side of each sheep. + # When the dog is opposite the pen relative to a sheep, that sheep + # flees toward the pen. Score ∈ [-1, 1] per sheep, weighted by + # a proximity gate so only nearby dogs count. + align_scores = [] + for s_pos, d_pen, d_dog in zip(active_pos, dists_pen, dists_dog): + if d_pen < 0.1 or d_dog < 0.1: + continue + pen_dir = (self.PEN_CENTER - s_pos) / d_pen # sheep → pen + dog_dir = (self.dog_pos - s_pos) / d_dog # sheep → dog + # cos(angle): +1 → dog behind sheep, -1 → dog on pen side + cosine = -float(np.dot(pen_dir, dog_dir)) + # gate: full credit inside flee range, fades beyond + proximity = max(0.0, 1.0 - d_dog / self.FLEE_DIST) + align_scores.append(cosine * proximity) + alignment = float(np.mean(align_scores)) if align_scores else 0.0 else: - shaping = approach = 0.0 + shaping = approach = alignment = 0.0 - reward = shaping * self.W_SHAPING - reward += approach * self.W_APPROACH + reward = shaping * self.W_SHAPING + reward += approach * self.W_APPROACH + reward += alignment * self.W_ALIGN reward += newly_penned * self.W_PEN_BONUS reward -= self.W_STEP_COST if n_penned == self.n_sheep: