diff --git a/training/herding_env.py b/training/herding_env.py index 1aa7356..2f6800d 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -55,7 +55,7 @@ class HerdingEnv(gym.Env): # ----------------------------------------------------------------------- W_DRIVE = 2.0 # progress: COM moved toward pen (only when compact) W_COLLECT = 4.0 # progress: radius shrank (2× stronger when scattered) - W_APPROACH_FAR = 1.0 # progress: dog moved toward farthest straggler (scatter only) + W_HERD_POS = 1.5 # progress: dog moved toward ideal herding position behind far1 W_ALIGN = 0.5 # position: dog on anti-pen side of COM (compact only) W_PEN_BONUS = 10.0 # per sheep penned W_COMPLETE = 100.0 # all sheep penned @@ -89,7 +89,7 @@ class HerdingEnv(gym.Env): self._prev_penned = 0 self._prev_com_dist = 0.0 self._prev_radius = 0.0 - self._prev_dog_to_far1 = 0.0 + self._prev_dog_to_ideal = 0.0 self.dog_pos = np.zeros(2, dtype=np.float32) self.sheep_pos = np.zeros((self.MAX_SHEEP, 2), dtype=np.float32) self.penned = np.ones(self.MAX_SHEEP, dtype=bool) @@ -160,9 +160,11 @@ class HerdingEnv(gym.Env): if active_mask.any(): pts = self.sheep_pos[:self.n_sheep][active_mask] far1 = pts[int(np.argmax(np.linalg.norm(pts - com, axis=1)))] - self._prev_dog_to_far1 = float(np.linalg.norm(self.dog_pos - far1)) + self._prev_dog_to_ideal = float( + np.linalg.norm(self.dog_pos - self._ideal_herd_pos(com, far1)) + ) else: - self._prev_dog_to_far1 = 0.0 + self._prev_dog_to_ideal = 0.0 return self._obs(), {} @@ -300,6 +302,24 @@ class HerdingEnv(gym.Env): active_mask.sum() / self.n_sheep, ], dtype=np.float32) + def _ideal_herd_pos(self, com: np.ndarray, far1: np.ndarray) -> np.ndarray: + """ + Target position for the dog to push far1 toward COM: + just beyond far1 on the outward radial line from COM. + From here, the dog's approach causes far1 to flee inward. + """ + d = far1 - com + d_norm = float(np.linalg.norm(d)) + if d_norm > 0.5: + direction = d / d_norm + else: + # Sheep all together — use anti-pen direction instead + to_pen = self.PEN_CENTER - com + tp = float(np.linalg.norm(to_pen)) + direction = -(to_pen / tp) if tp > 0.1 else np.array([0.0, -1.0], dtype=np.float32) + target = far1 + direction * self.FLEE_DIST * 0.8 + return np.clip(target, -self.FIELD, self.FIELD).astype(np.float32) + def _reward(self, n_penned: int, newly_penned: int) -> float: com, radius, _ = self._flock_stats() com_dist = float(np.linalg.norm(com - self.PEN_CENTER)) @@ -316,22 +336,26 @@ class HerdingEnv(gym.Env): # Drive: only when compact — prevents rewarding COM movement while scattered. r_drive = 0.0 if scattered else drive_delta * self.W_DRIVE - # Approach-to-straggler: reward dog for closing on farthest sheep. - # Only in scatter phase so it doesn't override drive positioning. - # Gated on there being active sheep. + # Herding-position reward: guides dog to the ideal position BEHIND far1 + # (on the outward radial, FLEE_DIST beyond far1 from COM). + # From there, advancing toward COM pushes far1 inward. + # Fires in scatter phase only; gives gradient even during the outward + # navigation arc when raw approach reward would be zero/negative. active_mask = ~self.penned[:self.n_sheep] if scattered and active_mask.any(): pts = self.sheep_pos[:self.n_sheep][active_mask] far1 = pts[int(np.argmax(np.linalg.norm(pts - com, axis=1)))] - cur_dog_to_far1 = float(np.linalg.norm(self.dog_pos - far1)) - r_approach = (self._prev_dog_to_far1 - cur_dog_to_far1) * self.W_APPROACH_FAR - self._prev_dog_to_far1 = cur_dog_to_far1 + ideal = self._ideal_herd_pos(com, far1) + cur_dog_to_ideal = float(np.linalg.norm(self.dog_pos - ideal)) + r_herd_pos = (self._prev_dog_to_ideal - cur_dog_to_ideal) * self.W_HERD_POS + self._prev_dog_to_ideal = cur_dog_to_ideal else: - r_approach = 0.0 + r_herd_pos = 0.0 if active_mask.any(): pts = self.sheep_pos[:self.n_sheep][active_mask] far1 = pts[int(np.argmax(np.linalg.norm(pts - com, axis=1)))] - self._prev_dog_to_far1 = float(np.linalg.norm(self.dog_pos - far1)) + ideal = self._ideal_herd_pos(com, far1) + self._prev_dog_to_ideal = float(np.linalg.norm(self.dog_pos - ideal)) # Alignment: dog on anti-pen side of COM — only in drive phase. # Disabled when scattered: chasing a straggler on the pen side would be @@ -346,7 +370,7 @@ class HerdingEnv(gym.Env): else: alignment = 0.0 - reward = r_drive + r_collect + r_approach + alignment + reward = r_drive + r_collect + r_herd_pos + alignment reward += newly_penned * self.W_PEN_BONUS reward -= self.W_STEP_COST if n_penned == self.n_sheep: