diff --git a/training/herding_env.py b/training/herding_env.py index 2d62b7a..f568a51 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -53,9 +53,9 @@ class HerdingEnv(gym.Env): # ----------------------------------------------------------------------- # Reward weights (progress-based potential shaping + sparse bonuses) # ----------------------------------------------------------------------- - W_DRIVE = 2.0 # flock COM moved toward pen (per metre, per step) - W_COLLECT = 1.0 # flock radius shrank (per metre, per step) - W_APPROACH = 0.3 # stable position signal: dog close to flock COM + W_DRIVE = 2.0 # progress: flock COM moved toward pen + W_COLLECT = 0.5 # progress: flock radius shrank + W_ALIGN = 0.5 # position: dog on anti-pen side of flock COM W_PEN_BONUS = 5.0 # per sheep penned W_COMPLETE = 20.0 # all sheep penned W_STEP_COST = 0.002 # time penalty @@ -288,23 +288,27 @@ class HerdingEnv(gym.Env): com, radius, _ = self._flock_stats() com_dist = float(np.linalg.norm(com - self.PEN_CENTER)) - # Progress rewards: positive when flock moves toward pen or compacts + # Progress rewards: positive when state improves drive_progress = (self._prev_com_dist - com_dist) * self.W_DRIVE collect_progress = (self._prev_radius - radius) * self.W_COLLECT self._prev_com_dist = com_dist self._prev_radius = radius - # Approach: stable position signal so the dog has a gradient toward - # the flock even when the sheep are not actively fleeing - active_mask = ~self.penned[:self.n_sheep] - if active_mask.any(): - dog_to_com = float(np.linalg.norm(self.dog_pos - com)) - approach = -(dog_to_com / (2 * self.FIELD)) * self.W_APPROACH + # Alignment: reward dog for being on the anti-pen side of the flock + # COM, gated by proximity so only nearby positioning counts. + # +1 = dog directly behind flock, -1 = dog on pen side (wrong). + d_dog_com = float(np.linalg.norm(self.dog_pos - com)) + if d_dog_com > 0.1 and com_dist > 0.1: + pen_dir = (self.PEN_CENTER - com) / com_dist # COM → pen + dog_dir = (self.dog_pos - com) / d_dog_com # COM → dog + cosine = -float(np.dot(pen_dir, dog_dir)) # +1 when opposite + proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST) + alignment = cosine * proximity * self.W_ALIGN else: - approach = 0.0 + alignment = 0.0 - reward = drive_progress + collect_progress + approach + reward = drive_progress + collect_progress + alignment reward += newly_penned * self.W_PEN_BONUS reward -= self.W_STEP_COST if n_penned == self.n_sheep: