diff --git a/training/herding_env.py b/training/herding_env.py index ce56cc3..2d62b7a 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -53,11 +53,12 @@ class HerdingEnv(gym.Env): # ----------------------------------------------------------------------- # Reward weights (progress-based potential shaping + sparse bonuses) # ----------------------------------------------------------------------- - W_DRIVE = 2.0 # flock COM moved toward pen (per metre, per step) - W_COLLECT = 1.0 # flock radius shrank (per metre, per step) - W_PEN_BONUS = 5.0 # per sheep penned - W_COMPLETE = 20.0 # all sheep penned - W_STEP_COST = 0.002 # time penalty + W_DRIVE = 2.0 # flock COM moved toward pen (per metre, per step) + W_COLLECT = 1.0 # flock radius shrank (per metre, per step) + W_APPROACH = 0.3 # stable position signal: dog close to flock COM + W_PEN_BONUS = 5.0 # per sheep penned + W_COMPLETE = 20.0 # all sheep penned + W_STEP_COST = 0.002 # time penalty def __init__(self, n_sheep: int = 1, max_steps: int = 2000, render_mode: str = None): @@ -288,13 +289,22 @@ class HerdingEnv(gym.Env): com_dist = float(np.linalg.norm(com - self.PEN_CENTER)) # Progress rewards: positive when flock moves toward pen or compacts - drive_progress = (self._prev_com_dist - com_dist) * self.W_DRIVE - collect_progress = (self._prev_radius - radius) * self.W_COLLECT + drive_progress = (self._prev_com_dist - com_dist) * self.W_DRIVE + collect_progress = (self._prev_radius - radius) * self.W_COLLECT self._prev_com_dist = com_dist self._prev_radius = radius - reward = drive_progress + collect_progress + # Approach: stable position signal so the dog has a gradient toward + # the flock even when the sheep are not actively fleeing + active_mask = ~self.penned[:self.n_sheep] + if active_mask.any(): + dog_to_com = float(np.linalg.norm(self.dog_pos - com)) + approach = -(dog_to_com / (2 * self.FIELD)) * self.W_APPROACH + else: + approach = 0.0 + + reward = drive_progress + collect_progress + approach reward += newly_penned * self.W_PEN_BONUS reward -= self.W_STEP_COST if n_penned == self.n_sheep: