diff --git a/training/herding_env.py b/training/herding_env.py index d462cf3..8423032 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -61,6 +61,11 @@ class HerdingEnv(gym.Env): W_COMPLETE = 100.0 # all sheep penned W_STEP_COST = 0.02 # time penalty — strong enough to punish doing nothing W_COMPACT = 0.0 # reward for flock-radius reduction (off by default) + W_WALL_TOUCH = 0.05 # per-sheep, per-step penalty when an active sheep is + # pinned against the outside of a pen W/E wall. Direct + # signal against the wall-corraling exploit so the + # policy generalises better to Webots pillared walls. + WALL_TOUCH_BUFFER = 0.5 # metres outside the wall counted as "touching" ALIGN_SHAPE = "standoff" # "standoff" (peaks at IDEAL) | "near" (peaks at 0) ALIGN_GATED = True # gate alignment on action magnitude ENTRY_AWARE = True # progress reward targets PEN_ENTRY (entrance face), not @@ -401,6 +406,19 @@ class HerdingEnv(gym.Env): else: alignment = 0.0 + # Wall-touch penalty: count active sheep pinned against outside W/E pen walls. + if self.W_WALL_TOUCH and active.any(): + pts = self.sheep_pos[:self.n_sheep][active] + px0, px1 = self.PEN_X + py0, py1 = self.PEN_Y + in_y = (pts[:, 1] > py0) & (pts[:, 1] < py1) + near_w = (pts[:, 0] < px0) & (pts[:, 0] > px0 - self.WALL_TOUCH_BUFFER) + near_e = (pts[:, 0] > px1) & (pts[:, 0] < px1 + self.WALL_TOUCH_BUFFER) + n_touch = int(((near_w | near_e) & in_y).sum()) + r_wall_touch = -n_touch * self.W_WALL_TOUCH + else: + r_wall_touch = 0.0 + # Compactness shaping: reward decreases in flock radius (active sheep only) if self.W_COMPACT and active.any(): cur_radius = float(np.linalg.norm( @@ -414,15 +432,16 @@ class HerdingEnv(gym.Env): r_pen_bonus = newly_penned * self.W_PEN_BONUS r_step_cost = -self.W_STEP_COST r_complete = self.W_COMPLETE if n_penned == self.n_sheep else 0.0 - reward = (r_progress + alignment + r_compact + r_pen_bonus - + r_step_cost + r_complete) + reward = (r_progress + alignment + r_compact + r_wall_touch + + r_pen_bonus + r_step_cost + r_complete) rcomps = { - "progress": float(r_progress), - "alignment": float(alignment), - "compact": float(r_compact), - "pen_bonus": float(r_pen_bonus), - "step_cost": float(r_step_cost), - "complete": float(r_complete), + "progress": float(r_progress), + "alignment": float(alignment), + "compact": float(r_compact), + "wall_touch": float(r_wall_touch), + "pen_bonus": float(r_pen_bonus), + "step_cost": float(r_step_cost), + "complete": float(r_complete), } return reward, rcomps