From 7b87908410fad255d6d5309fc148764c17f5aeb8 Mon Sep 17 00:00:00 2001 From: Johnny Fernandes Date: Sat, 25 Apr 2026 21:35:23 +0100 Subject: [PATCH] Behaviour refinement --- training/herding_env.py | 42 ++++++++++++++++++++++++++++++++------- training/replay_config.py | 26 ++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/training/herding_env.py b/training/herding_env.py index a9000bd..d462cf3 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -35,6 +35,7 @@ class HerdingEnv(gym.Env): PEN_X = (10.0, 13.0) PEN_Y = (-15.0, -8.0) PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32) + PEN_ENTRY = np.array([11.5, -8.0], dtype=np.float32) # north entrance face center # ----------------------------------------------------------------------- # Dynamics — calibrated to match Webots robot specs @@ -62,6 +63,11 @@ class HerdingEnv(gym.Env): W_COMPACT = 0.0 # reward for flock-radius reduction (off by default) ALIGN_SHAPE = "standoff" # "standoff" (peaks at IDEAL) | "near" (peaks at 0) ALIGN_GATED = True # gate alignment on action magnitude + ENTRY_AWARE = True # progress reward targets PEN_ENTRY (entrance face), not + # PEN_CENTER. Stops the wall-corraling exploit: when a + # sheep is shoved south past y=-8 outside the pen x-range, + # distance to PEN_ENTRY grows (since target is at y=-8), + # so progress reward goes negative instead of positive. # Initial sheep spawn: first sheep placed anywhere; rest within CLUSTER_RADIUS # of it. Set to None for legacy uniform-scatter behaviour. @@ -182,10 +188,11 @@ class HerdingEnv(gym.Env): # Initialise per-sheep pen-distance sum for progress reward active = ~self.penned[:self.n_sheep] + target = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER if active.any(): self._prev_pen_dist_sum = float( np.linalg.norm( - self.sheep_pos[:self.n_sheep][active] - self.PEN_CENTER, axis=1 + self.sheep_pos[:self.n_sheep][active] - target, axis=1 ).sum() ) com0 = self.sheep_pos[:self.n_sheep][active].mean(axis=0) @@ -202,10 +209,26 @@ class HerdingEnv(gym.Env): self._step_count += 1 act = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0) - self.dog_pos = np.clip( + old_dog = self.dog_pos.copy() + new_dog = np.clip( self.dog_pos + act * self.DOG_SPEED * self.DT, -self.FIELD, self.FIELD ) + # Pen wall collision — mirrors Webots geometry. West (x=PEN_X[0]) and + # east (x=PEN_X[1]) walls block the dog within the pen's y-range. + # North face (y=PEN_Y[1]=-8) is open. South is the field edge. + px0, px1 = self.PEN_X + py0, py1 = self.PEN_Y + if py0 < new_dog[1] < py1: + if old_dog[0] < px0 <= new_dog[0]: + new_dog[0] = px0 - 1e-3 + elif old_dog[0] > px0 >= new_dog[0]: + new_dog[0] = px0 + 1e-3 + if old_dog[0] > px1 >= new_dog[0]: + new_dog[0] = px1 + 1e-3 + elif old_dog[0] < px1 <= new_dog[0]: + new_dog[0] = px1 - 1e-3 + self.dog_pos = new_dog.astype(np.float32) for i in range(self.n_sheep): if self.penned[i]: @@ -325,14 +348,18 @@ class HerdingEnv(gym.Env): # For 1 sheep: far1-COM = far2-COM = far3-COM = [0,0] → cleanly ignorable. # For 3+ sheep: non-zero vectors tell the dog where each straggler is # within the group, without conflicting with weights trained on 1 sheep. + # Pen reference for the policy. Aligned with the reward target so the + # policy isn't forced to learn an implicit offset between what it sees + # ("pen is here") and what it's rewarded for ("get sheep close to here"). + pen_ref = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER return np.array([ self.dog_pos[0] / S, self.dog_pos[1] / S, (com[0] - self.dog_pos[0]) / D, (com[1] - self.dog_pos[1]) / D, (far1[0] - com[0]) / D, (far1[1] - com[1]) / D, (far2[0] - com[0]) / D, (far2[1] - com[1]) / D, (far3[0] - com[0]) / D, (far3[1] - com[1]) / D, - (self.PEN_CENTER[0] - com[0]) / D, (self.PEN_CENTER[1] - com[1]) / D, - (self.PEN_CENTER[0] - far1[0]) / D, (self.PEN_CENTER[1] - far1[1]) / D, + (pen_ref[0] - com[0]) / D, (pen_ref[1] - com[1]) / D, + (pen_ref[0] - far1[0]) / D, (pen_ref[1] - far1[1]) / D, radius / D, active_mask.sum() / self.n_sheep, ], dtype=np.float32) @@ -344,9 +371,10 @@ class HerdingEnv(gym.Env): # Naturally rewards keeping the flock together and pushing toward pen: # dog behind flock → all sheep flee toward pen → all contribute positive reward. # Dog from wrong side → sheep scatter away from pen → negative reward. + target = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER if active.any(): pen_dists = np.linalg.norm( - self.sheep_pos[:self.n_sheep][active] - self.PEN_CENTER, axis=1 + self.sheep_pos[:self.n_sheep][active] - target, axis=1 ) cur_sum = float(pen_dists.sum()) r_progress = (self._prev_pen_dist_sum - cur_sum) * self.W_PER_SHEEP @@ -355,10 +383,10 @@ class HerdingEnv(gym.Env): r_progress = 0.0 com, _, _ = self._flock_stats() - com_dist = float(np.linalg.norm(com - self.PEN_CENTER)) + com_dist = float(np.linalg.norm(com - target)) d_dog_com = float(np.linalg.norm(self.dog_pos - com)) if d_dog_com > 0.1 and com_dist > 0.1: - pen_dir = (self.PEN_CENTER - com) / com_dist + pen_dir = (target - com) / com_dist dog_dir = (self.dog_pos - com) / d_dog_com cosine = -float(np.dot(pen_dir, dog_dir)) if self.ALIGN_SHAPE == "standoff": diff --git a/training/replay_config.py b/training/replay_config.py index 6903f7e..08a151d 100644 --- a/training/replay_config.py +++ b/training/replay_config.py @@ -43,6 +43,10 @@ def main(): p.add_argument("--mixed", action="store_true", help="Train with n_sheep randomized per episode (no curriculum). " "Total train steps = steps-per-stage * max_sheep.") + p.add_argument("--final-mixed-steps", type=int, default=0, + help="After the curriculum, train this many extra steps with " + "random_n_sheep ∈ [1, max_sheep] to consolidate the policy " + "across all flock sizes. Re-evaluates all n_sheep at the end.") p.add_argument("--n-envs", type=int, default=8) p.add_argument("--max-steps", type=int, default=2500) p.add_argument("--eval-episodes", type=int, default=30) @@ -123,6 +127,28 @@ def main(): f"mean_act={r['mean_act']:.2f}") stage_results.append({"n_sheep": n, **r}) + # Optional consolidation pass with mixed n_sheep — fixes specialization + # imbalance from curriculum order (e.g. n=1 weakness after long n=10 + # training). Replaces stage_results with the post-consolidation eval. + if args.final_mixed_steps > 0 and not args.mixed: + print(f"\n[Consolidation] mixed n_sheep ∈ [1, {args.max_sheep}], " + f"{args.final_mixed_steps:,} steps") + vn.env_method("__setattr__", "random_n_sheep", True) + model.learn( + total_timesteps=args.final_mixed_steps, + reset_num_timesteps=False, + callback=ProgressCallback(0, "consolidate", freq=100_000), + ) + print("[Consolidation] re-evaluating all sheep counts") + stage_results = [] + for n in range(1, args.max_sheep + 1): + r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) + print(f"[Consolidation] n_sheep={n} sr={r['sr']*100:.0f}% " + f"mean_len={r['mean_len']:.0f} " + f"mean_min_pen={r['mean_min_pen']:.1f}m " + f"mean_act={r['mean_act']:.2f}") + stage_results.append({"n_sheep": n, **r}) + model.save(os.path.join(run_dir, "final_model")) vn.save(os.path.join(run_dir, "vecnorm.pkl")) with open(os.path.join(run_dir, "stage_results.json"), "w") as f: