diff --git a/training/herding_env.py b/training/herding_env.py index 4f53df6..15d17c1 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -59,9 +59,13 @@ class HerdingEnv(gym.Env): W_PEN_BONUS = 10.0 # per sheep penned W_COMPLETE = 100.0 # all sheep penned W_STEP_COST = 0.02 # time penalty — strong enough to punish doing nothing + W_COMPACT = 0.0 # reward for flock-radius reduction (off by default) + ALIGN_SHAPE = "standoff" # "standoff" (peaks at IDEAL) | "near" (peaks at 0) + ALIGN_GATED = True # gate alignment on action magnitude def __init__(self, n_sheep: int = 1, max_steps: int = 2000, - render_mode: str = None, random_n_sheep: bool = False): + render_mode: str = None, random_n_sheep: bool = False, + reward_cfg: dict = None): super().__init__() assert 1 <= n_sheep <= self.MAX_SHEEP self.n_sheep = n_sheep @@ -69,6 +73,14 @@ class HerdingEnv(gym.Env): self.render_mode = render_mode self.random_n_sheep = random_n_sheep # if True, randomise n_sheep each reset + # Override class-default reward weights / shape with per-instance config + # so sweeps can ship configs into subprocess envs via pickled make_env. + if reward_cfg: + for k, v in reward_cfg.items(): + if not hasattr(self.__class__, k): + raise ValueError(f"unknown reward_cfg key: {k}") + setattr(self, k, v) + # Fixed 16-dim observation regardless of n_sheep: # dog_pos(2) + rel_com(2) + rel_far1(2) + rel_far2(2) + rel_far3(2) # + com_to_pen(2) + far1_to_pen(2) + radius(1) + frac_penned(1) @@ -127,8 +139,12 @@ class HerdingEnv(gym.Env): # Dog: 50% of resets start already behind the flock (anti-pen side, # within flee range) to give early training aligned experiences. + # Use the flock COM as the reference (not sheep[0]) so the bias + # generalizes from 1-sheep to multi-sheep without putting the dog + # in front of or inside the flock. if self.np_random.random() < 0.5: - ref = self.sheep_pos[0] + active_pts = self.sheep_pos[:self.n_sheep][~self.penned[:self.n_sheep]] + ref = active_pts.mean(axis=0) if len(active_pts) else self.sheep_pos[0] away = ref - self.PEN_CENTER d = float(np.linalg.norm(away)) if d > 0.1: @@ -154,8 +170,13 @@ class HerdingEnv(gym.Env): self.sheep_pos[:self.n_sheep][active] - self.PEN_CENTER, axis=1 ).sum() ) + com0 = self.sheep_pos[:self.n_sheep][active].mean(axis=0) + self._prev_radius = float( + np.linalg.norm(self.sheep_pos[:self.n_sheep][active] - com0, axis=1).max() + ) else: self._prev_pen_dist_sum = 0.0 + self._prev_radius = 0.0 return self._obs(), {} @@ -322,22 +343,37 @@ class HerdingEnv(gym.Env): pen_dir = (self.PEN_CENTER - com) / com_dist dog_dir = (self.dog_pos - com) / d_dog_com cosine = -float(np.dot(pen_dir, dog_dir)) - proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST) - # Gate on action magnitude: only paid when the dog is actually moving. - # Without this, parking on the anti-pen side farms +0.03/step against - # the -0.02 step_cost and the policy collapses to sit-still. - move_gate = min(1.0, float(np.linalg.norm(action))) + if self.ALIGN_SHAPE == "standoff": + IDEAL = 0.5 * (self.SEPARATION_DIST + self.FLEE_DIST) + HALF = self.FLEE_DIST - IDEAL + proximity = max(0.0, 1.0 - abs(d_dog_com - IDEAL) / HALF) + else: # "near" + proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST) + move_gate = (min(1.0, float(np.linalg.norm(action))) + if self.ALIGN_GATED else 1.0) alignment = cosine * proximity * move_gate * self.W_ALIGN else: alignment = 0.0 + # Compactness shaping: reward decreases in flock radius (active sheep only) + if self.W_COMPACT and active.any(): + cur_radius = float(np.linalg.norm( + self.sheep_pos[:self.n_sheep][active] - com, axis=1 + ).max()) + r_compact = (self._prev_radius - cur_radius) * self.W_COMPACT + self._prev_radius = cur_radius + else: + r_compact = 0.0 + r_pen_bonus = newly_penned * self.W_PEN_BONUS r_step_cost = -self.W_STEP_COST r_complete = self.W_COMPLETE if n_penned == self.n_sheep else 0.0 - reward = r_progress + alignment + r_pen_bonus + r_step_cost + r_complete + reward = (r_progress + alignment + r_compact + r_pen_bonus + + r_step_cost + r_complete) rcomps = { "progress": float(r_progress), "alignment": float(alignment), + "compact": float(r_compact), "pen_bonus": float(r_pen_bonus), "step_cost": float(r_step_cost), "complete": float(r_complete), diff --git a/training/sweep_reward.py b/training/sweep_reward.py new file mode 100644 index 0000000..e11c2e0 --- /dev/null +++ b/training/sweep_reward.py @@ -0,0 +1,245 @@ +""" +Random-search sweep over reward-function hyperparameters. + +Each trial trains a fresh PPO policy through a 1→2-sheep curriculum on a tight +budget, then evaluates at n=1,2,3 sheep. A composite score is computed and +written to a JSONL log. After all trials, a leaderboard is printed and the +best config is saved. + +Sized to fit in ~4 hours wall-clock with default settings on 8 envs. + +Usage +----- + python sweep_reward.py # 25 trials, default budget + python sweep_reward.py --n-trials 15 + python sweep_reward.py --time-budget 6 # stop adding trials past 6h + python sweep_reward.py --resume runs/sweep_ # continue logging + +Per-trial budget (see TRAIN_*_STEPS below): ~1.0M training steps + 30 eval +episodes × 3 sheep counts. On this env that runs in ~8–12 min per trial. +""" +import argparse +import json +import os +import time +import traceback +from copy import deepcopy + +import numpy as np +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize + +from herding_env import HerdingEnv + +# --------------------------------------------------------------------------- +# Search space — reward weights + a couple of hyperparams +# --------------------------------------------------------------------------- +SEARCH_SPACE = { + "W_PER_SHEEP": [1.0, 2.0, 4.0, 6.0], + "W_ALIGN": [0.0, 0.025, 0.05, 0.1], + "W_PEN_BONUS": [5.0, 10.0, 20.0], + "W_STEP_COST": [0.005, 0.02, 0.05], + "W_COMPLETE": [50.0, 100.0, 200.0], + "W_COMPACT": [0.0, 0.5, 1.5, 3.0], + "ALIGN_SHAPE": ["standoff", "near"], + "ALIGN_GATED": [True, False], + "ent_coef": [0.005, 0.01, 0.02, 0.05], +} + +# Per-trial training budget — keep tight; total = sum + eval +TRAIN_STAGE1_STEPS = 400_000 # 1 sheep +TRAIN_STAGE2_STEPS = 600_000 # 2 sheep +EVAL_EPISODES = 10 +EVAL_NSHEEP = (1, 2, 3) +MAX_STEPS = 1500 +N_ENVS = 8 + + +def sample_config(rng: np.random.Generator) -> dict: + cfg = {} + for k, v in SEARCH_SPACE.items(): + choice = v[int(rng.integers(0, len(v)))] + cfg[k] = bool(choice) if isinstance(choice, np.bool_) else choice + return cfg + + +def reward_cfg(cfg: dict) -> dict: + """Strip non-env keys (anything that isn't a HerdingEnv attribute).""" + return {k: v for k, v in cfg.items() if k != "ent_coef"} + + +def make_env(n_sheep, seed, max_steps, rcfg): + def _init(): + env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, reward_cfg=rcfg) + env.reset(seed=seed) + return env + return _init + + +def evaluate(model, vn_template, n_sheep, n_episodes, max_steps, rcfg): + raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, rcfg)]) + vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) + vn.obs_rms = deepcopy(vn_template.obs_rms) + vn.ret_rms = deepcopy(vn_template.ret_rms) + successes = 0 + ep_lens, min_pen_list, action_mags = [], [], [] + for _ in range(n_episodes): + obs = vn.reset() + done = False + steps, min_pen, mags = 0, float("inf"), [] + while not done: + action, _ = model.predict(obs, deterministic=True) + obs, _, dones, infos = vn.step(action) + done = dones[0] + inner = vn.envs[0] + com, _, _ = inner._flock_stats() + min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER))) + mags.append(float(np.linalg.norm(action[0]))) + steps += 1 + successes += int(infos[0].get("n_penned") == n_sheep) + ep_lens.append(steps) + min_pen_list.append(min_pen) + action_mags.extend(mags) + vn.close() + return { + "sr": successes / n_episodes, + "mean_len": float(np.mean(ep_lens)), + "mean_min_pen": float(np.mean(min_pen_list)), + "mean_act": float(np.mean(action_mags)), + } + + +def run_trial(trial_id: int, cfg: dict, log_path: str) -> dict: + rcfg = reward_cfg(cfg) + + train_env = SubprocVecEnv([ + make_env(1, seed=trial_id * 100 + i, max_steps=MAX_STEPS, rcfg=rcfg) + for i in range(N_ENVS) + ]) + vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0) + + model = PPO( + "MlpPolicy", vn, + learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10, + gamma=0.995, gae_lambda=0.95, clip_range=0.2, + ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5, + policy_kwargs=dict(net_arch=[256, 256]), + verbose=0, + ) + + try: + model.learn(total_timesteps=TRAIN_STAGE1_STEPS, reset_num_timesteps=True) + vn.env_method("set_n_sheep", 2) + model.learn(total_timesteps=TRAIN_STAGE2_STEPS, reset_num_timesteps=False) + + per_sheep = {} + for n in EVAL_NSHEEP: + per_sheep[n] = evaluate(model, vn, n, EVAL_EPISODES, MAX_STEPS, rcfg) + finally: + try: vn.close() + except Exception: pass + + sr = {n: per_sheep[n]["sr"] for n in EVAL_NSHEEP} + score = 0.2 * sr[1] + 0.5 * sr[2] + 0.3 * sr[3] + return { + "trial": trial_id, + "config": cfg, + "score": score, + "sr": sr, + "details": per_sheep, + } + + +def main(): + p = argparse.ArgumentParser() + p.add_argument("--n-trials", type=int, default=25) + p.add_argument("--time-budget", type=float, default=7.5, + help="Stop launching new trials past this many hours.") + p.add_argument("--seed", type=int, default=42) + p.add_argument("--run-dir", type=str, default=None, + help="If unset, creates runs/sweep_/") + p.add_argument("--resume", type=str, default=None, + help="Continue logging into an existing sweep dir") + args = p.parse_args() + + run_dir = args.resume or args.run_dir or os.path.join( + "runs", "sweep_" + time.strftime("%Y%m%d_%H%M%S") + ) + os.makedirs(run_dir, exist_ok=True) + log_path = os.path.join(run_dir, "results.jsonl") + + rng = np.random.default_rng(args.seed) + start = time.time() + budget_s = args.time_budget * 3600 + results = [] + + # If resuming, replay the existing log into memory + if args.resume and os.path.exists(log_path): + with open(log_path) as f: + for line in f: + try: results.append(json.loads(line)) + except Exception: pass + print(f"Resumed sweep: {len(results)} prior trials loaded from {log_path}") + + print(f"Sweep dir: {run_dir}") + print(f"Search space: {list(SEARCH_SPACE.keys())}") + print(f"Per-trial: {TRAIN_STAGE1_STEPS+TRAIN_STAGE2_STEPS:,} steps train + " + f"{EVAL_EPISODES * len(EVAL_NSHEEP)} eval eps") + print(f"Time budget: {args.time_budget}h\n") + + n_done = sum(1 for r in results if "error" not in r) + trial_id = len(results) + while n_done < args.n_trials: + elapsed_h = (time.time() - start) / 3600 + if elapsed_h >= args.time_budget: + print(f"\n[Sweep] time budget reached ({elapsed_h:.2f}h) — stopping.") + break + + cfg = sample_config(rng) + t0 = time.time() + print(f"[Trial {trial_id+1:>3}] {cfg}") + try: + result = run_trial(trial_id, cfg, log_path) + result["elapsed_s"] = time.time() - t0 + sr = result["sr"] + print(f" → score={result['score']:.3f} " + f"sr1={sr[1]:.2f} sr2={sr[2]:.2f} sr3={sr[3]:.2f} " + f"[{result['elapsed_s']:.0f}s]") + results.append(result) + n_done += 1 + except Exception as e: + traceback.print_exc() + err = {"trial": trial_id, "config": cfg, + "error": f"{type(e).__name__}: {e}", + "elapsed_s": time.time() - t0} + results.append(err) + print(f" ! FAILED: {err['error']}") + with open(log_path, "a") as f: + f.write(json.dumps(results[-1]) + "\n") + trial_id += 1 + + # Leaderboard + succ = [r for r in results if "error" not in r] + succ.sort(key=lambda r: -r["score"]) + print("\n" + "=" * 92) + print(" LEADERBOARD") + print("=" * 92) + hdr = f" {'rank':>4} {'score':>6} {'sr1':>5} {'sr2':>5} {'sr3':>5} config" + print(hdr); print(" " + "-" * 88) + for i, r in enumerate(succ[:15], 1): + sr = r["sr"] + cfg_short = " ".join(f"{k}={v}" for k, v in r["config"].items()) + print(f" {i:>4d} {r['score']:>6.3f} {sr[1]:>5.2f} {sr[2]:>5.2f} {sr[3]:>5.2f} {cfg_short}") + + if succ: + best = succ[0] + with open(os.path.join(run_dir, "best.json"), "w") as f: + json.dump(best, f, indent=2) + print(f"\n Best config saved to {run_dir}/best.json") + print(f" Total trials: {len(results)} ({len(succ)} successful, " + f"{len(results)-len(succ)} failed)") + print(f" Total time: {(time.time()-start)/3600:.2f}h\n") + + +if __name__ == "__main__": + main()