diff --git a/training/replay_config.py b/training/replay_config.py new file mode 100644 index 0000000..dacfef0 --- /dev/null +++ b/training/replay_config.py @@ -0,0 +1,118 @@ +""" +Replay a reward config from the sweep with a longer training budget. + +Tells you whether a promising sweep config was bottlenecked by training time +vs. structurally limited. If sr2/sr3 climb past their sweep numbers given more +budget, the issue was budget; if they plateau, the policy/obs needs work. + +Usage +----- + python replay_config.py --config runs/sweep_/best.json + python replay_config.py --config runs/sweep_/trial_007/config.json \ + --max-sheep 4 --steps-per-stage 1500000 + +Argument summary: + --config JSON file with the reward config (sweep best.json works) + --max-sheep Final curriculum stage (default 3) + --steps-per-stage Env steps per curriculum stage (default 1.5M) + --n-envs Parallel envs (default 8) + --eval-episodes Per-stage eval episodes (default 30) + --run-dir Output directory (default runs/replay_/) +""" +import argparse +import json +import os +import time +from copy import deepcopy + +import numpy as np +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize + +from herding_env import HerdingEnv +from sweep_reward import ProgressCallback, reward_cfg, evaluate, make_env + + +def main(): + p = argparse.ArgumentParser() + p.add_argument("--config", type=str, required=True, + help="Reward config JSON (sweep best.json or trial config.json)") + p.add_argument("--max-sheep", type=int, default=3) + p.add_argument("--steps-per-stage", type=int, default=1_500_000) + p.add_argument("--n-envs", type=int, default=8) + p.add_argument("--max-steps", type=int, default=1500) + p.add_argument("--eval-episodes", type=int, default=30) + p.add_argument("--run-dir", type=str, default=None) + args = p.parse_args() + + with open(args.config) as f: + raw = json.load(f) + cfg = raw["config"] if "config" in raw and isinstance(raw["config"], dict) else raw + rcfg = reward_cfg(cfg) + print(f"Config: {cfg}") + + run_dir = args.run_dir or os.path.join( + "runs", "replay_" + time.strftime("%Y%m%d_%H%M%S") + ) + os.makedirs(run_dir, exist_ok=True) + with open(os.path.join(run_dir, "config.json"), "w") as f: + json.dump(cfg, f, indent=2) + print(f"Run dir: {run_dir}") + print(f"Curriculum: 1 → {args.max_sheep} sheep, " + f"{args.steps_per_stage:,} steps/stage") + + train_env = SubprocVecEnv([ + make_env(1, seed=i, max_steps=args.max_steps, rcfg=rcfg) + for i in range(args.n_envs) + ]) + vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0) + + model = PPO( + "MlpPolicy", vn, + learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10, + gamma=0.995, gae_lambda=0.95, clip_range=0.2, + ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5, + policy_kwargs=dict(net_arch=[256, 256]), + verbose=0, + ) + + stage_results = [] + t0 = time.time() + try: + for n in range(1, args.max_sheep + 1): + if n > 1: + vn.env_method("set_n_sheep", n) + print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps") + model.learn( + total_timesteps=args.steps_per_stage, + reset_num_timesteps=(n == 1), + callback=ProgressCallback(0, f"{n} sheep", freq=100_000), + ) + print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps") + r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) + print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% " + f"mean_len={r['mean_len']:.0f} mean_min_pen={r['mean_min_pen']:.1f}m " + f"mean_act={r['mean_act']:.2f}") + stage_results.append({"n_sheep": n, **r}) + + model.save(os.path.join(run_dir, "final_model")) + vn.save(os.path.join(run_dir, "vecnorm.pkl")) + with open(os.path.join(run_dir, "stage_results.json"), "w") as f: + json.dump(stage_results, f, indent=2) + finally: + try: vn.close() + except Exception: pass + + print("\n" + "=" * 60) + print(" REPLAY SUMMARY") + print("=" * 60) + for r in stage_results: + print(f" n_sheep={r['n_sheep']} sr={r['sr']*100:>3.0f}% " + f"len={r['mean_len']:>5.0f} min_pen={r['mean_min_pen']:>5.1f}m " + f"act={r['mean_act']:.2f}") + print(f"\n Total time: {(time.time()-t0)/60:.1f} min") + print(f" Artefacts: {run_dir}/") + + +if __name__ == "__main__": + main() diff --git a/training/sweep_reward.py b/training/sweep_reward.py index 0e0b819..02380fe 100644 --- a/training/sweep_reward.py +++ b/training/sweep_reward.py @@ -155,8 +155,12 @@ def evaluate(model, vn_template, n_sheep, n_episodes, max_steps, rcfg): } -def run_trial(trial_id: int, cfg: dict, log_path: str) -> dict: +def run_trial(trial_id: int, cfg: dict, log_path: str, run_dir: str) -> dict: rcfg = reward_cfg(cfg) + trial_dir = os.path.join(run_dir, f"trial_{trial_id:03d}") + os.makedirs(trial_dir, exist_ok=True) + with open(os.path.join(trial_dir, "config.json"), "w") as f: + json.dump(cfg, f, indent=2) train_env = SubprocVecEnv([ make_env(1, seed=trial_id * 100 + i, max_steps=MAX_STEPS, rcfg=rcfg) @@ -186,6 +190,9 @@ def run_trial(trial_id: int, cfg: dict, log_path: str) -> dict: for n in EVAL_NSHEEP: print(f" ... [trial {trial_id+1} | eval n={n}]", flush=True) per_sheep[n] = evaluate(model, vn, n, EVAL_EPISODES, MAX_STEPS, rcfg) + + model.save(os.path.join(trial_dir, "model")) + vn.save(os.path.join(trial_dir, "vecnorm.pkl")) finally: try: vn.close() except Exception: pass @@ -250,7 +257,7 @@ def main(): t0 = time.time() print(f"[Trial {trial_id+1:>3}] {cfg}") try: - result = run_trial(trial_id, cfg, log_path) + result = run_trial(trial_id, cfg, log_path, run_dir) result["elapsed_s"] = time.time() - t0 sr = result["sr"] print(f" → score={result['score']:.3f} "