""" Replay a reward config from the sweep with a longer training budget. Tells you whether a promising sweep config was bottlenecked by training time vs. structurally limited. If sr2/sr3 climb past their sweep numbers given more budget, the issue was budget; if they plateau, the policy/obs needs work. Usage ----- python replay_config.py --config runs/sweep_/best.json python replay_config.py --config runs/sweep_/trial_007/config.json \ --max-sheep 4 --steps-per-stage 1500000 Argument summary: --config JSON file with the reward config (sweep best.json works) --max-sheep Final curriculum stage (default 3) --steps-per-stage Env steps per curriculum stage (default 1.5M) --n-envs Parallel envs (default 8) --eval-episodes Per-stage eval episodes (default 30) --run-dir Output directory (default runs/replay_/) """ import argparse import json import os import time from copy import deepcopy import numpy as np from stable_baselines3 import PPO from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize from herding_env import HerdingEnv from sweep_reward import ProgressCallback, reward_cfg, evaluate, make_env def main(): p = argparse.ArgumentParser() p.add_argument("--config", type=str, required=True, help="Reward config JSON (sweep best.json or trial config.json)") p.add_argument("--start-sheep", type=int, default=1) p.add_argument("--max-sheep", type=int, default=3) p.add_argument("--steps-per-stage", type=int, default=1_500_000) p.add_argument("--mixed", action="store_true", help="Train with n_sheep randomized per episode (no curriculum). " "Total train steps = steps-per-stage * max_sheep.") p.add_argument("--n-envs", type=int, default=8) p.add_argument("--max-steps", type=int, default=1500) p.add_argument("--eval-episodes", type=int, default=30) p.add_argument("--run-dir", type=str, default=None) args = p.parse_args() with open(args.config) as f: raw = json.load(f) cfg = raw["config"] if "config" in raw and isinstance(raw["config"], dict) else raw rcfg = reward_cfg(cfg) print(f"Config: {cfg}") run_dir = args.run_dir or os.path.join( "runs", "replay_" + time.strftime("%Y%m%d_%H%M%S") ) os.makedirs(run_dir, exist_ok=True) with open(os.path.join(run_dir, "config.json"), "w") as f: json.dump(cfg, f, indent=2) print(f"Run dir: {run_dir}") if args.mixed: print(f"MIXED training: random n_sheep ∈ [1, {args.max_sheep}], " f"{args.steps_per_stage * args.max_sheep:,} total steps") else: print(f"Curriculum: {args.start_sheep} → {args.max_sheep} sheep, " f"{args.steps_per_stage:,} steps/stage") train_env = SubprocVecEnv([ make_env(args.max_sheep if args.mixed else args.start_sheep, seed=i, max_steps=args.max_steps, rcfg=rcfg, random_n_sheep=args.mixed) for i in range(args.n_envs) ]) vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0) model = PPO( "MlpPolicy", vn, learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10, gamma=0.995, gae_lambda=0.95, clip_range=0.2, ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5, policy_kwargs=dict(net_arch=[256, 256]), verbose=0, ) stage_results = [] t0 = time.time() try: if args.mixed: total = args.steps_per_stage * args.max_sheep print(f"\n[Mixed] training {total:,} steps") model.learn( total_timesteps=total, reset_num_timesteps=True, callback=ProgressCallback(0, "mixed", freq=100_000), ) for n in range(1, args.max_sheep + 1): print(f"[Mixed] evaluating n={n}, {args.eval_episodes} eps") r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) print(f"[Mixed] n_sheep={n} sr={r['sr']*100:.0f}% " f"mean_len={r['mean_len']:.0f} " f"mean_min_pen={r['mean_min_pen']:.1f}m " f"mean_act={r['mean_act']:.2f}") stage_results.append({"n_sheep": n, **r}) else: for n in range(args.start_sheep, args.max_sheep + 1): if n > args.start_sheep: vn.env_method("set_n_sheep", n) print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps") model.learn( total_timesteps=args.steps_per_stage, reset_num_timesteps=(n == args.start_sheep), callback=ProgressCallback(0, f"{n} sheep", freq=100_000), ) print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps") r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% " f"mean_len={r['mean_len']:.0f} " f"mean_min_pen={r['mean_min_pen']:.1f}m " f"mean_act={r['mean_act']:.2f}") stage_results.append({"n_sheep": n, **r}) model.save(os.path.join(run_dir, "final_model")) vn.save(os.path.join(run_dir, "vecnorm.pkl")) with open(os.path.join(run_dir, "stage_results.json"), "w") as f: json.dump(stage_results, f, indent=2) finally: try: vn.close() except Exception: pass print("\n" + "=" * 60) print(" REPLAY SUMMARY") print("=" * 60) for r in stage_results: print(f" n_sheep={r['n_sheep']} sr={r['sr']*100:>3.0f}% " f"len={r['mean_len']:>5.0f} min_pen={r['mean_min_pen']:>5.1f}m " f"act={r['mean_act']:.2f}") print(f"\n Total time: {(time.time()-t0)/60:.1f} min") print(f" Artefacts: {run_dir}/") if __name__ == "__main__": main()