diff --git a/training/replay_config.py b/training/replay_config.py index dacfef0..7a79927 100644 --- a/training/replay_config.py +++ b/training/replay_config.py @@ -37,8 +37,12 @@ def main(): p = argparse.ArgumentParser() p.add_argument("--config", type=str, required=True, help="Reward config JSON (sweep best.json or trial config.json)") + p.add_argument("--start-sheep", type=int, default=1) p.add_argument("--max-sheep", type=int, default=3) p.add_argument("--steps-per-stage", type=int, default=1_500_000) + p.add_argument("--mixed", action="store_true", + help="Train with n_sheep randomized per episode (no curriculum). " + "Total train steps = steps-per-stage * max_sheep.") p.add_argument("--n-envs", type=int, default=8) p.add_argument("--max-steps", type=int, default=1500) p.add_argument("--eval-episodes", type=int, default=30) @@ -58,11 +62,17 @@ def main(): with open(os.path.join(run_dir, "config.json"), "w") as f: json.dump(cfg, f, indent=2) print(f"Run dir: {run_dir}") - print(f"Curriculum: 1 → {args.max_sheep} sheep, " - f"{args.steps_per_stage:,} steps/stage") + if args.mixed: + print(f"MIXED training: random n_sheep ∈ [1, {args.max_sheep}], " + f"{args.steps_per_stage * args.max_sheep:,} total steps") + else: + print(f"Curriculum: {args.start_sheep} → {args.max_sheep} sheep, " + f"{args.steps_per_stage:,} steps/stage") train_env = SubprocVecEnv([ - make_env(1, seed=i, max_steps=args.max_steps, rcfg=rcfg) + make_env(args.max_sheep if args.mixed else args.start_sheep, + seed=i, max_steps=args.max_steps, rcfg=rcfg, + random_n_sheep=args.mixed) for i in range(args.n_envs) ]) vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0) @@ -79,21 +89,39 @@ def main(): stage_results = [] t0 = time.time() try: - for n in range(1, args.max_sheep + 1): - if n > 1: - vn.env_method("set_n_sheep", n) - print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps") + if args.mixed: + total = args.steps_per_stage * args.max_sheep + print(f"\n[Mixed] training {total:,} steps") model.learn( - total_timesteps=args.steps_per_stage, - reset_num_timesteps=(n == 1), - callback=ProgressCallback(0, f"{n} sheep", freq=100_000), + total_timesteps=total, + reset_num_timesteps=True, + callback=ProgressCallback(0, "mixed", freq=100_000), ) - print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps") - r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) - print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% " - f"mean_len={r['mean_len']:.0f} mean_min_pen={r['mean_min_pen']:.1f}m " - f"mean_act={r['mean_act']:.2f}") - stage_results.append({"n_sheep": n, **r}) + for n in range(1, args.max_sheep + 1): + print(f"[Mixed] evaluating n={n}, {args.eval_episodes} eps") + r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) + print(f"[Mixed] n_sheep={n} sr={r['sr']*100:.0f}% " + f"mean_len={r['mean_len']:.0f} " + f"mean_min_pen={r['mean_min_pen']:.1f}m " + f"mean_act={r['mean_act']:.2f}") + stage_results.append({"n_sheep": n, **r}) + else: + for n in range(args.start_sheep, args.max_sheep + 1): + if n > args.start_sheep: + vn.env_method("set_n_sheep", n) + print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps") + model.learn( + total_timesteps=args.steps_per_stage, + reset_num_timesteps=(n == args.start_sheep), + callback=ProgressCallback(0, f"{n} sheep", freq=100_000), + ) + print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps") + r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) + print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% " + f"mean_len={r['mean_len']:.0f} " + f"mean_min_pen={r['mean_min_pen']:.1f}m " + f"mean_act={r['mean_act']:.2f}") + stage_results.append({"n_sheep": n, **r}) model.save(os.path.join(run_dir, "final_model")) vn.save(os.path.join(run_dir, "vecnorm.pkl")) diff --git a/training/sweep_reward.py b/training/sweep_reward.py index 02380fe..db84412 100644 --- a/training/sweep_reward.py +++ b/training/sweep_reward.py @@ -114,9 +114,10 @@ def reward_cfg(cfg: dict) -> dict: return {k: v for k, v in cfg.items() if k != "ent_coef"} -def make_env(n_sheep, seed, max_steps, rcfg): +def make_env(n_sheep, seed, max_steps, rcfg, random_n_sheep=False): def _init(): - env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, reward_cfg=rcfg) + env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, + reward_cfg=rcfg, random_n_sheep=random_n_sheep) env.reset(seed=seed) return env return _init