TIR_PROJ/training/replay_config.py

"""
Replay a reward config from the sweep with a longer training budget.

Tells you whether a promising sweep config was bottlenecked by training time
vs. structurally limited. If sr2/sr3 climb past their sweep numbers given more
budget, the issue was budget; if they plateau, the policy/obs needs work.

Usage
-----
    python replay_config.py --config runs/sweep_<ts>/best.json
    python replay_config.py --config runs/sweep_<ts>/trial_007/config.json \
        --max-sheep 4 --steps-per-stage 1500000

Argument summary:
    --config           JSON file with the reward config (sweep best.json works)
    --max-sheep        Final curriculum stage (default 3)
    --steps-per-stage  Env steps per curriculum stage (default 1.5M)
    --n-envs           Parallel envs (default 8)
    --eval-episodes    Per-stage eval episodes (default 30)
    --run-dir          Output directory (default runs/replay_<ts>/)
"""
import argparse
import json
import os
import time
from copy import deepcopy

import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize

from herding_env import HerdingEnv
from sweep_reward import ProgressCallback, reward_cfg, evaluate, make_env


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--config", type=str, required=True,
                   help="Reward config JSON (sweep best.json or trial config.json)")
    p.add_argument("--start-sheep", type=int, default=1)
    p.add_argument("--max-sheep", type=int, default=3)
    p.add_argument("--steps-per-stage", type=int, default=1_500_000)
    p.add_argument("--mixed", action="store_true",
                   help="Train with n_sheep randomized per episode (no curriculum). "
                        "Total train steps = steps-per-stage * max_sheep.")
    p.add_argument("--final-mixed-steps", type=int, default=0,
                   help="After the curriculum, train this many extra steps with "
                        "random_n_sheep ∈ [1, max_sheep] to consolidate the policy "
                        "across all flock sizes. Re-evaluates all n_sheep at the end.")
    p.add_argument("--n-envs", type=int, default=8)
    p.add_argument("--max-steps", type=int, default=2500)
    p.add_argument("--eval-episodes", type=int, default=30)
    p.add_argument("--run-dir", type=str, default=None)
    args = p.parse_args()

    with open(args.config) as f:
        raw = json.load(f)
    cfg = raw["config"] if "config" in raw and isinstance(raw["config"], dict) else raw
    rcfg = reward_cfg(cfg)
    print(f"Config: {cfg}")

    run_dir = args.run_dir or os.path.join(
        "runs", "replay_" + time.strftime("%Y%m%d_%H%M%S")
    )
    os.makedirs(run_dir, exist_ok=True)
    with open(os.path.join(run_dir, "config.json"), "w") as f:
        json.dump(cfg, f, indent=2)
    print(f"Run dir: {run_dir}")
    if args.mixed:
        print(f"MIXED training: random n_sheep ∈ [1, {args.max_sheep}], "
              f"{args.steps_per_stage * args.max_sheep:,} total steps")
    else:
        print(f"Curriculum: {args.start_sheep} → {args.max_sheep} sheep, "
              f"{args.steps_per_stage:,} steps/stage")

    train_env = SubprocVecEnv([
        make_env(args.max_sheep if args.mixed else args.start_sheep,
                 seed=i, max_steps=args.max_steps, rcfg=rcfg,
                 random_n_sheep=args.mixed)
        for i in range(args.n_envs)
    ])
    vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)

    model = PPO(
        "MlpPolicy", vn,
        learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
        gamma=0.995, gae_lambda=0.95, clip_range=0.2,
        ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5,
        policy_kwargs=dict(net_arch=[256, 256]),
        verbose=0,
    )

    stage_results = []
    t0 = time.time()
    try:
        if args.mixed:
            total = args.steps_per_stage * args.max_sheep
            print(f"\n[Mixed] training {total:,} steps")
            model.learn(
                total_timesteps=total,
                reset_num_timesteps=True,
                callback=ProgressCallback(0, "mixed", freq=100_000),
            )
            for n in range(1, args.max_sheep + 1):
                print(f"[Mixed] evaluating n={n}, {args.eval_episodes} eps")
                r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
                print(f"[Mixed] n_sheep={n}  sr={r['sr']*100:.0f}%  "
                      f"mean_len={r['mean_len']:.0f}  "
                      f"mean_min_pen={r['mean_min_pen']:.1f}m  "
                      f"mean_act={r['mean_act']:.2f}")
                stage_results.append({"n_sheep": n, **r})
        else:
            for n in range(args.start_sheep, args.max_sheep + 1):
                if n > args.start_sheep:
                    vn.env_method("set_n_sheep", n)
                print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps")
                model.learn(
                    total_timesteps=args.steps_per_stage,
                    reset_num_timesteps=(n == args.start_sheep),
                    callback=ProgressCallback(0, f"{n} sheep", freq=100_000),
                )
                print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
                r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
                print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}%  "
                      f"mean_len={r['mean_len']:.0f}  "
                      f"mean_min_pen={r['mean_min_pen']:.1f}m  "
                      f"mean_act={r['mean_act']:.2f}")
                stage_results.append({"n_sheep": n, **r})

        # Optional consolidation pass with mixed n_sheep — fixes specialization
        # imbalance from curriculum order (e.g. n=1 weakness after long n=10
        # training). Replaces stage_results with the post-consolidation eval.
        if args.final_mixed_steps > 0 and not args.mixed:
            print(f"\n[Consolidation] mixed n_sheep ∈ [1, {args.max_sheep}], "
                  f"{args.final_mixed_steps:,} steps")
            vn.env_method("__setattr__", "random_n_sheep", True)
            model.learn(
                total_timesteps=args.final_mixed_steps,
                reset_num_timesteps=False,
                callback=ProgressCallback(0, "consolidate", freq=100_000),
            )
            print("[Consolidation] re-evaluating all sheep counts")
            stage_results = []
            for n in range(1, args.max_sheep + 1):
                r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
                print(f"[Consolidation] n_sheep={n}  sr={r['sr']*100:.0f}%  "
                      f"mean_len={r['mean_len']:.0f}  "
                      f"mean_min_pen={r['mean_min_pen']:.1f}m  "
                      f"mean_act={r['mean_act']:.2f}")
                stage_results.append({"n_sheep": n, **r})

        model.save(os.path.join(run_dir, "final_model"))
        vn.save(os.path.join(run_dir, "vecnorm.pkl"))
        with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
            json.dump(stage_results, f, indent=2)
    finally:
        try: vn.close()
        except Exception: pass

    print("\n" + "=" * 60)
    print("  REPLAY SUMMARY")
    print("=" * 60)
    for r in stage_results:
        print(f"  n_sheep={r['n_sheep']}  sr={r['sr']*100:>3.0f}%  "
              f"len={r['mean_len']:>5.0f}  min_pen={r['mean_min_pen']:>5.1f}m  "
              f"act={r['mean_act']:.2f}")
    print(f"\n  Total time: {(time.time()-t0)/60:.1f} min")
    print(f"  Artefacts:  {run_dir}/")


if __name__ == "__main__":
    main()