Sheep training flock _ improver
This commit is contained in:
@@ -0,0 +1,118 @@
|
|||||||
|
"""
|
||||||
|
Replay a reward config from the sweep with a longer training budget.
|
||||||
|
|
||||||
|
Tells you whether a promising sweep config was bottlenecked by training time
|
||||||
|
vs. structurally limited. If sr2/sr3 climb past their sweep numbers given more
|
||||||
|
budget, the issue was budget; if they plateau, the policy/obs needs work.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
python replay_config.py --config runs/sweep_<ts>/best.json
|
||||||
|
python replay_config.py --config runs/sweep_<ts>/trial_007/config.json \
|
||||||
|
--max-sheep 4 --steps-per-stage 1500000
|
||||||
|
|
||||||
|
Argument summary:
|
||||||
|
--config JSON file with the reward config (sweep best.json works)
|
||||||
|
--max-sheep Final curriculum stage (default 3)
|
||||||
|
--steps-per-stage Env steps per curriculum stage (default 1.5M)
|
||||||
|
--n-envs Parallel envs (default 8)
|
||||||
|
--eval-episodes Per-stage eval episodes (default 30)
|
||||||
|
--run-dir Output directory (default runs/replay_<ts>/)
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from stable_baselines3 import PPO
|
||||||
|
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize
|
||||||
|
|
||||||
|
from herding_env import HerdingEnv
|
||||||
|
from sweep_reward import ProgressCallback, reward_cfg, evaluate, make_env
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--config", type=str, required=True,
|
||||||
|
help="Reward config JSON (sweep best.json or trial config.json)")
|
||||||
|
p.add_argument("--max-sheep", type=int, default=3)
|
||||||
|
p.add_argument("--steps-per-stage", type=int, default=1_500_000)
|
||||||
|
p.add_argument("--n-envs", type=int, default=8)
|
||||||
|
p.add_argument("--max-steps", type=int, default=1500)
|
||||||
|
p.add_argument("--eval-episodes", type=int, default=30)
|
||||||
|
p.add_argument("--run-dir", type=str, default=None)
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
with open(args.config) as f:
|
||||||
|
raw = json.load(f)
|
||||||
|
cfg = raw["config"] if "config" in raw and isinstance(raw["config"], dict) else raw
|
||||||
|
rcfg = reward_cfg(cfg)
|
||||||
|
print(f"Config: {cfg}")
|
||||||
|
|
||||||
|
run_dir = args.run_dir or os.path.join(
|
||||||
|
"runs", "replay_" + time.strftime("%Y%m%d_%H%M%S")
|
||||||
|
)
|
||||||
|
os.makedirs(run_dir, exist_ok=True)
|
||||||
|
with open(os.path.join(run_dir, "config.json"), "w") as f:
|
||||||
|
json.dump(cfg, f, indent=2)
|
||||||
|
print(f"Run dir: {run_dir}")
|
||||||
|
print(f"Curriculum: 1 → {args.max_sheep} sheep, "
|
||||||
|
f"{args.steps_per_stage:,} steps/stage")
|
||||||
|
|
||||||
|
train_env = SubprocVecEnv([
|
||||||
|
make_env(1, seed=i, max_steps=args.max_steps, rcfg=rcfg)
|
||||||
|
for i in range(args.n_envs)
|
||||||
|
])
|
||||||
|
vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
|
||||||
|
|
||||||
|
model = PPO(
|
||||||
|
"MlpPolicy", vn,
|
||||||
|
learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
|
||||||
|
gamma=0.995, gae_lambda=0.95, clip_range=0.2,
|
||||||
|
ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5,
|
||||||
|
policy_kwargs=dict(net_arch=[256, 256]),
|
||||||
|
verbose=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
stage_results = []
|
||||||
|
t0 = time.time()
|
||||||
|
try:
|
||||||
|
for n in range(1, args.max_sheep + 1):
|
||||||
|
if n > 1:
|
||||||
|
vn.env_method("set_n_sheep", n)
|
||||||
|
print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps")
|
||||||
|
model.learn(
|
||||||
|
total_timesteps=args.steps_per_stage,
|
||||||
|
reset_num_timesteps=(n == 1),
|
||||||
|
callback=ProgressCallback(0, f"{n} sheep", freq=100_000),
|
||||||
|
)
|
||||||
|
print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
|
||||||
|
r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
|
||||||
|
print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% "
|
||||||
|
f"mean_len={r['mean_len']:.0f} mean_min_pen={r['mean_min_pen']:.1f}m "
|
||||||
|
f"mean_act={r['mean_act']:.2f}")
|
||||||
|
stage_results.append({"n_sheep": n, **r})
|
||||||
|
|
||||||
|
model.save(os.path.join(run_dir, "final_model"))
|
||||||
|
vn.save(os.path.join(run_dir, "vecnorm.pkl"))
|
||||||
|
with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
|
||||||
|
json.dump(stage_results, f, indent=2)
|
||||||
|
finally:
|
||||||
|
try: vn.close()
|
||||||
|
except Exception: pass
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(" REPLAY SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
for r in stage_results:
|
||||||
|
print(f" n_sheep={r['n_sheep']} sr={r['sr']*100:>3.0f}% "
|
||||||
|
f"len={r['mean_len']:>5.0f} min_pen={r['mean_min_pen']:>5.1f}m "
|
||||||
|
f"act={r['mean_act']:.2f}")
|
||||||
|
print(f"\n Total time: {(time.time()-t0)/60:.1f} min")
|
||||||
|
print(f" Artefacts: {run_dir}/")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -155,8 +155,12 @@ def evaluate(model, vn_template, n_sheep, n_episodes, max_steps, rcfg):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def run_trial(trial_id: int, cfg: dict, log_path: str) -> dict:
|
def run_trial(trial_id: int, cfg: dict, log_path: str, run_dir: str) -> dict:
|
||||||
rcfg = reward_cfg(cfg)
|
rcfg = reward_cfg(cfg)
|
||||||
|
trial_dir = os.path.join(run_dir, f"trial_{trial_id:03d}")
|
||||||
|
os.makedirs(trial_dir, exist_ok=True)
|
||||||
|
with open(os.path.join(trial_dir, "config.json"), "w") as f:
|
||||||
|
json.dump(cfg, f, indent=2)
|
||||||
|
|
||||||
train_env = SubprocVecEnv([
|
train_env = SubprocVecEnv([
|
||||||
make_env(1, seed=trial_id * 100 + i, max_steps=MAX_STEPS, rcfg=rcfg)
|
make_env(1, seed=trial_id * 100 + i, max_steps=MAX_STEPS, rcfg=rcfg)
|
||||||
@@ -186,6 +190,9 @@ def run_trial(trial_id: int, cfg: dict, log_path: str) -> dict:
|
|||||||
for n in EVAL_NSHEEP:
|
for n in EVAL_NSHEEP:
|
||||||
print(f" ... [trial {trial_id+1} | eval n={n}]", flush=True)
|
print(f" ... [trial {trial_id+1} | eval n={n}]", flush=True)
|
||||||
per_sheep[n] = evaluate(model, vn, n, EVAL_EPISODES, MAX_STEPS, rcfg)
|
per_sheep[n] = evaluate(model, vn, n, EVAL_EPISODES, MAX_STEPS, rcfg)
|
||||||
|
|
||||||
|
model.save(os.path.join(trial_dir, "model"))
|
||||||
|
vn.save(os.path.join(trial_dir, "vecnorm.pkl"))
|
||||||
finally:
|
finally:
|
||||||
try: vn.close()
|
try: vn.close()
|
||||||
except Exception: pass
|
except Exception: pass
|
||||||
@@ -250,7 +257,7 @@ def main():
|
|||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
print(f"[Trial {trial_id+1:>3}] {cfg}")
|
print(f"[Trial {trial_id+1:>3}] {cfg}")
|
||||||
try:
|
try:
|
||||||
result = run_trial(trial_id, cfg, log_path)
|
result = run_trial(trial_id, cfg, log_path, run_dir)
|
||||||
result["elapsed_s"] = time.time() - t0
|
result["elapsed_s"] = time.time() - t0
|
||||||
sr = result["sr"]
|
sr = result["sr"]
|
||||||
print(f" → score={result['score']:.3f} "
|
print(f" → score={result['score']:.3f} "
|
||||||
|
|||||||
Reference in New Issue
Block a user