Sheep training flock _ improver
This commit is contained in:
@@ -37,8 +37,12 @@ def main():
|
|||||||
p = argparse.ArgumentParser()
|
p = argparse.ArgumentParser()
|
||||||
p.add_argument("--config", type=str, required=True,
|
p.add_argument("--config", type=str, required=True,
|
||||||
help="Reward config JSON (sweep best.json or trial config.json)")
|
help="Reward config JSON (sweep best.json or trial config.json)")
|
||||||
|
p.add_argument("--start-sheep", type=int, default=1)
|
||||||
p.add_argument("--max-sheep", type=int, default=3)
|
p.add_argument("--max-sheep", type=int, default=3)
|
||||||
p.add_argument("--steps-per-stage", type=int, default=1_500_000)
|
p.add_argument("--steps-per-stage", type=int, default=1_500_000)
|
||||||
|
p.add_argument("--mixed", action="store_true",
|
||||||
|
help="Train with n_sheep randomized per episode (no curriculum). "
|
||||||
|
"Total train steps = steps-per-stage * max_sheep.")
|
||||||
p.add_argument("--n-envs", type=int, default=8)
|
p.add_argument("--n-envs", type=int, default=8)
|
||||||
p.add_argument("--max-steps", type=int, default=1500)
|
p.add_argument("--max-steps", type=int, default=1500)
|
||||||
p.add_argument("--eval-episodes", type=int, default=30)
|
p.add_argument("--eval-episodes", type=int, default=30)
|
||||||
@@ -58,11 +62,17 @@ def main():
|
|||||||
with open(os.path.join(run_dir, "config.json"), "w") as f:
|
with open(os.path.join(run_dir, "config.json"), "w") as f:
|
||||||
json.dump(cfg, f, indent=2)
|
json.dump(cfg, f, indent=2)
|
||||||
print(f"Run dir: {run_dir}")
|
print(f"Run dir: {run_dir}")
|
||||||
print(f"Curriculum: 1 → {args.max_sheep} sheep, "
|
if args.mixed:
|
||||||
|
print(f"MIXED training: random n_sheep ∈ [1, {args.max_sheep}], "
|
||||||
|
f"{args.steps_per_stage * args.max_sheep:,} total steps")
|
||||||
|
else:
|
||||||
|
print(f"Curriculum: {args.start_sheep} → {args.max_sheep} sheep, "
|
||||||
f"{args.steps_per_stage:,} steps/stage")
|
f"{args.steps_per_stage:,} steps/stage")
|
||||||
|
|
||||||
train_env = SubprocVecEnv([
|
train_env = SubprocVecEnv([
|
||||||
make_env(1, seed=i, max_steps=args.max_steps, rcfg=rcfg)
|
make_env(args.max_sheep if args.mixed else args.start_sheep,
|
||||||
|
seed=i, max_steps=args.max_steps, rcfg=rcfg,
|
||||||
|
random_n_sheep=args.mixed)
|
||||||
for i in range(args.n_envs)
|
for i in range(args.n_envs)
|
||||||
])
|
])
|
||||||
vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
|
vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
|
||||||
@@ -79,19 +89,37 @@ def main():
|
|||||||
stage_results = []
|
stage_results = []
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
try:
|
try:
|
||||||
|
if args.mixed:
|
||||||
|
total = args.steps_per_stage * args.max_sheep
|
||||||
|
print(f"\n[Mixed] training {total:,} steps")
|
||||||
|
model.learn(
|
||||||
|
total_timesteps=total,
|
||||||
|
reset_num_timesteps=True,
|
||||||
|
callback=ProgressCallback(0, "mixed", freq=100_000),
|
||||||
|
)
|
||||||
for n in range(1, args.max_sheep + 1):
|
for n in range(1, args.max_sheep + 1):
|
||||||
if n > 1:
|
print(f"[Mixed] evaluating n={n}, {args.eval_episodes} eps")
|
||||||
|
r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
|
||||||
|
print(f"[Mixed] n_sheep={n} sr={r['sr']*100:.0f}% "
|
||||||
|
f"mean_len={r['mean_len']:.0f} "
|
||||||
|
f"mean_min_pen={r['mean_min_pen']:.1f}m "
|
||||||
|
f"mean_act={r['mean_act']:.2f}")
|
||||||
|
stage_results.append({"n_sheep": n, **r})
|
||||||
|
else:
|
||||||
|
for n in range(args.start_sheep, args.max_sheep + 1):
|
||||||
|
if n > args.start_sheep:
|
||||||
vn.env_method("set_n_sheep", n)
|
vn.env_method("set_n_sheep", n)
|
||||||
print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps")
|
print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps")
|
||||||
model.learn(
|
model.learn(
|
||||||
total_timesteps=args.steps_per_stage,
|
total_timesteps=args.steps_per_stage,
|
||||||
reset_num_timesteps=(n == 1),
|
reset_num_timesteps=(n == args.start_sheep),
|
||||||
callback=ProgressCallback(0, f"{n} sheep", freq=100_000),
|
callback=ProgressCallback(0, f"{n} sheep", freq=100_000),
|
||||||
)
|
)
|
||||||
print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
|
print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
|
||||||
r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
|
r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
|
||||||
print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% "
|
print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% "
|
||||||
f"mean_len={r['mean_len']:.0f} mean_min_pen={r['mean_min_pen']:.1f}m "
|
f"mean_len={r['mean_len']:.0f} "
|
||||||
|
f"mean_min_pen={r['mean_min_pen']:.1f}m "
|
||||||
f"mean_act={r['mean_act']:.2f}")
|
f"mean_act={r['mean_act']:.2f}")
|
||||||
stage_results.append({"n_sheep": n, **r})
|
stage_results.append({"n_sheep": n, **r})
|
||||||
|
|
||||||
|
|||||||
@@ -114,9 +114,10 @@ def reward_cfg(cfg: dict) -> dict:
|
|||||||
return {k: v for k, v in cfg.items() if k != "ent_coef"}
|
return {k: v for k, v in cfg.items() if k != "ent_coef"}
|
||||||
|
|
||||||
|
|
||||||
def make_env(n_sheep, seed, max_steps, rcfg):
|
def make_env(n_sheep, seed, max_steps, rcfg, random_n_sheep=False):
|
||||||
def _init():
|
def _init():
|
||||||
env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, reward_cfg=rcfg)
|
env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
|
||||||
|
reward_cfg=rcfg, random_n_sheep=random_n_sheep)
|
||||||
env.reset(seed=seed)
|
env.reset(seed=seed)
|
||||||
return env
|
return env
|
||||||
return _init
|
return _init
|
||||||
|
|||||||
Reference in New Issue
Block a user