Sheep training flock _ improver
This commit is contained in:
+44
-8
@@ -59,9 +59,13 @@ class HerdingEnv(gym.Env):
|
|||||||
W_PEN_BONUS = 10.0 # per sheep penned
|
W_PEN_BONUS = 10.0 # per sheep penned
|
||||||
W_COMPLETE = 100.0 # all sheep penned
|
W_COMPLETE = 100.0 # all sheep penned
|
||||||
W_STEP_COST = 0.02 # time penalty — strong enough to punish doing nothing
|
W_STEP_COST = 0.02 # time penalty — strong enough to punish doing nothing
|
||||||
|
W_COMPACT = 0.0 # reward for flock-radius reduction (off by default)
|
||||||
|
ALIGN_SHAPE = "standoff" # "standoff" (peaks at IDEAL) | "near" (peaks at 0)
|
||||||
|
ALIGN_GATED = True # gate alignment on action magnitude
|
||||||
|
|
||||||
def __init__(self, n_sheep: int = 1, max_steps: int = 2000,
|
def __init__(self, n_sheep: int = 1, max_steps: int = 2000,
|
||||||
render_mode: str = None, random_n_sheep: bool = False):
|
render_mode: str = None, random_n_sheep: bool = False,
|
||||||
|
reward_cfg: dict = None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
assert 1 <= n_sheep <= self.MAX_SHEEP
|
assert 1 <= n_sheep <= self.MAX_SHEEP
|
||||||
self.n_sheep = n_sheep
|
self.n_sheep = n_sheep
|
||||||
@@ -69,6 +73,14 @@ class HerdingEnv(gym.Env):
|
|||||||
self.render_mode = render_mode
|
self.render_mode = render_mode
|
||||||
self.random_n_sheep = random_n_sheep # if True, randomise n_sheep each reset
|
self.random_n_sheep = random_n_sheep # if True, randomise n_sheep each reset
|
||||||
|
|
||||||
|
# Override class-default reward weights / shape with per-instance config
|
||||||
|
# so sweeps can ship configs into subprocess envs via pickled make_env.
|
||||||
|
if reward_cfg:
|
||||||
|
for k, v in reward_cfg.items():
|
||||||
|
if not hasattr(self.__class__, k):
|
||||||
|
raise ValueError(f"unknown reward_cfg key: {k}")
|
||||||
|
setattr(self, k, v)
|
||||||
|
|
||||||
# Fixed 16-dim observation regardless of n_sheep:
|
# Fixed 16-dim observation regardless of n_sheep:
|
||||||
# dog_pos(2) + rel_com(2) + rel_far1(2) + rel_far2(2) + rel_far3(2)
|
# dog_pos(2) + rel_com(2) + rel_far1(2) + rel_far2(2) + rel_far3(2)
|
||||||
# + com_to_pen(2) + far1_to_pen(2) + radius(1) + frac_penned(1)
|
# + com_to_pen(2) + far1_to_pen(2) + radius(1) + frac_penned(1)
|
||||||
@@ -127,8 +139,12 @@ class HerdingEnv(gym.Env):
|
|||||||
|
|
||||||
# Dog: 50% of resets start already behind the flock (anti-pen side,
|
# Dog: 50% of resets start already behind the flock (anti-pen side,
|
||||||
# within flee range) to give early training aligned experiences.
|
# within flee range) to give early training aligned experiences.
|
||||||
|
# Use the flock COM as the reference (not sheep[0]) so the bias
|
||||||
|
# generalizes from 1-sheep to multi-sheep without putting the dog
|
||||||
|
# in front of or inside the flock.
|
||||||
if self.np_random.random() < 0.5:
|
if self.np_random.random() < 0.5:
|
||||||
ref = self.sheep_pos[0]
|
active_pts = self.sheep_pos[:self.n_sheep][~self.penned[:self.n_sheep]]
|
||||||
|
ref = active_pts.mean(axis=0) if len(active_pts) else self.sheep_pos[0]
|
||||||
away = ref - self.PEN_CENTER
|
away = ref - self.PEN_CENTER
|
||||||
d = float(np.linalg.norm(away))
|
d = float(np.linalg.norm(away))
|
||||||
if d > 0.1:
|
if d > 0.1:
|
||||||
@@ -154,8 +170,13 @@ class HerdingEnv(gym.Env):
|
|||||||
self.sheep_pos[:self.n_sheep][active] - self.PEN_CENTER, axis=1
|
self.sheep_pos[:self.n_sheep][active] - self.PEN_CENTER, axis=1
|
||||||
).sum()
|
).sum()
|
||||||
)
|
)
|
||||||
|
com0 = self.sheep_pos[:self.n_sheep][active].mean(axis=0)
|
||||||
|
self._prev_radius = float(
|
||||||
|
np.linalg.norm(self.sheep_pos[:self.n_sheep][active] - com0, axis=1).max()
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self._prev_pen_dist_sum = 0.0
|
self._prev_pen_dist_sum = 0.0
|
||||||
|
self._prev_radius = 0.0
|
||||||
|
|
||||||
return self._obs(), {}
|
return self._obs(), {}
|
||||||
|
|
||||||
@@ -322,22 +343,37 @@ class HerdingEnv(gym.Env):
|
|||||||
pen_dir = (self.PEN_CENTER - com) / com_dist
|
pen_dir = (self.PEN_CENTER - com) / com_dist
|
||||||
dog_dir = (self.dog_pos - com) / d_dog_com
|
dog_dir = (self.dog_pos - com) / d_dog_com
|
||||||
cosine = -float(np.dot(pen_dir, dog_dir))
|
cosine = -float(np.dot(pen_dir, dog_dir))
|
||||||
proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST)
|
if self.ALIGN_SHAPE == "standoff":
|
||||||
# Gate on action magnitude: only paid when the dog is actually moving.
|
IDEAL = 0.5 * (self.SEPARATION_DIST + self.FLEE_DIST)
|
||||||
# Without this, parking on the anti-pen side farms +0.03/step against
|
HALF = self.FLEE_DIST - IDEAL
|
||||||
# the -0.02 step_cost and the policy collapses to sit-still.
|
proximity = max(0.0, 1.0 - abs(d_dog_com - IDEAL) / HALF)
|
||||||
move_gate = min(1.0, float(np.linalg.norm(action)))
|
else: # "near"
|
||||||
|
proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST)
|
||||||
|
move_gate = (min(1.0, float(np.linalg.norm(action)))
|
||||||
|
if self.ALIGN_GATED else 1.0)
|
||||||
alignment = cosine * proximity * move_gate * self.W_ALIGN
|
alignment = cosine * proximity * move_gate * self.W_ALIGN
|
||||||
else:
|
else:
|
||||||
alignment = 0.0
|
alignment = 0.0
|
||||||
|
|
||||||
|
# Compactness shaping: reward decreases in flock radius (active sheep only)
|
||||||
|
if self.W_COMPACT and active.any():
|
||||||
|
cur_radius = float(np.linalg.norm(
|
||||||
|
self.sheep_pos[:self.n_sheep][active] - com, axis=1
|
||||||
|
).max())
|
||||||
|
r_compact = (self._prev_radius - cur_radius) * self.W_COMPACT
|
||||||
|
self._prev_radius = cur_radius
|
||||||
|
else:
|
||||||
|
r_compact = 0.0
|
||||||
|
|
||||||
r_pen_bonus = newly_penned * self.W_PEN_BONUS
|
r_pen_bonus = newly_penned * self.W_PEN_BONUS
|
||||||
r_step_cost = -self.W_STEP_COST
|
r_step_cost = -self.W_STEP_COST
|
||||||
r_complete = self.W_COMPLETE if n_penned == self.n_sheep else 0.0
|
r_complete = self.W_COMPLETE if n_penned == self.n_sheep else 0.0
|
||||||
reward = r_progress + alignment + r_pen_bonus + r_step_cost + r_complete
|
reward = (r_progress + alignment + r_compact + r_pen_bonus
|
||||||
|
+ r_step_cost + r_complete)
|
||||||
rcomps = {
|
rcomps = {
|
||||||
"progress": float(r_progress),
|
"progress": float(r_progress),
|
||||||
"alignment": float(alignment),
|
"alignment": float(alignment),
|
||||||
|
"compact": float(r_compact),
|
||||||
"pen_bonus": float(r_pen_bonus),
|
"pen_bonus": float(r_pen_bonus),
|
||||||
"step_cost": float(r_step_cost),
|
"step_cost": float(r_step_cost),
|
||||||
"complete": float(r_complete),
|
"complete": float(r_complete),
|
||||||
|
|||||||
@@ -0,0 +1,245 @@
|
|||||||
|
"""
|
||||||
|
Random-search sweep over reward-function hyperparameters.
|
||||||
|
|
||||||
|
Each trial trains a fresh PPO policy through a 1→2-sheep curriculum on a tight
|
||||||
|
budget, then evaluates at n=1,2,3 sheep. A composite score is computed and
|
||||||
|
written to a JSONL log. After all trials, a leaderboard is printed and the
|
||||||
|
best config is saved.
|
||||||
|
|
||||||
|
Sized to fit in ~4 hours wall-clock with default settings on 8 envs.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
python sweep_reward.py # 25 trials, default budget
|
||||||
|
python sweep_reward.py --n-trials 15
|
||||||
|
python sweep_reward.py --time-budget 6 # stop adding trials past 6h
|
||||||
|
python sweep_reward.py --resume runs/sweep_<timestamp> # continue logging
|
||||||
|
|
||||||
|
Per-trial budget (see TRAIN_*_STEPS below): ~1.0M training steps + 30 eval
|
||||||
|
episodes × 3 sheep counts. On this env that runs in ~8–12 min per trial.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from stable_baselines3 import PPO
|
||||||
|
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize
|
||||||
|
|
||||||
|
from herding_env import HerdingEnv
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Search space — reward weights + a couple of hyperparams
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
SEARCH_SPACE = {
|
||||||
|
"W_PER_SHEEP": [1.0, 2.0, 4.0, 6.0],
|
||||||
|
"W_ALIGN": [0.0, 0.025, 0.05, 0.1],
|
||||||
|
"W_PEN_BONUS": [5.0, 10.0, 20.0],
|
||||||
|
"W_STEP_COST": [0.005, 0.02, 0.05],
|
||||||
|
"W_COMPLETE": [50.0, 100.0, 200.0],
|
||||||
|
"W_COMPACT": [0.0, 0.5, 1.5, 3.0],
|
||||||
|
"ALIGN_SHAPE": ["standoff", "near"],
|
||||||
|
"ALIGN_GATED": [True, False],
|
||||||
|
"ent_coef": [0.005, 0.01, 0.02, 0.05],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Per-trial training budget — keep tight; total = sum + eval
|
||||||
|
TRAIN_STAGE1_STEPS = 400_000 # 1 sheep
|
||||||
|
TRAIN_STAGE2_STEPS = 600_000 # 2 sheep
|
||||||
|
EVAL_EPISODES = 10
|
||||||
|
EVAL_NSHEEP = (1, 2, 3)
|
||||||
|
MAX_STEPS = 1500
|
||||||
|
N_ENVS = 8
|
||||||
|
|
||||||
|
|
||||||
|
def sample_config(rng: np.random.Generator) -> dict:
|
||||||
|
cfg = {}
|
||||||
|
for k, v in SEARCH_SPACE.items():
|
||||||
|
choice = v[int(rng.integers(0, len(v)))]
|
||||||
|
cfg[k] = bool(choice) if isinstance(choice, np.bool_) else choice
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def reward_cfg(cfg: dict) -> dict:
|
||||||
|
"""Strip non-env keys (anything that isn't a HerdingEnv attribute)."""
|
||||||
|
return {k: v for k, v in cfg.items() if k != "ent_coef"}
|
||||||
|
|
||||||
|
|
||||||
|
def make_env(n_sheep, seed, max_steps, rcfg):
|
||||||
|
def _init():
|
||||||
|
env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, reward_cfg=rcfg)
|
||||||
|
env.reset(seed=seed)
|
||||||
|
return env
|
||||||
|
return _init
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(model, vn_template, n_sheep, n_episodes, max_steps, rcfg):
|
||||||
|
raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, rcfg)])
|
||||||
|
vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
|
||||||
|
vn.obs_rms = deepcopy(vn_template.obs_rms)
|
||||||
|
vn.ret_rms = deepcopy(vn_template.ret_rms)
|
||||||
|
successes = 0
|
||||||
|
ep_lens, min_pen_list, action_mags = [], [], []
|
||||||
|
for _ in range(n_episodes):
|
||||||
|
obs = vn.reset()
|
||||||
|
done = False
|
||||||
|
steps, min_pen, mags = 0, float("inf"), []
|
||||||
|
while not done:
|
||||||
|
action, _ = model.predict(obs, deterministic=True)
|
||||||
|
obs, _, dones, infos = vn.step(action)
|
||||||
|
done = dones[0]
|
||||||
|
inner = vn.envs[0]
|
||||||
|
com, _, _ = inner._flock_stats()
|
||||||
|
min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER)))
|
||||||
|
mags.append(float(np.linalg.norm(action[0])))
|
||||||
|
steps += 1
|
||||||
|
successes += int(infos[0].get("n_penned") == n_sheep)
|
||||||
|
ep_lens.append(steps)
|
||||||
|
min_pen_list.append(min_pen)
|
||||||
|
action_mags.extend(mags)
|
||||||
|
vn.close()
|
||||||
|
return {
|
||||||
|
"sr": successes / n_episodes,
|
||||||
|
"mean_len": float(np.mean(ep_lens)),
|
||||||
|
"mean_min_pen": float(np.mean(min_pen_list)),
|
||||||
|
"mean_act": float(np.mean(action_mags)),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_trial(trial_id: int, cfg: dict, log_path: str) -> dict:
|
||||||
|
rcfg = reward_cfg(cfg)
|
||||||
|
|
||||||
|
train_env = SubprocVecEnv([
|
||||||
|
make_env(1, seed=trial_id * 100 + i, max_steps=MAX_STEPS, rcfg=rcfg)
|
||||||
|
for i in range(N_ENVS)
|
||||||
|
])
|
||||||
|
vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
|
||||||
|
|
||||||
|
model = PPO(
|
||||||
|
"MlpPolicy", vn,
|
||||||
|
learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
|
||||||
|
gamma=0.995, gae_lambda=0.95, clip_range=0.2,
|
||||||
|
ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5,
|
||||||
|
policy_kwargs=dict(net_arch=[256, 256]),
|
||||||
|
verbose=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
model.learn(total_timesteps=TRAIN_STAGE1_STEPS, reset_num_timesteps=True)
|
||||||
|
vn.env_method("set_n_sheep", 2)
|
||||||
|
model.learn(total_timesteps=TRAIN_STAGE2_STEPS, reset_num_timesteps=False)
|
||||||
|
|
||||||
|
per_sheep = {}
|
||||||
|
for n in EVAL_NSHEEP:
|
||||||
|
per_sheep[n] = evaluate(model, vn, n, EVAL_EPISODES, MAX_STEPS, rcfg)
|
||||||
|
finally:
|
||||||
|
try: vn.close()
|
||||||
|
except Exception: pass
|
||||||
|
|
||||||
|
sr = {n: per_sheep[n]["sr"] for n in EVAL_NSHEEP}
|
||||||
|
score = 0.2 * sr[1] + 0.5 * sr[2] + 0.3 * sr[3]
|
||||||
|
return {
|
||||||
|
"trial": trial_id,
|
||||||
|
"config": cfg,
|
||||||
|
"score": score,
|
||||||
|
"sr": sr,
|
||||||
|
"details": per_sheep,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--n-trials", type=int, default=25)
|
||||||
|
p.add_argument("--time-budget", type=float, default=7.5,
|
||||||
|
help="Stop launching new trials past this many hours.")
|
||||||
|
p.add_argument("--seed", type=int, default=42)
|
||||||
|
p.add_argument("--run-dir", type=str, default=None,
|
||||||
|
help="If unset, creates runs/sweep_<timestamp>/")
|
||||||
|
p.add_argument("--resume", type=str, default=None,
|
||||||
|
help="Continue logging into an existing sweep dir")
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
run_dir = args.resume or args.run_dir or os.path.join(
|
||||||
|
"runs", "sweep_" + time.strftime("%Y%m%d_%H%M%S")
|
||||||
|
)
|
||||||
|
os.makedirs(run_dir, exist_ok=True)
|
||||||
|
log_path = os.path.join(run_dir, "results.jsonl")
|
||||||
|
|
||||||
|
rng = np.random.default_rng(args.seed)
|
||||||
|
start = time.time()
|
||||||
|
budget_s = args.time_budget * 3600
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# If resuming, replay the existing log into memory
|
||||||
|
if args.resume and os.path.exists(log_path):
|
||||||
|
with open(log_path) as f:
|
||||||
|
for line in f:
|
||||||
|
try: results.append(json.loads(line))
|
||||||
|
except Exception: pass
|
||||||
|
print(f"Resumed sweep: {len(results)} prior trials loaded from {log_path}")
|
||||||
|
|
||||||
|
print(f"Sweep dir: {run_dir}")
|
||||||
|
print(f"Search space: {list(SEARCH_SPACE.keys())}")
|
||||||
|
print(f"Per-trial: {TRAIN_STAGE1_STEPS+TRAIN_STAGE2_STEPS:,} steps train + "
|
||||||
|
f"{EVAL_EPISODES * len(EVAL_NSHEEP)} eval eps")
|
||||||
|
print(f"Time budget: {args.time_budget}h\n")
|
||||||
|
|
||||||
|
n_done = sum(1 for r in results if "error" not in r)
|
||||||
|
trial_id = len(results)
|
||||||
|
while n_done < args.n_trials:
|
||||||
|
elapsed_h = (time.time() - start) / 3600
|
||||||
|
if elapsed_h >= args.time_budget:
|
||||||
|
print(f"\n[Sweep] time budget reached ({elapsed_h:.2f}h) — stopping.")
|
||||||
|
break
|
||||||
|
|
||||||
|
cfg = sample_config(rng)
|
||||||
|
t0 = time.time()
|
||||||
|
print(f"[Trial {trial_id+1:>3}] {cfg}")
|
||||||
|
try:
|
||||||
|
result = run_trial(trial_id, cfg, log_path)
|
||||||
|
result["elapsed_s"] = time.time() - t0
|
||||||
|
sr = result["sr"]
|
||||||
|
print(f" → score={result['score']:.3f} "
|
||||||
|
f"sr1={sr[1]:.2f} sr2={sr[2]:.2f} sr3={sr[3]:.2f} "
|
||||||
|
f"[{result['elapsed_s']:.0f}s]")
|
||||||
|
results.append(result)
|
||||||
|
n_done += 1
|
||||||
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
|
err = {"trial": trial_id, "config": cfg,
|
||||||
|
"error": f"{type(e).__name__}: {e}",
|
||||||
|
"elapsed_s": time.time() - t0}
|
||||||
|
results.append(err)
|
||||||
|
print(f" ! FAILED: {err['error']}")
|
||||||
|
with open(log_path, "a") as f:
|
||||||
|
f.write(json.dumps(results[-1]) + "\n")
|
||||||
|
trial_id += 1
|
||||||
|
|
||||||
|
# Leaderboard
|
||||||
|
succ = [r for r in results if "error" not in r]
|
||||||
|
succ.sort(key=lambda r: -r["score"])
|
||||||
|
print("\n" + "=" * 92)
|
||||||
|
print(" LEADERBOARD")
|
||||||
|
print("=" * 92)
|
||||||
|
hdr = f" {'rank':>4} {'score':>6} {'sr1':>5} {'sr2':>5} {'sr3':>5} config"
|
||||||
|
print(hdr); print(" " + "-" * 88)
|
||||||
|
for i, r in enumerate(succ[:15], 1):
|
||||||
|
sr = r["sr"]
|
||||||
|
cfg_short = " ".join(f"{k}={v}" for k, v in r["config"].items())
|
||||||
|
print(f" {i:>4d} {r['score']:>6.3f} {sr[1]:>5.2f} {sr[2]:>5.2f} {sr[3]:>5.2f} {cfg_short}")
|
||||||
|
|
||||||
|
if succ:
|
||||||
|
best = succ[0]
|
||||||
|
with open(os.path.join(run_dir, "best.json"), "w") as f:
|
||||||
|
json.dump(best, f, indent=2)
|
||||||
|
print(f"\n Best config saved to {run_dir}/best.json")
|
||||||
|
print(f" Total trials: {len(results)} ({len(succ)} successful, "
|
||||||
|
f"{len(results)-len(succ)} failed)")
|
||||||
|
print(f" Total time: {(time.time()-start)/3600:.2f}h\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user