diff --git a/.gitignore b/.gitignore index 996a8ec..035b6d1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Stuff #_example/ +.claude/ # Python __pycache__/ diff --git a/training/config.json b/training/config.json new file mode 100644 index 0000000..3c0774e --- /dev/null +++ b/training/config.json @@ -0,0 +1,14 @@ +{ + "W_PER_SHEEP": 2.0, + "W_ALIGN": 0.05, + "W_PEN_BONUS": 10.0, + "W_COMPLETE": 100.0, + "W_STEP_COST": 0.02, + "W_COMPACT": 0.0, + "W_WALL_TOUCH": 0.15, + "WALL_TOUCH_BUFFER": 0.8, + "ALIGN_SHAPE": "standoff", + "ALIGN_GATED": true, + "ENTRY_AWARE": false, + "ent_coef": 0.02 +} diff --git a/training/debug_plots/10sheep/ep001_SUCCESS.png b/training/debug_plots/10sheep/ep001_SUCCESS.png deleted file mode 100644 index 93beee7..0000000 Binary files a/training/debug_plots/10sheep/ep001_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep002_SUCCESS.png b/training/debug_plots/10sheep/ep002_SUCCESS.png deleted file mode 100644 index 4fe5075..0000000 Binary files a/training/debug_plots/10sheep/ep002_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep003_SUCCESS.png b/training/debug_plots/10sheep/ep003_SUCCESS.png deleted file mode 100644 index 54a413a..0000000 Binary files a/training/debug_plots/10sheep/ep003_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep004_SUCCESS.png b/training/debug_plots/10sheep/ep004_SUCCESS.png deleted file mode 100644 index 8ad143b..0000000 Binary files a/training/debug_plots/10sheep/ep004_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep005_SUCCESS.png b/training/debug_plots/10sheep/ep005_SUCCESS.png deleted file mode 100644 index 52e2394..0000000 Binary files a/training/debug_plots/10sheep/ep005_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep006_SUCCESS.png b/training/debug_plots/10sheep/ep006_SUCCESS.png deleted file mode 100644 index fef269e..0000000 Binary files a/training/debug_plots/10sheep/ep006_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep007_SUCCESS.png b/training/debug_plots/10sheep/ep007_SUCCESS.png deleted file mode 100644 index 1efed9d..0000000 Binary files a/training/debug_plots/10sheep/ep007_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep008_SUCCESS.png b/training/debug_plots/10sheep/ep008_SUCCESS.png deleted file mode 100644 index 94c048a..0000000 Binary files a/training/debug_plots/10sheep/ep008_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep009_SUCCESS.png b/training/debug_plots/10sheep/ep009_SUCCESS.png deleted file mode 100644 index 16888d1..0000000 Binary files a/training/debug_plots/10sheep/ep009_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep010_SUCCESS.png b/training/debug_plots/10sheep/ep010_SUCCESS.png deleted file mode 100644 index cfb2fdc..0000000 Binary files a/training/debug_plots/10sheep/ep010_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep011_SUCCESS.png b/training/debug_plots/10sheep/ep011_SUCCESS.png deleted file mode 100644 index 5aa1fe7..0000000 Binary files a/training/debug_plots/10sheep/ep011_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep012_SUCCESS.png b/training/debug_plots/10sheep/ep012_SUCCESS.png deleted file mode 100644 index 138eead..0000000 Binary files a/training/debug_plots/10sheep/ep012_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep013_SUCCESS.png b/training/debug_plots/10sheep/ep013_SUCCESS.png deleted file mode 100644 index b13ecf9..0000000 Binary files a/training/debug_plots/10sheep/ep013_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep014_SUCCESS.png b/training/debug_plots/10sheep/ep014_SUCCESS.png deleted file mode 100644 index e6ca7e6..0000000 Binary files a/training/debug_plots/10sheep/ep014_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep015_SUCCESS.png b/training/debug_plots/10sheep/ep015_SUCCESS.png deleted file mode 100644 index c0cf257..0000000 Binary files a/training/debug_plots/10sheep/ep015_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep016_SUCCESS.png b/training/debug_plots/10sheep/ep016_SUCCESS.png deleted file mode 100644 index e0e0e5e..0000000 Binary files a/training/debug_plots/10sheep/ep016_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep017_SUCCESS.png b/training/debug_plots/10sheep/ep017_SUCCESS.png deleted file mode 100644 index 390bcaa..0000000 Binary files a/training/debug_plots/10sheep/ep017_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep018_SUCCESS.png b/training/debug_plots/10sheep/ep018_SUCCESS.png deleted file mode 100644 index b9c0ccf..0000000 Binary files a/training/debug_plots/10sheep/ep018_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep019_SUCCESS.png b/training/debug_plots/10sheep/ep019_SUCCESS.png deleted file mode 100644 index dd1bbf4..0000000 Binary files a/training/debug_plots/10sheep/ep019_SUCCESS.png and /dev/null differ diff --git a/training/debug_plots/10sheep/ep020_SUCCESS.png b/training/debug_plots/10sheep/ep020_SUCCESS.png deleted file mode 100644 index d31d1ef..0000000 Binary files a/training/debug_plots/10sheep/ep020_SUCCESS.png and /dev/null differ diff --git a/training/diagnose.py b/training/diagnose.py deleted file mode 100644 index 59022a1..0000000 --- a/training/diagnose.py +++ /dev/null @@ -1,223 +0,0 @@ -""" -Episode-level diagnostics for the herding policy. - -Runs N episodes and for each one tracks: - - flock radius over time - - COM-to-pen distance over time - - dog position over time - - when (if ever) the flock first became compact - - failure mode classification - -Then produces: - 1. Console summary of failure modes - 2. Per-episode time-series plots (radius + com_dist) - 3. Optional rendered playback of the worst episodes - -Usage ------ - python diagnose.py --model runs/ppo_consolidation/final_model.zip \ - --vecnorm runs/ppo_consolidation/vecnorm.pkl \ - --n-sheep 5 --episodes 20 - - # Watch the policy live (first episode rendered): - python diagnose.py ... --render - - # Save plots to a directory instead of showing interactively: - python diagnose.py ... --plot-dir debug_plots/ -""" - -import argparse -import os -import numpy as np -import matplotlib -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches - -from stable_baselines3 import PPO -from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize -from herding_env import HerdingEnv - - -# ── failure mode constants ──────────────────────────────────────────────────── - -COMPACT_RADIUS = 5.0 # must match DRIVE_GATE_RADIUS in herding_env.py - - -def classify_failure(ep_radius, ep_com_dist, n_penned, n_sheep, success): - if success: - return "SUCCESS" - if min(ep_radius) > COMPACT_RADIUS: - return "NEVER_COMPACT" # flock was always too scattered - first_compact = next(i for i, r in enumerate(ep_radius) if r <= COMPACT_RADIUS) - min_com_after = min(ep_com_dist[first_compact:]) - pen_close = 3.0 # COM within 3m of pen counts as "got close" - if min_com_after > pen_close: - return "COMPACT_CANT_DRIVE" # compacted but never drove to pen - if n_penned == 0: - return "DROVE_NO_SHEEP" # got near pen, nothing went in - return f"PARTIAL_{n_penned}of{n_sheep}" # some in, not all - - -# ── main ───────────────────────────────────────────────────────────────────── - -def parse_args(): - p = argparse.ArgumentParser() - p.add_argument("--model", required=True) - p.add_argument("--vecnorm", default=None) - p.add_argument("--n-sheep", type=int, default=5) - p.add_argument("--episodes", type=int, default=20) - p.add_argument("--max-steps", type=int, default=4000) - p.add_argument("--render", action="store_true", - help="Show matplotlib animation of the first episode") - p.add_argument("--plot-dir", default=None, - help="Save time-series plots here (one per episode)") - p.add_argument("--seed", type=int, default=0) - return p.parse_args() - - -def make_env(n_sheep, max_steps, render_mode=None): - def _init(): - return HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, - render_mode=render_mode) - return _init - - -def main(): - args = parse_args() - - if args.plot_dir: - os.makedirs(args.plot_dir, exist_ok=True) - matplotlib.use("Agg") - - render_mode = "human" if args.render else None - raw_env = DummyVecEnv([make_env(args.n_sheep, args.max_steps, render_mode)]) - - if args.vecnorm: - env = VecNormalize.load(args.vecnorm, raw_env) - env.training = False - env.norm_reward = False - else: - env = raw_env - - model = PPO.load(args.model, env=env) - - failure_counts = {} - all_ep_data = [] - - for ep in range(args.episodes): - obs = env.reset() - done = False - step = 0 - - ep_radius = [] - ep_com_dist = [] - ep_dog_x = [] - ep_dog_y = [] - ep_n_penned = [] - - while not done: - action, _ = model.predict(obs, deterministic=True) - obs, _, dones, infos = env.step(action) - done = dones[0] - step += 1 - - inner = env.envs[0] if hasattr(env, "envs") else env.venv.envs[0] - com, radius, _ = inner._flock_stats() - com_dist = float(np.linalg.norm(com - inner.PEN_CENTER)) - n_penned = int(inner.penned[:inner.n_sheep].sum()) - - ep_radius.append(radius) - ep_com_dist.append(com_dist) - ep_dog_x.append(float(inner.dog_pos[0])) - ep_dog_y.append(float(inner.dog_pos[1])) - ep_n_penned.append(n_penned) - - info = infos[0] - n_pen = info.get("n_penned", 0) - n_sheep = info.get("n_sheep", args.n_sheep) - success = n_pen == n_sheep - mode = classify_failure(ep_radius, ep_com_dist, n_pen, n_sheep, success) - - failure_counts[mode] = failure_counts.get(mode, 0) + 1 - - compact_step = next((i for i, r in enumerate(ep_radius) - if r <= COMPACT_RADIUS), None) - min_radius = min(ep_radius) - min_com_dist = min(ep_com_dist) - - print(f" ep {ep+1:>3} steps={step:>5} penned={n_pen}/{n_sheep}" - f" min_r={min_radius:.1f}m" - f" min_com={min_com_dist:.1f}m" - f" compact@step={compact_step if compact_step is not None else 'NEVER'}" - f" [{mode}]") - - all_ep_data.append(dict( - ep=ep, radius=ep_radius, com_dist=ep_com_dist, - dog_x=ep_dog_x, dog_y=ep_dog_y, n_penned=ep_n_penned, - steps=step, mode=mode, success=success, - )) - - # ── per-episode time-series plot ────────────────────────────────── - if args.plot_dir or (not args.render and ep < 5): - fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True) - t = np.arange(len(ep_radius)) - - axes[0].plot(t, ep_radius, color="steelblue", label="flock radius (m)") - axes[0].axhline(COMPACT_RADIUS, color="orange", linestyle="--", - label=f"compact threshold ({COMPACT_RADIUS}m)") - if compact_step is not None: - axes[0].axvline(compact_step, color="green", linestyle=":", - alpha=0.6, label=f"first compact (step {compact_step})") - axes[0].set_ylabel("radius (m)") - axes[0].legend(fontsize=8) - axes[0].set_title(f"ep {ep+1} | n_sheep={n_sheep} | {mode}") - - axes[1].plot(t, ep_com_dist, color="tomato", label="COM-to-pen dist (m)") - axes[1].set_ylabel("COM-to-pen (m)") - axes[1].set_xlabel("step") - axes[1].legend(fontsize=8) - - plt.tight_layout() - if args.plot_dir: - fig.savefig(os.path.join(args.plot_dir, f"ep{ep+1:03d}_{mode}.png"), - dpi=100) - plt.close(fig) - else: - plt.show(block=False) - plt.pause(0.5) - - env.close() - - # ── summary ────────────────────────────────────────────────────────────── - print("\n" + "=" * 55) - print(f" Model : {args.model}") - print(f" n_sheep : {args.n_sheep} episodes : {args.episodes}") - print("-" * 55) - total = sum(failure_counts.values()) - for mode, cnt in sorted(failure_counts.items(), key=lambda x: -x[1]): - bar = "█" * cnt - print(f" {mode:<26} {cnt:>3}/{total} {bar}") - print("-" * 55) - - never_compact = failure_counts.get("NEVER_COMPACT", 0) - cant_drive = failure_counts.get("COMPACT_CANT_DRIVE", 0) - partial = sum(v for k, v in failure_counts.items() if k.startswith("PARTIAL")) - successes = failure_counts.get("SUCCESS", 0) - - print(f"\n Diagnosis:") - if never_compact / total > 0.5: - print(" ► COLLECT problem: dog rarely compacts the flock.") - print(" → Phase-gate W_DRIVE, increase W_COLLECT, check alignment reward.") - if cant_drive / total > 0.3: - print(" ► DRIVE problem: flock compacts but doesn't reach pen.") - print(" → Check dog alignment, pen direction, W_DRIVE magnitude.") - if partial / total > 0.3: - print(" ► PARTIAL problem: some sheep penned, stragglers remain.") - print(" → Flock splits; need better straggler-chasing behavior.") - if successes / total > 0.5: - print(" ► Mostly working! Fine-tune for consistency.") - print("=" * 55) - - -if __name__ == "__main__": - main() diff --git a/training/eval_per_sheep.py b/training/eval_per_sheep.py deleted file mode 100644 index 90779a3..0000000 --- a/training/eval_per_sheep.py +++ /dev/null @@ -1,109 +0,0 @@ -""" -Load a saved run and evaluate the policy at every n_sheep from 1..N. -Tells you exactly where the curriculum stopped working. - -Usage: - python eval_per_sheep.py --run-dir runs/ppo_v3 - python eval_per_sheep.py --run-dir runs/ppo_v3 --max-sheep 10 --episodes 20 - python eval_per_sheep.py --model runs/ppo_v3/final_model.zip \ - --vecnorm runs/ppo_v3/vecnorm.pkl -""" -import argparse -import os -from copy import deepcopy - -import numpy as np -from stable_baselines3 import PPO -from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize - -from herding_env import HerdingEnv -from train import _classify, COMPACT_RADIUS - - -def evaluate(model, vn_template, n_sheep, n_episodes, max_steps): - raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep, max_steps=max_steps)]) - vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) - vn.obs_rms = deepcopy(vn_template.obs_rms) - vn.ret_rms = deepcopy(vn_template.ret_rms) - - failure = {} - successes = 0 - act_mags, min_radii, min_dog_com, min_pen = [], [], [], [] - - for _ in range(n_episodes): - obs = vn.reset() - done = False - ep_radius, ep_com_dist, ep_dog_com, ep_act = [], [], [], [] - while not done: - action, _ = model.predict(obs, deterministic=True) - obs, _, dones, infos = vn.step(action) - done = dones[0] - inner = vn.envs[0] - com, radius, _ = inner._flock_stats() - ep_radius.append(radius) - ep_com_dist.append(float(np.linalg.norm(com - inner.PEN_CENTER))) - ep_dog_com.append(float(np.linalg.norm(inner.dog_pos - com))) - ep_act.append(float(np.linalg.norm(action[0]))) - npen = infos[0].get("n_penned", 0) - success = npen == n_sheep - successes += int(success) - mode = _classify(ep_radius, ep_com_dist, npen, n_sheep, success) - failure[mode] = failure.get(mode, 0) + 1 - act_mags.extend(ep_act) - min_radii.append(min(ep_radius)) - min_dog_com.append(min(ep_dog_com)) - min_pen.append(min(ep_com_dist)) - vn.close() - - return { - "n_sheep": n_sheep, - "success_rate": successes / n_episodes, - "failure": failure, - "mean_action": float(np.mean(act_mags)), - "mean_min_radius": float(np.mean(min_radii)), - "mean_min_dog_com": float(np.mean(min_dog_com)), - "mean_min_pen": float(np.mean(min_pen)), - } - - -def main(): - p = argparse.ArgumentParser() - p.add_argument("--run-dir", type=str, default=None) - p.add_argument("--model", type=str, default=None) - p.add_argument("--vecnorm", type=str, default=None) - p.add_argument("--max-sheep", type=int, default=10) - p.add_argument("--episodes", type=int, default=10) - p.add_argument("--max-steps", type=int, default=2000) - args = p.parse_args() - - if args.run_dir: - model_path = os.path.join(args.run_dir, "final_model.zip") - if not os.path.exists(model_path): - model_path = os.path.join(args.run_dir, "best_model", "best_model.zip") - vn_path = os.path.join(args.run_dir, "vecnorm.pkl") - else: - model_path = args.model - vn_path = args.vecnorm - - print(f"Loading model: {model_path}") - print(f"Loading vecnorm: {vn_path}\n") - model = PPO.load(model_path, device="cpu") - raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=1, max_steps=args.max_steps)]) - vn_template = VecNormalize.load(vn_path, raw) - - print(f"{'n_sheep':>7} {'success':>8} {'act':>6} {'min_r':>7} " - f"{'dog→com':>8} {'com→pen':>8} failure breakdown") - print("-" * 90) - for n in range(1, args.max_sheep + 1): - r = evaluate(model, vn_template, n, args.episodes, args.max_steps) - fb = " ".join(f"{m}={c}" for m, c in - sorted(r["failure"].items(), key=lambda x: -x[1])) - print(f"{n:>7d} {r['success_rate']*100:>6.0f}% " - f"{r['mean_action']:>6.2f} " - f"{r['mean_min_radius']:>6.2f}m " - f"{r['mean_min_dog_com']:>7.2f}m " - f"{r['mean_min_pen']:>7.2f}m {fb}") - - -if __name__ == "__main__": - main() diff --git a/training/evaluate.py b/training/evaluate.py deleted file mode 100644 index 6fe7560..0000000 --- a/training/evaluate.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -Evaluation script for a trained herding policy. - -Runs N episodes and reports the three project metrics: - 1. Success rate — fraction of episodes where all sheep are penned - 2. Time-to-pen — mean steps across successful episodes (per sheep) - 3. Flock dispersion — mean pairwise distance among active sheep, averaged - over all timesteps (lower = tighter herding) - -Usage ------ - python evaluate.py --model runs/ppo_herding/best_model/best_model.zip \ - --vecnorm runs/ppo_herding/vecnorm.pkl \ - --n-sheep 5 --episodes 100 - -Add --render to watch the first episode in a matplotlib window. -""" - -import argparse - -import numpy as np -from stable_baselines3 import PPO -from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize - -from herding_env import HerdingEnv - - -def make_single_env(n_sheep: int, max_steps: int, render_mode: str = None): - def _init(): - return HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, - render_mode=render_mode) - return _init - - -def pairwise_mean(positions: np.ndarray, n_active: int) -> float: - """Mean pairwise distance among the first n_active sheep.""" - if n_active < 2: - return 0.0 - pts = positions[:n_active] - dists = [] - for i in range(n_active): - for j in range(i + 1, n_active): - dists.append(float(np.linalg.norm(pts[i] - pts[j]))) - return float(np.mean(dists)) - - -def parse_args(): - p = argparse.ArgumentParser() - p.add_argument("--model", required=True, - help="Path to saved model .zip") - p.add_argument("--vecnorm", default=None, - help="Path to VecNormalize stats .pkl (optional)") - p.add_argument("--n-sheep", type=int, default=1) - p.add_argument("--episodes", type=int, default=50) - p.add_argument("--max-steps", type=int, default=2000) - p.add_argument("--render", action="store_true", - help="Render first episode in matplotlib") - p.add_argument("--seed", type=int, default=42) - return p.parse_args() - - -def main(): - args = parse_args() - - render_mode = "human" if args.render else None - raw_env = DummyVecEnv([make_single_env(args.n_sheep, args.max_steps, - render_mode)]) - if args.vecnorm: - env = VecNormalize.load(args.vecnorm, raw_env) - env.training = False - env.norm_reward = False - else: - env = raw_env - - model = PPO.load(args.model, env=env) - - successes = [] - steps_to_pen = [] # steps for successful episodes - dispersions = [] # per-episode mean flock dispersion - - for ep in range(args.episodes): - obs = env.reset() - done = False - ep_steps = 0 - ep_dispersion = [] - first_ep = ep == 0 - - while not done: - action, _ = model.predict(obs, deterministic=True) - obs, _, dones, infos = env.step(action) - done = dones[0] - ep_steps += 1 - - # Access the underlying HerdingEnv for dispersion calculation - inner = env.envs[0] if hasattr(env, "envs") else env.venv.envs[0] - if not inner.penned[:inner.n_sheep].all(): - _, radius, _ = inner._flock_stats() - ep_dispersion.append(radius) - - if first_ep and render_mode == "human": - pass # render() is called inside step() - - info = infos[0] - n_penned = info.get("n_penned", 0) - n_sheep = info.get("n_sheep", args.n_sheep) - success = n_penned == n_sheep - - successes.append(int(success)) - if success: - steps_to_pen.append(ep_steps / n_sheep) - if ep_dispersion: - dispersions.append(float(np.mean(ep_dispersion))) - - if (ep + 1) % 10 == 0: - print(f" Episode {ep + 1:>4}/{args.episodes} " - f"success={int(success)} steps={ep_steps}") - - env.close() - - # ----------------------------------------------------------------------- - # Report - # ----------------------------------------------------------------------- - success_rate = float(np.mean(successes)) - mean_ttp = float(np.mean(steps_to_pen)) if steps_to_pen else float("nan") - mean_disp = float(np.mean(dispersions)) if dispersions else float("nan") - - print("\n" + "=" * 50) - print(f" Model : {args.model}") - print(f" Sheep : {args.n_sheep}") - print(f" Episodes : {args.episodes}") - print("-" * 50) - print(f" Success rate : {success_rate * 100:.1f}%" - f" ({sum(successes)}/{args.episodes})") - print(f" Time-to-pen : {mean_ttp:.1f} steps/sheep" - f" (successful episodes only)") - print(f" Flock radius : {mean_disp:.2f} m" - f" (max sheep-to-COM distance while active)") - print("=" * 50) - - -if __name__ == "__main__": - main() diff --git a/training/herding_env.py b/training/herding_env.py index b038551..6665906 100644 --- a/training/herding_env.py +++ b/training/herding_env.py @@ -61,18 +61,19 @@ class HerdingEnv(gym.Env): W_COMPLETE = 100.0 # all sheep penned W_STEP_COST = 0.02 # time penalty — strong enough to punish doing nothing W_COMPACT = 0.0 # reward for flock-radius reduction (off by default) - W_WALL_TOUCH = 0.01 # per-sheep, per-step penalty when an active sheep is - # pinned against the outside of a pen W/E wall. Kept - # small ( py0) & (pts[:, 1] < py1) - near_w = (pts[:, 0] < px0) & (pts[:, 0] > px0 - self.WALL_TOUCH_BUFFER) - near_e = (pts[:, 0] > px1) & (pts[:, 0] < px1 + self.WALL_TOUCH_BUFFER) - n_touch = int(((near_w | near_e) & in_y).sum()) - r_wall_touch = -n_touch * self.W_WALL_TOUCH + buf = self.WALL_TOUCH_BUFFER + far = buf + 1.0 + d_w = np.where((pts[:, 0] < px0) & (pts[:, 1] > py0) & (pts[:, 1] < py1), + px0 - pts[:, 0], far) + d_e = np.where((pts[:, 0] > px1) & (pts[:, 1] > py0) & (pts[:, 1] < py1), + pts[:, 0] - px1, far) + d_s = np.where((pts[:, 1] < py0) & (pts[:, 0] > px0) & (pts[:, 0] < px1), + py0 - pts[:, 1], far) + d_min = np.minimum(np.minimum(d_w, d_e), d_s) + penalties = np.maximum(0.0, 1.0 - d_min / buf) * self.W_WALL_TOUCH + r_wall_touch = -float(penalties.sum()) else: r_wall_touch = 0.0 diff --git a/training/replay_config.py b/training/replay_config.py deleted file mode 100644 index 08a151d..0000000 --- a/training/replay_config.py +++ /dev/null @@ -1,172 +0,0 @@ -""" -Replay a reward config from the sweep with a longer training budget. - -Tells you whether a promising sweep config was bottlenecked by training time -vs. structurally limited. If sr2/sr3 climb past their sweep numbers given more -budget, the issue was budget; if they plateau, the policy/obs needs work. - -Usage ------ - python replay_config.py --config runs/sweep_/best.json - python replay_config.py --config runs/sweep_/trial_007/config.json \ - --max-sheep 4 --steps-per-stage 1500000 - -Argument summary: - --config JSON file with the reward config (sweep best.json works) - --max-sheep Final curriculum stage (default 3) - --steps-per-stage Env steps per curriculum stage (default 1.5M) - --n-envs Parallel envs (default 8) - --eval-episodes Per-stage eval episodes (default 30) - --run-dir Output directory (default runs/replay_/) -""" -import argparse -import json -import os -import time -from copy import deepcopy - -import numpy as np -from stable_baselines3 import PPO -from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize - -from herding_env import HerdingEnv -from sweep_reward import ProgressCallback, reward_cfg, evaluate, make_env - - -def main(): - p = argparse.ArgumentParser() - p.add_argument("--config", type=str, required=True, - help="Reward config JSON (sweep best.json or trial config.json)") - p.add_argument("--start-sheep", type=int, default=1) - p.add_argument("--max-sheep", type=int, default=3) - p.add_argument("--steps-per-stage", type=int, default=1_500_000) - p.add_argument("--mixed", action="store_true", - help="Train with n_sheep randomized per episode (no curriculum). " - "Total train steps = steps-per-stage * max_sheep.") - p.add_argument("--final-mixed-steps", type=int, default=0, - help="After the curriculum, train this many extra steps with " - "random_n_sheep ∈ [1, max_sheep] to consolidate the policy " - "across all flock sizes. Re-evaluates all n_sheep at the end.") - p.add_argument("--n-envs", type=int, default=8) - p.add_argument("--max-steps", type=int, default=2500) - p.add_argument("--eval-episodes", type=int, default=30) - p.add_argument("--run-dir", type=str, default=None) - args = p.parse_args() - - with open(args.config) as f: - raw = json.load(f) - cfg = raw["config"] if "config" in raw and isinstance(raw["config"], dict) else raw - rcfg = reward_cfg(cfg) - print(f"Config: {cfg}") - - run_dir = args.run_dir or os.path.join( - "runs", "replay_" + time.strftime("%Y%m%d_%H%M%S") - ) - os.makedirs(run_dir, exist_ok=True) - with open(os.path.join(run_dir, "config.json"), "w") as f: - json.dump(cfg, f, indent=2) - print(f"Run dir: {run_dir}") - if args.mixed: - print(f"MIXED training: random n_sheep ∈ [1, {args.max_sheep}], " - f"{args.steps_per_stage * args.max_sheep:,} total steps") - else: - print(f"Curriculum: {args.start_sheep} → {args.max_sheep} sheep, " - f"{args.steps_per_stage:,} steps/stage") - - train_env = SubprocVecEnv([ - make_env(args.max_sheep if args.mixed else args.start_sheep, - seed=i, max_steps=args.max_steps, rcfg=rcfg, - random_n_sheep=args.mixed) - for i in range(args.n_envs) - ]) - vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0) - - model = PPO( - "MlpPolicy", vn, - learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10, - gamma=0.995, gae_lambda=0.95, clip_range=0.2, - ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5, - policy_kwargs=dict(net_arch=[256, 256]), - verbose=0, - ) - - stage_results = [] - t0 = time.time() - try: - if args.mixed: - total = args.steps_per_stage * args.max_sheep - print(f"\n[Mixed] training {total:,} steps") - model.learn( - total_timesteps=total, - reset_num_timesteps=True, - callback=ProgressCallback(0, "mixed", freq=100_000), - ) - for n in range(1, args.max_sheep + 1): - print(f"[Mixed] evaluating n={n}, {args.eval_episodes} eps") - r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) - print(f"[Mixed] n_sheep={n} sr={r['sr']*100:.0f}% " - f"mean_len={r['mean_len']:.0f} " - f"mean_min_pen={r['mean_min_pen']:.1f}m " - f"mean_act={r['mean_act']:.2f}") - stage_results.append({"n_sheep": n, **r}) - else: - for n in range(args.start_sheep, args.max_sheep + 1): - if n > args.start_sheep: - vn.env_method("set_n_sheep", n) - print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps") - model.learn( - total_timesteps=args.steps_per_stage, - reset_num_timesteps=(n == args.start_sheep), - callback=ProgressCallback(0, f"{n} sheep", freq=100_000), - ) - print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps") - r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) - print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% " - f"mean_len={r['mean_len']:.0f} " - f"mean_min_pen={r['mean_min_pen']:.1f}m " - f"mean_act={r['mean_act']:.2f}") - stage_results.append({"n_sheep": n, **r}) - - # Optional consolidation pass with mixed n_sheep — fixes specialization - # imbalance from curriculum order (e.g. n=1 weakness after long n=10 - # training). Replaces stage_results with the post-consolidation eval. - if args.final_mixed_steps > 0 and not args.mixed: - print(f"\n[Consolidation] mixed n_sheep ∈ [1, {args.max_sheep}], " - f"{args.final_mixed_steps:,} steps") - vn.env_method("__setattr__", "random_n_sheep", True) - model.learn( - total_timesteps=args.final_mixed_steps, - reset_num_timesteps=False, - callback=ProgressCallback(0, "consolidate", freq=100_000), - ) - print("[Consolidation] re-evaluating all sheep counts") - stage_results = [] - for n in range(1, args.max_sheep + 1): - r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) - print(f"[Consolidation] n_sheep={n} sr={r['sr']*100:.0f}% " - f"mean_len={r['mean_len']:.0f} " - f"mean_min_pen={r['mean_min_pen']:.1f}m " - f"mean_act={r['mean_act']:.2f}") - stage_results.append({"n_sheep": n, **r}) - - model.save(os.path.join(run_dir, "final_model")) - vn.save(os.path.join(run_dir, "vecnorm.pkl")) - with open(os.path.join(run_dir, "stage_results.json"), "w") as f: - json.dump(stage_results, f, indent=2) - finally: - try: vn.close() - except Exception: pass - - print("\n" + "=" * 60) - print(" REPLAY SUMMARY") - print("=" * 60) - for r in stage_results: - print(f" n_sheep={r['n_sheep']} sr={r['sr']*100:>3.0f}% " - f"len={r['mean_len']:>5.0f} min_pen={r['mean_min_pen']:>5.1f}m " - f"act={r['mean_act']:.2f}") - print(f"\n Total time: {(time.time()-t0)/60:.1f} min") - print(f" Artefacts: {run_dir}/") - - -if __name__ == "__main__": - main() diff --git a/training/runs/expA_fresh2.log b/training/runs/expA_fresh2.log deleted file mode 100644 index 6ea1c89..0000000 --- a/training/runs/expA_fresh2.log +++ /dev/null @@ -1,35 +0,0 @@ -Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02} -Run dir: runs/expA_fresh2 -Curriculum: 2 → 2 sheep, 2,000,000 steps/stage - -[Stage n_sheep=2] training 2,000,000 steps - ... [trial 1 | 2 sheep | 100,000 steps | ret(last 50)=-13.44 sr=0%] - ... [trial 1 | 2 sheep | 200,000 steps | ret(last 50)=-14.60 sr=0%] - ... [trial 1 | 2 sheep | 300,000 steps | ret(last 50)=-17.36 sr=0%] - ... [trial 1 | 2 sheep | 400,000 steps | ret(last 50)=-17.36 sr=0%] - ... [trial 1 | 2 sheep | 500,000 steps | ret(last 50)=-17.92 sr=0%] - ... [trial 1 | 2 sheep | 600,000 steps | ret(last 50)=-15.65 sr=0%] - ... [trial 1 | 2 sheep | 700,000 steps | ret(last 50)=-17.69 sr=2%] - ... [trial 1 | 2 sheep | 800,000 steps | ret(last 50)=-14.61 sr=2%] - ... [trial 1 | 2 sheep | 900,000 steps | ret(last 50)=-17.36 sr=0%] - ... [trial 1 | 2 sheep | 1,000,000 steps | ret(last 50)=-17.44 sr=0%] - ... [trial 1 | 2 sheep | 1,100,000 steps | ret(last 50)=-15.91 sr=2%] - ... [trial 1 | 2 sheep | 1,200,000 steps | ret(last 50)=-16.08 sr=0%] - ... [trial 1 | 2 sheep | 1,300,000 steps | ret(last 50)=-14.34 sr=0%] - ... [trial 1 | 2 sheep | 1,400,000 steps | ret(last 50)=-17.00 sr=2%] - ... [trial 1 | 2 sheep | 1,500,000 steps | ret(last 50)=-18.52 sr=0%] - ... [trial 1 | 2 sheep | 1,600,000 steps | ret(last 50)=-16.68 sr=0%] - ... [trial 1 | 2 sheep | 1,700,000 steps | ret(last 50)=-17.52 sr=0%] - ... [trial 1 | 2 sheep | 1,800,000 steps | ret(last 50)=-17.33 sr=0%] - ... [trial 1 | 2 sheep | 1,900,000 steps | ret(last 50)=-14.96 sr=2%] - ... [trial 1 | 2 sheep | 2,000,000 steps | ret(last 50)=-15.59 sr=0%] -[Stage n_sheep=2] evaluating 30 eps -[Stage n_sheep=2] sr=0% mean_len=1500 mean_min_pen=13.2m mean_act=0.96 - -============================================================ - REPLAY SUMMARY -============================================================ - n_sheep=2 sr= 0% len= 1500 min_pen= 13.2m act=0.96 - - Total time: 10.7 min - Artefacts: runs/expA_fresh2/ diff --git a/training/runs/expA_fresh2/config.json b/training/runs/expA_fresh2/config.json deleted file mode 100644 index b2d15fe..0000000 --- a/training/runs/expA_fresh2/config.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "W_PER_SHEEP": 1.0, - "W_ALIGN": 0.0, - "W_PEN_BONUS": 5.0, - "W_STEP_COST": 0.02, - "W_COMPLETE": 200.0, - "W_COMPACT": 1.5, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": false, - "ent_coef": 0.02 -} \ No newline at end of file diff --git a/training/runs/expA_fresh2/final_model.zip b/training/runs/expA_fresh2/final_model.zip deleted file mode 100644 index 3d8a3e3..0000000 Binary files a/training/runs/expA_fresh2/final_model.zip and /dev/null differ diff --git a/training/runs/expA_fresh2/stage_results.json b/training/runs/expA_fresh2/stage_results.json deleted file mode 100644 index 323888a..0000000 --- a/training/runs/expA_fresh2/stage_results.json +++ /dev/null @@ -1,9 +0,0 @@ -[ - { - "n_sheep": 2, - "sr": 0.0, - "mean_len": 1500.0, - "mean_min_pen": 13.171057415008544, - "mean_act": 0.960968065615257 - } -] \ No newline at end of file diff --git a/training/runs/expA_fresh2/vecnorm.pkl b/training/runs/expA_fresh2/vecnorm.pkl deleted file mode 100644 index 5e15a6d..0000000 Binary files a/training/runs/expA_fresh2/vecnorm.pkl and /dev/null differ diff --git a/training/runs/expB_mixed.log b/training/runs/expB_mixed.log deleted file mode 100644 index 02c2b65..0000000 --- a/training/runs/expB_mixed.log +++ /dev/null @@ -1,51 +0,0 @@ -Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02} -Run dir: runs/expB_mixed -MIXED training: random n_sheep ∈ [1, 3], 3,000,000 total steps - -[Mixed] training 3,000,000 steps - ... [trial 1 | mixed | 100,000 steps | ret(last 50)=-13.68 sr=2%] - ... [trial 1 | mixed | 200,000 steps | ret(last 50)=-14.08 sr=0%] - ... [trial 1 | mixed | 300,000 steps | ret(last 50)=-9.80 sr=0%] - ... [trial 1 | mixed | 400,000 steps | ret(last 50)=-11.20 sr=0%] - ... [trial 1 | mixed | 500,000 steps | ret(last 50)=-10.61 sr=0%] - ... [trial 1 | mixed | 600,000 steps | ret(last 50)=-11.19 sr=0%] - ... [trial 1 | mixed | 700,000 steps | ret(last 50)=-14.22 sr=0%] - ... [trial 1 | mixed | 800,000 steps | ret(last 50)=-6.31 sr=0%] - ... [trial 1 | mixed | 900,000 steps | ret(last 50)=-12.68 sr=0%] - ... [trial 1 | mixed | 1,000,000 steps | ret(last 50)=-11.06 sr=0%] - ... [trial 1 | mixed | 1,100,000 steps | ret(last 50)=-13.39 sr=0%] - ... [trial 1 | mixed | 1,200,000 steps | ret(last 50)=-14.20 sr=0%] - ... [trial 1 | mixed | 1,300,000 steps | ret(last 50)=-11.33 sr=0%] - ... [trial 1 | mixed | 1,400,000 steps | ret(last 50)=-10.73 sr=0%] - ... [trial 1 | mixed | 1,500,000 steps | ret(last 50)=-10.91 sr=0%] - ... [trial 1 | mixed | 1,600,000 steps | ret(last 50)=-10.44 sr=0%] - ... [trial 1 | mixed | 1,700,000 steps | ret(last 50)=-10.56 sr=0%] - ... [trial 1 | mixed | 1,800,000 steps | ret(last 50)=-15.74 sr=0%] - ... [trial 1 | mixed | 1,900,000 steps | ret(last 50)=-13.46 sr=0%] - ... [trial 1 | mixed | 2,000,000 steps | ret(last 50)=-9.86 sr=0%] - ... [trial 1 | mixed | 2,100,000 steps | ret(last 50)=-13.07 sr=0%] - ... [trial 1 | mixed | 2,200,000 steps | ret(last 50)=-9.86 sr=0%] - ... [trial 1 | mixed | 2,300,000 steps | ret(last 50)=-9.73 sr=2%] - ... [trial 1 | mixed | 2,400,000 steps | ret(last 50)=-12.21 sr=0%] - ... [trial 1 | mixed | 2,500,000 steps | ret(last 50)=-14.27 sr=0%] - ... [trial 1 | mixed | 2,600,000 steps | ret(last 50)=-10.90 sr=2%] - ... [trial 1 | mixed | 2,700,000 steps | ret(last 50)=-9.67 sr=0%] - ... [trial 1 | mixed | 2,800,000 steps | ret(last 50)=-14.29 sr=0%] - ... [trial 1 | mixed | 2,900,000 steps | ret(last 50)=-9.08 sr=0%] - ... [trial 1 | mixed | 3,000,000 steps | ret(last 50)=-11.62 sr=6%] -[Mixed] evaluating n=1, 30 eps -[Mixed] n_sheep=1 sr=0% mean_len=1500 mean_min_pen=12.1m mean_act=0.64 -[Mixed] evaluating n=2, 30 eps -[Mixed] n_sheep=2 sr=0% mean_len=1500 mean_min_pen=13.6m mean_act=1.12 -[Mixed] evaluating n=3, 30 eps -[Mixed] n_sheep=3 sr=0% mean_len=1500 mean_min_pen=13.3m mean_act=1.02 - -============================================================ - REPLAY SUMMARY -============================================================ - n_sheep=1 sr= 0% len= 1500 min_pen= 12.1m act=0.64 - n_sheep=2 sr= 0% len= 1500 min_pen= 13.6m act=1.12 - n_sheep=3 sr= 0% len= 1500 min_pen= 13.3m act=1.02 - - Total time: 20.6 min - Artefacts: runs/expB_mixed/ diff --git a/training/runs/expB_mixed/config.json b/training/runs/expB_mixed/config.json deleted file mode 100644 index b2d15fe..0000000 --- a/training/runs/expB_mixed/config.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "W_PER_SHEEP": 1.0, - "W_ALIGN": 0.0, - "W_PEN_BONUS": 5.0, - "W_STEP_COST": 0.02, - "W_COMPLETE": 200.0, - "W_COMPACT": 1.5, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": false, - "ent_coef": 0.02 -} \ No newline at end of file diff --git a/training/runs/expB_mixed/final_model.zip b/training/runs/expB_mixed/final_model.zip deleted file mode 100644 index 211707c..0000000 Binary files a/training/runs/expB_mixed/final_model.zip and /dev/null differ diff --git a/training/runs/expB_mixed/stage_results.json b/training/runs/expB_mixed/stage_results.json deleted file mode 100644 index 735c94e..0000000 --- a/training/runs/expB_mixed/stage_results.json +++ /dev/null @@ -1,23 +0,0 @@ -[ - { - "n_sheep": 1, - "sr": 0.0, - "mean_len": 1500.0, - "mean_min_pen": 12.136781152089437, - "mean_act": 0.6380681545449439 - }, - { - "n_sheep": 2, - "sr": 0.0, - "mean_len": 1500.0, - "mean_min_pen": 13.609641806284587, - "mean_act": 1.1225489819858792 - }, - { - "n_sheep": 3, - "sr": 0.0, - "mean_len": 1500.0, - "mean_min_pen": 13.337443319956462, - "mean_act": 1.0186407331574738 - } -] \ No newline at end of file diff --git a/training/runs/expB_mixed/vecnorm.pkl b/training/runs/expB_mixed/vecnorm.pkl deleted file mode 100644 index 9bb6497..0000000 Binary files a/training/runs/expB_mixed/vecnorm.pkl and /dev/null differ diff --git a/training/runs/expC_clustered.log b/training/runs/expC_clustered.log deleted file mode 100644 index 424303f..0000000 --- a/training/runs/expC_clustered.log +++ /dev/null @@ -1,57 +0,0 @@ -Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02} -Run dir: runs/expC_clustered -Curriculum: 1 → 3 sheep, 1,000,000 steps/stage - -[Stage n_sheep=1] training 1,000,000 steps - ... [trial 1 | 1 sheep | 100,000 steps | ret(last 50)=-17.04 sr=6%] - ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-17.39 sr=4%] - ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=-15.50 sr=4%] - ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=-2.07 sr=26%] - ... [trial 1 | 1 sheep | 500,000 steps | ret(last 50)=+3.81 sr=52%] - ... [trial 1 | 1 sheep | 600,000 steps | ret(last 50)=+8.03 sr=76%] - ... [trial 1 | 1 sheep | 700,000 steps | ret(last 50)=+9.49 sr=86%] - ... [trial 1 | 1 sheep | 800,000 steps | ret(last 50)=+9.42 sr=88%] - ... [trial 1 | 1 sheep | 900,000 steps | ret(last 50)=+9.49 sr=88%] - ... [trial 1 | 1 sheep | 1,000,000 steps | ret(last 50)=+10.34 sr=94%] -[Stage n_sheep=1] evaluating 30 eps -[Stage n_sheep=1] sr=83% mean_len=519 mean_min_pen=3.5m mean_act=0.25 - -[Stage n_sheep=2] training 1,000,000 steps - ... [trial 1 | 2 sheep | 1,015,816 steps | ret(last 0)=+nan sr=nan%] - ... [trial 1 | 2 sheep | 1,115,816 steps | ret(last 50)=-0.13 sr=10%] - ... [trial 1 | 2 sheep | 1,215,816 steps | ret(last 50)=-1.23 sr=10%] - ... [trial 1 | 2 sheep | 1,315,816 steps | ret(last 50)=-0.10 sr=6%] - ... [trial 1 | 2 sheep | 1,415,816 steps | ret(last 50)=+4.10 sr=28%] - ... [trial 1 | 2 sheep | 1,515,816 steps | ret(last 50)=+6.24 sr=32%] - ... [trial 1 | 2 sheep | 1,615,816 steps | ret(last 50)=+8.48 sr=52%] - ... [trial 1 | 2 sheep | 1,715,816 steps | ret(last 50)=+14.14 sr=98%] - ... [trial 1 | 2 sheep | 1,815,816 steps | ret(last 50)=+14.33 sr=98%] - ... [trial 1 | 2 sheep | 1,915,816 steps | ret(last 50)=+14.02 sr=100%] - ... [trial 1 | 2 sheep | 2,015,816 steps | ret(last 50)=+14.05 sr=100%] -[Stage n_sheep=2] evaluating 30 eps -[Stage n_sheep=2] sr=100% mean_len=695 mean_min_pen=3.4m mean_act=0.58 - -[Stage n_sheep=3] training 1,000,000 steps - ... [trial 1 | 3 sheep | 2,031,624 steps | ret(last 0)=+nan sr=nan%] - ... [trial 1 | 3 sheep | 2,131,624 steps | ret(last 50)=+10.43 sr=56%] - ... [trial 1 | 3 sheep | 2,231,624 steps | ret(last 50)=+13.91 sr=74%] - ... [trial 1 | 3 sheep | 2,331,624 steps | ret(last 50)=+13.98 sr=76%] - ... [trial 1 | 3 sheep | 2,431,624 steps | ret(last 50)=+12.67 sr=68%] - ... [trial 1 | 3 sheep | 2,531,624 steps | ret(last 50)=+15.79 sr=90%] - ... [trial 1 | 3 sheep | 2,631,624 steps | ret(last 50)=+16.29 sr=94%] - ... [trial 1 | 3 sheep | 2,731,624 steps | ret(last 50)=+15.47 sr=90%] - ... [trial 1 | 3 sheep | 2,831,624 steps | ret(last 50)=+16.67 sr=96%] - ... [trial 1 | 3 sheep | 2,931,624 steps | ret(last 50)=+17.50 sr=100%] - ... [trial 1 | 3 sheep | 3,031,624 steps | ret(last 50)=+16.49 sr=96%] -[Stage n_sheep=3] evaluating 30 eps -[Stage n_sheep=3] sr=90% mean_len=794 mean_min_pen=3.7m mean_act=0.47 - -============================================================ - REPLAY SUMMARY -============================================================ - n_sheep=1 sr= 83% len= 519 min_pen= 3.5m act=0.25 - n_sheep=2 sr=100% len= 695 min_pen= 3.4m act=0.58 - n_sheep=3 sr= 90% len= 794 min_pen= 3.7m act=0.47 - - Total time: 15.1 min - Artefacts: runs/expC_clustered/ diff --git a/training/runs/expC_clustered/config.json b/training/runs/expC_clustered/config.json deleted file mode 100644 index b2d15fe..0000000 --- a/training/runs/expC_clustered/config.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "W_PER_SHEEP": 1.0, - "W_ALIGN": 0.0, - "W_PEN_BONUS": 5.0, - "W_STEP_COST": 0.02, - "W_COMPLETE": 200.0, - "W_COMPACT": 1.5, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": false, - "ent_coef": 0.02 -} \ No newline at end of file diff --git a/training/runs/expC_clustered/final_model.zip b/training/runs/expC_clustered/final_model.zip deleted file mode 100644 index 86d9208..0000000 Binary files a/training/runs/expC_clustered/final_model.zip and /dev/null differ diff --git a/training/runs/expC_clustered/stage_results.json b/training/runs/expC_clustered/stage_results.json deleted file mode 100644 index 7614958..0000000 --- a/training/runs/expC_clustered/stage_results.json +++ /dev/null @@ -1,23 +0,0 @@ -[ - { - "n_sheep": 1, - "sr": 0.8333333333333334, - "mean_len": 518.5333333333333, - "mean_min_pen": 3.5244259238243103, - "mean_act": 0.25044742608759274 - }, - { - "n_sheep": 2, - "sr": 1.0, - "mean_len": 694.9, - "mean_min_pen": 3.4314632336298625, - "mean_act": 0.5796192060058971 - }, - { - "n_sheep": 3, - "sr": 0.9, - "mean_len": 794.1333333333333, - "mean_min_pen": 3.6645382324854534, - "mean_act": 0.46590614892287907 - } -] \ No newline at end of file diff --git a/training/runs/expC_clustered/vecnorm.pkl b/training/runs/expC_clustered/vecnorm.pkl deleted file mode 100644 index 0cffe9b..0000000 Binary files a/training/runs/expC_clustered/vecnorm.pkl and /dev/null differ diff --git a/training/runs/final_v2.log b/training/runs/final_v2.log deleted file mode 100644 index 39cf38e..0000000 --- a/training/runs/final_v2.log +++ /dev/null @@ -1,219 +0,0 @@ -Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02} -Run dir: runs/final_v2 -Curriculum: 1 → 10 sheep, 1,500,000 steps/stage - -[Stage n_sheep=1] training 1,500,000 steps - ... [trial 1 | 1 sheep | 100,000 steps | ret(last 41)=-38.49 win_sr=10% cum_sr=10%] - ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-32.87 win_sr=8% cum_sr=9%] - ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=-33.60 win_sr=4% cum_sr=7%] - ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=-34.78 win_sr=8% cum_sr=7%] - ... [trial 1 | 1 sheep | 500,000 steps | ret(last 50)=-31.25 win_sr=12% cum_sr=8%] - ... [trial 1 | 1 sheep | 600,000 steps | ret(last 50)=-32.87 win_sr=2% cum_sr=7%] - ... [trial 1 | 1 sheep | 700,000 steps | ret(last 50)=-33.25 win_sr=6% cum_sr=7%] - ... [trial 1 | 1 sheep | 800,000 steps | ret(last 50)=-27.80 win_sr=16% cum_sr=8%] - ... [trial 1 | 1 sheep | 900,000 steps | ret(last 50)=-27.44 win_sr=14% cum_sr=9%] - ... [trial 1 | 1 sheep | 1,000,000 steps | ret(last 50)=-30.52 win_sr=6% cum_sr=9%] - ... [trial 1 | 1 sheep | 1,100,000 steps | ret(last 50)=-24.75 win_sr=20% cum_sr=10%] - ... [trial 1 | 1 sheep | 1,200,000 steps | ret(last 50)=-29.94 win_sr=4% cum_sr=10%] - ... [trial 1 | 1 sheep | 1,300,000 steps | ret(last 50)=-22.72 win_sr=22% cum_sr=11%] - ... [trial 1 | 1 sheep | 1,400,000 steps | ret(last 50)=-9.84 win_sr=46% cum_sr=14%] - ... [trial 1 | 1 sheep | 1,500,000 steps | ret(last 50)=+10.01 win_sr=96% cum_sr=24%] -[Stage n_sheep=1] evaluating 30 eps -[Stage n_sheep=1] sr=97% mean_len=351 mean_min_pen=3.9m mean_act=0.28 - -[Stage n_sheep=2] training 1,500,000 steps - ... [trial 1 | 2 sheep | 1,507,336 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 2 sheep | 1,607,336 steps | ret(last 43)=-4.11 win_sr=33% cum_sr=33%] - ... [trial 1 | 2 sheep | 1,707,336 steps | ret(last 50)=-0.34 win_sr=36% cum_sr=34%] - ... [trial 1 | 2 sheep | 1,807,336 steps | ret(last 50)=+14.73 win_sr=92% cum_sr=62%] - ... [trial 1 | 2 sheep | 1,907,336 steps | ret(last 50)=+17.38 win_sr=100% cum_sr=76%] - ... [trial 1 | 2 sheep | 2,007,336 steps | ret(last 50)=+16.80 win_sr=100% cum_sr=83%] - ... [trial 1 | 2 sheep | 2,107,336 steps | ret(last 50)=+15.67 win_sr=100% cum_sr=87%] - ... [trial 1 | 2 sheep | 2,207,336 steps | ret(last 50)=+15.39 win_sr=100% cum_sr=90%] - ... [trial 1 | 2 sheep | 2,307,336 steps | ret(last 50)=+15.58 win_sr=100% cum_sr=92%] - ... [trial 1 | 2 sheep | 2,407,336 steps | ret(last 50)=+15.01 win_sr=100% cum_sr=93%] - ... [trial 1 | 2 sheep | 2,507,336 steps | ret(last 50)=+15.50 win_sr=100% cum_sr=94%] - ... [trial 1 | 2 sheep | 2,607,336 steps | ret(last 50)=+15.21 win_sr=100% cum_sr=95%] - ... [trial 1 | 2 sheep | 2,707,336 steps | ret(last 50)=+15.22 win_sr=100% cum_sr=95%] - ... [trial 1 | 2 sheep | 2,807,336 steps | ret(last 50)=+15.05 win_sr=100% cum_sr=96%] - ... [trial 1 | 2 sheep | 2,907,336 steps | ret(last 50)=+14.37 win_sr=100% cum_sr=96%] - ... [trial 1 | 2 sheep | 3,007,336 steps | ret(last 50)=+14.70 win_sr=100% cum_sr=97%] -[Stage n_sheep=2] evaluating 30 eps -[Stage n_sheep=2] sr=100% mean_len=421 mean_min_pen=3.5m mean_act=1.01 - -[Stage n_sheep=3] training 1,500,000 steps - ... [trial 1 | 3 sheep | 3,014,664 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 3 sheep | 3,114,664 steps | ret(last 50)=+16.52 win_sr=100% cum_sr=99%] - ... [trial 1 | 3 sheep | 3,214,664 steps | ret(last 50)=+16.74 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,314,664 steps | ret(last 50)=+17.09 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,414,664 steps | ret(last 50)=+16.90 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,514,664 steps | ret(last 50)=+16.97 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,614,664 steps | ret(last 50)=+17.20 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,714,664 steps | ret(last 50)=+17.09 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,814,664 steps | ret(last 50)=+17.12 win_sr=98% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,914,664 steps | ret(last 50)=+17.17 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,014,664 steps | ret(last 50)=+16.25 win_sr=98% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,114,664 steps | ret(last 50)=+17.04 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,214,664 steps | ret(last 50)=+16.31 win_sr=98% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,314,664 steps | ret(last 50)=+16.82 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,414,664 steps | ret(last 50)=+16.49 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,514,664 steps | ret(last 50)=+16.54 win_sr=100% cum_sr=100%] -[Stage n_sheep=3] evaluating 30 eps -[Stage n_sheep=3] sr=100% mean_len=608 mean_min_pen=3.5m mean_act=1.06 - -[Stage n_sheep=4] training 1,500,000 steps - ... [trial 1 | 4 sheep | 4,521,992 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 4 sheep | 4,621,992 steps | ret(last 50)=+18.55 win_sr=98% cum_sr=94%] - ... [trial 1 | 4 sheep | 4,721,992 steps | ret(last 50)=+19.17 win_sr=100% cum_sr=97%] - ... [trial 1 | 4 sheep | 4,821,992 steps | ret(last 50)=+18.64 win_sr=100% cum_sr=98%] - ... [trial 1 | 4 sheep | 4,921,992 steps | ret(last 50)=+19.06 win_sr=100% cum_sr=99%] - ... [trial 1 | 4 sheep | 5,021,992 steps | ret(last 50)=+19.01 win_sr=100% cum_sr=99%] - ... [trial 1 | 4 sheep | 5,121,992 steps | ret(last 50)=+19.23 win_sr=100% cum_sr=99%] - ... [trial 1 | 4 sheep | 5,221,992 steps | ret(last 50)=+18.71 win_sr=100% cum_sr=99%] - ... [trial 1 | 4 sheep | 5,321,992 steps | ret(last 50)=+18.81 win_sr=100% cum_sr=99%] - ... [trial 1 | 4 sheep | 5,421,992 steps | ret(last 50)=+19.51 win_sr=100% cum_sr=99%] - ... [trial 1 | 4 sheep | 5,521,992 steps | ret(last 50)=+19.01 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,621,992 steps | ret(last 50)=+19.21 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,721,992 steps | ret(last 50)=+18.62 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,821,992 steps | ret(last 50)=+18.57 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,921,992 steps | ret(last 50)=+19.22 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 6,021,992 steps | ret(last 50)=+18.73 win_sr=100% cum_sr=100%] -[Stage n_sheep=4] evaluating 30 eps -[Stage n_sheep=4] sr=100% mean_len=874 mean_min_pen=3.3m mean_act=1.23 - -[Stage n_sheep=5] training 1,500,000 steps - ... [trial 1 | 5 sheep | 6,029,320 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 5 sheep | 6,129,320 steps | ret(last 50)=+22.70 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,229,320 steps | ret(last 50)=+20.82 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,329,320 steps | ret(last 50)=+20.84 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,429,320 steps | ret(last 50)=+21.70 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,529,320 steps | ret(last 50)=+21.25 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,629,320 steps | ret(last 50)=+20.61 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,729,320 steps | ret(last 50)=+21.10 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,829,320 steps | ret(last 50)=+21.42 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,929,320 steps | ret(last 50)=+21.39 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,029,320 steps | ret(last 50)=+20.80 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,129,320 steps | ret(last 50)=+21.19 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,229,320 steps | ret(last 50)=+20.92 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,329,320 steps | ret(last 50)=+20.97 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,429,320 steps | ret(last 50)=+20.48 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,529,320 steps | ret(last 50)=+21.36 win_sr=100% cum_sr=100%] -[Stage n_sheep=5] evaluating 30 eps -[Stage n_sheep=5] sr=97% mean_len=945 mean_min_pen=3.4m mean_act=1.33 - -[Stage n_sheep=6] training 1,500,000 steps - ... [trial 1 | 6 sheep | 7,536,648 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 6 sheep | 7,636,648 steps | ret(last 50)=+22.41 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 7,736,648 steps | ret(last 50)=+23.84 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 7,836,648 steps | ret(last 50)=+22.95 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 7,936,648 steps | ret(last 50)=+23.97 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,036,648 steps | ret(last 50)=+24.02 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,136,648 steps | ret(last 50)=+23.42 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,236,648 steps | ret(last 50)=+24.15 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,336,648 steps | ret(last 50)=+23.32 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,436,648 steps | ret(last 50)=+23.46 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,536,648 steps | ret(last 50)=+23.80 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,636,648 steps | ret(last 50)=+24.41 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,736,648 steps | ret(last 50)=+23.86 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,836,648 steps | ret(last 50)=+23.57 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,936,648 steps | ret(last 50)=+23.74 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 9,036,648 steps | ret(last 50)=+22.87 win_sr=100% cum_sr=100%] -[Stage n_sheep=6] evaluating 30 eps -[Stage n_sheep=6] sr=100% mean_len=1162 mean_min_pen=3.1m mean_act=1.36 - -[Stage n_sheep=7] training 1,500,000 steps - ... [trial 1 | 7 sheep | 9,043,976 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 7 sheep | 9,143,976 steps | ret(last 50)=+24.46 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,243,976 steps | ret(last 50)=+25.47 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,343,976 steps | ret(last 50)=+25.10 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,443,976 steps | ret(last 50)=+24.85 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,543,976 steps | ret(last 50)=+26.01 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,643,976 steps | ret(last 50)=+26.26 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,743,976 steps | ret(last 50)=+26.44 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,843,976 steps | ret(last 50)=+26.08 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,943,976 steps | ret(last 50)=+25.00 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,043,976 steps | ret(last 50)=+26.22 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,143,976 steps | ret(last 50)=+24.79 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,243,976 steps | ret(last 50)=+26.33 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,343,976 steps | ret(last 50)=+26.36 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,443,976 steps | ret(last 50)=+25.68 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,543,976 steps | ret(last 50)=+26.75 win_sr=100% cum_sr=100%] -[Stage n_sheep=7] evaluating 30 eps -[Stage n_sheep=7] sr=100% mean_len=1253 mean_min_pen=2.7m mean_act=1.38 - -[Stage n_sheep=8] training 1,500,000 steps - ... [trial 1 | 8 sheep | 10,551,304 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 8 sheep | 10,651,304 steps | ret(last 50)=+28.19 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 10,751,304 steps | ret(last 50)=+28.80 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 10,851,304 steps | ret(last 50)=+27.81 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 10,951,304 steps | ret(last 50)=+27.31 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,051,304 steps | ret(last 50)=+27.67 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,151,304 steps | ret(last 50)=+27.14 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,251,304 steps | ret(last 50)=+29.60 win_sr=98% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,351,304 steps | ret(last 50)=+28.81 win_sr=98% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,451,304 steps | ret(last 50)=+27.76 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,551,304 steps | ret(last 50)=+27.28 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,651,304 steps | ret(last 50)=+29.04 win_sr=98% cum_sr=99%] - ... [trial 1 | 8 sheep | 11,751,304 steps | ret(last 50)=+28.75 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,851,304 steps | ret(last 50)=+29.04 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,951,304 steps | ret(last 50)=+28.27 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 12,051,304 steps | ret(last 50)=+27.90 win_sr=100% cum_sr=100%] -[Stage n_sheep=8] evaluating 30 eps -[Stage n_sheep=8] sr=93% mean_len=1495 mean_min_pen=2.6m mean_act=1.39 - -[Stage n_sheep=9] training 1,500,000 steps - ... [trial 1 | 9 sheep | 12,058,632 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 9 sheep | 12,158,632 steps | ret(last 50)=+30.67 win_sr=98% cum_sr=98%] - ... [trial 1 | 9 sheep | 12,258,632 steps | ret(last 50)=+28.78 win_sr=100% cum_sr=99%] - ... [trial 1 | 9 sheep | 12,358,632 steps | ret(last 50)=+30.08 win_sr=100% cum_sr=99%] - ... [trial 1 | 9 sheep | 12,458,632 steps | ret(last 50)=+29.61 win_sr=100% cum_sr=99%] - ... [trial 1 | 9 sheep | 12,558,632 steps | ret(last 50)=+30.34 win_sr=98% cum_sr=99%] - ... [trial 1 | 9 sheep | 12,658,632 steps | ret(last 50)=+29.48 win_sr=98% cum_sr=99%] - ... [trial 1 | 9 sheep | 12,758,632 steps | ret(last 50)=+29.92 win_sr=98% cum_sr=99%] - ... [trial 1 | 9 sheep | 12,858,632 steps | ret(last 50)=+29.26 win_sr=100% cum_sr=99%] - ... [trial 1 | 9 sheep | 12,958,632 steps | ret(last 50)=+30.36 win_sr=96% cum_sr=98%] - ... [trial 1 | 9 sheep | 13,058,632 steps | ret(last 50)=+30.19 win_sr=100% cum_sr=98%] - ... [trial 1 | 9 sheep | 13,158,632 steps | ret(last 50)=+29.24 win_sr=100% cum_sr=99%] - ... [trial 1 | 9 sheep | 13,258,632 steps | ret(last 50)=+30.40 win_sr=100% cum_sr=99%] - ... [trial 1 | 9 sheep | 13,358,632 steps | ret(last 50)=+31.65 win_sr=100% cum_sr=99%] - ... [trial 1 | 9 sheep | 13,458,632 steps | ret(last 50)=+30.77 win_sr=98% cum_sr=99%] - ... [trial 1 | 9 sheep | 13,558,632 steps | ret(last 50)=+30.21 win_sr=94% cum_sr=98%] -[Stage n_sheep=9] evaluating 30 eps -[Stage n_sheep=9] sr=97% mean_len=1625 mean_min_pen=2.1m mean_act=1.39 - -[Stage n_sheep=10] training 1,500,000 steps - ... [trial 1 | 10 sheep | 13,565,960 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 10 sheep | 13,665,960 steps | ret(last 50)=+30.13 win_sr=90% cum_sr=92%] - ... [trial 1 | 10 sheep | 13,765,960 steps | ret(last 50)=+31.84 win_sr=96% cum_sr=92%] - ... [trial 1 | 10 sheep | 13,865,960 steps | ret(last 50)=+32.66 win_sr=88% cum_sr=91%] - ... [trial 1 | 10 sheep | 13,965,960 steps | ret(last 50)=+32.56 win_sr=90% cum_sr=91%] - ... [trial 1 | 10 sheep | 14,065,960 steps | ret(last 50)=+31.29 win_sr=98% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,165,960 steps | ret(last 50)=+32.72 win_sr=94% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,265,960 steps | ret(last 50)=+32.42 win_sr=96% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,365,960 steps | ret(last 50)=+33.96 win_sr=92% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,465,960 steps | ret(last 50)=+33.17 win_sr=98% cum_sr=94%] - ... [trial 1 | 10 sheep | 14,565,960 steps | ret(last 50)=+31.48 win_sr=96% cum_sr=94%] - ... [trial 1 | 10 sheep | 14,665,960 steps | ret(last 50)=+31.19 win_sr=90% cum_sr=94%] - ... [trial 1 | 10 sheep | 14,765,960 steps | ret(last 50)=+32.87 win_sr=98% cum_sr=94%] - ... [trial 1 | 10 sheep | 14,865,960 steps | ret(last 50)=+32.36 win_sr=94% cum_sr=94%] - ... [trial 1 | 10 sheep | 14,965,960 steps | ret(last 50)=+31.14 win_sr=94% cum_sr=94%] - ... [trial 1 | 10 sheep | 15,065,960 steps | ret(last 50)=+32.18 win_sr=96% cum_sr=94%] -[Stage n_sheep=10] evaluating 30 eps -[Stage n_sheep=10] sr=97% mean_len=1816 mean_min_pen=2.0m mean_act=1.39 - -============================================================ - REPLAY SUMMARY -============================================================ - n_sheep=1 sr= 97% len= 351 min_pen= 3.9m act=0.28 - n_sheep=2 sr=100% len= 421 min_pen= 3.5m act=1.01 - n_sheep=3 sr=100% len= 608 min_pen= 3.5m act=1.06 - n_sheep=4 sr=100% len= 874 min_pen= 3.3m act=1.23 - n_sheep=5 sr= 97% len= 945 min_pen= 3.4m act=1.33 - n_sheep=6 sr=100% len= 1162 min_pen= 3.1m act=1.36 - n_sheep=7 sr=100% len= 1253 min_pen= 2.7m act=1.38 - n_sheep=8 sr= 93% len= 1495 min_pen= 2.6m act=1.39 - n_sheep=9 sr= 97% len= 1625 min_pen= 2.1m act=1.39 - n_sheep=10 sr= 97% len= 1816 min_pen= 2.0m act=1.39 - - Total time: 90.3 min - Artefacts: runs/final_v2/ diff --git a/training/runs/final_v2/config.json b/training/runs/final_v2/config.json deleted file mode 100644 index b2d15fe..0000000 --- a/training/runs/final_v2/config.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "W_PER_SHEEP": 1.0, - "W_ALIGN": 0.0, - "W_PEN_BONUS": 5.0, - "W_STEP_COST": 0.02, - "W_COMPLETE": 200.0, - "W_COMPACT": 1.5, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": false, - "ent_coef": 0.02 -} \ No newline at end of file diff --git a/training/runs/final_v2/final_model.zip b/training/runs/final_v2/final_model.zip deleted file mode 100644 index 41dc86d..0000000 Binary files a/training/runs/final_v2/final_model.zip and /dev/null differ diff --git a/training/runs/final_v2/stage_results.json b/training/runs/final_v2/stage_results.json deleted file mode 100644 index a8f3266..0000000 --- a/training/runs/final_v2/stage_results.json +++ /dev/null @@ -1,72 +0,0 @@ -[ - { - "n_sheep": 1, - "sr": 0.9666666666666667, - "mean_len": 350.96666666666664, - "mean_min_pen": 3.913520161310832, - "mean_act": 0.2797267940386975 - }, - { - "n_sheep": 2, - "sr": 1.0, - "mean_len": 421.46666666666664, - "mean_min_pen": 3.485754116376241, - "mean_act": 1.0053067604365706 - }, - { - "n_sheep": 3, - "sr": 1.0, - "mean_len": 608.5, - "mean_min_pen": 3.52824010848999, - "mean_act": 1.0576287743527575 - }, - { - "n_sheep": 4, - "sr": 1.0, - "mean_len": 874.1333333333333, - "mean_min_pen": 3.2648465514183043, - "mean_act": 1.2302308682249101 - }, - { - "n_sheep": 5, - "sr": 0.9666666666666667, - "mean_len": 945.1333333333333, - "mean_min_pen": 3.390091093381246, - "mean_act": 1.328577256075333 - }, - { - "n_sheep": 6, - "sr": 1.0, - "mean_len": 1162.1, - "mean_min_pen": 3.0996540347735086, - "mean_act": 1.3581346810990618 - }, - { - "n_sheep": 7, - "sr": 1.0, - "mean_len": 1252.6, - "mean_min_pen": 2.6753984689712524, - "mean_act": 1.3753795162019462 - }, - { - "n_sheep": 8, - "sr": 0.9333333333333333, - "mean_len": 1495.2333333333333, - "mean_min_pen": 2.560386610031128, - "mean_act": 1.3861974064434042 - }, - { - "n_sheep": 9, - "sr": 0.9666666666666667, - "mean_len": 1624.9, - "mean_min_pen": 2.130835851033529, - "mean_act": 1.387693840600181 - }, - { - "n_sheep": 10, - "sr": 0.9666666666666667, - "mean_len": 1816.5, - "mean_min_pen": 1.9940622925758362, - "mean_act": 1.3946097864970635 - } -] \ No newline at end of file diff --git a/training/runs/final_v2/vecnorm.pkl b/training/runs/final_v2/vecnorm.pkl deleted file mode 100644 index 44319c8..0000000 Binary files a/training/runs/final_v2/vecnorm.pkl and /dev/null differ diff --git a/training/runs/final_v3.log b/training/runs/final_v3.log deleted file mode 100644 index 385c3ed..0000000 --- a/training/runs/final_v3.log +++ /dev/null @@ -1,253 +0,0 @@ -Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02} -Run dir: runs/final_v3 -Curriculum: 1 → 10 sheep, 1,500,000 steps/stage - -[Stage n_sheep=1] training 1,500,000 steps - ... [trial 1 | 1 sheep | 100,000 steps | ret(last 40)=-28.61 win_sr=10% cum_sr=10%] - ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-29.25 win_sr=12% cum_sr=11%] - ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=-31.55 win_sr=6% cum_sr=9%] - ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=-30.74 win_sr=10% cum_sr=9%] - ... [trial 1 | 1 sheep | 500,000 steps | ret(last 50)=-32.89 win_sr=4% cum_sr=8%] - ... [trial 1 | 1 sheep | 600,000 steps | ret(last 50)=-34.66 win_sr=4% cum_sr=7%] - ... [trial 1 | 1 sheep | 700,000 steps | ret(last 50)=-31.44 win_sr=12% cum_sr=8%] - ... [trial 1 | 1 sheep | 800,000 steps | ret(last 50)=-32.70 win_sr=6% cum_sr=8%] - ... [trial 1 | 1 sheep | 900,000 steps | ret(last 50)=-35.48 win_sr=2% cum_sr=7%] - ... [trial 1 | 1 sheep | 1,000,000 steps | ret(last 50)=-31.81 win_sr=10% cum_sr=8%] - ... [trial 1 | 1 sheep | 1,100,000 steps | ret(last 50)=-28.53 win_sr=10% cum_sr=8%] - ... [trial 1 | 1 sheep | 1,200,000 steps | ret(last 50)=-5.61 win_sr=62% cum_sr=13%] - ... [trial 1 | 1 sheep | 1,300,000 steps | ret(last 50)=+11.97 win_sr=100% cum_sr=34%] - ... [trial 1 | 1 sheep | 1,400,000 steps | ret(last 50)=+10.92 win_sr=96% cum_sr=50%] - ... [trial 1 | 1 sheep | 1,500,000 steps | ret(last 50)=+11.97 win_sr=100% cum_sr=63%] -[Stage n_sheep=1] evaluating 30 eps -[Stage n_sheep=1] sr=100% mean_len=249 mean_min_pen=3.7m mean_act=0.41 - -[Stage n_sheep=2] training 1,500,000 steps - ... [trial 1 | 2 sheep | 1,507,336 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 2 sheep | 1,607,336 steps | ret(last 47)=-1.11 win_sr=45% cum_sr=45%] - ... [trial 1 | 2 sheep | 1,707,336 steps | ret(last 50)=-8.90 win_sr=8% cum_sr=27%] - ... [trial 1 | 2 sheep | 1,807,336 steps | ret(last 50)=-5.28 win_sr=16% cum_sr=24%] - ... [trial 1 | 2 sheep | 1,907,336 steps | ret(last 50)=+3.16 win_sr=58% cum_sr=33%] - ... [trial 1 | 2 sheep | 2,007,336 steps | ret(last 50)=+10.26 win_sr=84% cum_sr=48%] - ... [trial 1 | 2 sheep | 2,107,336 steps | ret(last 50)=+14.27 win_sr=100% cum_sr=64%] - ... [trial 1 | 2 sheep | 2,207,336 steps | ret(last 50)=+14.08 win_sr=100% cum_sr=72%] - ... [trial 1 | 2 sheep | 2,307,336 steps | ret(last 50)=+14.38 win_sr=100% cum_sr=77%] - ... [trial 1 | 2 sheep | 2,407,336 steps | ret(last 50)=+14.27 win_sr=100% cum_sr=81%] - ... [trial 1 | 2 sheep | 2,507,336 steps | ret(last 50)=+14.37 win_sr=100% cum_sr=84%] - ... [trial 1 | 2 sheep | 2,607,336 steps | ret(last 50)=+14.33 win_sr=100% cum_sr=86%] - ... [trial 1 | 2 sheep | 2,707,336 steps | ret(last 50)=+14.04 win_sr=100% cum_sr=87%] - ... [trial 1 | 2 sheep | 2,807,336 steps | ret(last 50)=+14.25 win_sr=100% cum_sr=89%] - ... [trial 1 | 2 sheep | 2,907,336 steps | ret(last 50)=+14.61 win_sr=100% cum_sr=90%] - ... [trial 1 | 2 sheep | 3,007,336 steps | ret(last 50)=+13.98 win_sr=98% cum_sr=91%] -[Stage n_sheep=2] evaluating 30 eps -[Stage n_sheep=2] sr=100% mean_len=548 mean_min_pen=3.5m mean_act=0.92 - -[Stage n_sheep=3] training 1,500,000 steps - ... [trial 1 | 3 sheep | 3,014,664 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 3 sheep | 3,114,664 steps | ret(last 50)=+16.10 win_sr=100% cum_sr=99%] - ... [trial 1 | 3 sheep | 3,214,664 steps | ret(last 50)=+17.27 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,314,664 steps | ret(last 50)=+16.86 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,414,664 steps | ret(last 50)=+16.86 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,514,664 steps | ret(last 50)=+17.46 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,614,664 steps | ret(last 50)=+17.43 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,714,664 steps | ret(last 50)=+16.76 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,814,664 steps | ret(last 50)=+16.97 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 3,914,664 steps | ret(last 50)=+16.97 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,014,664 steps | ret(last 50)=+17.19 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,114,664 steps | ret(last 50)=+17.23 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,214,664 steps | ret(last 50)=+16.45 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,314,664 steps | ret(last 50)=+17.18 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,414,664 steps | ret(last 50)=+16.42 win_sr=100% cum_sr=100%] - ... [trial 1 | 3 sheep | 4,514,664 steps | ret(last 50)=+16.32 win_sr=100% cum_sr=100%] -[Stage n_sheep=3] evaluating 30 eps -[Stage n_sheep=3] sr=100% mean_len=640 mean_min_pen=3.5m mean_act=1.06 - -[Stage n_sheep=4] training 1,500,000 steps - ... [trial 1 | 4 sheep | 4,521,992 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 4 sheep | 4,621,992 steps | ret(last 50)=+18.61 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 4,721,992 steps | ret(last 50)=+18.82 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 4,821,992 steps | ret(last 50)=+18.91 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 4,921,992 steps | ret(last 50)=+18.55 win_sr=98% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,021,992 steps | ret(last 50)=+18.99 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,121,992 steps | ret(last 50)=+18.76 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,221,992 steps | ret(last 50)=+18.46 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,321,992 steps | ret(last 50)=+19.21 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,421,992 steps | ret(last 50)=+17.86 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,521,992 steps | ret(last 50)=+19.19 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,621,992 steps | ret(last 50)=+18.83 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,721,992 steps | ret(last 50)=+18.51 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,821,992 steps | ret(last 50)=+18.38 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 5,921,992 steps | ret(last 50)=+18.56 win_sr=100% cum_sr=100%] - ... [trial 1 | 4 sheep | 6,021,992 steps | ret(last 50)=+18.82 win_sr=100% cum_sr=100%] -[Stage n_sheep=4] evaluating 30 eps -[Stage n_sheep=4] sr=100% mean_len=762 mean_min_pen=3.5m mean_act=1.26 - -[Stage n_sheep=5] training 1,500,000 steps - ... [trial 1 | 5 sheep | 6,029,320 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 5 sheep | 6,129,320 steps | ret(last 50)=+20.46 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,229,320 steps | ret(last 50)=+20.41 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,329,320 steps | ret(last 50)=+20.58 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,429,320 steps | ret(last 50)=+21.10 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,529,320 steps | ret(last 50)=+20.48 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,629,320 steps | ret(last 50)=+20.56 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,729,320 steps | ret(last 50)=+20.51 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,829,320 steps | ret(last 50)=+20.70 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 6,929,320 steps | ret(last 50)=+20.83 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,029,320 steps | ret(last 50)=+21.52 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,129,320 steps | ret(last 50)=+21.62 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,229,320 steps | ret(last 50)=+21.22 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,329,320 steps | ret(last 50)=+21.17 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,429,320 steps | ret(last 50)=+21.00 win_sr=100% cum_sr=100%] - ... [trial 1 | 5 sheep | 7,529,320 steps | ret(last 50)=+20.48 win_sr=100% cum_sr=100%] -[Stage n_sheep=5] evaluating 30 eps -[Stage n_sheep=5] sr=100% mean_len=931 mean_min_pen=3.6m mean_act=1.31 - -[Stage n_sheep=6] training 1,500,000 steps - ... [trial 1 | 6 sheep | 7,536,648 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 6 sheep | 7,636,648 steps | ret(last 50)=+21.89 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 7,736,648 steps | ret(last 50)=+22.98 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 7,836,648 steps | ret(last 50)=+22.66 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 7,936,648 steps | ret(last 50)=+23.23 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,036,648 steps | ret(last 50)=+22.83 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,136,648 steps | ret(last 50)=+22.65 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,236,648 steps | ret(last 50)=+22.22 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,336,648 steps | ret(last 50)=+22.45 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,436,648 steps | ret(last 50)=+22.55 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,536,648 steps | ret(last 50)=+22.99 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,636,648 steps | ret(last 50)=+21.99 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,736,648 steps | ret(last 50)=+22.30 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,836,648 steps | ret(last 50)=+23.06 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 8,936,648 steps | ret(last 50)=+23.32 win_sr=100% cum_sr=100%] - ... [trial 1 | 6 sheep | 9,036,648 steps | ret(last 50)=+21.80 win_sr=100% cum_sr=100%] -[Stage n_sheep=6] evaluating 30 eps -[Stage n_sheep=6] sr=100% mean_len=1082 mean_min_pen=3.6m mean_act=1.35 - -[Stage n_sheep=7] training 1,500,000 steps - ... [trial 1 | 7 sheep | 9,043,976 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 7 sheep | 9,143,976 steps | ret(last 50)=+25.57 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,243,976 steps | ret(last 50)=+24.76 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,343,976 steps | ret(last 50)=+24.69 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,443,976 steps | ret(last 50)=+26.12 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,543,976 steps | ret(last 50)=+25.53 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,643,976 steps | ret(last 50)=+25.39 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,743,976 steps | ret(last 50)=+24.45 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,843,976 steps | ret(last 50)=+26.45 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 9,943,976 steps | ret(last 50)=+24.51 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,043,976 steps | ret(last 50)=+24.80 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,143,976 steps | ret(last 50)=+25.56 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,243,976 steps | ret(last 50)=+25.75 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,343,976 steps | ret(last 50)=+25.64 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,443,976 steps | ret(last 50)=+26.45 win_sr=100% cum_sr=100%] - ... [trial 1 | 7 sheep | 10,543,976 steps | ret(last 50)=+25.19 win_sr=100% cum_sr=100%] -[Stage n_sheep=7] evaluating 30 eps -[Stage n_sheep=7] sr=100% mean_len=1081 mean_min_pen=3.5m mean_act=1.37 - -[Stage n_sheep=8] training 1,500,000 steps - ... [trial 1 | 8 sheep | 10,551,304 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 8 sheep | 10,651,304 steps | ret(last 50)=+26.63 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 10,751,304 steps | ret(last 50)=+27.63 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 10,851,304 steps | ret(last 50)=+27.53 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 10,951,304 steps | ret(last 50)=+27.43 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,051,304 steps | ret(last 50)=+27.70 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,151,304 steps | ret(last 50)=+26.53 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,251,304 steps | ret(last 50)=+27.24 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,351,304 steps | ret(last 50)=+27.14 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,451,304 steps | ret(last 50)=+27.43 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,551,304 steps | ret(last 50)=+27.25 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,651,304 steps | ret(last 50)=+27.40 win_sr=98% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,751,304 steps | ret(last 50)=+27.35 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,851,304 steps | ret(last 50)=+26.33 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 11,951,304 steps | ret(last 50)=+26.89 win_sr=100% cum_sr=100%] - ... [trial 1 | 8 sheep | 12,051,304 steps | ret(last 50)=+27.86 win_sr=100% cum_sr=100%] -[Stage n_sheep=8] evaluating 30 eps -[Stage n_sheep=8] sr=100% mean_len=1311 mean_min_pen=3.5m mean_act=1.38 - -[Stage n_sheep=9] training 1,500,000 steps - ... [trial 1 | 9 sheep | 12,058,632 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 9 sheep | 12,158,632 steps | ret(last 50)=+29.62 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 12,258,632 steps | ret(last 50)=+31.32 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 12,358,632 steps | ret(last 50)=+30.30 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 12,458,632 steps | ret(last 50)=+29.33 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 12,558,632 steps | ret(last 50)=+28.83 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 12,658,632 steps | ret(last 50)=+29.02 win_sr=98% cum_sr=100%] - ... [trial 1 | 9 sheep | 12,758,632 steps | ret(last 50)=+29.60 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 12,858,632 steps | ret(last 50)=+29.88 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 12,958,632 steps | ret(last 50)=+30.12 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 13,058,632 steps | ret(last 50)=+28.80 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 13,158,632 steps | ret(last 50)=+30.33 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 13,258,632 steps | ret(last 50)=+27.85 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 13,358,632 steps | ret(last 50)=+28.21 win_sr=96% cum_sr=100%] - ... [trial 1 | 9 sheep | 13,458,632 steps | ret(last 50)=+29.88 win_sr=100% cum_sr=100%] - ... [trial 1 | 9 sheep | 13,558,632 steps | ret(last 50)=+29.06 win_sr=98% cum_sr=100%] -[Stage n_sheep=9] evaluating 30 eps -[Stage n_sheep=9] sr=100% mean_len=1435 mean_min_pen=3.6m mean_act=1.39 - -[Stage n_sheep=10] training 1,500,000 steps - ... [trial 1 | 10 sheep | 13,565,960 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | 10 sheep | 13,665,960 steps | ret(last 50)=+30.42 win_sr=96% cum_sr=96%] - ... [trial 1 | 10 sheep | 13,765,960 steps | ret(last 50)=+29.97 win_sr=92% cum_sr=95%] - ... [trial 1 | 10 sheep | 13,865,960 steps | ret(last 50)=+30.45 win_sr=82% cum_sr=90%] - ... [trial 1 | 10 sheep | 13,965,960 steps | ret(last 50)=+29.82 win_sr=90% cum_sr=91%] - ... [trial 1 | 10 sheep | 14,065,960 steps | ret(last 50)=+29.66 win_sr=90% cum_sr=91%] - ... [trial 1 | 10 sheep | 14,165,960 steps | ret(last 50)=+31.57 win_sr=98% cum_sr=92%] - ... [trial 1 | 10 sheep | 14,265,960 steps | ret(last 50)=+31.71 win_sr=96% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,365,960 steps | ret(last 50)=+31.75 win_sr=94% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,465,960 steps | ret(last 50)=+29.46 win_sr=88% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,565,960 steps | ret(last 50)=+29.62 win_sr=94% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,665,960 steps | ret(last 50)=+31.64 win_sr=98% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,765,960 steps | ret(last 50)=+30.86 win_sr=90% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,865,960 steps | ret(last 50)=+31.65 win_sr=90% cum_sr=93%] - ... [trial 1 | 10 sheep | 14,965,960 steps | ret(last 50)=+31.75 win_sr=92% cum_sr=93%] - ... [trial 1 | 10 sheep | 15,065,960 steps | ret(last 50)=+30.24 win_sr=100% cum_sr=93%] -[Stage n_sheep=10] evaluating 30 eps -[Stage n_sheep=10] sr=90% mean_len=1841 mean_min_pen=3.6m mean_act=1.39 - -[Consolidation] mixed n_sheep ∈ [1, 10], 2,000,000 steps - ... [trial 1 | consolidate | 15,073,288 steps | ret(last 0)=+nan win_sr=nan% cum_sr=nan%] - ... [trial 1 | consolidate | 15,173,288 steps | ret(last 50)=+20.69 win_sr=94% cum_sr=95%] - ... [trial 1 | consolidate | 15,273,288 steps | ret(last 50)=+20.62 win_sr=90% cum_sr=92%] - ... [trial 1 | consolidate | 15,373,288 steps | ret(last 50)=+20.25 win_sr=94% cum_sr=93%] - ... [trial 1 | consolidate | 15,473,288 steps | ret(last 50)=+19.82 win_sr=96% cum_sr=94%] - ... [trial 1 | consolidate | 15,573,288 steps | ret(last 50)=+20.56 win_sr=94% cum_sr=94%] - ... [trial 1 | consolidate | 15,673,288 steps | ret(last 50)=+20.56 win_sr=92% cum_sr=94%] - ... [trial 1 | consolidate | 15,773,288 steps | ret(last 50)=+19.43 win_sr=94% cum_sr=95%] - ... [trial 1 | consolidate | 15,873,288 steps | ret(last 50)=+21.85 win_sr=98% cum_sr=95%] - ... [trial 1 | consolidate | 15,973,288 steps | ret(last 50)=+21.84 win_sr=94% cum_sr=95%] - ... [trial 1 | consolidate | 16,073,288 steps | ret(last 50)=+22.13 win_sr=98% cum_sr=95%] - ... [trial 1 | consolidate | 16,173,288 steps | ret(last 50)=+21.89 win_sr=94% cum_sr=95%] - ... [trial 1 | consolidate | 16,273,288 steps | ret(last 50)=+21.88 win_sr=98% cum_sr=95%] - ... [trial 1 | consolidate | 16,373,288 steps | ret(last 50)=+20.81 win_sr=94% cum_sr=95%] - ... [trial 1 | consolidate | 16,473,288 steps | ret(last 50)=+20.91 win_sr=98% cum_sr=95%] - ... [trial 1 | consolidate | 16,573,288 steps | ret(last 50)=+21.13 win_sr=98% cum_sr=95%] - ... [trial 1 | consolidate | 16,673,288 steps | ret(last 50)=+19.85 win_sr=100% cum_sr=95%] - ... [trial 1 | consolidate | 16,773,288 steps | ret(last 50)=+22.30 win_sr=92% cum_sr=95%] - ... [trial 1 | consolidate | 16,873,288 steps | ret(last 50)=+20.61 win_sr=96% cum_sr=95%] - ... [trial 1 | consolidate | 16,973,288 steps | ret(last 50)=+21.93 win_sr=98% cum_sr=96%] - ... [trial 1 | consolidate | 17,073,288 steps | ret(last 50)=+21.86 win_sr=98% cum_sr=96%] -[Consolidation] re-evaluating all sheep counts -[Consolidation] n_sheep=1 sr=97% mean_len=377 mean_min_pen=3.5m mean_act=1.39 -[Consolidation] n_sheep=2 sr=47% mean_len=1718 mean_min_pen=2.4m mean_act=1.39 -[Consolidation] n_sheep=3 sr=93% mean_len=970 mean_min_pen=3.2m mean_act=1.39 -[Consolidation] n_sheep=4 sr=97% mean_len=1008 mean_min_pen=3.3m mean_act=1.39 -[Consolidation] n_sheep=5 sr=100% mean_len=1176 mean_min_pen=3.3m mean_act=1.39 -[Consolidation] n_sheep=6 sr=100% mean_len=1305 mean_min_pen=3.3m mean_act=1.39 -[Consolidation] n_sheep=7 sr=100% mean_len=1300 mean_min_pen=3.4m mean_act=1.39 -[Consolidation] n_sheep=8 sr=100% mean_len=1461 mean_min_pen=3.5m mean_act=1.39 -[Consolidation] n_sheep=9 sr=87% mean_len=1607 mean_min_pen=3.8m mean_act=1.39 -[Consolidation] n_sheep=10 sr=80% mean_len=1801 mean_min_pen=3.7m mean_act=1.39 - -============================================================ - REPLAY SUMMARY -============================================================ - n_sheep=1 sr= 97% len= 377 min_pen= 3.5m act=1.39 - n_sheep=2 sr= 47% len= 1718 min_pen= 2.4m act=1.39 - n_sheep=3 sr= 93% len= 970 min_pen= 3.2m act=1.39 - n_sheep=4 sr= 97% len= 1008 min_pen= 3.3m act=1.39 - n_sheep=5 sr=100% len= 1176 min_pen= 3.3m act=1.39 - n_sheep=6 sr=100% len= 1305 min_pen= 3.3m act=1.39 - n_sheep=7 sr=100% len= 1300 min_pen= 3.4m act=1.39 - n_sheep=8 sr=100% len= 1461 min_pen= 3.5m act=1.39 - n_sheep=9 sr= 87% len= 1607 min_pen= 3.8m act=1.39 - n_sheep=10 sr= 80% len= 1801 min_pen= 3.7m act=1.39 - - Total time: 110.1 min - Artefacts: runs/final_v3/ diff --git a/training/runs/final_v3/config.json b/training/runs/final_v3/config.json deleted file mode 100644 index b2d15fe..0000000 --- a/training/runs/final_v3/config.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "W_PER_SHEEP": 1.0, - "W_ALIGN": 0.0, - "W_PEN_BONUS": 5.0, - "W_STEP_COST": 0.02, - "W_COMPLETE": 200.0, - "W_COMPACT": 1.5, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": false, - "ent_coef": 0.02 -} \ No newline at end of file diff --git a/training/runs/final_v3/final_model.zip b/training/runs/final_v3/final_model.zip deleted file mode 100644 index abad0d3..0000000 Binary files a/training/runs/final_v3/final_model.zip and /dev/null differ diff --git a/training/runs/final_v3/stage_results.json b/training/runs/final_v3/stage_results.json deleted file mode 100644 index e44f037..0000000 --- a/training/runs/final_v3/stage_results.json +++ /dev/null @@ -1,72 +0,0 @@ -[ - { - "n_sheep": 1, - "sr": 0.9666666666666667, - "mean_len": 377.3666666666667, - "mean_min_pen": 3.5389957586924234, - "mean_act": 1.3908841227086732 - }, - { - "n_sheep": 2, - "sr": 0.4666666666666667, - "mean_len": 1717.6333333333334, - "mean_min_pen": 2.4164488633473713, - "mean_act": 1.3922284740020803 - }, - { - "n_sheep": 3, - "sr": 0.9333333333333333, - "mean_len": 970.2666666666667, - "mean_min_pen": 3.203955141703288, - "mean_act": 1.3945290882248416 - }, - { - "n_sheep": 4, - "sr": 0.9666666666666667, - "mean_len": 1008.0, - "mean_min_pen": 3.279213563601176, - "mean_act": 1.3918021049325862 - }, - { - "n_sheep": 5, - "sr": 1.0, - "mean_len": 1175.8666666666666, - "mean_min_pen": 3.3209743976593016, - "mean_act": 1.3925684957666513 - }, - { - "n_sheep": 6, - "sr": 1.0, - "mean_len": 1305.0, - "mean_min_pen": 3.312229561805725, - "mean_act": 1.391130207932886 - }, - { - "n_sheep": 7, - "sr": 1.0, - "mean_len": 1300.0, - "mean_min_pen": 3.363971138000488, - "mean_act": 1.392986050516367 - }, - { - "n_sheep": 8, - "sr": 1.0, - "mean_len": 1461.3666666666666, - "mean_min_pen": 3.4741388003031415, - "mean_act": 1.392040583461347 - }, - { - "n_sheep": 9, - "sr": 0.8666666666666667, - "mean_len": 1606.7333333333333, - "mean_min_pen": 3.835897175470988, - "mean_act": 1.3907199496534952 - }, - { - "n_sheep": 10, - "sr": 0.8, - "mean_len": 1800.9666666666667, - "mean_min_pen": 3.741190282503764, - "mean_act": 1.392501896076031 - } -] \ No newline at end of file diff --git a/training/runs/final_v3/vecnorm.pkl b/training/runs/final_v3/vecnorm.pkl deleted file mode 100644 index 983de47..0000000 Binary files a/training/runs/final_v3/vecnorm.pkl and /dev/null differ diff --git a/training/runs/ppo_debug.log b/training/runs/ppo_debug.log deleted file mode 100644 index 81b308e..0000000 --- a/training/runs/ppo_debug.log +++ /dev/null @@ -1,5569 +0,0 @@ -Using cpu device -Logging to runs/ppo_debug/ppo_1 ------------------------------- -| time/ | | -| fps | 5496 | -| iterations | 1 | -| time_elapsed | 2 | -| total_timesteps | 16384 | ------------------------------- ------------------------------------------- -| time/ | | -| fps | 4317 | -| iterations | 2 | -| time_elapsed | 7 | -| total_timesteps | 32768 | -| train/ | | -| approx_kl | 0.0036917897 | -| clip_fraction | 0.0212 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.352 | -| learning_rate | 0.0003 | -| loss | -0.0118 | -| n_updates | 10 | -| policy_gradient_loss | -0.000544 | -| std | 0.999 | -| value_loss | 0.0658 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 3946 | -| iterations | 3 | -| time_elapsed | 12 | -| total_timesteps | 49152 | -| train/ | | -| approx_kl | 0.0033213054 | -| clip_fraction | 0.0266 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.502 | -| learning_rate | 0.0003 | -| loss | -0.0255 | -| n_updates | 20 | -| policy_gradient_loss | -0.00158 | -| std | 0.997 | -| value_loss | 0.08 | ------------------------------------------- -/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. - warnings.warn( -Eval num_timesteps=50000, episode_reward=-32.92 +/- 15.12 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -32.9 | -| time/ | | -| total_timesteps | 50000 | -| train/ | | -| approx_kl | 0.005147726 | -| clip_fraction | 0.0478 | -| clip_range | 0.2 | -| entropy_loss | -2.84 | -| explained_variance | 0.893 | -| learning_rate | 0.0003 | -| loss | -0.0145 | -| n_updates | 30 | -| policy_gradient_loss | -0.00318 | -| std | 1 | -| value_loss | 0.0194 | ------------------------------------------ -New best mean reward! ------------------------------- -| time/ | | -| fps | 2231 | -| iterations | 4 | -| time_elapsed | 29 | -| total_timesteps | 65536 | ------------------------------- ------------------------------------------- -| time/ | | -| fps | 2444 | -| iterations | 5 | -| time_elapsed | 33 | -| total_timesteps | 81920 | -| train/ | | -| approx_kl | 0.0054671075 | -| clip_fraction | 0.0529 | -| clip_range | 0.2 | -| entropy_loss | -2.84 | -| explained_variance | 0.914 | -| learning_rate | 0.0003 | -| loss | -0.021 | -| n_updates | 40 | -| policy_gradient_loss | -0.00416 | -| std | 1 | -| value_loss | 0.0247 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2616 | -| iterations | 6 | -| time_elapsed | 37 | -| total_timesteps | 98304 | -| train/ | | -| approx_kl | 0.004603466 | -| clip_fraction | 0.0379 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0193 | -| n_updates | 50 | -| policy_gradient_loss | -0.00284 | -| std | 0.995 | -| value_loss | 0.0171 | ------------------------------------------ -/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. - warnings.warn( -Eval num_timesteps=100000, episode_reward=-27.45 +/- 49.10 -Episode length: 1973.15 +/- 86.14 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.97e+03 | -| mean_reward | -27.4 | -| time/ | | -| total_timesteps | 100000 | -| train/ | | -| approx_kl | 0.0053039393 | -| clip_fraction | 0.0564 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.878 | -| learning_rate | 0.0003 | -| loss | -0.0325 | -| n_updates | 60 | -| policy_gradient_loss | -0.00404 | -| std | 0.998 | -| value_loss | 0.0118 | ------------------------------------------- -New best mean reward! -------------------------------- -| time/ | | -| fps | 2212 | -| iterations | 7 | -| time_elapsed | 51 | -| total_timesteps | 114688 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2332 | -| iterations | 8 | -| time_elapsed | 56 | -| total_timesteps | 131072 | -| train/ | | -| approx_kl | 0.0048020086 | -| clip_fraction | 0.0449 | -| clip_range | 0.2 | -| entropy_loss | -2.84 | -| explained_variance | 0.839 | -| learning_rate | 0.0003 | -| loss | -0.0375 | -| n_updates | 70 | -| policy_gradient_loss | -0.00359 | -| std | 1 | -| value_loss | 0.0102 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2429 | -| iterations | 9 | -| time_elapsed | 60 | -| total_timesteps | 147456 | -| train/ | | -| approx_kl | 0.004460754 | -| clip_fraction | 0.0349 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.874 | -| learning_rate | 0.0003 | -| loss | -0.0293 | -| n_updates | 80 | -| policy_gradient_loss | -0.00294 | -| std | 1.01 | -| value_loss | 0.0132 | ------------------------------------------ -Eval num_timesteps=150000, episode_reward=-33.46 +/- 39.53 -Episode length: 1990.60 +/- 40.97 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.99e+03 | -| mean_reward | -33.5 | -| time/ | | -| total_timesteps | 150000 | -| train/ | | -| approx_kl | 0.003831089 | -| clip_fraction | 0.0196 | -| clip_range | 0.2 | -| entropy_loss | -2.82 | -| explained_variance | 0.381 | -| learning_rate | 0.0003 | -| loss | -0.0191 | -| n_updates | 90 | -| policy_gradient_loss | -0.00202 | -| std | 0.984 | -| value_loss | 0.104 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 2147 | -| iterations | 10 | -| time_elapsed | 76 | -| total_timesteps | 163840 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2218 | -| iterations | 11 | -| time_elapsed | 81 | -| total_timesteps | 180224 | -| train/ | | -| approx_kl | 0.0032510734 | -| clip_fraction | 0.0246 | -| clip_range | 0.2 | -| entropy_loss | -2.82 | -| explained_variance | 0.887 | -| learning_rate | 0.0003 | -| loss | -0.0279 | -| n_updates | 100 | -| policy_gradient_loss | -0.00207 | -| std | 0.993 | -| value_loss | 0.045 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2289 | -| iterations | 12 | -| time_elapsed | 85 | -| total_timesteps | 196608 | -| train/ | | -| approx_kl | 0.0047060847 | -| clip_fraction | 0.0387 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.896 | -| learning_rate | 0.0003 | -| loss | 0.00931 | -| n_updates | 110 | -| policy_gradient_loss | -0.00305 | -| std | 0.994 | -| value_loss | 0.0489 | ------------------------------------------- -Eval num_timesteps=200000, episode_reward=-18.47 +/- 55.53 -Episode length: 1938.95 +/- 147.97 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.94e+03 | -| mean_reward | -18.5 | -| time/ | | -| total_timesteps | 200000 | -| train/ | | -| approx_kl | 0.0047602034 | -| clip_fraction | 0.0421 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.968 | -| learning_rate | 0.0003 | -| loss | -0.0301 | -| n_updates | 120 | -| policy_gradient_loss | -0.00281 | -| std | 1.01 | -| value_loss | 0.0094 | ------------------------------------------- -New best mean reward! - -[Diag @ 200,000 | n_sheep=1 | success=5%] - COMPACT_CANT_DRIVE 18/20 - DROVE_NO_SHEEP 1/20 - SUCCESS 1/20 - action_mag mean=0.269 p10=0.129 p90=0.447 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=3.86m best=1.91m (FLEE_DIST=7m) - min_com_to_pen mean=11.22m best=2.44m - reward/step (mean): progress=-0.0022 alignment=+0.0006 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0026 -------------------------------- -| time/ | | -| fps | 1964 | -| iterations | 13 | -| time_elapsed | 108 | -| total_timesteps | 212992 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2034 | -| iterations | 14 | -| time_elapsed | 112 | -| total_timesteps | 229376 | -| train/ | | -| approx_kl | 0.0041663316 | -| clip_fraction | 0.0373 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.901 | -| learning_rate | 0.0003 | -| loss | -0.0251 | -| n_updates | 130 | -| policy_gradient_loss | -0.00223 | -| std | 1.03 | -| value_loss | 0.00752 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2102 | -| iterations | 15 | -| time_elapsed | 116 | -| total_timesteps | 245760 | -| train/ | | -| approx_kl | 0.0042076977 | -| clip_fraction | 0.032 | -| clip_range | 0.2 | -| entropy_loss | -2.91 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0333 | -| n_updates | 140 | -| policy_gradient_loss | -0.00281 | -| std | 1.04 | -| value_loss | 0.00934 | ------------------------------------------- -Eval num_timesteps=250000, episode_reward=-37.07 +/- 35.02 -Episode length: 1938.20 +/- 269.38 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.94e+03 | -| mean_reward | -37.1 | -| time/ | | -| total_timesteps | 250000 | -| train/ | | -| approx_kl | 0.0028561926 | -| clip_fraction | 0.0171 | -| clip_range | 0.2 | -| entropy_loss | -2.92 | -| explained_variance | 0.822 | -| learning_rate | 0.0003 | -| loss | -0.0292 | -| n_updates | 150 | -| policy_gradient_loss | -0.00113 | -| std | 1.04 | -| value_loss | 0.0473 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1990 | -| iterations | 16 | -| time_elapsed | 131 | -| total_timesteps | 262144 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2042 | -| iterations | 17 | -| time_elapsed | 136 | -| total_timesteps | 278528 | -| train/ | | -| approx_kl | 0.0054259067 | -| clip_fraction | 0.0468 | -| clip_range | 0.2 | -| entropy_loss | -2.91 | -| explained_variance | 0.891 | -| learning_rate | 0.0003 | -| loss | -0.032 | -| n_updates | 160 | -| policy_gradient_loss | -0.00597 | -| std | 1.03 | -| value_loss | 0.0128 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2085 | -| iterations | 18 | -| time_elapsed | 141 | -| total_timesteps | 294912 | -| train/ | | -| approx_kl | 0.004205579 | -| clip_fraction | 0.0291 | -| clip_range | 0.2 | -| entropy_loss | -2.91 | -| explained_variance | 0.834 | -| learning_rate | 0.0003 | -| loss | -0.0364 | -| n_updates | 170 | -| policy_gradient_loss | -0.00307 | -| std | 1.03 | -| value_loss | 0.0107 | ------------------------------------------ -Eval num_timesteps=300000, episode_reward=-25.41 +/- 48.70 -Episode length: 1886.45 +/- 435.99 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.89e+03 | -| mean_reward | -25.4 | -| time/ | | -| total_timesteps | 300000 | -| train/ | | -| approx_kl | 0.0045948992 | -| clip_fraction | 0.0354 | -| clip_range | 0.2 | -| entropy_loss | -2.9 | -| explained_variance | 0.806 | -| learning_rate | 0.0003 | -| loss | -0.0242 | -| n_updates | 180 | -| policy_gradient_loss | -0.00236 | -| std | 1.03 | -| value_loss | 0.0371 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1981 | -| iterations | 19 | -| time_elapsed | 157 | -| total_timesteps | 311296 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2024 | -| iterations | 20 | -| time_elapsed | 161 | -| total_timesteps | 327680 | -| train/ | | -| approx_kl | 0.005344864 | -| clip_fraction | 0.0442 | -| clip_range | 0.2 | -| entropy_loss | -2.91 | -| explained_variance | 0.877 | -| learning_rate | 0.0003 | -| loss | -0.0369 | -| n_updates | 190 | -| policy_gradient_loss | -0.00344 | -| std | 1.04 | -| value_loss | 0.0104 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2066 | -| iterations | 21 | -| time_elapsed | 166 | -| total_timesteps | 344064 | -| train/ | | -| approx_kl | 0.007574372 | -| clip_fraction | 0.0753 | -| clip_range | 0.2 | -| entropy_loss | -2.92 | -| explained_variance | 0.903 | -| learning_rate | 0.0003 | -| loss | -0.0272 | -| n_updates | 200 | -| policy_gradient_loss | -0.00726 | -| std | 1.04 | -| value_loss | 0.0113 | ------------------------------------------ -Eval num_timesteps=350000, episode_reward=-21.14 +/- 37.01 -Episode length: 1959.80 +/- 175.23 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.96e+03 | -| mean_reward | -21.1 | -| time/ | | -| total_timesteps | 350000 | -| train/ | | -| approx_kl | 0.0061714016 | -| clip_fraction | 0.0569 | -| clip_range | 0.2 | -| entropy_loss | -2.91 | -| explained_variance | 0.917 | -| learning_rate | 0.0003 | -| loss | -0.022 | -| n_updates | 210 | -| policy_gradient_loss | -0.00598 | -| std | 1.04 | -| value_loss | 0.0231 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1984 | -| iterations | 22 | -| time_elapsed | 181 | -| total_timesteps | 360448 | -------------------------------- ----------------------------------------- -| time/ | | -| fps | 2026 | -| iterations | 23 | -| time_elapsed | 185 | -| total_timesteps | 376832 | -| train/ | | -| approx_kl | 0.00587913 | -| clip_fraction | 0.0501 | -| clip_range | 0.2 | -| entropy_loss | -2.92 | -| explained_variance | 0.932 | -| learning_rate | 0.0003 | -| loss | -0.0415 | -| n_updates | 220 | -| policy_gradient_loss | -0.00484 | -| std | 1.04 | -| value_loss | 0.0242 | ----------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2064 | -| iterations | 24 | -| time_elapsed | 190 | -| total_timesteps | 393216 | -| train/ | | -| approx_kl | 0.006933649 | -| clip_fraction | 0.081 | -| clip_range | 0.2 | -| entropy_loss | -2.91 | -| explained_variance | 0.918 | -| learning_rate | 0.0003 | -| loss | -0.032 | -| n_updates | 230 | -| policy_gradient_loss | -0.00773 | -| std | 1.03 | -| value_loss | 0.0233 | ------------------------------------------ -Eval num_timesteps=400000, episode_reward=-2.75 +/- 37.08 -Episode length: 1998.55 +/- 6.32 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -2.75 | -| time/ | | -| total_timesteps | 400000 | -| train/ | | -| approx_kl | 0.0064436095 | -| clip_fraction | 0.0647 | -| clip_range | 0.2 | -| entropy_loss | -2.9 | -| explained_variance | 0.853 | -| learning_rate | 0.0003 | -| loss | 0.0633 | -| n_updates | 240 | -| policy_gradient_loss | -0.00551 | -| std | 1.03 | -| value_loss | 0.128 | ------------------------------------------- -New best mean reward! - -[Diag @ 400,000 | n_sheep=1 | success=0%] - DROVE_NO_SHEEP 13/20 - COMPACT_CANT_DRIVE 7/20 - action_mag mean=0.316 p10=0.057 p90=0.512 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=1.86m best=0.95m (FLEE_DIST=7m) - min_com_to_pen mean=3.19m best=1.50m - reward/step (mean): progress=+0.0093 alignment=+0.0040 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 -------------------------------- -| time/ | | -| fps | 1925 | -| iterations | 25 | -| time_elapsed | 212 | -| total_timesteps | 409600 | -------------------------------- ----------------------------------------- -| time/ | | -| fps | 1961 | -| iterations | 26 | -| time_elapsed | 217 | -| total_timesteps | 425984 | -| train/ | | -| approx_kl | 0.00806847 | -| clip_fraction | 0.1 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.933 | -| learning_rate | 0.0003 | -| loss | -0.0254 | -| n_updates | 250 | -| policy_gradient_loss | -0.00871 | -| std | 1.02 | -| value_loss | 0.0264 | ----------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1997 | -| iterations | 27 | -| time_elapsed | 221 | -| total_timesteps | 442368 | -| train/ | | -| approx_kl | 0.005784355 | -| clip_fraction | 0.0531 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.878 | -| learning_rate | 0.0003 | -| loss | 0.00996 | -| n_updates | 260 | -| policy_gradient_loss | -0.00485 | -| std | 1 | -| value_loss | 0.0868 | ------------------------------------------ -Eval num_timesteps=450000, episode_reward=51.79 +/- 20.61 -Episode length: 1912.30 +/- 382.28 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.91e+03 | -| mean_reward | 51.8 | -| time/ | | -| total_timesteps | 450000 | -| train/ | | -| approx_kl | 0.005881632 | -| clip_fraction | 0.0639 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.952 | -| learning_rate | 0.0003 | -| loss | -0.0187 | -| n_updates | 270 | -| policy_gradient_loss | -0.00655 | -| std | 0.991 | -| value_loss | 0.0226 | ------------------------------------------ -New best mean reward! -------------------------------- -| time/ | | -| fps | 1936 | -| iterations | 28 | -| time_elapsed | 236 | -| total_timesteps | 458752 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1965 | -| iterations | 29 | -| time_elapsed | 241 | -| total_timesteps | 475136 | -| train/ | | -| approx_kl | 0.009020726 | -| clip_fraction | 0.0982 | -| clip_range | 0.2 | -| entropy_loss | -2.81 | -| explained_variance | 0.87 | -| learning_rate | 0.0003 | -| loss | 0.0218 | -| n_updates | 280 | -| policy_gradient_loss | -0.0061 | -| std | 0.984 | -| value_loss | 0.209 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1999 | -| iterations | 30 | -| time_elapsed | 245 | -| total_timesteps | 491520 | -| train/ | | -| approx_kl | 0.011525536 | -| clip_fraction | 0.136 | -| clip_range | 0.2 | -| entropy_loss | -2.79 | -| explained_variance | 0.92 | -| learning_rate | 0.0003 | -| loss | 0.0306 | -| n_updates | 290 | -| policy_gradient_loss | -0.00896 | -| std | 0.97 | -| value_loss | 0.0903 | ------------------------------------------ -Eval num_timesteps=500000, episode_reward=87.01 +/- 42.12 -Episode length: 1359.85 +/- 815.95 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.36e+03 | -| mean_reward | 87 | -| time/ | | -| total_timesteps | 500000 | -| train/ | | -| approx_kl | 0.012545023 | -| clip_fraction | 0.171 | -| clip_range | 0.2 | -| entropy_loss | -2.78 | -| explained_variance | 0.956 | -| learning_rate | 0.0003 | -| loss | -0.0369 | -| n_updates | 300 | -| policy_gradient_loss | -0.0069 | -| std | 0.972 | -| value_loss | 0.034 | ------------------------------------------ -New best mean reward! -------------------------------- -| time/ | | -| fps | 1968 | -| iterations | 31 | -| time_elapsed | 258 | -| total_timesteps | 507904 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1996 | -| iterations | 32 | -| time_elapsed | 262 | -| total_timesteps | 524288 | -| train/ | | -| approx_kl | 0.008305798 | -| clip_fraction | 0.102 | -| clip_range | 0.2 | -| entropy_loss | -2.78 | -| explained_variance | 0.975 | -| learning_rate | 0.0003 | -| loss | -0.0285 | -| n_updates | 310 | -| policy_gradient_loss | -0.00343 | -| std | 0.972 | -| value_loss | 0.0162 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 2021 | -| iterations | 33 | -| time_elapsed | 267 | -| total_timesteps | 540672 | -| train/ | | -| approx_kl | 0.0074599315 | -| clip_fraction | 0.0925 | -| clip_range | 0.2 | -| entropy_loss | -2.81 | -| explained_variance | 0.976 | -| learning_rate | 0.0003 | -| loss | -0.0282 | -| n_updates | 320 | -| policy_gradient_loss | -0.0028 | -| std | 0.989 | -| value_loss | 0.0136 | ------------------------------------------- -Eval num_timesteps=550000, episode_reward=113.42 +/- 48.33 -Episode length: 926.05 +/- 792.99 ------------------------------------------ -| eval/ | | -| mean_ep_length | 926 | -| mean_reward | 113 | -| time/ | | -| total_timesteps | 550000 | -| train/ | | -| approx_kl | 0.010888291 | -| clip_fraction | 0.136 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.981 | -| learning_rate | 0.0003 | -| loss | -0.0226 | -| n_updates | 330 | -| policy_gradient_loss | -0.00266 | -| std | 1 | -| value_loss | 0.00643 | ------------------------------------------ -New best mean reward! -------------------------------- -| time/ | | -| fps | 2005 | -| iterations | 34 | -| time_elapsed | 277 | -| total_timesteps | 557056 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2030 | -| iterations | 35 | -| time_elapsed | 282 | -| total_timesteps | 573440 | -| train/ | | -| approx_kl | 0.009418717 | -| clip_fraction | 0.121 | -| clip_range | 0.2 | -| entropy_loss | -2.84 | -| explained_variance | 0.975 | -| learning_rate | 0.0003 | -| loss | -0.0234 | -| n_updates | 340 | -| policy_gradient_loss | -0.00417 | -| std | 1 | -| value_loss | 0.0219 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2054 | -| iterations | 36 | -| time_elapsed | 287 | -| total_timesteps | 589824 | -| train/ | | -| approx_kl | 0.009153167 | -| clip_fraction | 0.132 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.972 | -| learning_rate | 0.0003 | -| loss | 0.00458 | -| n_updates | 350 | -| policy_gradient_loss | -0.00925 | -| std | 1.01 | -| value_loss | 0.0644 | ------------------------------------------ -Eval num_timesteps=600000, episode_reward=142.43 +/- 15.10 -Episode length: 292.00 +/- 114.85 ------------------------------------------- -| eval/ | | -| mean_ep_length | 292 | -| mean_reward | 142 | -| time/ | | -| total_timesteps | 600000 | -| train/ | | -| approx_kl | 0.0073751104 | -| clip_fraction | 0.0817 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.967 | -| learning_rate | 0.0003 | -| loss | 0.0205 | -| n_updates | 360 | -| policy_gradient_loss | -0.0078 | -| std | 1.01 | -| value_loss | 0.0854 | ------------------------------------------- -New best mean reward! - -[Diag @ 600,000 | n_sheep=1 | success=100%] - SUCCESS 20/20 - action_mag mean=0.339 p10=0.246 p90=0.609 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=1.68m best=0.23m (FLEE_DIST=7m) - min_com_to_pen mean=3.54m best=2.70m - reward/step (mean): progress=+0.0996 alignment=+0.0271 pen_bonus=+0.0302 step_cost=-0.0200 complete=+0.3022 -------------------------------- -| time/ | | -| fps | 2059 | -| iterations | 37 | -| time_elapsed | 294 | -| total_timesteps | 606208 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2069 | -| iterations | 38 | -| time_elapsed | 300 | -| total_timesteps | 622592 | -| train/ | | -| approx_kl | 0.006348365 | -| clip_fraction | 0.0685 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.954 | -| learning_rate | 0.0003 | -| loss | -0.0107 | -| n_updates | 370 | -| policy_gradient_loss | -0.00403 | -| std | 1 | -| value_loss | 0.0629 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 2085 | -| iterations | 39 | -| time_elapsed | 306 | -| total_timesteps | 638976 | -| train/ | | -| approx_kl | 0.0073653567 | -| clip_fraction | 0.089 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.976 | -| learning_rate | 0.0003 | -| loss | -0.0379 | -| n_updates | 380 | -| policy_gradient_loss | -0.00635 | -| std | 0.993 | -| value_loss | 0.0213 | ------------------------------------------- -Eval num_timesteps=650000, episode_reward=148.63 +/- 11.08 -Episode length: 312.15 +/- 83.52 ------------------------------------------- -| eval/ | | -| mean_ep_length | 312 | -| mean_reward | 149 | -| time/ | | -| total_timesteps | 650000 | -| train/ | | -| approx_kl | 0.0064217458 | -| clip_fraction | 0.0662 | -| clip_range | 0.2 | -| entropy_loss | -2.81 | -| explained_variance | 0.977 | -| learning_rate | 0.0003 | -| loss | -0.0177 | -| n_updates | 390 | -| policy_gradient_loss | -0.00451 | -| std | 0.983 | -| value_loss | 0.0325 | ------------------------------------------- -New best mean reward! -------------------------------- -| time/ | | -| fps | 2092 | -| iterations | 40 | -| time_elapsed | 313 | -| total_timesteps | 655360 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2107 | -| iterations | 41 | -| time_elapsed | 318 | -| total_timesteps | 671744 | -| train/ | | -| approx_kl | 0.007330196 | -| clip_fraction | 0.0823 | -| clip_range | 0.2 | -| entropy_loss | -2.79 | -| explained_variance | 0.985 | -| learning_rate | 0.0003 | -| loss | -0.0257 | -| n_updates | 400 | -| policy_gradient_loss | -0.00559 | -| std | 0.971 | -| value_loss | 0.0108 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 2123 | -| iterations | 42 | -| time_elapsed | 323 | -| total_timesteps | 688128 | -| train/ | | -| approx_kl | 0.0076610697 | -| clip_fraction | 0.0876 | -| clip_range | 0.2 | -| entropy_loss | -2.77 | -| explained_variance | 0.99 | -| learning_rate | 0.0003 | -| loss | -0.037 | -| n_updates | 410 | -| policy_gradient_loss | -0.00581 | -| std | 0.966 | -| value_loss | 0.00623 | ------------------------------------------- -Eval num_timesteps=700000, episode_reward=137.38 +/- 18.54 -Episode length: 255.10 +/- 119.47 ------------------------------------------- -| eval/ | | -| mean_ep_length | 255 | -| mean_reward | 137 | -| time/ | | -| total_timesteps | 700000 | -| train/ | | -| approx_kl | 0.0072219693 | -| clip_fraction | 0.0734 | -| clip_range | 0.2 | -| entropy_loss | -2.76 | -| explained_variance | 0.989 | -| learning_rate | 0.0003 | -| loss | -0.0383 | -| n_updates | 420 | -| policy_gradient_loss | -0.00416 | -| std | 0.961 | -| value_loss | 0.00951 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 2128 | -| iterations | 43 | -| time_elapsed | 331 | -| total_timesteps | 704512 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2144 | -| iterations | 44 | -| time_elapsed | 336 | -| total_timesteps | 720896 | -| train/ | | -| approx_kl | 0.0075956425 | -| clip_fraction | 0.0895 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.993 | -| learning_rate | 0.0003 | -| loss | -0.0433 | -| n_updates | 430 | -| policy_gradient_loss | -0.00475 | -| std | 0.953 | -| value_loss | 0.00343 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2160 | -| iterations | 45 | -| time_elapsed | 341 | -| total_timesteps | 737280 | -| train/ | | -| approx_kl | 0.0062526334 | -| clip_fraction | 0.0699 | -| clip_range | 0.2 | -| entropy_loss | -2.72 | -| explained_variance | 0.99 | -| learning_rate | 0.0003 | -| loss | -0.0329 | -| n_updates | 440 | -| policy_gradient_loss | -0.00355 | -| std | 0.942 | -| value_loss | 0.0113 | ------------------------------------------- -Eval num_timesteps=750000, episode_reward=145.04 +/- 16.56 -Episode length: 291.10 +/- 132.25 ------------------------------------------- -| eval/ | | -| mean_ep_length | 291 | -| mean_reward | 145 | -| time/ | | -| total_timesteps | 750000 | -| train/ | | -| approx_kl | 0.0058749127 | -| clip_fraction | 0.0607 | -| clip_range | 0.2 | -| entropy_loss | -2.71 | -| explained_variance | 0.993 | -| learning_rate | 0.0003 | -| loss | -0.0281 | -| n_updates | 450 | -| policy_gradient_loss | -0.00324 | -| std | 0.934 | -| value_loss | 0.00811 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 2161 | -| iterations | 46 | -| time_elapsed | 348 | -| total_timesteps | 753664 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2176 | -| iterations | 47 | -| time_elapsed | 353 | -| total_timesteps | 770048 | -| train/ | | -| approx_kl | 0.0070656985 | -| clip_fraction | 0.0763 | -| clip_range | 0.2 | -| entropy_loss | -2.68 | -| explained_variance | 0.996 | -| learning_rate | 0.0003 | -| loss | -0.0322 | -| n_updates | 460 | -| policy_gradient_loss | -0.00485 | -| std | 0.92 | -| value_loss | 0.00234 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2193 | -| iterations | 48 | -| time_elapsed | 358 | -| total_timesteps | 786432 | -| train/ | | -| approx_kl | 0.008987564 | -| clip_fraction | 0.112 | -| clip_range | 0.2 | -| entropy_loss | -2.66 | -| explained_variance | 0.997 | -| learning_rate | 0.0003 | -| loss | -0.0471 | -| n_updates | 470 | -| policy_gradient_loss | -0.00864 | -| std | 0.909 | -| value_loss | 0.00178 | ------------------------------------------ -Eval num_timesteps=800000, episode_reward=141.03 +/- 13.75 -Episode length: 256.90 +/- 100.39 ------------------------------------------ -| eval/ | | -| mean_ep_length | 257 | -| mean_reward | 141 | -| time/ | | -| total_timesteps | 800000 | -| train/ | | -| approx_kl | 0.008297143 | -| clip_fraction | 0.0945 | -| clip_range | 0.2 | -| entropy_loss | -2.67 | -| explained_variance | 0.989 | -| learning_rate | 0.0003 | -| loss | -0.0173 | -| n_updates | 480 | -| policy_gradient_loss | -0.00352 | -| std | 0.921 | -| value_loss | 0.00934 | ------------------------------------------ - -[Diag @ 800,000 | n_sheep=1 | success=100%] - SUCCESS 20/20 - action_mag mean=0.333 p10=0.244 p90=0.332 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=1.40m best=0.75m (FLEE_DIST=7m) - min_com_to_pen mean=3.47m best=1.58m - reward/step (mean): progress=+0.1108 alignment=+0.0328 pen_bonus=+0.0366 step_cost=-0.0200 complete=+0.3664 - -[Curriculum] leaving stage n_sheep=1 after 800,000 steps | training success rate (last 100 eps) = 100% -[Curriculum] → 2 sheep at step 800,000 - -------------------------------- -| time/ | | -| fps | 2187 | -| iterations | 49 | -| time_elapsed | 367 | -| total_timesteps | 802816 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2201 | -| iterations | 50 | -| time_elapsed | 372 | -| total_timesteps | 819200 | -| train/ | | -| approx_kl | 0.006534174 | -| clip_fraction | 0.0754 | -| clip_range | 0.2 | -| entropy_loss | -2.7 | -| explained_variance | 0.968 | -| learning_rate | 0.0003 | -| loss | -0.0252 | -| n_updates | 490 | -| policy_gradient_loss | 0.00248 | -| std | 0.942 | -| value_loss | 0.021 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2213 | -| iterations | 51 | -| time_elapsed | 377 | -| total_timesteps | 835584 | -| train/ | | -| approx_kl | 0.012509884 | -| clip_fraction | 0.182 | -| clip_range | 0.2 | -| entropy_loss | -2.73 | -| explained_variance | 0.51 | -| learning_rate | 0.0003 | -| loss | -0.0127 | -| n_updates | 500 | -| policy_gradient_loss | 0.00321 | -| std | 0.953 | -| value_loss | 0.0093 | ------------------------------------------ -Eval num_timesteps=850000, episode_reward=-30.43 +/- 29.94 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -30.4 | -| time/ | | -| total_timesteps | 850000 | -| train/ | | -| approx_kl | 0.009752454 | -| clip_fraction | 0.146 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.865 | -| learning_rate | 0.0003 | -| loss | -0.0289 | -| n_updates | 510 | -| policy_gradient_loss | 0.00274 | -| std | 0.95 | -| value_loss | 0.0117 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 2153 | -| iterations | 52 | -| time_elapsed | 395 | -| total_timesteps | 851968 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2166 | -| iterations | 53 | -| time_elapsed | 400 | -| total_timesteps | 868352 | -| train/ | | -| approx_kl | 0.011746319 | -| clip_fraction | 0.133 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.953 | -| learning_rate | 0.0003 | -| loss | -0.0316 | -| n_updates | 520 | -| policy_gradient_loss | 0.00116 | -| std | 0.958 | -| value_loss | 0.00603 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2179 | -| iterations | 54 | -| time_elapsed | 405 | -| total_timesteps | 884736 | -| train/ | | -| approx_kl | 0.008340008 | -| clip_fraction | 0.111 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.959 | -| learning_rate | 0.0003 | -| loss | -0.0317 | -| n_updates | 530 | -| policy_gradient_loss | 0.000628 | -| std | 0.955 | -| value_loss | 0.00663 | ------------------------------------------ -Eval num_timesteps=900000, episode_reward=-21.80 +/- 34.98 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -21.8 | -| time/ | | -| total_timesteps | 900000 | -| train/ | | -| approx_kl | 0.010461532 | -| clip_fraction | 0.13 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.88 | -| learning_rate | 0.0003 | -| loss | -0.00905 | -| n_updates | 540 | -| policy_gradient_loss | -0.000256 | -| std | 0.951 | -| value_loss | 0.00567 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 2128 | -| iterations | 55 | -| time_elapsed | 423 | -| total_timesteps | 901120 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2139 | -| iterations | 56 | -| time_elapsed | 428 | -| total_timesteps | 917504 | -| train/ | | -| approx_kl | 0.0071650296 | -| clip_fraction | 0.0988 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0294 | -| n_updates | 550 | -| policy_gradient_loss | -0.000672 | -| std | 0.957 | -| value_loss | 0.00545 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2152 | -| iterations | 57 | -| time_elapsed | 433 | -| total_timesteps | 933888 | -| train/ | | -| approx_kl | 0.009678386 | -| clip_fraction | 0.112 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.927 | -| learning_rate | 0.0003 | -| loss | -0.0308 | -| n_updates | 560 | -| policy_gradient_loss | -0.000959 | -| std | 0.953 | -| value_loss | 0.00409 | ------------------------------------------ -Eval num_timesteps=950000, episode_reward=-34.37 +/- 35.50 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -34.4 | -| time/ | | -| total_timesteps | 950000 | -| train/ | | -| approx_kl | 0.008903094 | -| clip_fraction | 0.111 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0259 | -| n_updates | 570 | -| policy_gradient_loss | -0.000299 | -| std | 0.955 | -| value_loss | 0.00432 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 2108 | -| iterations | 58 | -| time_elapsed | 450 | -| total_timesteps | 950272 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2117 | -| iterations | 59 | -| time_elapsed | 456 | -| total_timesteps | 966656 | -| train/ | | -| approx_kl | 0.008592881 | -| clip_fraction | 0.0954 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0173 | -| n_updates | 580 | -| policy_gradient_loss | 0.00103 | -| std | 0.95 | -| value_loss | 0.00265 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2129 | -| iterations | 60 | -| time_elapsed | 461 | -| total_timesteps | 983040 | -| train/ | | -| approx_kl | 0.010225108 | -| clip_fraction | 0.108 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.972 | -| learning_rate | 0.0003 | -| loss | -0.0135 | -| n_updates | 590 | -| policy_gradient_loss | -0.000738 | -| std | 0.954 | -| value_loss | 0.0029 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2137 | -| iterations | 61 | -| time_elapsed | 467 | -| total_timesteps | 999424 | -| train/ | | -| approx_kl | 0.008312117 | -| clip_fraction | 0.0887 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.898 | -| learning_rate | 0.0003 | -| loss | -0.0262 | -| n_updates | 600 | -| policy_gradient_loss | -0.000497 | -| std | 0.958 | -| value_loss | 0.00511 | ------------------------------------------ -Eval num_timesteps=1000000, episode_reward=-32.64 +/- 38.38 -Episode length: 2000.00 +/- 0.00 ----------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -32.6 | -| time/ | | -| total_timesteps | 1000000 | -| train/ | | -| approx_kl | 0.00942917 | -| clip_fraction | 0.105 | -| clip_range | 0.2 | -| entropy_loss | -2.76 | -| explained_variance | 0.961 | -| learning_rate | 0.0003 | -| loss | -0.0331 | -| n_updates | 610 | -| policy_gradient_loss | -0.0023 | -| std | 0.966 | -| value_loss | 0.00282 | ----------------------------------------- - -[Diag @ 1,000,000 | n_sheep=2 | success=0%] - COMPACT_CANT_DRIVE 14/20 - NEVER_COMPACT 6/20 - action_mag mean=0.216 p10=0.000 p90=0.805 (0=stopped, 1=full speed) - min_flock_radius mean=3.39m best=0.00m (target <5m to compact) - min_dog_to_com mean=1.18m best=0.11m (FLEE_DIST=7m) - min_com_to_pen mean=13.11m best=7.44m - reward/step (mean): progress=-0.0011 alignment=+0.0106 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 2057 | -| iterations | 62 | -| time_elapsed | 493 | -| total_timesteps | 1015808 | --------------------------------- ---------------------------------------- -| time/ | | -| fps | 2067 | -| iterations | 63 | -| time_elapsed | 499 | -| total_timesteps | 1032192 | -| train/ | | -| approx_kl | 0.008683 | -| clip_fraction | 0.0967 | -| clip_range | 0.2 | -| entropy_loss | -2.77 | -| explained_variance | 0.93 | -| learning_rate | 0.0003 | -| loss | -0.029 | -| n_updates | 620 | -| policy_gradient_loss | -0.000765 | -| std | 0.965 | -| value_loss | 0.00446 | ---------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2077 | -| iterations | 64 | -| time_elapsed | 504 | -| total_timesteps | 1048576 | -| train/ | | -| approx_kl | 0.009014329 | -| clip_fraction | 0.113 | -| clip_range | 0.2 | -| entropy_loss | -2.76 | -| explained_variance | 0.984 | -| learning_rate | 0.0003 | -| loss | -0.0279 | -| n_updates | 630 | -| policy_gradient_loss | -0.00211 | -| std | 0.962 | -| value_loss | 0.00312 | ------------------------------------------ -Eval num_timesteps=1050000, episode_reward=-31.51 +/- 42.52 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -31.5 | -| time/ | | -| total_timesteps | 1050000 | -| train/ | | -| approx_kl | 0.008500135 | -| clip_fraction | 0.105 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.968 | -| learning_rate | 0.0003 | -| loss | -0.0306 | -| n_updates | 640 | -| policy_gradient_loss | -0.00312 | -| std | 0.955 | -| value_loss | 0.00288 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 2042 | -| iterations | 65 | -| time_elapsed | 521 | -| total_timesteps | 1064960 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 2056 | -| iterations | 66 | -| time_elapsed | 525 | -| total_timesteps | 1081344 | -| train/ | | -| approx_kl | 0.0069593494 | -| clip_fraction | 0.0923 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.835 | -| learning_rate | 0.0003 | -| loss | -0.0291 | -| n_updates | 650 | -| policy_gradient_loss | -0.000469 | -| std | 0.952 | -| value_loss | 0.00186 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2064 | -| iterations | 67 | -| time_elapsed | 531 | -| total_timesteps | 1097728 | -| train/ | | -| approx_kl | 0.007817726 | -| clip_fraction | 0.0933 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.922 | -| learning_rate | 0.0003 | -| loss | -0.0206 | -| n_updates | 660 | -| policy_gradient_loss | -0.00208 | -| std | 0.953 | -| value_loss | 0.00234 | ------------------------------------------ -Eval num_timesteps=1100000, episode_reward=-22.82 +/- 33.61 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -22.8 | -| time/ | | -| total_timesteps | 1100000 | -| train/ | | -| approx_kl | 0.006177975 | -| clip_fraction | 0.0806 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.026 | -| n_updates | 670 | -| policy_gradient_loss | -5.8e-05 | -| std | 0.951 | -| value_loss | 0.00184 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 2035 | -| iterations | 68 | -| time_elapsed | 547 | -| total_timesteps | 1114112 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 2048 | -| iterations | 69 | -| time_elapsed | 551 | -| total_timesteps | 1130496 | -| train/ | | -| approx_kl | 0.009605391 | -| clip_fraction | 0.102 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.0344 | -| n_updates | 680 | -| policy_gradient_loss | -0.0022 | -| std | 0.957 | -| value_loss | 0.00221 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 2060 | -| iterations | 70 | -| time_elapsed | 556 | -| total_timesteps | 1146880 | -| train/ | | -| approx_kl | 0.0064521013 | -| clip_fraction | 0.0953 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.898 | -| learning_rate | 0.0003 | -| loss | -0.0348 | -| n_updates | 690 | -| policy_gradient_loss | -0.00112 | -| std | 0.96 | -| value_loss | 0.00221 | ------------------------------------------- -Eval num_timesteps=1150000, episode_reward=-26.36 +/- 35.49 -Episode length: 2000.00 +/- 0.00 ----------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -26.4 | -| time/ | | -| total_timesteps | 1150000 | -| train/ | | -| approx_kl | 0.00777065 | -| clip_fraction | 0.0837 | -| clip_range | 0.2 | -| entropy_loss | -2.76 | -| explained_variance | 0.907 | -| learning_rate | 0.0003 | -| loss | -0.0198 | -| n_updates | 700 | -| policy_gradient_loss | -0.000371 | -| std | 0.963 | -| value_loss | 0.00182 | ----------------------------------------- --------------------------------- -| time/ | | -| fps | 2031 | -| iterations | 71 | -| time_elapsed | 572 | -| total_timesteps | 1163264 | --------------------------------- ---------------------------------------- -| time/ | | -| fps | 2044 | -| iterations | 72 | -| time_elapsed | 577 | -| total_timesteps | 1179648 | -| train/ | | -| approx_kl | 0.006344 | -| clip_fraction | 0.0719 | -| clip_range | 0.2 | -| entropy_loss | -2.76 | -| explained_variance | 0.908 | -| learning_rate | 0.0003 | -| loss | -0.0347 | -| n_updates | 710 | -| policy_gradient_loss | -0.000455 | -| std | 0.961 | -| value_loss | 0.00145 | ---------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2054 | -| iterations | 73 | -| time_elapsed | 582 | -| total_timesteps | 1196032 | -| train/ | | -| approx_kl | 0.0060829036 | -| clip_fraction | 0.0854 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.896 | -| learning_rate | 0.0003 | -| loss | -0.0232 | -| n_updates | 720 | -| policy_gradient_loss | -0.00108 | -| std | 0.957 | -| value_loss | 0.00152 | ------------------------------------------- -Eval num_timesteps=1200000, episode_reward=-14.33 +/- 30.83 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -14.3 | -| time/ | | -| total_timesteps | 1200000 | -| train/ | | -| approx_kl | 0.0073732347 | -| clip_fraction | 0.0783 | -| clip_range | 0.2 | -| entropy_loss | -2.76 | -| explained_variance | 0.948 | -| learning_rate | 0.0003 | -| loss | -0.0267 | -| n_updates | 730 | -| policy_gradient_loss | -0.00212 | -| std | 0.968 | -| value_loss | 0.00253 | ------------------------------------------- - -[Diag @ 1,200,000 | n_sheep=2 | success=0%] - COMPACT_CANT_DRIVE 15/20 - NEVER_COMPACT 5/20 - action_mag mean=0.273 p10=0.004 p90=1.008 (0=stopped, 1=full speed) - min_flock_radius mean=3.94m best=0.97m (target <5m to compact) - min_dog_to_com mean=1.16m best=0.35m (FLEE_DIST=7m) - min_com_to_pen mean=13.54m best=4.20m - reward/step (mean): progress=+0.0001 alignment=+0.0121 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1998 | -| iterations | 74 | -| time_elapsed | 606 | -| total_timesteps | 1212416 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 2008 | -| iterations | 75 | -| time_elapsed | 611 | -| total_timesteps | 1228800 | -| train/ | | -| approx_kl | 0.006109112 | -| clip_fraction | 0.0814 | -| clip_range | 0.2 | -| entropy_loss | -2.78 | -| explained_variance | 0.86 | -| learning_rate | 0.0003 | -| loss | -0.0205 | -| n_updates | 740 | -| policy_gradient_loss | -0.000541 | -| std | 0.973 | -| value_loss | 0.00171 | ------------------------------------------ ----------------------------------------- -| time/ | | -| fps | 2016 | -| iterations | 76 | -| time_elapsed | 617 | -| total_timesteps | 1245184 | -| train/ | | -| approx_kl | 0.00703271 | -| clip_fraction | 0.0781 | -| clip_range | 0.2 | -| entropy_loss | -2.78 | -| explained_variance | 0.934 | -| learning_rate | 0.0003 | -| loss | -0.0394 | -| n_updates | 750 | -| policy_gradient_loss | -0.00105 | -| std | 0.975 | -| value_loss | 0.00168 | ----------------------------------------- -Eval num_timesteps=1250000, episode_reward=-18.12 +/- 39.82 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -18.1 | -| time/ | | -| total_timesteps | 1250000 | -| train/ | | -| approx_kl | 0.0064994176 | -| clip_fraction | 0.0698 | -| clip_range | 0.2 | -| entropy_loss | -2.8 | -| explained_variance | 0.919 | -| learning_rate | 0.0003 | -| loss | -0.0166 | -| n_updates | 760 | -| policy_gradient_loss | -0.000919 | -| std | 0.985 | -| value_loss | 0.000832 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1989 | -| iterations | 77 | -| time_elapsed | 634 | -| total_timesteps | 1261568 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 2001 | -| iterations | 78 | -| time_elapsed | 638 | -| total_timesteps | 1277952 | -| train/ | | -| approx_kl | 0.008321709 | -| clip_fraction | 0.0902 | -| clip_range | 0.2 | -| entropy_loss | -2.81 | -| explained_variance | 0.874 | -| learning_rate | 0.0003 | -| loss | -0.0295 | -| n_updates | 770 | -| policy_gradient_loss | -0.00219 | -| std | 0.991 | -| value_loss | 0.00127 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2010 | -| iterations | 79 | -| time_elapsed | 643 | -| total_timesteps | 1294336 | -| train/ | | -| approx_kl | 0.009220061 | -| clip_fraction | 0.112 | -| clip_range | 0.2 | -| entropy_loss | -2.82 | -| explained_variance | 0.952 | -| learning_rate | 0.0003 | -| loss | -0.0379 | -| n_updates | 780 | -| policy_gradient_loss | -0.00411 | -| std | 0.994 | -| value_loss | 0.00295 | ------------------------------------------ -Eval num_timesteps=1300000, episode_reward=-22.41 +/- 35.57 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -22.4 | -| time/ | | -| total_timesteps | 1300000 | -| train/ | | -| approx_kl | 0.0071307076 | -| clip_fraction | 0.0826 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.948 | -| learning_rate | 0.0003 | -| loss | -0.0281 | -| n_updates | 790 | -| policy_gradient_loss | -0.00178 | -| std | 0.995 | -| value_loss | 0.00169 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1986 | -| iterations | 80 | -| time_elapsed | 659 | -| total_timesteps | 1310720 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1996 | -| iterations | 81 | -| time_elapsed | 664 | -| total_timesteps | 1327104 | -| train/ | | -| approx_kl | 0.008566003 | -| clip_fraction | 0.0857 | -| clip_range | 0.2 | -| entropy_loss | -2.84 | -| explained_variance | 0.904 | -| learning_rate | 0.0003 | -| loss | -0.0369 | -| n_updates | 800 | -| policy_gradient_loss | -0.00199 | -| std | 1.01 | -| value_loss | 0.00203 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 2006 | -| iterations | 82 | -| time_elapsed | 669 | -| total_timesteps | 1343488 | -| train/ | | -| approx_kl | 0.0082352655 | -| clip_fraction | 0.0989 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.918 | -| learning_rate | 0.0003 | -| loss | -0.0297 | -| n_updates | 810 | -| policy_gradient_loss | -0.0023 | -| std | 1.01 | -| value_loss | 0.00203 | ------------------------------------------- -Eval num_timesteps=1350000, episode_reward=-14.21 +/- 38.53 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -14.2 | -| time/ | | -| total_timesteps | 1350000 | -| train/ | | -| approx_kl | 0.0066830693 | -| clip_fraction | 0.0831 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.923 | -| learning_rate | 0.0003 | -| loss | -0.0331 | -| n_updates | 820 | -| policy_gradient_loss | -0.00226 | -| std | 1.01 | -| value_loss | 0.00125 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1983 | -| iterations | 83 | -| time_elapsed | 685 | -| total_timesteps | 1359872 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1991 | -| iterations | 84 | -| time_elapsed | 691 | -| total_timesteps | 1376256 | -| train/ | | -| approx_kl | 0.008341949 | -| clip_fraction | 0.101 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.928 | -| learning_rate | 0.0003 | -| loss | -0.0156 | -| n_updates | 830 | -| policy_gradient_loss | -0.00132 | -| std | 1.01 | -| value_loss | 0.00407 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1999 | -| iterations | 85 | -| time_elapsed | 696 | -| total_timesteps | 1392640 | -| train/ | | -| approx_kl | 0.010089031 | -| clip_fraction | 0.109 | -| clip_range | 0.2 | -| entropy_loss | -2.84 | -| explained_variance | 0.914 | -| learning_rate | 0.0003 | -| loss | -0.0249 | -| n_updates | 840 | -| policy_gradient_loss | -0.00202 | -| std | 0.999 | -| value_loss | 0.00555 | ------------------------------------------ -Eval num_timesteps=1400000, episode_reward=-5.74 +/- 37.76 -Episode length: 2000.00 +/- 0.00 ----------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -5.74 | -| time/ | | -| total_timesteps | 1400000 | -| train/ | | -| approx_kl | 0.00840036 | -| clip_fraction | 0.112 | -| clip_range | 0.2 | -| entropy_loss | -2.84 | -| explained_variance | 0.915 | -| learning_rate | 0.0003 | -| loss | -0.0267 | -| n_updates | 850 | -| policy_gradient_loss | -0.00422 | -| std | 1 | -| value_loss | 0.0017 | ----------------------------------------- - -[Diag @ 1,400,000 | n_sheep=2 | success=0%] - COMPACT_CANT_DRIVE 16/20 - NEVER_COMPACT 4/20 - action_mag mean=0.258 p10=0.000 p90=1.004 (0=stopped, 1=full speed) - min_flock_radius mean=3.30m best=0.61m (target <5m to compact) - min_dog_to_com mean=0.76m best=0.22m (FLEE_DIST=7m) - min_com_to_pen mean=12.16m best=4.08m - reward/step (mean): progress=+0.0035 alignment=+0.0165 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1954 | -| iterations | 86 | -| time_elapsed | 720 | -| total_timesteps | 1409024 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1964 | -| iterations | 87 | -| time_elapsed | 725 | -| total_timesteps | 1425408 | -| train/ | | -| approx_kl | 0.007908808 | -| clip_fraction | 0.0839 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.755 | -| learning_rate | 0.0003 | -| loss | -0.018 | -| n_updates | 860 | -| policy_gradient_loss | -0.00223 | -| std | 1.01 | -| value_loss | 0.00248 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1972 | -| iterations | 88 | -| time_elapsed | 730 | -| total_timesteps | 1441792 | -| train/ | | -| approx_kl | 0.007957449 | -| clip_fraction | 0.0864 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.868 | -| learning_rate | 0.0003 | -| loss | -0.0315 | -| n_updates | 870 | -| policy_gradient_loss | -0.00288 | -| std | 1.01 | -| value_loss | 0.00145 | ------------------------------------------ -Eval num_timesteps=1450000, episode_reward=-13.10 +/- 29.51 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -13.1 | -| time/ | | -| total_timesteps | 1450000 | -| train/ | | -| approx_kl | 0.007803983 | -| clip_fraction | 0.083 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.83 | -| learning_rate | 0.0003 | -| loss | -0.0212 | -| n_updates | 880 | -| policy_gradient_loss | -0.00119 | -| std | 1.01 | -| value_loss | 0.00191 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1952 | -| iterations | 89 | -| time_elapsed | 746 | -| total_timesteps | 1458176 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1961 | -| iterations | 90 | -| time_elapsed | 751 | -| total_timesteps | 1474560 | -| train/ | | -| approx_kl | 0.010021031 | -| clip_fraction | 0.097 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.902 | -| learning_rate | 0.0003 | -| loss | -0.0221 | -| n_updates | 890 | -| policy_gradient_loss | -0.00294 | -| std | 1.02 | -| value_loss | 0.00136 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1970 | -| iterations | 91 | -| time_elapsed | 756 | -| total_timesteps | 1490944 | -| train/ | | -| approx_kl | 0.0076614916 | -| clip_fraction | 0.0963 | -| clip_range | 0.2 | -| entropy_loss | -2.89 | -| explained_variance | 0.945 | -| learning_rate | 0.0003 | -| loss | -0.0273 | -| n_updates | 900 | -| policy_gradient_loss | -0.00355 | -| std | 1.03 | -| value_loss | 0.00181 | ------------------------------------------- -Eval num_timesteps=1500000, episode_reward=5.01 +/- 34.23 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 5.01 | -| time/ | | -| total_timesteps | 1500000 | -| train/ | | -| approx_kl | 0.005815446 | -| clip_fraction | 0.0675 | -| clip_range | 0.2 | -| entropy_loss | -2.9 | -| explained_variance | 0.934 | -| learning_rate | 0.0003 | -| loss | -0.0316 | -| n_updates | 910 | -| policy_gradient_loss | -0.00215 | -| std | 1.03 | -| value_loss | 0.00162 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1950 | -| iterations | 92 | -| time_elapsed | 772 | -| total_timesteps | 1507328 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1959 | -| iterations | 93 | -| time_elapsed | 777 | -| total_timesteps | 1523712 | -| train/ | | -| approx_kl | 0.0071218535 | -| clip_fraction | 0.0897 | -| clip_range | 0.2 | -| entropy_loss | -2.9 | -| explained_variance | 0.937 | -| learning_rate | 0.0003 | -| loss | -0.0219 | -| n_updates | 920 | -| policy_gradient_loss | -0.00225 | -| std | 1.03 | -| value_loss | 0.00463 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1967 | -| iterations | 94 | -| time_elapsed | 782 | -| total_timesteps | 1540096 | -| train/ | | -| approx_kl | 0.006857206 | -| clip_fraction | 0.0809 | -| clip_range | 0.2 | -| entropy_loss | -2.89 | -| explained_variance | 0.933 | -| learning_rate | 0.0003 | -| loss | -0.0252 | -| n_updates | 930 | -| policy_gradient_loss | -0.00219 | -| std | 1.02 | -| value_loss | 0.00436 | ------------------------------------------ -Eval num_timesteps=1550000, episode_reward=-4.04 +/- 33.69 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -4.04 | -| time/ | | -| total_timesteps | 1550000 | -| train/ | | -| approx_kl | 0.006146897 | -| clip_fraction | 0.0821 | -| clip_range | 0.2 | -| entropy_loss | -2.87 | -| explained_variance | 0.913 | -| learning_rate | 0.0003 | -| loss | -0.0352 | -| n_updates | 940 | -| policy_gradient_loss | -0.00258 | -| std | 1.02 | -| value_loss | 0.00325 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1948 | -| iterations | 95 | -| time_elapsed | 798 | -| total_timesteps | 1556480 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1958 | -| iterations | 96 | -| time_elapsed | 803 | -| total_timesteps | 1572864 | -| train/ | | -| approx_kl | 0.0069321445 | -| clip_fraction | 0.0778 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.94 | -| learning_rate | 0.0003 | -| loss | -0.013 | -| n_updates | 950 | -| policy_gradient_loss | -0.00214 | -| std | 1.01 | -| value_loss | 0.00162 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1965 | -| iterations | 97 | -| time_elapsed | 808 | -| total_timesteps | 1589248 | -| train/ | | -| approx_kl | 0.0066491435 | -| clip_fraction | 0.0714 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.941 | -| learning_rate | 0.0003 | -| loss | -0.0304 | -| n_updates | 960 | -| policy_gradient_loss | -0.00212 | -| std | 1.03 | -| value_loss | 0.0011 | ------------------------------------------- -Eval num_timesteps=1600000, episode_reward=12.65 +/- 31.73 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 12.6 | -| time/ | | -| total_timesteps | 1600000 | -| train/ | | -| approx_kl | 0.0050257677 | -| clip_fraction | 0.0588 | -| clip_range | 0.2 | -| entropy_loss | -2.9 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0359 | -| n_updates | 970 | -| policy_gradient_loss | -0.0013 | -| std | 1.04 | -| value_loss | 0.00201 | ------------------------------------------- - -[Diag @ 1,600,000 | n_sheep=2 | success=0%] - COMPACT_CANT_DRIVE 13/20 - NEVER_COMPACT 7/20 - action_mag mean=0.252 p10=0.004 p90=0.980 (0=stopped, 1=full speed) - min_flock_radius mean=4.30m best=0.92m (target <5m to compact) - min_dog_to_com mean=0.74m best=0.38m (FLEE_DIST=7m) - min_com_to_pen mean=13.76m best=5.49m - reward/step (mean): progress=-0.0006 alignment=+0.0287 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 - -[Curriculum] leaving stage n_sheep=2 after 800,000 steps | training success rate (last 100 eps) = 0% -[Curriculum] → 3 sheep at step 1,600,000 - --------------------------------- -| time/ | | -| fps | 1930 | -| iterations | 98 | -| time_elapsed | 831 | -| total_timesteps | 1605632 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1937 | -| iterations | 99 | -| time_elapsed | 837 | -| total_timesteps | 1622016 | -| train/ | | -| approx_kl | 0.0085028205 | -| clip_fraction | 0.0905 | -| clip_range | 0.2 | -| entropy_loss | -2.89 | -| explained_variance | 0.909 | -| learning_rate | 0.0003 | -| loss | -0.0346 | -| n_updates | 980 | -| policy_gradient_loss | -0.00245 | -| std | 1.02 | -| value_loss | 0.00492 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1945 | -| iterations | 100 | -| time_elapsed | 842 | -| total_timesteps | 1638400 | -| train/ | | -| approx_kl | 0.009084044 | -| clip_fraction | 0.118 | -| clip_range | 0.2 | -| entropy_loss | -2.91 | -| explained_variance | 0.964 | -| learning_rate | 0.0003 | -| loss | -0.0416 | -| n_updates | 990 | -| policy_gradient_loss | 0.0025 | -| std | 1.04 | -| value_loss | 0.00194 | ------------------------------------------ -Eval num_timesteps=1650000, episode_reward=3.05 +/- 36.42 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 3.05 | -| time/ | | -| total_timesteps | 1650000 | -| train/ | | -| approx_kl | 0.009275759 | -| clip_fraction | 0.108 | -| clip_range | 0.2 | -| entropy_loss | -2.92 | -| explained_variance | 0.965 | -| learning_rate | 0.0003 | -| loss | -0.0336 | -| n_updates | 1000 | -| policy_gradient_loss | 0.000149 | -| std | 1.04 | -| value_loss | 0.00185 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1926 | -| iterations | 101 | -| time_elapsed | 859 | -| total_timesteps | 1654784 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1934 | -| iterations | 102 | -| time_elapsed | 864 | -| total_timesteps | 1671168 | -| train/ | | -| approx_kl | 0.008650862 | -| clip_fraction | 0.117 | -| clip_range | 0.2 | -| entropy_loss | -2.92 | -| explained_variance | 0.938 | -| learning_rate | 0.0003 | -| loss | -0.0279 | -| n_updates | 1010 | -| policy_gradient_loss | -0.000545 | -| std | 1.04 | -| value_loss | 0.00611 | ------------------------------------------ ---------------------------------------- -| time/ | | -| fps | 1939 | -| iterations | 103 | -| time_elapsed | 869 | -| total_timesteps | 1687552 | -| train/ | | -| approx_kl | 0.0080826 | -| clip_fraction | 0.0992 | -| clip_range | 0.2 | -| entropy_loss | -2.93 | -| explained_variance | 0.952 | -| learning_rate | 0.0003 | -| loss | -0.0415 | -| n_updates | 1020 | -| policy_gradient_loss | -0.00201 | -| std | 1.05 | -| value_loss | 0.00251 | ---------------------------------------- -Eval num_timesteps=1700000, episode_reward=-4.66 +/- 36.05 -Episode length: 2000.00 +/- 0.00 ----------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -4.66 | -| time/ | | -| total_timesteps | 1700000 | -| train/ | | -| approx_kl | 0.00786162 | -| clip_fraction | 0.0921 | -| clip_range | 0.2 | -| entropy_loss | -2.95 | -| explained_variance | 0.893 | -| learning_rate | 0.0003 | -| loss | -0.0301 | -| n_updates | 1030 | -| policy_gradient_loss | -0.000631 | -| std | 1.06 | -| value_loss | 0.00158 | ----------------------------------------- --------------------------------- -| time/ | | -| fps | 1922 | -| iterations | 104 | -| time_elapsed | 886 | -| total_timesteps | 1703936 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1930 | -| iterations | 105 | -| time_elapsed | 891 | -| total_timesteps | 1720320 | -| train/ | | -| approx_kl | 0.008055547 | -| clip_fraction | 0.0842 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.918 | -| learning_rate | 0.0003 | -| loss | -0.027 | -| n_updates | 1040 | -| policy_gradient_loss | -6.56e-05 | -| std | 1.07 | -| value_loss | 0.00193 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1937 | -| iterations | 106 | -| time_elapsed | 896 | -| total_timesteps | 1736704 | -| train/ | | -| approx_kl | 0.008067045 | -| clip_fraction | 0.087 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.878 | -| learning_rate | 0.0003 | -| loss | -0.0281 | -| n_updates | 1050 | -| policy_gradient_loss | -0.00194 | -| std | 1.07 | -| value_loss | 0.0082 | ------------------------------------------ -Eval num_timesteps=1750000, episode_reward=-0.31 +/- 42.66 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -0.309 | -| time/ | | -| total_timesteps | 1750000 | -| train/ | | -| approx_kl | 0.0066514863 | -| clip_fraction | 0.0808 | -| clip_range | 0.2 | -| entropy_loss | -2.99 | -| explained_variance | 0.888 | -| learning_rate | 0.0003 | -| loss | -0.0335 | -| n_updates | 1060 | -| policy_gradient_loss | -0.00108 | -| std | 1.08 | -| value_loss | 0.00303 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1921 | -| iterations | 107 | -| time_elapsed | 912 | -| total_timesteps | 1753088 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1927 | -| iterations | 108 | -| time_elapsed | 917 | -| total_timesteps | 1769472 | -| train/ | | -| approx_kl | 0.008252729 | -| clip_fraction | 0.093 | -| clip_range | 0.2 | -| entropy_loss | -3 | -| explained_variance | 0.959 | -| learning_rate | 0.0003 | -| loss | -0.0413 | -| n_updates | 1070 | -| policy_gradient_loss | -0.00241 | -| std | 1.09 | -| value_loss | 0.00122 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1935 | -| iterations | 109 | -| time_elapsed | 922 | -| total_timesteps | 1785856 | -| train/ | | -| approx_kl | 0.0073527684 | -| clip_fraction | 0.0822 | -| clip_range | 0.2 | -| entropy_loss | -3.01 | -| explained_variance | 0.883 | -| learning_rate | 0.0003 | -| loss | -0.018 | -| n_updates | 1080 | -| policy_gradient_loss | -0.00172 | -| std | 1.1 | -| value_loss | 0.00172 | ------------------------------------------- -Eval num_timesteps=1800000, episode_reward=8.99 +/- 39.35 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 8.99 | -| time/ | | -| total_timesteps | 1800000 | -| train/ | | -| approx_kl | 0.006149094 | -| clip_fraction | 0.0771 | -| clip_range | 0.2 | -| entropy_loss | -3.03 | -| explained_variance | 0.911 | -| learning_rate | 0.0003 | -| loss | -0.0315 | -| n_updates | 1090 | -| policy_gradient_loss | -0.000744 | -| std | 1.1 | -| value_loss | 0.00456 | ------------------------------------------ - -[Diag @ 1,800,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 19/20 - COMPACT_CANT_DRIVE 1/20 - action_mag mean=0.049 p10=0.007 p90=0.049 (0=stopped, 1=full speed) - min_flock_radius mean=7.79m best=4.73m (target <5m to compact) - min_dog_to_com mean=0.92m best=0.25m (FLEE_DIST=7m) - min_com_to_pen mean=14.27m best=7.54m - reward/step (mean): progress=-0.0043 alignment=+0.0208 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1899 | -| iterations | 110 | -| time_elapsed | 948 | -| total_timesteps | 1802240 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1906 | -| iterations | 111 | -| time_elapsed | 953 | -| total_timesteps | 1818624 | -| train/ | | -| approx_kl | 0.007161974 | -| clip_fraction | 0.0871 | -| clip_range | 0.2 | -| entropy_loss | -3.03 | -| explained_variance | 0.914 | -| learning_rate | 0.0003 | -| loss | -0.0359 | -| n_updates | 1100 | -| policy_gradient_loss | -0.00186 | -| std | 1.1 | -| value_loss | 0.00214 | ------------------------------------------ ----------------------------------------- -| time/ | | -| fps | 1914 | -| iterations | 112 | -| time_elapsed | 958 | -| total_timesteps | 1835008 | -| train/ | | -| approx_kl | 0.00886854 | -| clip_fraction | 0.103 | -| clip_range | 0.2 | -| entropy_loss | -3.04 | -| explained_variance | 0.94 | -| learning_rate | 0.0003 | -| loss | -0.04 | -| n_updates | 1110 | -| policy_gradient_loss | -0.00333 | -| std | 1.11 | -| value_loss | 0.00456 | ----------------------------------------- -Eval num_timesteps=1850000, episode_reward=14.49 +/- 36.35 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 14.5 | -| time/ | | -| total_timesteps | 1850000 | -| train/ | | -| approx_kl | 0.0058414284 | -| clip_fraction | 0.0642 | -| clip_range | 0.2 | -| entropy_loss | -3.05 | -| explained_variance | 0.871 | -| learning_rate | 0.0003 | -| loss | -0.033 | -| n_updates | 1120 | -| policy_gradient_loss | -0.000891 | -| std | 1.11 | -| value_loss | 0.00394 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1898 | -| iterations | 113 | -| time_elapsed | 975 | -| total_timesteps | 1851392 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1906 | -| iterations | 114 | -| time_elapsed | 979 | -| total_timesteps | 1867776 | -| train/ | | -| approx_kl | 0.008916938 | -| clip_fraction | 0.0916 | -| clip_range | 0.2 | -| entropy_loss | -3.05 | -| explained_variance | 0.937 | -| learning_rate | 0.0003 | -| loss | -0.0334 | -| n_updates | 1130 | -| policy_gradient_loss | -0.00257 | -| std | 1.12 | -| value_loss | 0.00285 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1913 | -| iterations | 115 | -| time_elapsed | 984 | -| total_timesteps | 1884160 | -| train/ | | -| approx_kl | 0.008523149 | -| clip_fraction | 0.0907 | -| clip_range | 0.2 | -| entropy_loss | -3.06 | -| explained_variance | 0.954 | -| learning_rate | 0.0003 | -| loss | -0.0339 | -| n_updates | 1140 | -| policy_gradient_loss | -0.0034 | -| std | 1.12 | -| value_loss | 0.00209 | ------------------------------------------ -Eval num_timesteps=1900000, episode_reward=9.85 +/- 42.18 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 9.85 | -| time/ | | -| total_timesteps | 1900000 | -| train/ | | -| approx_kl | 0.0075978916 | -| clip_fraction | 0.0819 | -| clip_range | 0.2 | -| entropy_loss | -3.06 | -| explained_variance | 0.96 | -| learning_rate | 0.0003 | -| loss | -0.0313 | -| n_updates | 1150 | -| policy_gradient_loss | -0.00272 | -| std | 1.12 | -| value_loss | 0.00332 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1896 | -| iterations | 116 | -| time_elapsed | 1002 | -| total_timesteps | 1900544 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1902 | -| iterations | 117 | -| time_elapsed | 1007 | -| total_timesteps | 1916928 | -| train/ | | -| approx_kl | 0.008376695 | -| clip_fraction | 0.0935 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.964 | -| learning_rate | 0.0003 | -| loss | -0.0392 | -| n_updates | 1160 | -| policy_gradient_loss | -0.00354 | -| std | 1.12 | -| value_loss | 0.00203 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1909 | -| iterations | 118 | -| time_elapsed | 1012 | -| total_timesteps | 1933312 | -| train/ | | -| approx_kl | 0.0077100536 | -| clip_fraction | 0.0854 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.933 | -| learning_rate | 0.0003 | -| loss | -0.0467 | -| n_updates | 1170 | -| policy_gradient_loss | -0.00421 | -| std | 1.12 | -| value_loss | 0.00132 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1915 | -| iterations | 119 | -| time_elapsed | 1018 | -| total_timesteps | 1949696 | -| train/ | | -| approx_kl | 0.006848542 | -| clip_fraction | 0.0674 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.959 | -| learning_rate | 0.0003 | -| loss | -0.0335 | -| n_updates | 1180 | -| policy_gradient_loss | -0.00229 | -| std | 1.13 | -| value_loss | 0.00138 | ------------------------------------------ -Eval num_timesteps=1950000, episode_reward=29.72 +/- 38.42 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 29.7 | -| time/ | | -| total_timesteps | 1950000 | -| train/ | | -| approx_kl | 0.007300608 | -| clip_fraction | 0.0824 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.977 | -| learning_rate | 0.0003 | -| loss | -0.0358 | -| n_updates | 1190 | -| policy_gradient_loss | -0.00364 | -| std | 1.12 | -| value_loss | 0.00159 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1899 | -| iterations | 120 | -| time_elapsed | 1034 | -| total_timesteps | 1966080 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1906 | -| iterations | 121 | -| time_elapsed | 1040 | -| total_timesteps | 1982464 | -| train/ | | -| approx_kl | 0.0072772675 | -| clip_fraction | 0.0703 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.882 | -| learning_rate | 0.0003 | -| loss | -0.0357 | -| n_updates | 1200 | -| policy_gradient_loss | -0.00163 | -| std | 1.13 | -| value_loss | 0.00471 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1912 | -| iterations | 122 | -| time_elapsed | 1045 | -| total_timesteps | 1998848 | -| train/ | | -| approx_kl | 0.007866079 | -| clip_fraction | 0.0898 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.962 | -| learning_rate | 0.0003 | -| loss | -0.0304 | -| n_updates | 1210 | -| policy_gradient_loss | -0.0052 | -| std | 1.13 | -| value_loss | 0.0014 | ------------------------------------------ -Eval num_timesteps=2000000, episode_reward=14.20 +/- 34.02 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 14.2 | -| time/ | | -| total_timesteps | 2000000 | -| train/ | | -| approx_kl | 0.0073383995 | -| clip_fraction | 0.083 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.95 | -| learning_rate | 0.0003 | -| loss | -0.0369 | -| n_updates | 1220 | -| policy_gradient_loss | -0.00296 | -| std | 1.12 | -| value_loss | 0.00336 | ------------------------------------------- - -[Diag @ 2,000,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 12/20 - COMPACT_CANT_DRIVE 8/20 - action_mag mean=0.076 p10=0.007 p90=0.097 (0=stopped, 1=full speed) - min_flock_radius mean=5.33m best=0.00m (target <5m to compact) - min_dog_to_com mean=1.01m best=0.16m (FLEE_DIST=7m) - min_com_to_pen mean=12.40m best=6.50m - reward/step (mean): progress=+0.0041 alignment=+0.0263 pen_bonus=+0.0013 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1881 | -| iterations | 123 | -| time_elapsed | 1071 | -| total_timesteps | 2015232 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1887 | -| iterations | 124 | -| time_elapsed | 1076 | -| total_timesteps | 2031616 | -| train/ | | -| approx_kl | 0.0060287267 | -| clip_fraction | 0.0716 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.902 | -| learning_rate | 0.0003 | -| loss | -0.0402 | -| n_updates | 1230 | -| policy_gradient_loss | -0.00308 | -| std | 1.13 | -| value_loss | 0.00475 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1894 | -| iterations | 125 | -| time_elapsed | 1081 | -| total_timesteps | 2048000 | -| train/ | | -| approx_kl | 0.0073304214 | -| clip_fraction | 0.08 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.95 | -| learning_rate | 0.0003 | -| loss | -0.0436 | -| n_updates | 1240 | -| policy_gradient_loss | -0.00373 | -| std | 1.13 | -| value_loss | 0.00138 | ------------------------------------------- -Eval num_timesteps=2050000, episode_reward=18.68 +/- 36.20 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 18.7 | -| time/ | | -| total_timesteps | 2050000 | -| train/ | | -| approx_kl | 0.0068036346 | -| clip_fraction | 0.0768 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.897 | -| learning_rate | 0.0003 | -| loss | -0.0461 | -| n_updates | 1250 | -| policy_gradient_loss | -0.00392 | -| std | 1.13 | -| value_loss | 0.0013 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1880 | -| iterations | 126 | -| time_elapsed | 1097 | -| total_timesteps | 2064384 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1886 | -| iterations | 127 | -| time_elapsed | 1102 | -| total_timesteps | 2080768 | -| train/ | | -| approx_kl | 0.006960577 | -| clip_fraction | 0.0689 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.917 | -| learning_rate | 0.0003 | -| loss | -0.0302 | -| n_updates | 1260 | -| policy_gradient_loss | -0.00248 | -| std | 1.12 | -| value_loss | 0.00841 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1892 | -| iterations | 128 | -| time_elapsed | 1108 | -| total_timesteps | 2097152 | -| train/ | | -| approx_kl | 0.007300884 | -| clip_fraction | 0.0705 | -| clip_range | 0.2 | -| entropy_loss | -3.09 | -| explained_variance | 0.915 | -| learning_rate | 0.0003 | -| loss | -0.0338 | -| n_updates | 1270 | -| policy_gradient_loss | -0.00351 | -| std | 1.14 | -| value_loss | 0.00336 | ------------------------------------------ -Eval num_timesteps=2100000, episode_reward=37.33 +/- 41.91 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 37.3 | -| time/ | | -| total_timesteps | 2100000 | -| train/ | | -| approx_kl | 0.007571588 | -| clip_fraction | 0.076 | -| clip_range | 0.2 | -| entropy_loss | -3.1 | -| explained_variance | 0.907 | -| learning_rate | 0.0003 | -| loss | -0.0278 | -| n_updates | 1280 | -| policy_gradient_loss | -0.00336 | -| std | 1.14 | -| value_loss | 0.00228 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1878 | -| iterations | 129 | -| time_elapsed | 1124 | -| total_timesteps | 2113536 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1884 | -| iterations | 130 | -| time_elapsed | 1130 | -| total_timesteps | 2129920 | -| train/ | | -| approx_kl | 0.007885255 | -| clip_fraction | 0.088 | -| clip_range | 0.2 | -| entropy_loss | -3.11 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0388 | -| n_updates | 1290 | -| policy_gradient_loss | -0.00498 | -| std | 1.15 | -| value_loss | 0.00231 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1890 | -| iterations | 131 | -| time_elapsed | 1135 | -| total_timesteps | 2146304 | -| train/ | | -| approx_kl | 0.0073760273 | -| clip_fraction | 0.0769 | -| clip_range | 0.2 | -| entropy_loss | -3.11 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.0277 | -| n_updates | 1300 | -| policy_gradient_loss | -0.00306 | -| std | 1.15 | -| value_loss | 0.00294 | ------------------------------------------- -Eval num_timesteps=2150000, episode_reward=31.84 +/- 38.92 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 31.8 | -| time/ | | -| total_timesteps | 2150000 | -| train/ | | -| approx_kl | 0.006736047 | -| clip_fraction | 0.0685 | -| clip_range | 0.2 | -| entropy_loss | -3.12 | -| explained_variance | 0.913 | -| learning_rate | 0.0003 | -| loss | -0.0302 | -| n_updates | 1310 | -| policy_gradient_loss | -0.0021 | -| std | 1.16 | -| value_loss | 0.00422 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1872 | -| iterations | 132 | -| time_elapsed | 1155 | -| total_timesteps | 2162688 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1878 | -| iterations | 133 | -| time_elapsed | 1160 | -| total_timesteps | 2179072 | -| train/ | | -| approx_kl | 0.006166819 | -| clip_fraction | 0.0668 | -| clip_range | 0.2 | -| entropy_loss | -3.13 | -| explained_variance | 0.956 | -| learning_rate | 0.0003 | -| loss | -0.0473 | -| n_updates | 1320 | -| policy_gradient_loss | -0.00364 | -| std | 1.16 | -| value_loss | 0.00158 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1884 | -| iterations | 134 | -| time_elapsed | 1165 | -| total_timesteps | 2195456 | -| train/ | | -| approx_kl | 0.0075986157 | -| clip_fraction | 0.0769 | -| clip_range | 0.2 | -| entropy_loss | -3.14 | -| explained_variance | 0.966 | -| learning_rate | 0.0003 | -| loss | -0.0317 | -| n_updates | 1330 | -| policy_gradient_loss | -0.00398 | -| std | 1.17 | -| value_loss | 0.00307 | ------------------------------------------- -Eval num_timesteps=2200000, episode_reward=26.98 +/- 37.84 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 27 | -| time/ | | -| total_timesteps | 2200000 | -| train/ | | -| approx_kl | 0.008170303 | -| clip_fraction | 0.0981 | -| clip_range | 0.2 | -| entropy_loss | -3.14 | -| explained_variance | 0.964 | -| learning_rate | 0.0003 | -| loss | -0.0326 | -| n_updates | 1340 | -| policy_gradient_loss | -0.00415 | -| std | 1.16 | -| value_loss | 0.00349 | ------------------------------------------ - -[Diag @ 2,200,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 16/20 - COMPACT_CANT_DRIVE 4/20 - action_mag mean=0.067 p10=0.003 p90=0.067 (0=stopped, 1=full speed) - min_flock_radius mean=7.25m best=1.61m (target <5m to compact) - min_dog_to_com mean=0.97m best=0.20m (FLEE_DIST=7m) - min_com_to_pen mean=13.28m best=5.53m - reward/step (mean): progress=+0.0007 alignment=+0.0353 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1832 | -| iterations | 135 | -| time_elapsed | 1206 | -| total_timesteps | 2211840 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1805 | -| iterations | 136 | -| time_elapsed | 1234 | -| total_timesteps | 2228224 | -| train/ | | -| approx_kl | 0.006131858 | -| clip_fraction | 0.067 | -| clip_range | 0.2 | -| entropy_loss | -3.13 | -| explained_variance | 0.927 | -| learning_rate | 0.0003 | -| loss | -0.0328 | -| n_updates | 1350 | -| policy_gradient_loss | -0.0022 | -| std | 1.16 | -| value_loss | 0.000981 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1811 | -| iterations | 137 | -| time_elapsed | 1239 | -| total_timesteps | 2244608 | -| train/ | | -| approx_kl | 0.0071705403 | -| clip_fraction | 0.0699 | -| clip_range | 0.2 | -| entropy_loss | -3.12 | -| explained_variance | 0.913 | -| learning_rate | 0.0003 | -| loss | -0.0391 | -| n_updates | 1360 | -| policy_gradient_loss | -0.0032 | -| std | 1.15 | -| value_loss | 0.00639 | ------------------------------------------- -Eval num_timesteps=2250000, episode_reward=28.55 +/- 29.67 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 28.5 | -| time/ | | -| total_timesteps | 2250000 | -| train/ | | -| approx_kl | 0.007929602 | -| clip_fraction | 0.0812 | -| clip_range | 0.2 | -| entropy_loss | -3.14 | -| explained_variance | 0.933 | -| learning_rate | 0.0003 | -| loss | -0.0592 | -| n_updates | 1370 | -| policy_gradient_loss | -0.00434 | -| std | 1.17 | -| value_loss | 0.00337 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1800 | -| iterations | 138 | -| time_elapsed | 1255 | -| total_timesteps | 2260992 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1806 | -| iterations | 139 | -| time_elapsed | 1260 | -| total_timesteps | 2277376 | -| train/ | | -| approx_kl | 0.0062256474 | -| clip_fraction | 0.0592 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.935 | -| learning_rate | 0.0003 | -| loss | -0.0368 | -| n_updates | 1380 | -| policy_gradient_loss | -0.00242 | -| std | 1.17 | -| value_loss | 0.00787 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1812 | -| iterations | 140 | -| time_elapsed | 1265 | -| total_timesteps | 2293760 | -| train/ | | -| approx_kl | 0.0075241774 | -| clip_fraction | 0.0885 | -| clip_range | 0.2 | -| entropy_loss | -3.14 | -| explained_variance | 0.948 | -| learning_rate | 0.0003 | -| loss | -0.0385 | -| n_updates | 1390 | -| policy_gradient_loss | -0.00346 | -| std | 1.16 | -| value_loss | 0.00172 | ------------------------------------------- -Eval num_timesteps=2300000, episode_reward=43.34 +/- 34.73 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 43.3 | -| time/ | | -| total_timesteps | 2300000 | -| train/ | | -| approx_kl | 0.0073855575 | -| clip_fraction | 0.0753 | -| clip_range | 0.2 | -| entropy_loss | -3.12 | -| explained_variance | 0.911 | -| learning_rate | 0.0003 | -| loss | -0.0377 | -| n_updates | 1400 | -| policy_gradient_loss | -0.0034 | -| std | 1.15 | -| value_loss | 0.00645 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1801 | -| iterations | 141 | -| time_elapsed | 1282 | -| total_timesteps | 2310144 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1806 | -| iterations | 142 | -| time_elapsed | 1287 | -| total_timesteps | 2326528 | -| train/ | | -| approx_kl | 0.007232903 | -| clip_fraction | 0.0845 | -| clip_range | 0.2 | -| entropy_loss | -3.13 | -| explained_variance | 0.956 | -| learning_rate | 0.0003 | -| loss | -0.0346 | -| n_updates | 1410 | -| policy_gradient_loss | -0.003 | -| std | 1.16 | -| value_loss | 0.00134 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1812 | -| iterations | 143 | -| time_elapsed | 1292 | -| total_timesteps | 2342912 | -| train/ | | -| approx_kl | 0.007283367 | -| clip_fraction | 0.0785 | -| clip_range | 0.2 | -| entropy_loss | -3.14 | -| explained_variance | 0.913 | -| learning_rate | 0.0003 | -| loss | -0.0306 | -| n_updates | 1420 | -| policy_gradient_loss | -0.00368 | -| std | 1.17 | -| value_loss | 0.00385 | ------------------------------------------ -Eval num_timesteps=2350000, episode_reward=33.49 +/- 34.79 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 33.5 | -| time/ | | -| total_timesteps | 2350000 | -| train/ | | -| approx_kl | 0.006632698 | -| clip_fraction | 0.0647 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.934 | -| learning_rate | 0.0003 | -| loss | -0.0469 | -| n_updates | 1430 | -| policy_gradient_loss | -0.00327 | -| std | 1.17 | -| value_loss | 0.00793 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1800 | -| iterations | 144 | -| time_elapsed | 1310 | -| total_timesteps | 2359296 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1805 | -| iterations | 145 | -| time_elapsed | 1315 | -| total_timesteps | 2375680 | -| train/ | | -| approx_kl | 0.008364577 | -| clip_fraction | 0.089 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.957 | -| learning_rate | 0.0003 | -| loss | -0.0464 | -| n_updates | 1440 | -| policy_gradient_loss | -0.00453 | -| std | 1.17 | -| value_loss | 0.00507 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1810 | -| iterations | 146 | -| time_elapsed | 1321 | -| total_timesteps | 2392064 | -| train/ | | -| approx_kl | 0.007854694 | -| clip_fraction | 0.0927 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.953 | -| learning_rate | 0.0003 | -| loss | -0.0436 | -| n_updates | 1450 | -| policy_gradient_loss | -0.00519 | -| std | 1.17 | -| value_loss | 0.00289 | ------------------------------------------ -Eval num_timesteps=2400000, episode_reward=34.64 +/- 37.27 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 34.6 | -| time/ | | -| total_timesteps | 2400000 | -| train/ | | -| approx_kl | 0.0076201856 | -| clip_fraction | 0.0844 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.945 | -| learning_rate | 0.0003 | -| loss | -0.0431 | -| n_updates | 1460 | -| policy_gradient_loss | -0.00554 | -| std | 1.17 | -| value_loss | 0.00196 | ------------------------------------------- - -[Diag @ 2,400,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 15/20 - COMPACT_CANT_DRIVE 5/20 - action_mag mean=0.058 p10=0.006 p90=0.053 (0=stopped, 1=full speed) - min_flock_radius mean=6.68m best=0.96m (target <5m to compact) - min_dog_to_com mean=0.92m best=0.16m (FLEE_DIST=7m) - min_com_to_pen mean=12.18m best=5.62m - reward/step (mean): progress=+0.0034 alignment=+0.0352 pen_bonus=+0.0010 step_cost=-0.0200 complete=+0.0000 - -[Curriculum] leaving stage n_sheep=3 after 800,000 steps | training success rate (last 100 eps) = 0% -[Curriculum] → 4 sheep at step 2,400,000 - --------------------------------- -| time/ | | -| fps | 1788 | -| iterations | 147 | -| time_elapsed | 1346 | -| total_timesteps | 2408448 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1794 | -| iterations | 148 | -| time_elapsed | 1351 | -| total_timesteps | 2424832 | -| train/ | | -| approx_kl | 0.006801254 | -| clip_fraction | 0.0797 | -| clip_range | 0.2 | -| entropy_loss | -3.16 | -| explained_variance | 0.922 | -| learning_rate | 0.0003 | -| loss | -0.0313 | -| n_updates | 1470 | -| policy_gradient_loss | -0.00418 | -| std | 1.18 | -| value_loss | 0.00724 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1798 | -| iterations | 149 | -| time_elapsed | 1357 | -| total_timesteps | 2441216 | -| train/ | | -| approx_kl | 0.007604986 | -| clip_fraction | 0.0758 | -| clip_range | 0.2 | -| entropy_loss | -3.18 | -| explained_variance | 0.937 | -| learning_rate | 0.0003 | -| loss | -0.0354 | -| n_updates | 1480 | -| policy_gradient_loss | -0.00189 | -| std | 1.19 | -| value_loss | 0.00591 | ------------------------------------------ -Eval num_timesteps=2450000, episode_reward=27.82 +/- 47.76 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 27.8 | -| time/ | | -| total_timesteps | 2450000 | -| train/ | | -| approx_kl | 0.0070674624 | -| clip_fraction | 0.0749 | -| clip_range | 0.2 | -| entropy_loss | -3.2 | -| explained_variance | 0.893 | -| learning_rate | 0.0003 | -| loss | -0.0327 | -| n_updates | 1490 | -| policy_gradient_loss | -0.00322 | -| std | 1.2 | -| value_loss | 0.0105 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1788 | -| iterations | 150 | -| time_elapsed | 1374 | -| total_timesteps | 2457600 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1792 | -| iterations | 151 | -| time_elapsed | 1380 | -| total_timesteps | 2473984 | -| train/ | | -| approx_kl | 0.008372683 | -| clip_fraction | 0.0874 | -| clip_range | 0.2 | -| entropy_loss | -3.21 | -| explained_variance | 0.932 | -| learning_rate | 0.0003 | -| loss | -0.0381 | -| n_updates | 1500 | -| policy_gradient_loss | -0.00471 | -| std | 1.21 | -| value_loss | 0.00563 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1796 | -| iterations | 152 | -| time_elapsed | 1385 | -| total_timesteps | 2490368 | -| train/ | | -| approx_kl | 0.007761459 | -| clip_fraction | 0.0794 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0345 | -| n_updates | 1510 | -| policy_gradient_loss | -0.00402 | -| std | 1.22 | -| value_loss | 0.00736 | ------------------------------------------ -Eval num_timesteps=2500000, episode_reward=25.79 +/- 28.60 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 25.8 | -| time/ | | -| total_timesteps | 2500000 | -| train/ | | -| approx_kl | 0.0070840344 | -| clip_fraction | 0.0711 | -| clip_range | 0.2 | -| entropy_loss | -3.22 | -| explained_variance | 0.9 | -| learning_rate | 0.0003 | -| loss | -0.0322 | -| n_updates | 1520 | -| policy_gradient_loss | -0.00397 | -| std | 1.21 | -| value_loss | 0.00517 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1785 | -| iterations | 153 | -| time_elapsed | 1404 | -| total_timesteps | 2506752 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1788 | -| iterations | 154 | -| time_elapsed | 1410 | -| total_timesteps | 2523136 | -| train/ | | -| approx_kl | 0.0062630484 | -| clip_fraction | 0.069 | -| clip_range | 0.2 | -| entropy_loss | -3.22 | -| explained_variance | 0.93 | -| learning_rate | 0.0003 | -| loss | -0.0363 | -| n_updates | 1530 | -| policy_gradient_loss | -0.00382 | -| std | 1.21 | -| value_loss | 0.00546 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1792 | -| iterations | 155 | -| time_elapsed | 1416 | -| total_timesteps | 2539520 | -| train/ | | -| approx_kl | 0.007609036 | -| clip_fraction | 0.0815 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.832 | -| learning_rate | 0.0003 | -| loss | -0.0404 | -| n_updates | 1540 | -| policy_gradient_loss | -0.00347 | -| std | 1.22 | -| value_loss | 0.00902 | ------------------------------------------ -Eval num_timesteps=2550000, episode_reward=26.76 +/- 38.76 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 26.8 | -| time/ | | -| total_timesteps | 2550000 | -| train/ | | -| approx_kl | 0.0070117847 | -| clip_fraction | 0.0808 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.863 | -| learning_rate | 0.0003 | -| loss | -0.0357 | -| n_updates | 1550 | -| policy_gradient_loss | -0.00279 | -| std | 1.22 | -| value_loss | 0.0114 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1780 | -| iterations | 156 | -| time_elapsed | 1435 | -| total_timesteps | 2555904 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1786 | -| iterations | 157 | -| time_elapsed | 1440 | -| total_timesteps | 2572288 | -| train/ | | -| approx_kl | 0.0070258966 | -| clip_fraction | 0.0817 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.941 | -| learning_rate | 0.0003 | -| loss | -0.039 | -| n_updates | 1560 | -| policy_gradient_loss | -0.00488 | -| std | 1.22 | -| value_loss | 0.00696 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1791 | -| iterations | 158 | -| time_elapsed | 1445 | -| total_timesteps | 2588672 | -| train/ | | -| approx_kl | 0.007600763 | -| clip_fraction | 0.0842 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.912 | -| learning_rate | 0.0003 | -| loss | -0.0363 | -| n_updates | 1570 | -| policy_gradient_loss | -0.00544 | -| std | 1.22 | -| value_loss | 0.00556 | ------------------------------------------ -Eval num_timesteps=2600000, episode_reward=19.53 +/- 46.34 -Episode length: 2000.00 +/- 0.00 ----------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 19.5 | -| time/ | | -| total_timesteps | 2600000 | -| train/ | | -| approx_kl | 0.00714178 | -| clip_fraction | 0.0783 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.92 | -| learning_rate | 0.0003 | -| loss | -0.0352 | -| n_updates | 1580 | -| policy_gradient_loss | -0.00468 | -| std | 1.22 | -| value_loss | 0.00364 | ----------------------------------------- - -[Diag @ 2,600,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 20/20 - action_mag mean=0.061 p10=0.006 p90=0.047 (0=stopped, 1=full speed) - min_flock_radius mean=7.84m best=5.75m (target <5m to compact) - min_dog_to_com mean=0.66m best=0.09m (FLEE_DIST=7m) - min_com_to_pen mean=12.60m best=6.52m - reward/step (mean): progress=-0.0028 alignment=+0.0337 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1768 | -| iterations | 159 | -| time_elapsed | 1473 | -| total_timesteps | 2605056 | --------------------------------- ----------------------------------------- -| time/ | | -| fps | 1771 | -| iterations | 160 | -| time_elapsed | 1479 | -| total_timesteps | 2621440 | -| train/ | | -| approx_kl | 0.00681924 | -| clip_fraction | 0.0779 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.946 | -| learning_rate | 0.0003 | -| loss | -0.0409 | -| n_updates | 1590 | -| policy_gradient_loss | -0.00346 | -| std | 1.22 | -| value_loss | 0.00377 | ----------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1775 | -| iterations | 161 | -| time_elapsed | 1485 | -| total_timesteps | 2637824 | -| train/ | | -| approx_kl | 0.008016385 | -| clip_fraction | 0.0888 | -| clip_range | 0.2 | -| entropy_loss | -3.24 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0311 | -| n_updates | 1600 | -| policy_gradient_loss | -0.00526 | -| std | 1.22 | -| value_loss | 0.00681 | ------------------------------------------ -Eval num_timesteps=2650000, episode_reward=28.98 +/- 40.07 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 29 | -| time/ | | -| total_timesteps | 2650000 | -| train/ | | -| approx_kl | 0.006836592 | -| clip_fraction | 0.0778 | -| clip_range | 0.2 | -| entropy_loss | -3.24 | -| explained_variance | 0.9 | -| learning_rate | 0.0003 | -| loss | -0.0304 | -| n_updates | 1610 | -| policy_gradient_loss | -0.00255 | -| std | 1.23 | -| value_loss | 0.00574 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1765 | -| iterations | 162 | -| time_elapsed | 1503 | -| total_timesteps | 2654208 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1770 | -| iterations | 163 | -| time_elapsed | 1508 | -| total_timesteps | 2670592 | -| train/ | | -| approx_kl | 0.0072684484 | -| clip_fraction | 0.0764 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.948 | -| learning_rate | 0.0003 | -| loss | -0.0295 | -| n_updates | 1620 | -| policy_gradient_loss | -0.00325 | -| std | 1.22 | -| value_loss | 0.00254 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1775 | -| iterations | 164 | -| time_elapsed | 1513 | -| total_timesteps | 2686976 | -| train/ | | -| approx_kl | 0.007457966 | -| clip_fraction | 0.0845 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.919 | -| learning_rate | 0.0003 | -| loss | -0.0473 | -| n_updates | 1630 | -| policy_gradient_loss | -0.00505 | -| std | 1.22 | -| value_loss | 0.004 | ------------------------------------------ -Eval num_timesteps=2700000, episode_reward=33.96 +/- 32.11 -Episode length: 2000.00 +/- 0.00 ----------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 34 | -| time/ | | -| total_timesteps | 2700000 | -| train/ | | -| approx_kl | 0.00796853 | -| clip_fraction | 0.0782 | -| clip_range | 0.2 | -| entropy_loss | -3.22 | -| explained_variance | 0.959 | -| learning_rate | 0.0003 | -| loss | -0.0336 | -| n_updates | 1640 | -| policy_gradient_loss | -0.00288 | -| std | 1.21 | -| value_loss | 0.00235 | ----------------------------------------- --------------------------------- -| time/ | | -| fps | 1761 | -| iterations | 165 | -| time_elapsed | 1534 | -| total_timesteps | 2703360 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1764 | -| iterations | 166 | -| time_elapsed | 1541 | -| total_timesteps | 2719744 | -| train/ | | -| approx_kl | 0.0073700505 | -| clip_fraction | 0.0857 | -| clip_range | 0.2 | -| entropy_loss | -3.21 | -| explained_variance | 0.875 | -| learning_rate | 0.0003 | -| loss | -0.0255 | -| n_updates | 1650 | -| policy_gradient_loss | -0.00495 | -| std | 1.21 | -| value_loss | 0.00846 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1768 | -| iterations | 167 | -| time_elapsed | 1546 | -| total_timesteps | 2736128 | -| train/ | | -| approx_kl | 0.007965144 | -| clip_fraction | 0.0858 | -| clip_range | 0.2 | -| entropy_loss | -3.22 | -| explained_variance | 0.898 | -| learning_rate | 0.0003 | -| loss | -0.0451 | -| n_updates | 1660 | -| policy_gradient_loss | -0.00518 | -| std | 1.22 | -| value_loss | 0.00395 | ------------------------------------------ -Eval num_timesteps=2750000, episode_reward=23.58 +/- 34.37 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 23.6 | -| time/ | | -| total_timesteps | 2750000 | -| train/ | | -| approx_kl | 0.0065765316 | -| clip_fraction | 0.0682 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.934 | -| learning_rate | 0.0003 | -| loss | -0.0429 | -| n_updates | 1670 | -| policy_gradient_loss | -0.00379 | -| std | 1.23 | -| value_loss | 0.00677 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1756 | -| iterations | 168 | -| time_elapsed | 1566 | -| total_timesteps | 2752512 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1761 | -| iterations | 169 | -| time_elapsed | 1571 | -| total_timesteps | 2768896 | -| train/ | | -| approx_kl | 0.0066236854 | -| clip_fraction | 0.0619 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.935 | -| learning_rate | 0.0003 | -| loss | -0.0365 | -| n_updates | 1680 | -| policy_gradient_loss | -0.00239 | -| std | 1.23 | -| value_loss | 0.00922 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1766 | -| iterations | 170 | -| time_elapsed | 1576 | -| total_timesteps | 2785280 | -| train/ | | -| approx_kl | 0.007887056 | -| clip_fraction | 0.0836 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.899 | -| learning_rate | 0.0003 | -| loss | -0.0353 | -| n_updates | 1690 | -| policy_gradient_loss | -0.0053 | -| std | 1.24 | -| value_loss | 0.00635 | ------------------------------------------ -Eval num_timesteps=2800000, episode_reward=33.57 +/- 35.56 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 33.6 | -| time/ | | -| total_timesteps | 2800000 | -| train/ | | -| approx_kl | 0.0067548407 | -| clip_fraction | 0.0804 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.887 | -| learning_rate | 0.0003 | -| loss | -0.0408 | -| n_updates | 1700 | -| policy_gradient_loss | -0.00444 | -| std | 1.24 | -| value_loss | 0.0101 | ------------------------------------------- - -[Diag @ 2,800,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 19/20 - COMPACT_CANT_DRIVE 1/20 - action_mag mean=0.050 p10=0.003 p90=0.039 (0=stopped, 1=full speed) - min_flock_radius mean=8.42m best=4.84m (target <5m to compact) - min_dog_to_com mean=0.73m best=0.12m (FLEE_DIST=7m) - min_com_to_pen mean=14.29m best=7.66m - reward/step (mean): progress=-0.0027 alignment=+0.0365 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1746 | -| iterations | 171 | -| time_elapsed | 1604 | -| total_timesteps | 2801664 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1750 | -| iterations | 172 | -| time_elapsed | 1609 | -| total_timesteps | 2818048 | -| train/ | | -| approx_kl | 0.0069283517 | -| clip_fraction | 0.0847 | -| clip_range | 0.2 | -| entropy_loss | -3.24 | -| explained_variance | 0.899 | -| learning_rate | 0.0003 | -| loss | -0.0476 | -| n_updates | 1710 | -| policy_gradient_loss | -0.00499 | -| std | 1.23 | -| value_loss | 0.00708 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1754 | -| iterations | 173 | -| time_elapsed | 1615 | -| total_timesteps | 2834432 | -| train/ | | -| approx_kl | 0.008303071 | -| clip_fraction | 0.082 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.911 | -| learning_rate | 0.0003 | -| loss | -0.0484 | -| n_updates | 1720 | -| policy_gradient_loss | -0.00388 | -| std | 1.23 | -| value_loss | 0.0061 | ------------------------------------------ -Eval num_timesteps=2850000, episode_reward=34.42 +/- 32.01 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 34.4 | -| time/ | | -| total_timesteps | 2850000 | -| train/ | | -| approx_kl | 0.0063731004 | -| clip_fraction | 0.069 | -| clip_range | 0.2 | -| entropy_loss | -3.26 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.029 | -| n_updates | 1730 | -| policy_gradient_loss | -0.00384 | -| std | 1.25 | -| value_loss | 0.00528 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1745 | -| iterations | 174 | -| time_elapsed | 1633 | -| total_timesteps | 2850816 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1749 | -| iterations | 175 | -| time_elapsed | 1638 | -| total_timesteps | 2867200 | -| train/ | | -| approx_kl | 0.008163793 | -| clip_fraction | 0.0812 | -| clip_range | 0.2 | -| entropy_loss | -3.28 | -| explained_variance | 0.935 | -| learning_rate | 0.0003 | -| loss | -0.0374 | -| n_updates | 1740 | -| policy_gradient_loss | -0.0032 | -| std | 1.26 | -| value_loss | 0.00432 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1754 | -| iterations | 176 | -| time_elapsed | 1643 | -| total_timesteps | 2883584 | -| train/ | | -| approx_kl | 0.0063439216 | -| clip_fraction | 0.0743 | -| clip_range | 0.2 | -| entropy_loss | -3.29 | -| explained_variance | 0.89 | -| learning_rate | 0.0003 | -| loss | -0.0372 | -| n_updates | 1750 | -| policy_gradient_loss | -0.00403 | -| std | 1.26 | -| value_loss | 0.00654 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1759 | -| iterations | 177 | -| time_elapsed | 1648 | -| total_timesteps | 2899968 | -| train/ | | -| approx_kl | 0.006967159 | -| clip_fraction | 0.0761 | -| clip_range | 0.2 | -| entropy_loss | -3.29 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0462 | -| n_updates | 1760 | -| policy_gradient_loss | -0.00382 | -| std | 1.26 | -| value_loss | 0.00381 | ------------------------------------------ -Eval num_timesteps=2900000, episode_reward=40.78 +/- 43.99 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 40.8 | -| time/ | | -| total_timesteps | 2900000 | -| train/ | | -| approx_kl | 0.0075211767 | -| clip_fraction | 0.0727 | -| clip_range | 0.2 | -| entropy_loss | -3.29 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.0178 | -| n_updates | 1770 | -| policy_gradient_loss | -0.00285 | -| std | 1.27 | -| value_loss | 0.00798 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1751 | -| iterations | 178 | -| time_elapsed | 1664 | -| total_timesteps | 2916352 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1756 | -| iterations | 179 | -| time_elapsed | 1669 | -| total_timesteps | 2932736 | -| train/ | | -| approx_kl | 0.006763531 | -| clip_fraction | 0.0678 | -| clip_range | 0.2 | -| entropy_loss | -3.3 | -| explained_variance | 0.91 | -| learning_rate | 0.0003 | -| loss | -0.0349 | -| n_updates | 1780 | -| policy_gradient_loss | -0.00361 | -| std | 1.27 | -| value_loss | 0.00528 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1760 | -| iterations | 180 | -| time_elapsed | 1675 | -| total_timesteps | 2949120 | -| train/ | | -| approx_kl | 0.0067441636 | -| clip_fraction | 0.0732 | -| clip_range | 0.2 | -| entropy_loss | -3.3 | -| explained_variance | 0.888 | -| learning_rate | 0.0003 | -| loss | -0.0261 | -| n_updates | 1790 | -| policy_gradient_loss | -0.00291 | -| std | 1.27 | -| value_loss | 0.00582 | ------------------------------------------- -Eval num_timesteps=2950000, episode_reward=48.39 +/- 31.91 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 48.4 | -| time/ | | -| total_timesteps | 2950000 | -| train/ | | -| approx_kl | 0.0076025603 | -| clip_fraction | 0.0858 | -| clip_range | 0.2 | -| entropy_loss | -3.31 | -| explained_variance | 0.92 | -| learning_rate | 0.0003 | -| loss | -0.0394 | -| n_updates | 1800 | -| policy_gradient_loss | -0.00443 | -| std | 1.27 | -| value_loss | 0.00647 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1751 | -| iterations | 181 | -| time_elapsed | 1693 | -| total_timesteps | 2965504 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1754 | -| iterations | 182 | -| time_elapsed | 1699 | -| total_timesteps | 2981888 | -| train/ | | -| approx_kl | 0.008041672 | -| clip_fraction | 0.0795 | -| clip_range | 0.2 | -| entropy_loss | -3.3 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0344 | -| n_updates | 1810 | -| policy_gradient_loss | -0.00456 | -| std | 1.27 | -| value_loss | 0.00404 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1758 | -| iterations | 183 | -| time_elapsed | 1704 | -| total_timesteps | 2998272 | -| train/ | | -| approx_kl | 0.0066829836 | -| clip_fraction | 0.0712 | -| clip_range | 0.2 | -| entropy_loss | -3.32 | -| explained_variance | 0.921 | -| learning_rate | 0.0003 | -| loss | -0.0361 | -| n_updates | 1820 | -| policy_gradient_loss | -0.00379 | -| std | 1.28 | -| value_loss | 0.00818 | ------------------------------------------- -Eval num_timesteps=3000000, episode_reward=33.06 +/- 47.57 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 33.1 | -| time/ | | -| total_timesteps | 3000000 | -| train/ | | -| approx_kl | 0.006152373 | -| clip_fraction | 0.0633 | -| clip_range | 0.2 | -| entropy_loss | -3.33 | -| explained_variance | 0.912 | -| learning_rate | 0.0003 | -| loss | -0.0316 | -| n_updates | 1830 | -| policy_gradient_loss | -0.00335 | -| std | 1.29 | -| value_loss | 0.00404 | ------------------------------------------ - -[Diag @ 3,000,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 20/20 - action_mag mean=0.049 p10=0.005 p90=0.046 (0=stopped, 1=full speed) - min_flock_radius mean=8.21m best=5.29m (target <5m to compact) - min_dog_to_com mean=0.76m best=0.22m (FLEE_DIST=7m) - min_com_to_pen mean=12.62m best=4.77m - reward/step (mean): progress=+0.0089 alignment=+0.0386 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1740 | -| iterations | 184 | -| time_elapsed | 1731 | -| total_timesteps | 3014656 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1745 | -| iterations | 185 | -| time_elapsed | 1736 | -| total_timesteps | 3031040 | -| train/ | | -| approx_kl | 0.006385569 | -| clip_fraction | 0.0703 | -| clip_range | 0.2 | -| entropy_loss | -3.34 | -| explained_variance | 0.919 | -| learning_rate | 0.0003 | -| loss | -0.0313 | -| n_updates | 1840 | -| policy_gradient_loss | -0.00274 | -| std | 1.3 | -| value_loss | 0.00503 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1748 | -| iterations | 186 | -| time_elapsed | 1743 | -| total_timesteps | 3047424 | -| train/ | | -| approx_kl | 0.007695101 | -| clip_fraction | 0.0784 | -| clip_range | 0.2 | -| entropy_loss | -3.36 | -| explained_variance | 0.935 | -| learning_rate | 0.0003 | -| loss | -0.0244 | -| n_updates | 1850 | -| policy_gradient_loss | -0.00342 | -| std | 1.31 | -| value_loss | 0.0051 | ------------------------------------------ -Eval num_timesteps=3050000, episode_reward=45.25 +/- 31.57 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 45.2 | -| time/ | | -| total_timesteps | 3050000 | -| train/ | | -| approx_kl | 0.0067556566 | -| clip_fraction | 0.082 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.868 | -| learning_rate | 0.0003 | -| loss | -0.0349 | -| n_updates | 1860 | -| policy_gradient_loss | -0.00353 | -| std | 1.31 | -| value_loss | 0.00931 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1738 | -| iterations | 187 | -| time_elapsed | 1762 | -| total_timesteps | 3063808 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1741 | -| iterations | 188 | -| time_elapsed | 1768 | -| total_timesteps | 3080192 | -| train/ | | -| approx_kl | 0.008263266 | -| clip_fraction | 0.0792 | -| clip_range | 0.2 | -| entropy_loss | -3.36 | -| explained_variance | 0.924 | -| learning_rate | 0.0003 | -| loss | -0.0411 | -| n_updates | 1870 | -| policy_gradient_loss | -0.00382 | -| std | 1.31 | -| value_loss | 0.00429 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1746 | -| iterations | 189 | -| time_elapsed | 1773 | -| total_timesteps | 3096576 | -| train/ | | -| approx_kl | 0.008488305 | -| clip_fraction | 0.08 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.925 | -| learning_rate | 0.0003 | -| loss | -0.0292 | -| n_updates | 1880 | -| policy_gradient_loss | -0.00441 | -| std | 1.31 | -| value_loss | 0.00748 | ------------------------------------------ -Eval num_timesteps=3100000, episode_reward=30.63 +/- 33.70 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 30.6 | -| time/ | | -| total_timesteps | 3100000 | -| train/ | | -| approx_kl | 0.0065515246 | -| clip_fraction | 0.0736 | -| clip_range | 0.2 | -| entropy_loss | -3.35 | -| explained_variance | 0.932 | -| learning_rate | 0.0003 | -| loss | 0.00192 | -| n_updates | 1890 | -| policy_gradient_loss | -0.00334 | -| std | 1.3 | -| value_loss | 0.00902 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1737 | -| iterations | 190 | -| time_elapsed | 1791 | -| total_timesteps | 3112960 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1741 | -| iterations | 191 | -| time_elapsed | 1796 | -| total_timesteps | 3129344 | -| train/ | | -| approx_kl | 0.0068135276 | -| clip_fraction | 0.0721 | -| clip_range | 0.2 | -| entropy_loss | -3.35 | -| explained_variance | 0.933 | -| learning_rate | 0.0003 | -| loss | -0.036 | -| n_updates | 1900 | -| policy_gradient_loss | -0.00403 | -| std | 1.29 | -| value_loss | 0.00616 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1731 | -| iterations | 192 | -| time_elapsed | 1817 | -| total_timesteps | 3145728 | -| train/ | | -| approx_kl | 0.0061126407 | -| clip_fraction | 0.0615 | -| clip_range | 0.2 | -| entropy_loss | -3.35 | -| explained_variance | 0.921 | -| learning_rate | 0.0003 | -| loss | -0.0355 | -| n_updates | 1910 | -| policy_gradient_loss | -0.00318 | -| std | 1.3 | -| value_loss | 0.0104 | ------------------------------------------- -Eval num_timesteps=3150000, episode_reward=33.88 +/- 34.31 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 33.9 | -| time/ | | -| total_timesteps | 3150000 | -| train/ | | -| approx_kl | 0.007734685 | -| clip_fraction | 0.0778 | -| clip_range | 0.2 | -| entropy_loss | -3.35 | -| explained_variance | 0.899 | -| learning_rate | 0.0003 | -| loss | -0.0323 | -| n_updates | 1920 | -| policy_gradient_loss | -0.00432 | -| std | 1.3 | -| value_loss | 0.0091 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1714 | -| iterations | 193 | -| time_elapsed | 1844 | -| total_timesteps | 3162112 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1717 | -| iterations | 194 | -| time_elapsed | 1850 | -| total_timesteps | 3178496 | -| train/ | | -| approx_kl | 0.007997783 | -| clip_fraction | 0.0782 | -| clip_range | 0.2 | -| entropy_loss | -3.35 | -| explained_variance | 0.91 | -| learning_rate | 0.0003 | -| loss | -0.0525 | -| n_updates | 1930 | -| policy_gradient_loss | -0.00523 | -| std | 1.3 | -| value_loss | 0.00283 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1720 | -| iterations | 195 | -| time_elapsed | 1857 | -| total_timesteps | 3194880 | -| train/ | | -| approx_kl | 0.007701534 | -| clip_fraction | 0.0712 | -| clip_range | 0.2 | -| entropy_loss | -3.34 | -| explained_variance | 0.927 | -| learning_rate | 0.0003 | -| loss | -0.0367 | -| n_updates | 1940 | -| policy_gradient_loss | -0.00288 | -| std | 1.3 | -| value_loss | 0.0126 | ------------------------------------------ -Eval num_timesteps=3200000, episode_reward=46.55 +/- 34.01 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 46.6 | -| time/ | | -| total_timesteps | 3200000 | -| train/ | | -| approx_kl | 0.006747664 | -| clip_fraction | 0.0766 | -| clip_range | 0.2 | -| entropy_loss | -3.35 | -| explained_variance | 0.93 | -| learning_rate | 0.0003 | -| loss | -0.0411 | -| n_updates | 1950 | -| policy_gradient_loss | -0.00404 | -| std | 1.3 | -| value_loss | 0.00409 | ------------------------------------------ - -[Diag @ 3,200,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 20/20 - action_mag mean=0.078 p10=0.005 p90=0.057 (0=stopped, 1=full speed) - min_flock_radius mean=8.76m best=6.32m (target <5m to compact) - min_dog_to_com mean=0.81m best=0.36m (FLEE_DIST=7m) - min_com_to_pen mean=13.75m best=6.91m - reward/step (mean): progress=-0.0020 alignment=+0.0384 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 - -[Curriculum] leaving stage n_sheep=4 after 800,000 steps | training success rate (last 100 eps) = 0% -[Curriculum] → 5 sheep at step 3,200,000 - --------------------------------- -| time/ | | -| fps | 1704 | -| iterations | 196 | -| time_elapsed | 1884 | -| total_timesteps | 3211264 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1707 | -| iterations | 197 | -| time_elapsed | 1889 | -| total_timesteps | 3227648 | -| train/ | | -| approx_kl | 0.0068222135 | -| clip_fraction | 0.0816 | -| clip_range | 0.2 | -| entropy_loss | -3.36 | -| explained_variance | 0.922 | -| learning_rate | 0.0003 | -| loss | -0.0386 | -| n_updates | 1960 | -| policy_gradient_loss | -0.00374 | -| std | 1.31 | -| value_loss | 0.0112 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1711 | -| iterations | 198 | -| time_elapsed | 1895 | -| total_timesteps | 3244032 | -| train/ | | -| approx_kl | 0.006939999 | -| clip_fraction | 0.0829 | -| clip_range | 0.2 | -| entropy_loss | -3.36 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.0439 | -| n_updates | 1970 | -| policy_gradient_loss | -0.00433 | -| std | 1.31 | -| value_loss | 0.00895 | ------------------------------------------ -Eval num_timesteps=3250000, episode_reward=21.19 +/- 37.18 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 21.2 | -| time/ | | -| total_timesteps | 3250000 | -| train/ | | -| approx_kl | 0.007944042 | -| clip_fraction | 0.0812 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.925 | -| learning_rate | 0.0003 | -| loss | -0.0379 | -| n_updates | 1980 | -| policy_gradient_loss | -0.00306 | -| std | 1.31 | -| value_loss | 0.00578 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1702 | -| iterations | 199 | -| time_elapsed | 1914 | -| total_timesteps | 3260416 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1706 | -| iterations | 200 | -| time_elapsed | 1920 | -| total_timesteps | 3276800 | -| train/ | | -| approx_kl | 0.007009124 | -| clip_fraction | 0.0786 | -| clip_range | 0.2 | -| entropy_loss | -3.36 | -| explained_variance | 0.945 | -| learning_rate | 0.0003 | -| loss | -0.0398 | -| n_updates | 1990 | -| policy_gradient_loss | -0.00469 | -| std | 1.31 | -| value_loss | 0.00344 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1709 | -| iterations | 201 | -| time_elapsed | 1926 | -| total_timesteps | 3293184 | -| train/ | | -| approx_kl | 0.007446406 | -| clip_fraction | 0.0736 | -| clip_range | 0.2 | -| entropy_loss | -3.36 | -| explained_variance | 0.957 | -| learning_rate | 0.0003 | -| loss | -0.0493 | -| n_updates | 2000 | -| policy_gradient_loss | -0.00431 | -| std | 1.31 | -| value_loss | 0.00262 | ------------------------------------------ -Eval num_timesteps=3300000, episode_reward=18.42 +/- 36.17 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 18.4 | -| time/ | | -| total_timesteps | 3300000 | -| train/ | | -| approx_kl | 0.007855328 | -| clip_fraction | 0.0783 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.0381 | -| n_updates | 2010 | -| policy_gradient_loss | -0.00422 | -| std | 1.32 | -| value_loss | 0.00379 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1701 | -| iterations | 202 | -| time_elapsed | 1945 | -| total_timesteps | 3309568 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1704 | -| iterations | 203 | -| time_elapsed | 1951 | -| total_timesteps | 3325952 | -| train/ | | -| approx_kl | 0.0073990654 | -| clip_fraction | 0.0773 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.89 | -| learning_rate | 0.0003 | -| loss | -0.0319 | -| n_updates | 2020 | -| policy_gradient_loss | -0.00507 | -| std | 1.32 | -| value_loss | 0.0165 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1707 | -| iterations | 204 | -| time_elapsed | 1956 | -| total_timesteps | 3342336 | -| train/ | | -| approx_kl | 0.0076738494 | -| clip_fraction | 0.0913 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.914 | -| learning_rate | 0.0003 | -| loss | -0.0326 | -| n_updates | 2030 | -| policy_gradient_loss | -0.00611 | -| std | 1.32 | -| value_loss | 0.00854 | ------------------------------------------- -Eval num_timesteps=3350000, episode_reward=39.75 +/- 38.09 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 39.8 | -| time/ | | -| total_timesteps | 3350000 | -| train/ | | -| approx_kl | 0.007704767 | -| clip_fraction | 0.0813 | -| clip_range | 0.2 | -| entropy_loss | -3.39 | -| explained_variance | 0.822 | -| learning_rate | 0.0003 | -| loss | -0.0351 | -| n_updates | 2040 | -| policy_gradient_loss | -0.0056 | -| std | 1.33 | -| value_loss | 0.0095 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1700 | -| iterations | 205 | -| time_elapsed | 1974 | -| total_timesteps | 3358720 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1703 | -| iterations | 206 | -| time_elapsed | 1980 | -| total_timesteps | 3375104 | -| train/ | | -| approx_kl | 0.006841295 | -| clip_fraction | 0.0682 | -| clip_range | 0.2 | -| entropy_loss | -3.39 | -| explained_variance | 0.973 | -| learning_rate | 0.0003 | -| loss | -0.04 | -| n_updates | 2050 | -| policy_gradient_loss | -0.00457 | -| std | 1.33 | -| value_loss | 0.00456 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1707 | -| iterations | 207 | -| time_elapsed | 1986 | -| total_timesteps | 3391488 | -| train/ | | -| approx_kl | 0.0063885115 | -| clip_fraction | 0.0749 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.962 | -| learning_rate | 0.0003 | -| loss | -0.041 | -| n_updates | 2060 | -| policy_gradient_loss | -0.00455 | -| std | 1.34 | -| value_loss | 0.00373 | ------------------------------------------- -Eval num_timesteps=3400000, episode_reward=26.62 +/- 43.12 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 26.6 | -| time/ | | -| total_timesteps | 3400000 | -| train/ | | -| approx_kl | 0.006273965 | -| clip_fraction | 0.0709 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.956 | -| learning_rate | 0.0003 | -| loss | -0.0465 | -| n_updates | 2070 | -| policy_gradient_loss | -0.00249 | -| std | 1.33 | -| value_loss | 0.00679 | ------------------------------------------ - -[Diag @ 3,400,000 | n_sheep=5 | success=0%] - NEVER_COMPACT 20/20 - action_mag mean=0.089 p10=0.005 p90=0.074 (0=stopped, 1=full speed) - min_flock_radius mean=9.14m best=5.59m (target <5m to compact) - min_dog_to_com mean=0.69m best=0.10m (FLEE_DIST=7m) - min_com_to_pen mean=12.77m best=5.15m - reward/step (mean): progress=-0.0015 alignment=+0.0368 pen_bonus=+0.0020 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1691 | -| iterations | 208 | -| time_elapsed | 2014 | -| total_timesteps | 3407872 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1695 | -| iterations | 209 | -| time_elapsed | 2019 | -| total_timesteps | 3424256 | -| train/ | | -| approx_kl | 0.006433293 | -| clip_fraction | 0.0727 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.932 | -| learning_rate | 0.0003 | -| loss | -0.0268 | -| n_updates | 2080 | -| policy_gradient_loss | -0.00365 | -| std | 1.33 | -| value_loss | 0.00657 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1698 | -| iterations | 210 | -| time_elapsed | 2025 | -| total_timesteps | 3440640 | -| train/ | | -| approx_kl | 0.007235542 | -| clip_fraction | 0.0839 | -| clip_range | 0.2 | -| entropy_loss | -3.39 | -| explained_variance | 0.935 | -| learning_rate | 0.0003 | -| loss | -0.0344 | -| n_updates | 2090 | -| policy_gradient_loss | -0.00417 | -| std | 1.32 | -| value_loss | 0.0137 | ------------------------------------------ -Eval num_timesteps=3450000, episode_reward=35.54 +/- 43.01 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 35.5 | -| time/ | | -| total_timesteps | 3450000 | -| train/ | | -| approx_kl | 0.007782845 | -| clip_fraction | 0.0859 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.924 | -| learning_rate | 0.0003 | -| loss | -0.044 | -| n_updates | 2100 | -| policy_gradient_loss | -0.00561 | -| std | 1.34 | -| value_loss | 0.0043 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1690 | -| iterations | 211 | -| time_elapsed | 2044 | -| total_timesteps | 3457024 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1693 | -| iterations | 212 | -| time_elapsed | 2050 | -| total_timesteps | 3473408 | -| train/ | | -| approx_kl | 0.0075765867 | -| clip_fraction | 0.0746 | -| clip_range | 0.2 | -| entropy_loss | -3.41 | -| explained_variance | 0.896 | -| learning_rate | 0.0003 | -| loss | -0.0293 | -| n_updates | 2110 | -| policy_gradient_loss | -0.00406 | -| std | 1.33 | -| value_loss | 0.011 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1696 | -| iterations | 213 | -| time_elapsed | 2056 | -| total_timesteps | 3489792 | -| train/ | | -| approx_kl | 0.0072322125 | -| clip_fraction | 0.071 | -| clip_range | 0.2 | -| entropy_loss | -3.41 | -| explained_variance | 0.949 | -| learning_rate | 0.0003 | -| loss | -0.0498 | -| n_updates | 2120 | -| policy_gradient_loss | -0.00421 | -| std | 1.34 | -| value_loss | 0.006 | ------------------------------------------- -Eval num_timesteps=3500000, episode_reward=54.69 +/- 47.39 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 54.7 | -| time/ | | -| total_timesteps | 3500000 | -| train/ | | -| approx_kl | 0.0073479656 | -| clip_fraction | 0.0778 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.824 | -| learning_rate | 0.0003 | -| loss | -0.0408 | -| n_updates | 2130 | -| policy_gradient_loss | -0.00465 | -| std | 1.32 | -| value_loss | 0.00657 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1688 | -| iterations | 214 | -| time_elapsed | 2076 | -| total_timesteps | 3506176 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1692 | -| iterations | 215 | -| time_elapsed | 2081 | -| total_timesteps | 3522560 | -| train/ | | -| approx_kl | 0.007274649 | -| clip_fraction | 0.0798 | -| clip_range | 0.2 | -| entropy_loss | -3.39 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.0356 | -| n_updates | 2140 | -| policy_gradient_loss | -0.00383 | -| std | 1.33 | -| value_loss | 0.00355 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1695 | -| iterations | 216 | -| time_elapsed | 2087 | -| total_timesteps | 3538944 | -| train/ | | -| approx_kl | 0.0068056686 | -| clip_fraction | 0.0726 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.0428 | -| n_updates | 2150 | -| policy_gradient_loss | -0.00356 | -| std | 1.32 | -| value_loss | 0.00378 | ------------------------------------------- -Eval num_timesteps=3550000, episode_reward=8.69 +/- 39.03 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 8.69 | -| time/ | | -| total_timesteps | 3550000 | -| train/ | | -| approx_kl | 0.008211401 | -| clip_fraction | 0.0801 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.972 | -| learning_rate | 0.0003 | -| loss | -0.0366 | -| n_updates | 2160 | -| policy_gradient_loss | -0.00453 | -| std | 1.32 | -| value_loss | 0.00445 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1687 | -| iterations | 217 | -| time_elapsed | 2106 | -| total_timesteps | 3555328 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1690 | -| iterations | 218 | -| time_elapsed | 2112 | -| total_timesteps | 3571712 | -| train/ | | -| approx_kl | 0.008278061 | -| clip_fraction | 0.0871 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0324 | -| n_updates | 2170 | -| policy_gradient_loss | -0.00486 | -| std | 1.32 | -| value_loss | 0.00377 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1693 | -| iterations | 219 | -| time_elapsed | 2119 | -| total_timesteps | 3588096 | -| train/ | | -| approx_kl | 0.007908824 | -| clip_fraction | 0.0777 | -| clip_range | 0.2 | -| entropy_loss | -3.39 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.0353 | -| n_updates | 2180 | -| policy_gradient_loss | -0.00318 | -| std | 1.32 | -| value_loss | 0.00768 | ------------------------------------------ -Eval num_timesteps=3600000, episode_reward=26.00 +/- 35.20 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 26 | -| time/ | | -| total_timesteps | 3600000 | -| train/ | | -| approx_kl | 0.0068260087 | -| clip_fraction | 0.0761 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.946 | -| learning_rate | 0.0003 | -| loss | -0.0257 | -| n_updates | 2190 | -| policy_gradient_loss | -0.00375 | -| std | 1.32 | -| value_loss | 0.00745 | ------------------------------------------- - -[Diag @ 3,600,000 | n_sheep=5 | success=0%] - NEVER_COMPACT 20/20 - action_mag mean=0.114 p10=0.006 p90=0.281 (0=stopped, 1=full speed) - min_flock_radius mean=9.62m best=5.04m (target <5m to compact) - min_dog_to_com mean=0.77m best=0.40m (FLEE_DIST=7m) - min_com_to_pen mean=13.31m best=6.37m - reward/step (mean): progress=+0.0071 alignment=+0.0385 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1677 | -| iterations | 220 | -| time_elapsed | 2148 | -| total_timesteps | 3604480 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1680 | -| iterations | 221 | -| time_elapsed | 2154 | -| total_timesteps | 3620864 | -| train/ | | -| approx_kl | 0.0084966235 | -| clip_fraction | 0.0849 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.936 | -| learning_rate | 0.0003 | -| loss | -0.0498 | -| n_updates | 2200 | -| policy_gradient_loss | -0.00478 | -| std | 1.32 | -| value_loss | 0.00856 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1683 | -| iterations | 222 | -| time_elapsed | 2160 | -| total_timesteps | 3637248 | -| train/ | | -| approx_kl | 0.007236682 | -| clip_fraction | 0.072 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.956 | -| learning_rate | 0.0003 | -| loss | -0.0436 | -| n_updates | 2210 | -| policy_gradient_loss | -0.0054 | -| std | 1.31 | -| value_loss | 0.00748 | ------------------------------------------ -Eval num_timesteps=3650000, episode_reward=48.26 +/- 45.24 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 48.3 | -| time/ | | -| total_timesteps | 3650000 | -| train/ | | -| approx_kl | 0.0076099336 | -| clip_fraction | 0.0694 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | -0.037 | -| n_updates | 2220 | -| policy_gradient_loss | -0.00369 | -| std | 1.31 | -| value_loss | 0.00888 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1676 | -| iterations | 223 | -| time_elapsed | 2179 | -| total_timesteps | 3653632 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1679 | -| iterations | 224 | -| time_elapsed | 2185 | -| total_timesteps | 3670016 | -| train/ | | -| approx_kl | 0.007888832 | -| clip_fraction | 0.0783 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.914 | -| learning_rate | 0.0003 | -| loss | -0.0298 | -| n_updates | 2230 | -| policy_gradient_loss | -0.00449 | -| std | 1.32 | -| value_loss | 0.00867 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1682 | -| iterations | 225 | -| time_elapsed | 2190 | -| total_timesteps | 3686400 | -| train/ | | -| approx_kl | 0.0069514583 | -| clip_fraction | 0.0791 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.946 | -| learning_rate | 0.0003 | -| loss | -0.0283 | -| n_updates | 2240 | -| policy_gradient_loss | -0.00427 | -| std | 1.32 | -| value_loss | 0.00382 | ------------------------------------------- -Eval num_timesteps=3700000, episode_reward=19.29 +/- 50.45 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 19.3 | -| time/ | | -| total_timesteps | 3700000 | -| train/ | | -| approx_kl | 0.008142319 | -| clip_fraction | 0.0865 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.92 | -| learning_rate | 0.0003 | -| loss | -0.0467 | -| n_updates | 2250 | -| policy_gradient_loss | -0.00506 | -| std | 1.31 | -| value_loss | 0.00547 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1674 | -| iterations | 226 | -| time_elapsed | 2210 | -| total_timesteps | 3702784 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1677 | -| iterations | 227 | -| time_elapsed | 2216 | -| total_timesteps | 3719168 | -| train/ | | -| approx_kl | 0.0077144434 | -| clip_fraction | 0.0783 | -| clip_range | 0.2 | -| entropy_loss | -3.36 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0331 | -| n_updates | 2260 | -| policy_gradient_loss | -0.00529 | -| std | 1.31 | -| value_loss | 0.00486 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1667 | -| iterations | 228 | -| time_elapsed | 2239 | -| total_timesteps | 3735552 | -| train/ | | -| approx_kl | 0.007820845 | -| clip_fraction | 0.087 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.95 | -| learning_rate | 0.0003 | -| loss | -0.0321 | -| n_updates | 2270 | -| policy_gradient_loss | -0.00493 | -| std | 1.31 | -| value_loss | 0.00531 | ------------------------------------------ -Eval num_timesteps=3750000, episode_reward=35.91 +/- 47.57 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 35.9 | -| time/ | | -| total_timesteps | 3750000 | -| train/ | | -| approx_kl | 0.008380983 | -| clip_fraction | 0.0868 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.927 | -| learning_rate | 0.0003 | -| loss | -0.0318 | -| n_updates | 2280 | -| policy_gradient_loss | -0.0046 | -| std | 1.32 | -| value_loss | 0.00684 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1639 | -| iterations | 229 | -| time_elapsed | 2289 | -| total_timesteps | 3751936 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1642 | -| iterations | 230 | -| time_elapsed | 2294 | -| total_timesteps | 3768320 | -| train/ | | -| approx_kl | 0.007415652 | -| clip_fraction | 0.0758 | -| clip_range | 0.2 | -| entropy_loss | -3.37 | -| explained_variance | 0.953 | -| learning_rate | 0.0003 | -| loss | -0.0354 | -| n_updates | 2290 | -| policy_gradient_loss | -0.00557 | -| std | 1.31 | -| value_loss | 0.0122 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1646 | -| iterations | 231 | -| time_elapsed | 2299 | -| total_timesteps | 3784704 | -| train/ | | -| approx_kl | 0.0071868873 | -| clip_fraction | 0.0736 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.954 | -| learning_rate | 0.0003 | -| loss | -0.0457 | -| n_updates | 2300 | -| policy_gradient_loss | -0.00442 | -| std | 1.33 | -| value_loss | 0.0201 | ------------------------------------------- -Eval num_timesteps=3800000, episode_reward=31.58 +/- 50.62 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 31.6 | -| time/ | | -| total_timesteps | 3800000 | -| train/ | | -| approx_kl | 0.0074889637 | -| clip_fraction | 0.0805 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.95 | -| learning_rate | 0.0003 | -| loss | -0.0355 | -| n_updates | 2310 | -| policy_gradient_loss | -0.00474 | -| std | 1.33 | -| value_loss | 0.00892 | ------------------------------------------- - -[Diag @ 3,800,000 | n_sheep=5 | success=0%] - NEVER_COMPACT 19/20 - COMPACT_CANT_DRIVE 1/20 - action_mag mean=0.128 p10=0.005 p90=0.475 (0=stopped, 1=full speed) - min_flock_radius mean=8.35m best=4.80m (target <5m to compact) - min_dog_to_com mean=0.71m best=0.23m (FLEE_DIST=7m) - min_com_to_pen mean=13.72m best=8.54m - reward/step (mean): progress=+0.0063 alignment=+0.0388 pen_bonus=+0.0010 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1633 | -| iterations | 232 | -| time_elapsed | 2326 | -| total_timesteps | 3801088 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1636 | -| iterations | 233 | -| time_elapsed | 2332 | -| total_timesteps | 3817472 | -| train/ | | -| approx_kl | 0.0070604184 | -| clip_fraction | 0.0765 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.953 | -| learning_rate | 0.0003 | -| loss | -0.0398 | -| n_updates | 2320 | -| policy_gradient_loss | -0.00453 | -| std | 1.33 | -| value_loss | 0.00675 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1640 | -| iterations | 234 | -| time_elapsed | 2336 | -| total_timesteps | 3833856 | -| train/ | | -| approx_kl | 0.007709453 | -| clip_fraction | 0.0816 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.943 | -| learning_rate | 0.0003 | -| loss | -0.0359 | -| n_updates | 2330 | -| policy_gradient_loss | -0.00423 | -| std | 1.34 | -| value_loss | 0.00754 | ------------------------------------------ -Eval num_timesteps=3850000, episode_reward=42.98 +/- 33.36 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 43 | -| time/ | | -| total_timesteps | 3850000 | -| train/ | | -| approx_kl | 0.007679659 | -| clip_fraction | 0.0858 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.961 | -| learning_rate | 0.0003 | -| loss | -0.032 | -| n_updates | 2340 | -| policy_gradient_loss | -0.00716 | -| std | 1.33 | -| value_loss | 0.00907 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1635 | -| iterations | 235 | -| time_elapsed | 2354 | -| total_timesteps | 3850240 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1638 | -| iterations | 236 | -| time_elapsed | 2360 | -| total_timesteps | 3866624 | -| train/ | | -| approx_kl | 0.0077598644 | -| clip_fraction | 0.0848 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.96 | -| learning_rate | 0.0003 | -| loss | -0.0468 | -| n_updates | 2350 | -| policy_gradient_loss | -0.005 | -| std | 1.33 | -| value_loss | 0.0101 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1641 | -| iterations | 237 | -| time_elapsed | 2366 | -| total_timesteps | 3883008 | -| train/ | | -| approx_kl | 0.0068941545 | -| clip_fraction | 0.0673 | -| clip_range | 0.2 | -| entropy_loss | -3.39 | -| explained_variance | 0.96 | -| learning_rate | 0.0003 | -| loss | -0.0398 | -| n_updates | 2360 | -| policy_gradient_loss | -0.0047 | -| std | 1.33 | -| value_loss | 0.0113 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1643 | -| iterations | 238 | -| time_elapsed | 2372 | -| total_timesteps | 3899392 | -| train/ | | -| approx_kl | 0.0073663425 | -| clip_fraction | 0.0785 | -| clip_range | 0.2 | -| entropy_loss | -3.41 | -| explained_variance | 0.963 | -| learning_rate | 0.0003 | -| loss | -0.0319 | -| n_updates | 2370 | -| policy_gradient_loss | -0.00458 | -| std | 1.35 | -| value_loss | 0.0036 | ------------------------------------------- -Eval num_timesteps=3900000, episode_reward=33.74 +/- 40.96 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 33.7 | -| time/ | | -| total_timesteps | 3900000 | -| train/ | | -| approx_kl | 0.007122398 | -| clip_fraction | 0.0759 | -| clip_range | 0.2 | -| entropy_loss | -3.41 | -| explained_variance | 0.972 | -| learning_rate | 0.0003 | -| loss | -0.0383 | -| n_updates | 2380 | -| policy_gradient_loss | -0.00446 | -| std | 1.35 | -| value_loss | 0.00445 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1637 | -| iterations | 239 | -| time_elapsed | 2391 | -| total_timesteps | 3915776 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1640 | -| iterations | 240 | -| time_elapsed | 2396 | -| total_timesteps | 3932160 | -| train/ | | -| approx_kl | 0.008265208 | -| clip_fraction | 0.0845 | -| clip_range | 0.2 | -| entropy_loss | -3.41 | -| explained_variance | 0.926 | -| learning_rate | 0.0003 | -| loss | -0.0361 | -| n_updates | 2390 | -| policy_gradient_loss | -0.00536 | -| std | 1.34 | -| value_loss | 0.00846 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1643 | -| iterations | 241 | -| time_elapsed | 2402 | -| total_timesteps | 3948544 | -| train/ | | -| approx_kl | 0.008583728 | -| clip_fraction | 0.0893 | -| clip_range | 0.2 | -| entropy_loss | -3.42 | -| explained_variance | 0.915 | -| learning_rate | 0.0003 | -| loss | -0.0297 | -| n_updates | 2400 | -| policy_gradient_loss | -0.00592 | -| std | 1.35 | -| value_loss | 0.0068 | ------------------------------------------ -Eval num_timesteps=3950000, episode_reward=46.06 +/- 34.67 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 46.1 | -| time/ | | -| total_timesteps | 3950000 | -| train/ | | -| approx_kl | 0.0060660206 | -| clip_fraction | 0.0654 | -| clip_range | 0.2 | -| entropy_loss | -3.42 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | -0.0359 | -| n_updates | 2410 | -| policy_gradient_loss | -0.0038 | -| std | 1.35 | -| value_loss | 0.00296 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1637 | -| iterations | 242 | -| time_elapsed | 2421 | -| total_timesteps | 3964928 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1639 | -| iterations | 243 | -| time_elapsed | 2427 | -| total_timesteps | 3981312 | -| train/ | | -| approx_kl | 0.007591601 | -| clip_fraction | 0.0808 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.964 | -| learning_rate | 0.0003 | -| loss | -0.0386 | -| n_updates | 2420 | -| policy_gradient_loss | -0.00575 | -| std | 1.34 | -| value_loss | 0.00714 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1642 | -| iterations | 244 | -| time_elapsed | 2433 | -| total_timesteps | 3997696 | -| train/ | | -| approx_kl | 0.006255053 | -| clip_fraction | 0.0663 | -| clip_range | 0.2 | -| entropy_loss | -3.41 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0304 | -| n_updates | 2430 | -| policy_gradient_loss | -0.00497 | -| std | 1.35 | -| value_loss | 0.00585 | ------------------------------------------ -Eval num_timesteps=4000000, episode_reward=19.52 +/- 38.43 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | 19.5 | -| time/ | | -| total_timesteps | 4000000 | -| train/ | | -| approx_kl | 0.008279499 | -| clip_fraction | 0.0814 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.958 | -| learning_rate | 0.0003 | -| loss | -0.0379 | -| n_updates | 2440 | -| policy_gradient_loss | -0.00568 | -| std | 1.34 | -| value_loss | 0.00469 | ------------------------------------------ - -[Diag @ 4,000,000 | n_sheep=5 | success=0%] - NEVER_COMPACT 20/20 - action_mag mean=0.158 p10=0.006 p90=0.744 (0=stopped, 1=full speed) - min_flock_radius mean=8.94m best=6.34m (target <5m to compact) - min_dog_to_com mean=0.82m best=0.49m (FLEE_DIST=7m) - min_com_to_pen mean=13.86m best=7.80m - reward/step (mean): progress=+0.0029 alignment=+0.0397 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1630 | -| iterations | 245 | -| time_elapsed | 2461 | -| total_timesteps | 4014080 | --------------------------------- - -Training complete. Artefacts saved to runs/ppo_debug/ diff --git a/training/runs/ppo_debug/best_model/best_model.zip b/training/runs/ppo_debug/best_model/best_model.zip deleted file mode 100644 index 2618c2c..0000000 Binary files a/training/runs/ppo_debug/best_model/best_model.zip and /dev/null differ diff --git a/training/runs/ppo_debug/evaluations.npz b/training/runs/ppo_debug/evaluations.npz deleted file mode 100644 index 84fd19d..0000000 Binary files a/training/runs/ppo_debug/evaluations.npz and /dev/null differ diff --git a/training/runs/ppo_debug/final_model.zip b/training/runs/ppo_debug/final_model.zip deleted file mode 100644 index e3be97e..0000000 Binary files a/training/runs/ppo_debug/final_model.zip and /dev/null differ diff --git a/training/runs/ppo_debug/vecnorm.pkl b/training/runs/ppo_debug/vecnorm.pkl deleted file mode 100644 index c17b706..0000000 Binary files a/training/runs/ppo_debug/vecnorm.pkl and /dev/null differ diff --git a/training/runs/ppo_fix_check.log b/training/runs/ppo_fix_check.log deleted file mode 100644 index 39ace5a..0000000 --- a/training/runs/ppo_fix_check.log +++ /dev/null @@ -1,3388 +0,0 @@ -Using cpu device -Logging to runs/ppo_fix_check/ppo_1 ------------------------------- -| time/ | | -| fps | 5021 | -| iterations | 1 | -| time_elapsed | 3 | -| total_timesteps | 16384 | ------------------------------- ------------------------------------------- -| time/ | | -| fps | 4241 | -| iterations | 2 | -| time_elapsed | 7 | -| total_timesteps | 32768 | -| train/ | | -| approx_kl | 0.0047510993 | -| clip_fraction | 0.0344 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.786 | -| learning_rate | 0.0003 | -| loss | -0.00995 | -| n_updates | 10 | -| policy_gradient_loss | -0.00156 | -| std | 1.01 | -| value_loss | 0.0657 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 4026 | -| iterations | 3 | -| time_elapsed | 12 | -| total_timesteps | 49152 | -| train/ | | -| approx_kl | 0.0032065492 | -| clip_fraction | 0.0328 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.868 | -| learning_rate | 0.0003 | -| loss | -0.0327 | -| n_updates | 20 | -| policy_gradient_loss | -0.00152 | -| std | 1.02 | -| value_loss | 0.0172 | ------------------------------------------- -/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. - warnings.warn( -Eval num_timesteps=50000, episode_reward=-25.33 +/- 56.30 -Episode length: 1859.00 +/- 393.69 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.86e+03 | -| mean_reward | -25.3 | -| time/ | | -| total_timesteps | 50000 | -| train/ | | -| approx_kl | 0.0038272792 | -| clip_fraction | 0.0312 | -| clip_range | 0.2 | -| entropy_loss | -2.89 | -| explained_variance | 0.891 | -| learning_rate | 0.0003 | -| loss | -0.0224 | -| n_updates | 30 | -| policy_gradient_loss | -0.0019 | -| std | 1.02 | -| value_loss | 0.0227 | ------------------------------------------- -New best mean reward! ------------------------------- -| time/ | | -| fps | 2387 | -| iterations | 4 | -| time_elapsed | 27 | -| total_timesteps | 65536 | ------------------------------- ------------------------------------------- -| time/ | | -| fps | 2563 | -| iterations | 5 | -| time_elapsed | 31 | -| total_timesteps | 81920 | -| train/ | | -| approx_kl | 0.0040233894 | -| clip_fraction | 0.0323 | -| clip_range | 0.2 | -| entropy_loss | -2.87 | -| explained_variance | 0.878 | -| learning_rate | 0.0003 | -| loss | -0.0251 | -| n_updates | 40 | -| policy_gradient_loss | -0.00247 | -| std | 1.01 | -| value_loss | 0.0169 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2719 | -| iterations | 6 | -| time_elapsed | 36 | -| total_timesteps | 98304 | -| train/ | | -| approx_kl | 0.003573698 | -| clip_fraction | 0.0316 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.865 | -| learning_rate | 0.0003 | -| loss | -0.0219 | -| n_updates | 50 | -| policy_gradient_loss | -0.0019 | -| std | 1.01 | -| value_loss | 0.022 | ------------------------------------------ -/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. - warnings.warn( -Eval num_timesteps=100000, episode_reward=-29.60 +/- 36.59 -Episode length: 1939.35 +/- 264.37 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.94e+03 | -| mean_reward | -29.6 | -| time/ | | -| total_timesteps | 100000 | -| train/ | | -| approx_kl | 0.0046861977 | -| clip_fraction | 0.039 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.815 | -| learning_rate | 0.0003 | -| loss | -0.0257 | -| n_updates | 60 | -| policy_gradient_loss | -0.00203 | -| std | 1.01 | -| value_loss | 0.0201 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 2191 | -| iterations | 7 | -| time_elapsed | 52 | -| total_timesteps | 114688 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2314 | -| iterations | 8 | -| time_elapsed | 56 | -| total_timesteps | 131072 | -| train/ | | -| approx_kl | 0.005258695 | -| clip_fraction | 0.0503 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.807 | -| learning_rate | 0.0003 | -| loss | -0.0211 | -| n_updates | 70 | -| policy_gradient_loss | -0.00398 | -| std | 1.01 | -| value_loss | 0.0164 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 2359 | -| iterations | 9 | -| time_elapsed | 62 | -| total_timesteps | 147456 | -| train/ | | -| approx_kl | 0.0043328116 | -| clip_fraction | 0.0332 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.811 | -| learning_rate | 0.0003 | -| loss | -0.0259 | -| n_updates | 80 | -| policy_gradient_loss | -0.00173 | -| std | 1.01 | -| value_loss | 0.0121 | ------------------------------------------- -Eval num_timesteps=150000, episode_reward=-33.97 +/- 37.15 -Episode length: 1954.85 +/- 196.80 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.95e+03 | -| mean_reward | -34 | -| time/ | | -| total_timesteps | 150000 | -| train/ | | -| approx_kl | 0.005169191 | -| clip_fraction | 0.0506 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.649 | -| learning_rate | 0.0003 | -| loss | -0.0287 | -| n_updates | 90 | -| policy_gradient_loss | -0.00384 | -| std | 1 | -| value_loss | 0.0162 | ------------------------------------------ - -[Diag @ 150,000 | n_sheep=1 | success=15%] - COMPACT_CANT_DRIVE 16/20 - SUCCESS 3/20 - DROVE_NO_SHEEP 1/20 - action_mag mean=0.239 p10=0.071 p90=0.433 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=4.80m best=1.70m (FLEE_DIST=7m) - min_com_to_pen mean=10.22m best=1.50m - reward/step (mean): progress=+0.0013 alignment=+0.0000 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0078 -------------------------------- -| time/ | | -| fps | 1935 | -| iterations | 10 | -| time_elapsed | 84 | -| total_timesteps | 163840 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2014 | -| iterations | 11 | -| time_elapsed | 89 | -| total_timesteps | 180224 | -| train/ | | -| approx_kl | 0.0039950563 | -| clip_fraction | 0.0276 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.623 | -| learning_rate | 0.0003 | -| loss | -0.0128 | -| n_updates | 100 | -| policy_gradient_loss | -0.00208 | -| std | 0.995 | -| value_loss | 0.0959 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2093 | -| iterations | 12 | -| time_elapsed | 93 | -| total_timesteps | 196608 | -| train/ | | -| approx_kl | 0.0036244316 | -| clip_fraction | 0.0299 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.916 | -| learning_rate | 0.0003 | -| loss | -0.0251 | -| n_updates | 110 | -| policy_gradient_loss | -0.00229 | -| std | 0.991 | -| value_loss | 0.0118 | ------------------------------------------- -Eval num_timesteps=200000, episode_reward=-36.37 +/- 39.41 -Episode length: 1950.95 +/- 213.80 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.95e+03 | -| mean_reward | -36.4 | -| time/ | | -| total_timesteps | 200000 | -| train/ | | -| approx_kl | 0.003325508 | -| clip_fraction | 0.0223 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.858 | -| learning_rate | 0.0003 | -| loss | -0.0279 | -| n_updates | 120 | -| policy_gradient_loss | -0.0007 | -| std | 0.999 | -| value_loss | 0.0493 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 1964 | -| iterations | 13 | -| time_elapsed | 108 | -| total_timesteps | 212992 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2034 | -| iterations | 14 | -| time_elapsed | 112 | -| total_timesteps | 229376 | -| train/ | | -| approx_kl | 0.004660043 | -| clip_fraction | 0.0403 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.719 | -| learning_rate | 0.0003 | -| loss | 0.128 | -| n_updates | 130 | -| policy_gradient_loss | -0.00265 | -| std | 1.01 | -| value_loss | 0.073 | ------------------------------------------ ----------------------------------------- -| time/ | | -| fps | 2103 | -| iterations | 15 | -| time_elapsed | 116 | -| total_timesteps | 245760 | -| train/ | | -| approx_kl | 0.00501227 | -| clip_fraction | 0.0499 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.847 | -| learning_rate | 0.0003 | -| loss | -0.0237 | -| n_updates | 140 | -| policy_gradient_loss | -0.00264 | -| std | 1.02 | -| value_loss | 0.0415 | ----------------------------------------- -Eval num_timesteps=250000, episode_reward=-44.92 +/- 15.63 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -44.9 | -| time/ | | -| total_timesteps | 250000 | -| train/ | | -| approx_kl | 0.0055294414 | -| clip_fraction | 0.06 | -| clip_range | 0.2 | -| entropy_loss | -2.89 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.0274 | -| n_updates | 150 | -| policy_gradient_loss | -0.00491 | -| std | 1.03 | -| value_loss | 0.014 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1999 | -| iterations | 16 | -| time_elapsed | 131 | -| total_timesteps | 262144 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2051 | -| iterations | 17 | -| time_elapsed | 135 | -| total_timesteps | 278528 | -| train/ | | -| approx_kl | 0.0051201656 | -| clip_fraction | 0.0301 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.941 | -| learning_rate | 0.0003 | -| loss | 0.148 | -| n_updates | 160 | -| policy_gradient_loss | -0.00199 | -| std | 1.02 | -| value_loss | 0.099 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2096 | -| iterations | 18 | -| time_elapsed | 140 | -| total_timesteps | 294912 | -| train/ | | -| approx_kl | 0.004261789 | -| clip_fraction | 0.0328 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | -0.0314 | -| n_updates | 170 | -| policy_gradient_loss | -0.00243 | -| std | 1.02 | -| value_loss | 0.0117 | ------------------------------------------ -Eval num_timesteps=300000, episode_reward=-44.79 +/- 17.68 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -44.8 | -| time/ | | -| total_timesteps | 300000 | -| train/ | | -| approx_kl | 0.004783842 | -| clip_fraction | 0.0296 | -| clip_range | 0.2 | -| entropy_loss | -2.87 | -| explained_variance | 0.892 | -| learning_rate | 0.0003 | -| loss | -0.0219 | -| n_updates | 180 | -| policy_gradient_loss | -0.00159 | -| std | 1.01 | -| value_loss | 0.0497 | ------------------------------------------ - -[Diag @ 300,000 | n_sheep=1 | success=0%] - COMPACT_CANT_DRIVE 17/20 - DROVE_NO_SHEEP 3/20 - action_mag mean=0.241 p10=0.109 p90=0.389 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=4.77m best=2.12m (FLEE_DIST=7m) - min_com_to_pen mean=9.31m best=1.50m - reward/step (mean): progress=+0.0016 alignment=+0.0000 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 -------------------------------- -| time/ | | -| fps | 1905 | -| iterations | 19 | -| time_elapsed | 163 | -| total_timesteps | 311296 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 1949 | -| iterations | 20 | -| time_elapsed | 168 | -| total_timesteps | 327680 | -| train/ | | -| approx_kl | 0.0033368056 | -| clip_fraction | 0.0258 | -| clip_range | 0.2 | -| entropy_loss | -2.87 | -| explained_variance | 0.794 | -| learning_rate | 0.0003 | -| loss | -0.0211 | -| n_updates | 190 | -| policy_gradient_loss | -0.00105 | -| std | 1.02 | -| value_loss | 0.0769 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1992 | -| iterations | 21 | -| time_elapsed | 172 | -| total_timesteps | 344064 | -| train/ | | -| approx_kl | 0.0046488494 | -| clip_fraction | 0.0352 | -| clip_range | 0.2 | -| entropy_loss | -2.87 | -| explained_variance | 0.927 | -| learning_rate | 0.0003 | -| loss | -0.0274 | -| n_updates | 200 | -| policy_gradient_loss | -0.00331 | -| std | 1.02 | -| value_loss | 0.0165 | ------------------------------------------- -Eval num_timesteps=350000, episode_reward=-24.90 +/- 50.25 -Episode length: 1976.75 +/- 82.03 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.98e+03 | -| mean_reward | -24.9 | -| time/ | | -| total_timesteps | 350000 | -| train/ | | -| approx_kl | 0.0041725934 | -| clip_fraction | 0.0299 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.944 | -| learning_rate | 0.0003 | -| loss | -0.026 | -| n_updates | 210 | -| policy_gradient_loss | -0.0026 | -| std | 1.02 | -| value_loss | 0.00665 | ------------------------------------------- -New best mean reward! -------------------------------- -| time/ | | -| fps | 1921 | -| iterations | 22 | -| time_elapsed | 187 | -| total_timesteps | 360448 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1963 | -| iterations | 23 | -| time_elapsed | 191 | -| total_timesteps | 376832 | -| train/ | | -| approx_kl | 0.005180447 | -| clip_fraction | 0.0532 | -| clip_range | 0.2 | -| entropy_loss | -2.87 | -| explained_variance | 0.956 | -| learning_rate | 0.0003 | -| loss | -0.0255 | -| n_updates | 220 | -| policy_gradient_loss | -0.00352 | -| std | 1.02 | -| value_loss | 0.0142 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1990 | -| iterations | 24 | -| time_elapsed | 197 | -| total_timesteps | 393216 | -| train/ | | -| approx_kl | 0.004661506 | -| clip_fraction | 0.0443 | -| clip_range | 0.2 | -| entropy_loss | -2.87 | -| explained_variance | 0.967 | -| learning_rate | 0.0003 | -| loss | -0.0331 | -| n_updates | 230 | -| policy_gradient_loss | -0.00441 | -| std | 1.02 | -| value_loss | 0.0112 | ------------------------------------------ -Eval num_timesteps=400000, episode_reward=-26.04 +/- 47.69 -Episode length: 1890.85 +/- 367.20 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.89e+03 | -| mean_reward | -26 | -| time/ | | -| total_timesteps | 400000 | -| train/ | | -| approx_kl | 0.005491742 | -| clip_fraction | 0.0538 | -| clip_range | 0.2 | -| entropy_loss | -2.89 | -| explained_variance | 0.941 | -| learning_rate | 0.0003 | -| loss | -0.042 | -| n_updates | 240 | -| policy_gradient_loss | -0.00297 | -| std | 1.03 | -| value_loss | 0.00877 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 1927 | -| iterations | 25 | -| time_elapsed | 212 | -| total_timesteps | 409600 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 1966 | -| iterations | 26 | -| time_elapsed | 216 | -| total_timesteps | 425984 | -| train/ | | -| approx_kl | 0.0045445506 | -| clip_fraction | 0.0385 | -| clip_range | 0.2 | -| entropy_loss | -2.91 | -| explained_variance | 0.941 | -| learning_rate | 0.0003 | -| loss | -0.0343 | -| n_updates | 250 | -| policy_gradient_loss | -0.00307 | -| std | 1.04 | -| value_loss | 0.00818 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2004 | -| iterations | 27 | -| time_elapsed | 220 | -| total_timesteps | 442368 | -| train/ | | -| approx_kl | 0.0045271795 | -| clip_fraction | 0.0373 | -| clip_range | 0.2 | -| entropy_loss | -2.94 | -| explained_variance | 0.97 | -| learning_rate | 0.0003 | -| loss | -0.0361 | -| n_updates | 260 | -| policy_gradient_loss | -0.00236 | -| std | 1.05 | -| value_loss | 0.0091 | ------------------------------------------- -Eval num_timesteps=450000, episode_reward=-24.58 +/- 48.73 -Episode length: 1907.85 +/- 276.46 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.91e+03 | -| mean_reward | -24.6 | -| time/ | | -| total_timesteps | 450000 | -| train/ | | -| approx_kl | 0.0052676853 | -| clip_fraction | 0.0498 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.948 | -| learning_rate | 0.0003 | -| loss | -0.0261 | -| n_updates | 270 | -| policy_gradient_loss | -0.00236 | -| std | 1.07 | -| value_loss | 0.0286 | ------------------------------------------- -New best mean reward! - -[Diag @ 450,000 | n_sheep=1 | success=5%] - COMPACT_CANT_DRIVE 18/20 - DROVE_NO_SHEEP 1/20 - SUCCESS 1/20 - action_mag mean=0.272 p10=0.139 p90=0.407 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=4.81m best=1.54m (FLEE_DIST=7m) - min_com_to_pen mean=12.36m best=1.96m - reward/step (mean): progress=+0.0012 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0025 -------------------------------- -| time/ | | -| fps | 1893 | -| iterations | 28 | -| time_elapsed | 242 | -| total_timesteps | 458752 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1928 | -| iterations | 29 | -| time_elapsed | 246 | -| total_timesteps | 475136 | -| train/ | | -| approx_kl | 0.004465497 | -| clip_fraction | 0.0376 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.948 | -| learning_rate | 0.0003 | -| loss | -0.0307 | -| n_updates | 280 | -| policy_gradient_loss | -0.00259 | -| std | 1.07 | -| value_loss | 0.0213 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1961 | -| iterations | 30 | -| time_elapsed | 250 | -| total_timesteps | 491520 | -| train/ | | -| approx_kl | 0.0054338034 | -| clip_fraction | 0.0512 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.967 | -| learning_rate | 0.0003 | -| loss | -0.021 | -| n_updates | 290 | -| policy_gradient_loss | -0.00296 | -| std | 1.07 | -| value_loss | 0.0138 | ------------------------------------------- -Eval num_timesteps=500000, episode_reward=-44.13 +/- 20.75 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -44.1 | -| time/ | | -| total_timesteps | 500000 | -| train/ | | -| approx_kl | 0.006292434 | -| clip_fraction | 0.0572 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.937 | -| learning_rate | 0.0003 | -| loss | -0.0398 | -| n_updates | 300 | -| policy_gradient_loss | -0.00516 | -| std | 1.07 | -| value_loss | 0.00832 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 1913 | -| iterations | 31 | -| time_elapsed | 265 | -| total_timesteps | 507904 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 1940 | -| iterations | 32 | -| time_elapsed | 270 | -| total_timesteps | 524288 | -| train/ | | -| approx_kl | 0.0063960385 | -| clip_fraction | 0.0702 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | -0.0341 | -| n_updates | 310 | -| policy_gradient_loss | -0.00436 | -| std | 1.06 | -| value_loss | 0.0189 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1968 | -| iterations | 33 | -| time_elapsed | 274 | -| total_timesteps | 540672 | -| train/ | | -| approx_kl | 0.0070166546 | -| clip_fraction | 0.0888 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.0376 | -| n_updates | 320 | -| policy_gradient_loss | -0.00631 | -| std | 1.06 | -| value_loss | 0.00861 | ------------------------------------------- -Eval num_timesteps=550000, episode_reward=-38.60 +/- 14.53 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -38.6 | -| time/ | | -| total_timesteps | 550000 | -| train/ | | -| approx_kl | 0.0068266992 | -| clip_fraction | 0.075 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.959 | -| learning_rate | 0.0003 | -| loss | -0.0252 | -| n_updates | 330 | -| policy_gradient_loss | -0.00593 | -| std | 1.07 | -| value_loss | 0.0131 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1922 | -| iterations | 34 | -| time_elapsed | 289 | -| total_timesteps | 557056 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1950 | -| iterations | 35 | -| time_elapsed | 294 | -| total_timesteps | 573440 | -| train/ | | -| approx_kl | 0.006152669 | -| clip_fraction | 0.0626 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.954 | -| learning_rate | 0.0003 | -| loss | -0.0376 | -| n_updates | 340 | -| policy_gradient_loss | -0.00514 | -| std | 1.07 | -| value_loss | 0.0187 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1977 | -| iterations | 36 | -| time_elapsed | 298 | -| total_timesteps | 589824 | -| train/ | | -| approx_kl | 0.006685758 | -| clip_fraction | 0.0729 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.958 | -| learning_rate | 0.0003 | -| loss | -0.0387 | -| n_updates | 350 | -| policy_gradient_loss | -0.00632 | -| std | 1.07 | -| value_loss | 0.0118 | ------------------------------------------ -Eval num_timesteps=600000, episode_reward=-31.39 +/- 8.94 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -31.4 | -| time/ | | -| total_timesteps | 600000 | -| train/ | | -| approx_kl | 0.008094068 | -| clip_fraction | 0.0985 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.937 | -| learning_rate | 0.0003 | -| loss | -0.0439 | -| n_updates | 360 | -| policy_gradient_loss | -0.00782 | -| std | 1.07 | -| value_loss | 0.0116 | ------------------------------------------ - -[Diag @ 600,000 | n_sheep=1 | success=5%] - COMPACT_CANT_DRIVE 16/20 - DROVE_NO_SHEEP 3/20 - SUCCESS 1/20 - action_mag mean=0.150 p10=0.000 p90=0.392 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=3.64m best=0.68m (FLEE_DIST=7m) - min_com_to_pen mean=10.60m best=1.50m - reward/step (mean): progress=+0.0025 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0026 - -[Curriculum] leaving stage n_sheep=1 after 600,000 steps | training success rate (last 100 eps) = 9% -[Curriculum] → 2 sheep at step 600,000 - -------------------------------- -| time/ | | -| fps | 1894 | -| iterations | 37 | -| time_elapsed | 319 | -| total_timesteps | 606208 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 1917 | -| iterations | 38 | -| time_elapsed | 324 | -| total_timesteps | 622592 | -| train/ | | -| approx_kl | 0.0067913756 | -| clip_fraction | 0.0689 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.861 | -| learning_rate | 0.0003 | -| loss | 0.0772 | -| n_updates | 370 | -| policy_gradient_loss | -0.00184 | -| std | 1.07 | -| value_loss | 0.101 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1938 | -| iterations | 39 | -| time_elapsed | 329 | -| total_timesteps | 638976 | -| train/ | | -| approx_kl | 0.0061344057 | -| clip_fraction | 0.0666 | -| clip_range | 0.2 | -| entropy_loss | -2.98 | -| explained_variance | 0.928 | -| learning_rate | 0.0003 | -| loss | -0.0147 | -| n_updates | 380 | -| policy_gradient_loss | -0.00148 | -| std | 1.08 | -| value_loss | 0.0386 | ------------------------------------------- -Eval num_timesteps=650000, episode_reward=-42.39 +/- 31.99 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -42.4 | -| time/ | | -| total_timesteps | 650000 | -| train/ | | -| approx_kl | 0.0061708866 | -| clip_fraction | 0.06 | -| clip_range | 0.2 | -| entropy_loss | -2.98 | -| explained_variance | 0.918 | -| learning_rate | 0.0003 | -| loss | -0.0203 | -| n_updates | 390 | -| policy_gradient_loss | -0.00313 | -| std | 1.07 | -| value_loss | 0.0242 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1896 | -| iterations | 40 | -| time_elapsed | 345 | -| total_timesteps | 655360 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1918 | -| iterations | 41 | -| time_elapsed | 350 | -| total_timesteps | 671744 | -| train/ | | -| approx_kl | 0.007122565 | -| clip_fraction | 0.0765 | -| clip_range | 0.2 | -| entropy_loss | -2.98 | -| explained_variance | 0.855 | -| learning_rate | 0.0003 | -| loss | -0.00749 | -| n_updates | 400 | -| policy_gradient_loss | -0.00529 | -| std | 1.07 | -| value_loss | 0.0596 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1941 | -| iterations | 42 | -| time_elapsed | 354 | -| total_timesteps | 688128 | -| train/ | | -| approx_kl | 0.0078532845 | -| clip_fraction | 0.0975 | -| clip_range | 0.2 | -| entropy_loss | -2.98 | -| explained_variance | 0.89 | -| learning_rate | 0.0003 | -| loss | -0.0188 | -| n_updates | 410 | -| policy_gradient_loss | -0.00699 | -| std | 1.07 | -| value_loss | 0.0207 | ------------------------------------------- -Eval num_timesteps=700000, episode_reward=-39.79 +/- 29.60 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -39.8 | -| time/ | | -| total_timesteps | 700000 | -| train/ | | -| approx_kl | 0.0073551387 | -| clip_fraction | 0.084 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.824 | -| learning_rate | 0.0003 | -| loss | 0.0126 | -| n_updates | 420 | -| policy_gradient_loss | -0.0064 | -| std | 1.06 | -| value_loss | 0.0438 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1904 | -| iterations | 43 | -| time_elapsed | 370 | -| total_timesteps | 704512 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1922 | -| iterations | 44 | -| time_elapsed | 375 | -| total_timesteps | 720896 | -| train/ | | -| approx_kl | 0.006614036 | -| clip_fraction | 0.0611 | -| clip_range | 0.2 | -| entropy_loss | -2.95 | -| explained_variance | 0.881 | -| learning_rate | 0.0003 | -| loss | -0.0207 | -| n_updates | 430 | -| policy_gradient_loss | -0.00371 | -| std | 1.06 | -| value_loss | 0.0244 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1940 | -| iterations | 45 | -| time_elapsed | 380 | -| total_timesteps | 737280 | -| train/ | | -| approx_kl | 0.0060790265 | -| clip_fraction | 0.0591 | -| clip_range | 0.2 | -| entropy_loss | -2.95 | -| explained_variance | 0.885 | -| learning_rate | 0.0003 | -| loss | -0.0284 | -| n_updates | 440 | -| policy_gradient_loss | -0.00447 | -| std | 1.06 | -| value_loss | 0.0206 | ------------------------------------------- -Eval num_timesteps=750000, episode_reward=-40.21 +/- 27.55 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -40.2 | -| time/ | | -| total_timesteps | 750000 | -| train/ | | -| approx_kl | 0.0066163363 | -| clip_fraction | 0.0691 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.924 | -| learning_rate | 0.0003 | -| loss | -0.032 | -| n_updates | 450 | -| policy_gradient_loss | -0.0043 | -| std | 1.06 | -| value_loss | 0.0127 | ------------------------------------------- - -[Diag @ 750,000 | n_sheep=2 | success=0%] - COMPACT_CANT_DRIVE 14/20 - NEVER_COMPACT 5/20 - DROVE_NO_SHEEP 1/20 - action_mag mean=0.313 p10=0.081 p90=0.638 (0=stopped, 1=full speed) - min_flock_radius mean=2.72m best=0.00m (target <5m to compact) - min_dog_to_com mean=3.96m best=0.02m (FLEE_DIST=7m) - min_com_to_pen mean=12.68m best=2.17m - reward/step (mean): progress=-0.0005 alignment=+0.0000 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0000 -------------------------------- -| time/ | | -| fps | 1866 | -| iterations | 46 | -| time_elapsed | 403 | -| total_timesteps | 753664 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1887 | -| iterations | 47 | -| time_elapsed | 407 | -| total_timesteps | 770048 | -| train/ | | -| approx_kl | 0.005094421 | -| clip_fraction | 0.0496 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.917 | -| learning_rate | 0.0003 | -| loss | -0.0237 | -| n_updates | 460 | -| policy_gradient_loss | -0.00332 | -| std | 1.06 | -| value_loss | 0.0275 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1906 | -| iterations | 48 | -| time_elapsed | 412 | -| total_timesteps | 786432 | -| train/ | | -| approx_kl | 0.006302662 | -| clip_fraction | 0.0571 | -| clip_range | 0.2 | -| entropy_loss | -2.94 | -| explained_variance | 0.944 | -| learning_rate | 0.0003 | -| loss | -0.0353 | -| n_updates | 470 | -| policy_gradient_loss | -0.00424 | -| std | 1.05 | -| value_loss | 0.0201 | ------------------------------------------ -Eval num_timesteps=800000, episode_reward=-31.43 +/- 45.97 -Episode length: 1953.35 +/- 203.34 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.95e+03 | -| mean_reward | -31.4 | -| time/ | | -| total_timesteps | 800000 | -| train/ | | -| approx_kl | 0.0055750986 | -| clip_fraction | 0.0494 | -| clip_range | 0.2 | -| entropy_loss | -2.95 | -| explained_variance | 0.959 | -| learning_rate | 0.0003 | -| loss | -0.0262 | -| n_updates | 480 | -| policy_gradient_loss | -0.00386 | -| std | 1.06 | -| value_loss | 0.0218 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1878 | -| iterations | 49 | -| time_elapsed | 427 | -| total_timesteps | 802816 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 1897 | -| iterations | 50 | -| time_elapsed | 431 | -| total_timesteps | 819200 | -| train/ | | -| approx_kl | 0.0057711033 | -| clip_fraction | 0.0568 | -| clip_range | 0.2 | -| entropy_loss | -2.95 | -| explained_variance | 0.838 | -| learning_rate | 0.0003 | -| loss | -0.0362 | -| n_updates | 490 | -| policy_gradient_loss | -0.00438 | -| std | 1.06 | -| value_loss | 0.00952 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1914 | -| iterations | 51 | -| time_elapsed | 436 | -| total_timesteps | 835584 | -| train/ | | -| approx_kl | 0.0073408587 | -| clip_fraction | 0.077 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0283 | -| n_updates | 500 | -| policy_gradient_loss | -0.00553 | -| std | 1.07 | -| value_loss | 0.0142 | ------------------------------------------- -Eval num_timesteps=850000, episode_reward=-37.98 +/- 27.04 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -38 | -| time/ | | -| total_timesteps | 850000 | -| train/ | | -| approx_kl | 0.0055803536 | -| clip_fraction | 0.0536 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0338 | -| n_updates | 510 | -| policy_gradient_loss | -0.00469 | -| std | 1.06 | -| value_loss | 0.0156 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1884 | -| iterations | 52 | -| time_elapsed | 452 | -| total_timesteps | 851968 | -------------------------------- ----------------------------------------- -| time/ | | -| fps | 1899 | -| iterations | 53 | -| time_elapsed | 457 | -| total_timesteps | 868352 | -| train/ | | -| approx_kl | 0.00585186 | -| clip_fraction | 0.0638 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.83 | -| learning_rate | 0.0003 | -| loss | -0.0333 | -| n_updates | 520 | -| policy_gradient_loss | -0.00395 | -| std | 1.07 | -| value_loss | 0.0322 | ----------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1915 | -| iterations | 54 | -| time_elapsed | 461 | -| total_timesteps | 884736 | -| train/ | | -| approx_kl | 0.0055105407 | -| clip_fraction | 0.045 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.845 | -| learning_rate | 0.0003 | -| loss | -0.0283 | -| n_updates | 530 | -| policy_gradient_loss | -0.00367 | -| std | 1.06 | -| value_loss | 0.0109 | ------------------------------------------- -Eval num_timesteps=900000, episode_reward=-41.53 +/- 35.40 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -41.5 | -| time/ | | -| total_timesteps | 900000 | -| train/ | | -| approx_kl | 0.0064837057 | -| clip_fraction | 0.0625 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.909 | -| learning_rate | 0.0003 | -| loss | -0.0394 | -| n_updates | 540 | -| policy_gradient_loss | -0.00409 | -| std | 1.06 | -| value_loss | 0.0147 | ------------------------------------------- - -[Diag @ 900,000 | n_sheep=2 | success=0%] - COMPACT_CANT_DRIVE 12/20 - NEVER_COMPACT 8/20 - action_mag mean=0.276 p10=0.038 p90=0.580 (0=stopped, 1=full speed) - min_flock_radius mean=4.30m best=0.98m (target <5m to compact) - min_dog_to_com mean=3.24m best=0.24m (FLEE_DIST=7m) - min_com_to_pen mean=12.15m best=5.60m - reward/step (mean): progress=-0.0048 alignment=+0.0000 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 -------------------------------- -| time/ | | -| fps | 1857 | -| iterations | 55 | -| time_elapsed | 485 | -| total_timesteps | 901120 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1874 | -| iterations | 56 | -| time_elapsed | 489 | -| total_timesteps | 917504 | -| train/ | | -| approx_kl | 0.006582682 | -| clip_fraction | 0.0662 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.961 | -| learning_rate | 0.0003 | -| loss | -0.039 | -| n_updates | 550 | -| policy_gradient_loss | -0.00462 | -| std | 1.07 | -| value_loss | 0.0103 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1888 | -| iterations | 57 | -| time_elapsed | 494 | -| total_timesteps | 933888 | -| train/ | | -| approx_kl | 0.0059698187 | -| clip_fraction | 0.0573 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.907 | -| learning_rate | 0.0003 | -| loss | -0.0291 | -| n_updates | 560 | -| policy_gradient_loss | -0.00446 | -| std | 1.07 | -| value_loss | 0.0113 | ------------------------------------------- -Eval num_timesteps=950000, episode_reward=-26.73 +/- 22.82 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -26.7 | -| time/ | | -| total_timesteps | 950000 | -| train/ | | -| approx_kl | 0.006601461 | -| clip_fraction | 0.0594 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.872 | -| learning_rate | 0.0003 | -| loss | -0.034 | -| n_updates | 570 | -| policy_gradient_loss | -0.00455 | -| std | 1.06 | -| value_loss | 0.00901 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 1856 | -| iterations | 58 | -| time_elapsed | 511 | -| total_timesteps | 950272 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1869 | -| iterations | 59 | -| time_elapsed | 517 | -| total_timesteps | 966656 | -| train/ | | -| approx_kl | 0.005824944 | -| clip_fraction | 0.0624 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.789 | -| learning_rate | 0.0003 | -| loss | -0.0214 | -| n_updates | 580 | -| policy_gradient_loss | -0.00363 | -| std | 1.07 | -| value_loss | 0.0359 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1882 | -| iterations | 60 | -| time_elapsed | 522 | -| total_timesteps | 983040 | -| train/ | | -| approx_kl | 0.005888001 | -| clip_fraction | 0.0573 | -| clip_range | 0.2 | -| entropy_loss | -2.98 | -| explained_variance | 0.887 | -| learning_rate | 0.0003 | -| loss | -0.0391 | -| n_updates | 590 | -| policy_gradient_loss | -0.00371 | -| std | 1.07 | -| value_loss | 0.00935 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1895 | -| iterations | 61 | -| time_elapsed | 527 | -| total_timesteps | 999424 | -| train/ | | -| approx_kl | 0.005874036 | -| clip_fraction | 0.0611 | -| clip_range | 0.2 | -| entropy_loss | -2.98 | -| explained_variance | 0.871 | -| learning_rate | 0.0003 | -| loss | -0.0246 | -| n_updates | 600 | -| policy_gradient_loss | -0.00492 | -| std | 1.07 | -| value_loss | 0.00877 | ------------------------------------------ -Eval num_timesteps=1000000, episode_reward=-22.72 +/- 33.15 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -22.7 | -| time/ | | -| total_timesteps | 1000000 | -| train/ | | -| approx_kl | 0.0060388125 | -| clip_fraction | 0.0637 | -| clip_range | 0.2 | -| entropy_loss | -2.97 | -| explained_variance | 0.737 | -| learning_rate | 0.0003 | -| loss | -0.0511 | -| n_updates | 610 | -| policy_gradient_loss | -0.00387 | -| std | 1.07 | -| value_loss | 0.0538 | ------------------------------------------- -New best mean reward! --------------------------------- -| time/ | | -| fps | 1869 | -| iterations | 62 | -| time_elapsed | 543 | -| total_timesteps | 1015808 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1882 | -| iterations | 63 | -| time_elapsed | 548 | -| total_timesteps | 1032192 | -| train/ | | -| approx_kl | 0.007320485 | -| clip_fraction | 0.0723 | -| clip_range | 0.2 | -| entropy_loss | -2.99 | -| explained_variance | 0.946 | -| learning_rate | 0.0003 | -| loss | -0.0342 | -| n_updates | 620 | -| policy_gradient_loss | -0.0052 | -| std | 1.08 | -| value_loss | 0.0174 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1894 | -| iterations | 64 | -| time_elapsed | 553 | -| total_timesteps | 1048576 | -| train/ | | -| approx_kl | 0.0066477214 | -| clip_fraction | 0.0621 | -| clip_range | 0.2 | -| entropy_loss | -3 | -| explained_variance | 0.919 | -| learning_rate | 0.0003 | -| loss | -0.0301 | -| n_updates | 630 | -| policy_gradient_loss | -0.00449 | -| std | 1.08 | -| value_loss | 0.0109 | ------------------------------------------- -Eval num_timesteps=1050000, episode_reward=-39.86 +/- 28.77 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -39.9 | -| time/ | | -| total_timesteps | 1050000 | -| train/ | | -| approx_kl | 0.0066243596 | -| clip_fraction | 0.0772 | -| clip_range | 0.2 | -| entropy_loss | -2.99 | -| explained_variance | 0.861 | -| learning_rate | 0.0003 | -| loss | -0.0313 | -| n_updates | 640 | -| policy_gradient_loss | -0.00462 | -| std | 1.07 | -| value_loss | 0.0324 | ------------------------------------------- - -[Diag @ 1,050,000 | n_sheep=2 | success=0%] - COMPACT_CANT_DRIVE 18/20 - NEVER_COMPACT 2/20 - action_mag mean=0.200 p10=0.022 p90=0.478 (0=stopped, 1=full speed) - min_flock_radius mean=2.29m best=0.00m (target <5m to compact) - min_dog_to_com mean=3.23m best=0.05m (FLEE_DIST=7m) - min_com_to_pen mean=12.84m best=3.77m - reward/step (mean): progress=+0.0016 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1843 | -| iterations | 65 | -| time_elapsed | 577 | -| total_timesteps | 1064960 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1855 | -| iterations | 66 | -| time_elapsed | 582 | -| total_timesteps | 1081344 | -| train/ | | -| approx_kl | 0.0066154073 | -| clip_fraction | 0.0657 | -| clip_range | 0.2 | -| entropy_loss | -2.99 | -| explained_variance | 0.836 | -| learning_rate | 0.0003 | -| loss | -0.029 | -| n_updates | 650 | -| policy_gradient_loss | -0.0049 | -| std | 1.08 | -| value_loss | 0.0135 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1865 | -| iterations | 67 | -| time_elapsed | 588 | -| total_timesteps | 1097728 | -| train/ | | -| approx_kl | 0.0059733046 | -| clip_fraction | 0.0634 | -| clip_range | 0.2 | -| entropy_loss | -3.01 | -| explained_variance | 0.852 | -| learning_rate | 0.0003 | -| loss | -0.0254 | -| n_updates | 660 | -| policy_gradient_loss | -0.00452 | -| std | 1.09 | -| value_loss | 0.0395 | ------------------------------------------- -Eval num_timesteps=1100000, episode_reward=-33.30 +/- 26.65 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -33.3 | -| time/ | | -| total_timesteps | 1100000 | -| train/ | | -| approx_kl | 0.0054050894 | -| clip_fraction | 0.048 | -| clip_range | 0.2 | -| entropy_loss | -3.02 | -| explained_variance | 0.851 | -| learning_rate | 0.0003 | -| loss | -0.0348 | -| n_updates | 670 | -| policy_gradient_loss | -0.00385 | -| std | 1.1 | -| value_loss | 0.0247 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1843 | -| iterations | 68 | -| time_elapsed | 604 | -| total_timesteps | 1114112 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1856 | -| iterations | 69 | -| time_elapsed | 608 | -| total_timesteps | 1130496 | -| train/ | | -| approx_kl | 0.0073612374 | -| clip_fraction | 0.076 | -| clip_range | 0.2 | -| entropy_loss | -3.01 | -| explained_variance | 0.885 | -| learning_rate | 0.0003 | -| loss | -0.0424 | -| n_updates | 680 | -| policy_gradient_loss | -0.00512 | -| std | 1.09 | -| value_loss | 0.0278 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1869 | -| iterations | 70 | -| time_elapsed | 613 | -| total_timesteps | 1146880 | -| train/ | | -| approx_kl | 0.0063554104 | -| clip_fraction | 0.067 | -| clip_range | 0.2 | -| entropy_loss | -3.01 | -| explained_variance | 0.915 | -| learning_rate | 0.0003 | -| loss | -0.0302 | -| n_updates | 690 | -| policy_gradient_loss | -0.00577 | -| std | 1.09 | -| value_loss | 0.0116 | ------------------------------------------- -Eval num_timesteps=1150000, episode_reward=-26.91 +/- 26.08 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -26.9 | -| time/ | | -| total_timesteps | 1150000 | -| train/ | | -| approx_kl | 0.006060633 | -| clip_fraction | 0.0603 | -| clip_range | 0.2 | -| entropy_loss | -3.02 | -| explained_variance | 0.905 | -| learning_rate | 0.0003 | -| loss | -0.0374 | -| n_updates | 700 | -| policy_gradient_loss | -0.00442 | -| std | 1.1 | -| value_loss | 0.0101 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1847 | -| iterations | 71 | -| time_elapsed | 629 | -| total_timesteps | 1163264 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1859 | -| iterations | 72 | -| time_elapsed | 634 | -| total_timesteps | 1179648 | -| train/ | | -| approx_kl | 0.0070389216 | -| clip_fraction | 0.0728 | -| clip_range | 0.2 | -| entropy_loss | -3.03 | -| explained_variance | 0.854 | -| learning_rate | 0.0003 | -| loss | -0.0409 | -| n_updates | 710 | -| policy_gradient_loss | -0.00505 | -| std | 1.1 | -| value_loss | 0.0196 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1871 | -| iterations | 73 | -| time_elapsed | 638 | -| total_timesteps | 1196032 | -| train/ | | -| approx_kl | 0.0055403598 | -| clip_fraction | 0.0567 | -| clip_range | 0.2 | -| entropy_loss | -3.03 | -| explained_variance | 0.906 | -| learning_rate | 0.0003 | -| loss | -0.0324 | -| n_updates | 720 | -| policy_gradient_loss | -0.00494 | -| std | 1.1 | -| value_loss | 0.0109 | ------------------------------------------- -Eval num_timesteps=1200000, episode_reward=-23.57 +/- 26.30 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -23.6 | -| time/ | | -| total_timesteps | 1200000 | -| train/ | | -| approx_kl | 0.0055604624 | -| clip_fraction | 0.0522 | -| clip_range | 0.2 | -| entropy_loss | -3.02 | -| explained_variance | 0.819 | -| learning_rate | 0.0003 | -| loss | -0.00379 | -| n_updates | 730 | -| policy_gradient_loss | -0.00374 | -| std | 1.1 | -| value_loss | 0.0453 | ------------------------------------------- - -[Diag @ 1,200,000 | n_sheep=2 | success=0%] - COMPACT_CANT_DRIVE 15/20 - NEVER_COMPACT 4/20 - DROVE_NO_SHEEP 1/20 - action_mag mean=0.399 p10=0.067 p90=0.794 (0=stopped, 1=full speed) - min_flock_radius mean=2.96m best=0.00m (target <5m to compact) - min_dog_to_com mean=2.17m best=0.14m (FLEE_DIST=7m) - min_com_to_pen mean=11.07m best=2.66m - reward/step (mean): progress=+0.0064 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 - -[Curriculum] leaving stage n_sheep=2 after 600,000 steps | training success rate (last 100 eps) = 0% -[Curriculum] → 3 sheep at step 1,200,000 - --------------------------------- -| time/ | | -| fps | 1828 | -| iterations | 74 | -| time_elapsed | 663 | -| total_timesteps | 1212416 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1839 | -| iterations | 75 | -| time_elapsed | 668 | -| total_timesteps | 1228800 | -| train/ | | -| approx_kl | 0.007044647 | -| clip_fraction | 0.0819 | -| clip_range | 0.2 | -| entropy_loss | -3.02 | -| explained_variance | 0.902 | -| learning_rate | 0.0003 | -| loss | -0.00823 | -| n_updates | 740 | -| policy_gradient_loss | -0.00327 | -| std | 1.1 | -| value_loss | 0.042 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1849 | -| iterations | 76 | -| time_elapsed | 673 | -| total_timesteps | 1245184 | -| train/ | | -| approx_kl | 0.0064169513 | -| clip_fraction | 0.0699 | -| clip_range | 0.2 | -| entropy_loss | -3.03 | -| explained_variance | 0.928 | -| learning_rate | 0.0003 | -| loss | -0.0323 | -| n_updates | 750 | -| policy_gradient_loss | -0.00459 | -| std | 1.1 | -| value_loss | 0.0102 | ------------------------------------------- -Eval num_timesteps=1250000, episode_reward=-27.97 +/- 37.55 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -28 | -| time/ | | -| total_timesteps | 1250000 | -| train/ | | -| approx_kl | 0.006859841 | -| clip_fraction | 0.0783 | -| clip_range | 0.2 | -| entropy_loss | -3.04 | -| explained_variance | 0.94 | -| learning_rate | 0.0003 | -| loss | -0.0368 | -| n_updates | 760 | -| policy_gradient_loss | -0.00472 | -| std | 1.11 | -| value_loss | 0.00931 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1825 | -| iterations | 77 | -| time_elapsed | 691 | -| total_timesteps | 1261568 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1836 | -| iterations | 78 | -| time_elapsed | 696 | -| total_timesteps | 1277952 | -| train/ | | -| approx_kl | 0.0066901552 | -| clip_fraction | 0.0704 | -| clip_range | 0.2 | -| entropy_loss | -3.04 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | -0.0329 | -| n_updates | 770 | -| policy_gradient_loss | -0.00458 | -| std | 1.11 | -| value_loss | 0.00938 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1845 | -| iterations | 79 | -| time_elapsed | 701 | -| total_timesteps | 1294336 | -| train/ | | -| approx_kl | 0.007008245 | -| clip_fraction | 0.082 | -| clip_range | 0.2 | -| entropy_loss | -3.03 | -| explained_variance | 0.899 | -| learning_rate | 0.0003 | -| loss | -0.0194 | -| n_updates | 780 | -| policy_gradient_loss | -0.00426 | -| std | 1.1 | -| value_loss | 0.052 | ------------------------------------------ -Eval num_timesteps=1300000, episode_reward=-41.12 +/- 37.68 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -41.1 | -| time/ | | -| total_timesteps | 1300000 | -| train/ | | -| approx_kl | 0.0070775724 | -| clip_fraction | 0.0742 | -| clip_range | 0.2 | -| entropy_loss | -3.03 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | -0.0238 | -| n_updates | 790 | -| policy_gradient_loss | -0.0052 | -| std | 1.11 | -| value_loss | 0.00657 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1823 | -| iterations | 80 | -| time_elapsed | 718 | -| total_timesteps | 1310720 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1832 | -| iterations | 81 | -| time_elapsed | 724 | -| total_timesteps | 1327104 | -| train/ | | -| approx_kl | 0.008046751 | -| clip_fraction | 0.0851 | -| clip_range | 0.2 | -| entropy_loss | -3.04 | -| explained_variance | 0.897 | -| learning_rate | 0.0003 | -| loss | -0.0384 | -| n_updates | 800 | -| policy_gradient_loss | -0.0057 | -| std | 1.11 | -| value_loss | 0.009 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1840 | -| iterations | 82 | -| time_elapsed | 730 | -| total_timesteps | 1343488 | -| train/ | | -| approx_kl | 0.006007643 | -| clip_fraction | 0.0548 | -| clip_range | 0.2 | -| entropy_loss | -3.06 | -| explained_variance | 0.871 | -| learning_rate | 0.0003 | -| loss | -0.0251 | -| n_updates | 810 | -| policy_gradient_loss | -0.00416 | -| std | 1.12 | -| value_loss | 0.0179 | ------------------------------------------ -Eval num_timesteps=1350000, episode_reward=-24.46 +/- 41.24 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -24.5 | -| time/ | | -| total_timesteps | 1350000 | -| train/ | | -| approx_kl | 0.0065572546 | -| clip_fraction | 0.0698 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.877 | -| learning_rate | 0.0003 | -| loss | -0.0219 | -| n_updates | 820 | -| policy_gradient_loss | -0.00456 | -| std | 1.13 | -| value_loss | 0.0242 | ------------------------------------------- - -[Diag @ 1,350,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 14/20 - COMPACT_CANT_DRIVE 6/20 - action_mag mean=0.195 p10=0.018 p90=0.576 (0=stopped, 1=full speed) - min_flock_radius mean=6.32m best=1.36m (target <5m to compact) - min_dog_to_com mean=4.15m best=0.61m (FLEE_DIST=7m) - min_com_to_pen mean=11.37m best=4.88m - reward/step (mean): progress=+0.0029 alignment=+0.0000 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1798 | -| iterations | 83 | -| time_elapsed | 756 | -| total_timesteps | 1359872 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1809 | -| iterations | 84 | -| time_elapsed | 760 | -| total_timesteps | 1376256 | -| train/ | | -| approx_kl | 0.0072198315 | -| clip_fraction | 0.0764 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.909 | -| learning_rate | 0.0003 | -| loss | -0.0208 | -| n_updates | 830 | -| policy_gradient_loss | -0.00626 | -| std | 1.13 | -| value_loss | 0.0106 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1817 | -| iterations | 85 | -| time_elapsed | 766 | -| total_timesteps | 1392640 | -| train/ | | -| approx_kl | 0.0070813587 | -| clip_fraction | 0.0733 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.907 | -| learning_rate | 0.0003 | -| loss | -0.0324 | -| n_updates | 840 | -| policy_gradient_loss | -0.00505 | -| std | 1.13 | -| value_loss | 0.0166 | ------------------------------------------- -Eval num_timesteps=1400000, episode_reward=-36.32 +/- 33.15 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -36.3 | -| time/ | | -| total_timesteps | 1400000 | -| train/ | | -| approx_kl | 0.0067584305 | -| clip_fraction | 0.08 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.906 | -| learning_rate | 0.0003 | -| loss | -0.0308 | -| n_updates | 850 | -| policy_gradient_loss | -0.0054 | -| std | 1.13 | -| value_loss | 0.0112 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1798 | -| iterations | 86 | -| time_elapsed | 783 | -| total_timesteps | 1409024 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1807 | -| iterations | 87 | -| time_elapsed | 788 | -| total_timesteps | 1425408 | -| train/ | | -| approx_kl | 0.007411341 | -| clip_fraction | 0.0716 | -| clip_range | 0.2 | -| entropy_loss | -3.09 | -| explained_variance | 0.904 | -| learning_rate | 0.0003 | -| loss | -0.0322 | -| n_updates | 860 | -| policy_gradient_loss | -0.00641 | -| std | 1.14 | -| value_loss | 0.0191 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1815 | -| iterations | 88 | -| time_elapsed | 794 | -| total_timesteps | 1441792 | -| train/ | | -| approx_kl | 0.0077011855 | -| clip_fraction | 0.0774 | -| clip_range | 0.2 | -| entropy_loss | -3.09 | -| explained_variance | 0.914 | -| learning_rate | 0.0003 | -| loss | -0.0316 | -| n_updates | 870 | -| policy_gradient_loss | -0.00545 | -| std | 1.13 | -| value_loss | 0.0148 | ------------------------------------------- -Eval num_timesteps=1450000, episode_reward=-40.58 +/- 38.17 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -40.6 | -| time/ | | -| total_timesteps | 1450000 | -| train/ | | -| approx_kl | 0.007694071 | -| clip_fraction | 0.0816 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.937 | -| learning_rate | 0.0003 | -| loss | -0.036 | -| n_updates | 880 | -| policy_gradient_loss | -0.0054 | -| std | 1.12 | -| value_loss | 0.0111 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1796 | -| iterations | 89 | -| time_elapsed | 811 | -| total_timesteps | 1458176 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1805 | -| iterations | 90 | -| time_elapsed | 816 | -| total_timesteps | 1474560 | -| train/ | | -| approx_kl | 0.007034345 | -| clip_fraction | 0.0693 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.924 | -| learning_rate | 0.0003 | -| loss | 0.0472 | -| n_updates | 890 | -| policy_gradient_loss | -0.00472 | -| std | 1.13 | -| value_loss | 0.0352 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1815 | -| iterations | 91 | -| time_elapsed | 821 | -| total_timesteps | 1490944 | -| train/ | | -| approx_kl | 0.0078114523 | -| clip_fraction | 0.0917 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | -0.0461 | -| n_updates | 900 | -| policy_gradient_loss | -0.00668 | -| std | 1.13 | -| value_loss | 0.00844 | ------------------------------------------- -Eval num_timesteps=1500000, episode_reward=-19.66 +/- 25.98 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -19.7 | -| time/ | | -| total_timesteps | 1500000 | -| train/ | | -| approx_kl | 0.0067999987 | -| clip_fraction | 0.0606 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.893 | -| learning_rate | 0.0003 | -| loss | -0.0283 | -| n_updates | 910 | -| policy_gradient_loss | -0.00385 | -| std | 1.12 | -| value_loss | 0.0409 | ------------------------------------------- -New best mean reward! - -[Diag @ 1,500,000 | n_sheep=3 | success=0%] - COMPACT_CANT_DRIVE 11/20 - NEVER_COMPACT 7/20 - DROVE_NO_SHEEP 2/20 - action_mag mean=0.185 p10=0.015 p90=0.426 (0=stopped, 1=full speed) - min_flock_radius mean=4.43m best=1.38m (target <5m to compact) - min_dog_to_com mean=2.89m best=0.07m (FLEE_DIST=7m) - min_com_to_pen mean=11.88m best=2.23m - reward/step (mean): progress=+0.0008 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1781 | -| iterations | 92 | -| time_elapsed | 846 | -| total_timesteps | 1507328 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1789 | -| iterations | 93 | -| time_elapsed | 851 | -| total_timesteps | 1523712 | -| train/ | | -| approx_kl | 0.0069550863 | -| clip_fraction | 0.0787 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.897 | -| learning_rate | 0.0003 | -| loss | -0.0204 | -| n_updates | 920 | -| policy_gradient_loss | -0.00394 | -| std | 1.13 | -| value_loss | 0.0324 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1798 | -| iterations | 94 | -| time_elapsed | 856 | -| total_timesteps | 1540096 | -| train/ | | -| approx_kl | 0.006749108 | -| clip_fraction | 0.0787 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0338 | -| n_updates | 930 | -| policy_gradient_loss | -0.00534 | -| std | 1.13 | -| value_loss | 0.00967 | ------------------------------------------ -Eval num_timesteps=1550000, episode_reward=-26.47 +/- 25.94 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -26.5 | -| time/ | | -| total_timesteps | 1550000 | -| train/ | | -| approx_kl | 0.0073381998 | -| clip_fraction | 0.0679 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.919 | -| learning_rate | 0.0003 | -| loss | -0.0259 | -| n_updates | 940 | -| policy_gradient_loss | -0.00554 | -| std | 1.13 | -| value_loss | 0.00999 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1782 | -| iterations | 95 | -| time_elapsed | 873 | -| total_timesteps | 1556480 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1790 | -| iterations | 96 | -| time_elapsed | 878 | -| total_timesteps | 1572864 | -| train/ | | -| approx_kl | 0.0071112993 | -| clip_fraction | 0.0781 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0324 | -| n_updates | 950 | -| policy_gradient_loss | -0.00428 | -| std | 1.13 | -| value_loss | 0.0246 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1798 | -| iterations | 97 | -| time_elapsed | 883 | -| total_timesteps | 1589248 | -| train/ | | -| approx_kl | 0.0077134473 | -| clip_fraction | 0.0784 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.917 | -| learning_rate | 0.0003 | -| loss | -0.0365 | -| n_updates | 960 | -| policy_gradient_loss | -0.00445 | -| std | 1.13 | -| value_loss | 0.0122 | ------------------------------------------- -Eval num_timesteps=1600000, episode_reward=-35.13 +/- 31.01 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -35.1 | -| time/ | | -| total_timesteps | 1600000 | -| train/ | | -| approx_kl | 0.0070123896 | -| clip_fraction | 0.0712 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.919 | -| learning_rate | 0.0003 | -| loss | -0.026 | -| n_updates | 970 | -| policy_gradient_loss | -0.00519 | -| std | 1.13 | -| value_loss | 0.0171 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1781 | -| iterations | 98 | -| time_elapsed | 901 | -| total_timesteps | 1605632 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1789 | -| iterations | 99 | -| time_elapsed | 906 | -| total_timesteps | 1622016 | -| train/ | | -| approx_kl | 0.007990176 | -| clip_fraction | 0.0845 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.873 | -| learning_rate | 0.0003 | -| loss | -0.04 | -| n_updates | 980 | -| policy_gradient_loss | -0.0045 | -| std | 1.13 | -| value_loss | 0.0153 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1798 | -| iterations | 100 | -| time_elapsed | 911 | -| total_timesteps | 1638400 | -| train/ | | -| approx_kl | 0.006477687 | -| clip_fraction | 0.0593 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.946 | -| learning_rate | 0.0003 | -| loss | -0.0396 | -| n_updates | 990 | -| policy_gradient_loss | -0.00442 | -| std | 1.13 | -| value_loss | 0.0107 | ------------------------------------------ -Eval num_timesteps=1650000, episode_reward=-31.86 +/- 47.05 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -31.9 | -| time/ | | -| total_timesteps | 1650000 | -| train/ | | -| approx_kl | 0.006796476 | -| clip_fraction | 0.0672 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0264 | -| n_updates | 1000 | -| policy_gradient_loss | -0.00375 | -| std | 1.13 | -| value_loss | 0.0385 | ------------------------------------------ - -[Diag @ 1,650,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 11/20 - COMPACT_CANT_DRIVE 9/20 - action_mag mean=0.154 p10=0.005 p90=0.398 (0=stopped, 1=full speed) - min_flock_radius mean=5.81m best=0.00m (target <5m to compact) - min_dog_to_com mean=3.22m best=0.52m (FLEE_DIST=7m) - min_com_to_pen mean=13.42m best=7.08m - reward/step (mean): progress=+0.0061 alignment=+0.0000 pen_bonus=+0.0010 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1768 | -| iterations | 101 | -| time_elapsed | 935 | -| total_timesteps | 1654784 | --------------------------------- ----------------------------------------- -| time/ | | -| fps | 1774 | -| iterations | 102 | -| time_elapsed | 941 | -| total_timesteps | 1671168 | -| train/ | | -| approx_kl | 0.00682881 | -| clip_fraction | 0.0694 | -| clip_range | 0.2 | -| entropy_loss | -3.08 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0233 | -| n_updates | 1010 | -| policy_gradient_loss | -0.00461 | -| std | 1.13 | -| value_loss | 0.0183 | ----------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1779 | -| iterations | 103 | -| time_elapsed | 948 | -| total_timesteps | 1687552 | -| train/ | | -| approx_kl | 0.0071003223 | -| clip_fraction | 0.0782 | -| clip_range | 0.2 | -| entropy_loss | -3.1 | -| explained_variance | 0.923 | -| learning_rate | 0.0003 | -| loss | -0.0398 | -| n_updates | 1020 | -| policy_gradient_loss | -0.00491 | -| std | 1.15 | -| value_loss | 0.0101 | ------------------------------------------- -Eval num_timesteps=1700000, episode_reward=-32.11 +/- 36.59 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -32.1 | -| time/ | | -| total_timesteps | 1700000 | -| train/ | | -| approx_kl | 0.0064870613 | -| clip_fraction | 0.0624 | -| clip_range | 0.2 | -| entropy_loss | -3.13 | -| explained_variance | 0.909 | -| learning_rate | 0.0003 | -| loss | -0.0365 | -| n_updates | 1030 | -| policy_gradient_loss | -0.00404 | -| std | 1.17 | -| value_loss | 0.00855 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1762 | -| iterations | 104 | -| time_elapsed | 966 | -| total_timesteps | 1703936 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1769 | -| iterations | 105 | -| time_elapsed | 972 | -| total_timesteps | 1720320 | -| train/ | | -| approx_kl | 0.007349294 | -| clip_fraction | 0.0833 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.926 | -| learning_rate | 0.0003 | -| loss | -0.0358 | -| n_updates | 1040 | -| policy_gradient_loss | -0.00514 | -| std | 1.17 | -| value_loss | 0.00848 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1777 | -| iterations | 106 | -| time_elapsed | 976 | -| total_timesteps | 1736704 | -| train/ | | -| approx_kl | 0.0070306472 | -| clip_fraction | 0.0814 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.887 | -| learning_rate | 0.0003 | -| loss | -0.0359 | -| n_updates | 1050 | -| policy_gradient_loss | -0.00489 | -| std | 1.17 | -| value_loss | 0.0134 | ------------------------------------------- -Eval num_timesteps=1750000, episode_reward=-34.24 +/- 43.23 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -34.2 | -| time/ | | -| total_timesteps | 1750000 | -| train/ | | -| approx_kl | 0.008487761 | -| clip_fraction | 0.102 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.962 | -| learning_rate | 0.0003 | -| loss | -0.0369 | -| n_updates | 1060 | -| policy_gradient_loss | -0.0077 | -| std | 1.17 | -| value_loss | 0.00786 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1762 | -| iterations | 107 | -| time_elapsed | 994 | -| total_timesteps | 1753088 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1766 | -| iterations | 108 | -| time_elapsed | 1001 | -| total_timesteps | 1769472 | -| train/ | | -| approx_kl | 0.0074267983 | -| clip_fraction | 0.0742 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0404 | -| n_updates | 1070 | -| policy_gradient_loss | -0.00575 | -| std | 1.18 | -| value_loss | 0.0158 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1772 | -| iterations | 109 | -| time_elapsed | 1007 | -| total_timesteps | 1785856 | -| train/ | | -| approx_kl | 0.0075380025 | -| clip_fraction | 0.074 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.961 | -| learning_rate | 0.0003 | -| loss | -0.034 | -| n_updates | 1080 | -| policy_gradient_loss | -0.00553 | -| std | 1.17 | -| value_loss | 0.00651 | ------------------------------------------- -Eval num_timesteps=1800000, episode_reward=-31.16 +/- 37.32 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -31.2 | -| time/ | | -| total_timesteps | 1800000 | -| train/ | | -| approx_kl | 0.007386248 | -| clip_fraction | 0.0843 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.922 | -| learning_rate | 0.0003 | -| loss | -0.0419 | -| n_updates | 1090 | -| policy_gradient_loss | -0.00596 | -| std | 1.17 | -| value_loss | 0.00858 | ------------------------------------------ - -[Diag @ 1,800,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 17/20 - COMPACT_CANT_DRIVE 3/20 - action_mag mean=0.164 p10=0.007 p90=0.418 (0=stopped, 1=full speed) - min_flock_radius mean=7.52m best=2.00m (target <5m to compact) - min_dog_to_com mean=2.24m best=0.21m (FLEE_DIST=7m) - min_com_to_pen mean=12.87m best=3.90m - reward/step (mean): progress=-0.0007 alignment=+0.0000 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 - -[Curriculum] leaving stage n_sheep=3 after 600,000 steps | training success rate (last 100 eps) = 0% -[Curriculum] → 4 sheep at step 1,800,000 - --------------------------------- -| time/ | | -| fps | 1743 | -| iterations | 110 | -| time_elapsed | 1033 | -| total_timesteps | 1802240 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1749 | -| iterations | 111 | -| time_elapsed | 1039 | -| total_timesteps | 1818624 | -| train/ | | -| approx_kl | 0.009158293 | -| clip_fraction | 0.0991 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.893 | -| learning_rate | 0.0003 | -| loss | -0.0414 | -| n_updates | 1100 | -| policy_gradient_loss | -0.00701 | -| std | 1.17 | -| value_loss | 0.0237 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1755 | -| iterations | 112 | -| time_elapsed | 1045 | -| total_timesteps | 1835008 | -| train/ | | -| approx_kl | 0.007241189 | -| clip_fraction | 0.0831 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.874 | -| learning_rate | 0.0003 | -| loss | -0.0241 | -| n_updates | 1110 | -| policy_gradient_loss | -0.00634 | -| std | 1.17 | -| value_loss | 0.0226 | ------------------------------------------ -Eval num_timesteps=1850000, episode_reward=-29.45 +/- 31.10 -Episode length: 2000.00 +/- 0.00 ---------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -29.5 | -| time/ | | -| total_timesteps | 1850000 | -| train/ | | -| approx_kl | 0.0078688 | -| clip_fraction | 0.0777 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.895 | -| learning_rate | 0.0003 | -| loss | -0.036 | -| n_updates | 1120 | -| policy_gradient_loss | -0.00602 | -| std | 1.17 | -| value_loss | 0.0128 | ---------------------------------------- --------------------------------- -| time/ | | -| fps | 1742 | -| iterations | 113 | -| time_elapsed | 1062 | -| total_timesteps | 1851392 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1749 | -| iterations | 114 | -| time_elapsed | 1067 | -| total_timesteps | 1867776 | -| train/ | | -| approx_kl | 0.008158936 | -| clip_fraction | 0.0963 | -| clip_range | 0.2 | -| entropy_loss | -3.14 | -| explained_variance | 0.897 | -| learning_rate | 0.0003 | -| loss | -0.0324 | -| n_updates | 1130 | -| policy_gradient_loss | -0.00854 | -| std | 1.17 | -| value_loss | 0.0144 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1754 | -| iterations | 115 | -| time_elapsed | 1073 | -| total_timesteps | 1884160 | -| train/ | | -| approx_kl | 0.0074978825 | -| clip_fraction | 0.0844 | -| clip_range | 0.2 | -| entropy_loss | -3.14 | -| explained_variance | 0.92 | -| learning_rate | 0.0003 | -| loss | -0.0246 | -| n_updates | 1140 | -| policy_gradient_loss | -0.00578 | -| std | 1.16 | -| value_loss | 0.0134 | ------------------------------------------- -Eval num_timesteps=1900000, episode_reward=-38.21 +/- 31.08 -Episode length: 2000.00 +/- 0.00 ----------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -38.2 | -| time/ | | -| total_timesteps | 1900000 | -| train/ | | -| approx_kl | 0.00678163 | -| clip_fraction | 0.0711 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.892 | -| learning_rate | 0.0003 | -| loss | -0.0345 | -| n_updates | 1150 | -| policy_gradient_loss | -0.00409 | -| std | 1.18 | -| value_loss | 0.0221 | ----------------------------------------- --------------------------------- -| time/ | | -| fps | 1740 | -| iterations | 116 | -| time_elapsed | 1091 | -| total_timesteps | 1900544 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1746 | -| iterations | 117 | -| time_elapsed | 1097 | -| total_timesteps | 1916928 | -| train/ | | -| approx_kl | 0.006992462 | -| clip_fraction | 0.0731 | -| clip_range | 0.2 | -| entropy_loss | -3.16 | -| explained_variance | 0.895 | -| learning_rate | 0.0003 | -| loss | -0.0243 | -| n_updates | 1160 | -| policy_gradient_loss | -0.00588 | -| std | 1.18 | -| value_loss | 0.0145 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1750 | -| iterations | 118 | -| time_elapsed | 1104 | -| total_timesteps | 1933312 | -| train/ | | -| approx_kl | 0.0069225584 | -| clip_fraction | 0.068 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.905 | -| learning_rate | 0.0003 | -| loss | -0.0297 | -| n_updates | 1170 | -| policy_gradient_loss | -0.00516 | -| std | 1.17 | -| value_loss | 0.0153 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1756 | -| iterations | 119 | -| time_elapsed | 1109 | -| total_timesteps | 1949696 | -| train/ | | -| approx_kl | 0.005966103 | -| clip_fraction | 0.059 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.896 | -| learning_rate | 0.0003 | -| loss | -0.0337 | -| n_updates | 1180 | -| policy_gradient_loss | -0.00413 | -| std | 1.17 | -| value_loss | 0.0091 | ------------------------------------------ -Eval num_timesteps=1950000, episode_reward=-59.72 +/- 38.15 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -59.7 | -| time/ | | -| total_timesteps | 1950000 | -| train/ | | -| approx_kl | 0.0067311125 | -| clip_fraction | 0.0733 | -| clip_range | 0.2 | -| entropy_loss | -3.16 | -| explained_variance | 0.861 | -| learning_rate | 0.0003 | -| loss | -0.0147 | -| n_updates | 1190 | -| policy_gradient_loss | -0.00459 | -| std | 1.18 | -| value_loss | 0.0083 | ------------------------------------------- - -[Diag @ 1,950,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 14/20 - COMPACT_CANT_DRIVE 6/20 - action_mag mean=0.325 p10=0.025 p90=0.778 (0=stopped, 1=full speed) - min_flock_radius mean=7.27m best=2.17m (target <5m to compact) - min_dog_to_com mean=3.74m best=0.07m (FLEE_DIST=7m) - min_com_to_pen mean=13.01m best=6.24m - reward/step (mean): progress=+0.0026 alignment=+0.0000 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1728 | -| iterations | 120 | -| time_elapsed | 1137 | -| total_timesteps | 1966080 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1734 | -| iterations | 121 | -| time_elapsed | 1143 | -| total_timesteps | 1982464 | -| train/ | | -| approx_kl | 0.0061555626 | -| clip_fraction | 0.0631 | -| clip_range | 0.2 | -| entropy_loss | -3.17 | -| explained_variance | 0.932 | -| learning_rate | 0.0003 | -| loss | -0.0328 | -| n_updates | 1200 | -| policy_gradient_loss | -0.00446 | -| std | 1.19 | -| value_loss | 0.0133 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1739 | -| iterations | 122 | -| time_elapsed | 1149 | -| total_timesteps | 1998848 | -| train/ | | -| approx_kl | 0.0060347347 | -| clip_fraction | 0.057 | -| clip_range | 0.2 | -| entropy_loss | -3.18 | -| explained_variance | 0.841 | -| learning_rate | 0.0003 | -| loss | -0.0352 | -| n_updates | 1210 | -| policy_gradient_loss | -0.00322 | -| std | 1.19 | -| value_loss | 0.0104 | ------------------------------------------- -Eval num_timesteps=2000000, episode_reward=-37.97 +/- 46.26 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -38 | -| time/ | | -| total_timesteps | 2000000 | -| train/ | | -| approx_kl | 0.0063244104 | -| clip_fraction | 0.0675 | -| clip_range | 0.2 | -| entropy_loss | -3.18 | -| explained_variance | 0.865 | -| learning_rate | 0.0003 | -| loss | -0.0217 | -| n_updates | 1220 | -| policy_gradient_loss | -0.00489 | -| std | 1.2 | -| value_loss | 0.0219 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1725 | -| iterations | 123 | -| time_elapsed | 1167 | -| total_timesteps | 2015232 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1730 | -| iterations | 124 | -| time_elapsed | 1173 | -| total_timesteps | 2031616 | -| train/ | | -| approx_kl | 0.007022621 | -| clip_fraction | 0.0816 | -| clip_range | 0.2 | -| entropy_loss | -3.19 | -| explained_variance | 0.949 | -| learning_rate | 0.0003 | -| loss | -0.0248 | -| n_updates | 1230 | -| policy_gradient_loss | -0.0053 | -| std | 1.19 | -| value_loss | 0.00677 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1735 | -| iterations | 125 | -| time_elapsed | 1179 | -| total_timesteps | 2048000 | -| train/ | | -| approx_kl | 0.006686856 | -| clip_fraction | 0.0653 | -| clip_range | 0.2 | -| entropy_loss | -3.18 | -| explained_variance | 0.928 | -| learning_rate | 0.0003 | -| loss | -0.0333 | -| n_updates | 1240 | -| policy_gradient_loss | -0.00445 | -| std | 1.19 | -| value_loss | 0.00651 | ------------------------------------------ -Eval num_timesteps=2050000, episode_reward=-27.67 +/- 36.42 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -27.7 | -| time/ | | -| total_timesteps | 2050000 | -| train/ | | -| approx_kl | 0.006721792 | -| clip_fraction | 0.0675 | -| clip_range | 0.2 | -| entropy_loss | -3.2 | -| explained_variance | 0.921 | -| learning_rate | 0.0003 | -| loss | -0.0278 | -| n_updates | 1250 | -| policy_gradient_loss | -0.00408 | -| std | 1.21 | -| value_loss | 0.00793 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1721 | -| iterations | 126 | -| time_elapsed | 1198 | -| total_timesteps | 2064384 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1726 | -| iterations | 127 | -| time_elapsed | 1205 | -| total_timesteps | 2080768 | -| train/ | | -| approx_kl | 0.006730888 | -| clip_fraction | 0.0617 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.911 | -| learning_rate | 0.0003 | -| loss | -0.0276 | -| n_updates | 1260 | -| policy_gradient_loss | -0.00378 | -| std | 1.22 | -| value_loss | 0.00964 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1732 | -| iterations | 128 | -| time_elapsed | 1210 | -| total_timesteps | 2097152 | -| train/ | | -| approx_kl | 0.007725292 | -| clip_fraction | 0.0775 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.913 | -| learning_rate | 0.0003 | -| loss | -0.0371 | -| n_updates | 1270 | -| policy_gradient_loss | -0.006 | -| std | 1.22 | -| value_loss | 0.0109 | ------------------------------------------ -Eval num_timesteps=2100000, episode_reward=-40.56 +/- 44.37 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -40.6 | -| time/ | | -| total_timesteps | 2100000 | -| train/ | | -| approx_kl | 0.0067186276 | -| clip_fraction | 0.0644 | -| clip_range | 0.2 | -| entropy_loss | -3.24 | -| explained_variance | 0.845 | -| learning_rate | 0.0003 | -| loss | -0.0357 | -| n_updates | 1280 | -| policy_gradient_loss | -0.00433 | -| std | 1.23 | -| value_loss | 0.0263 | ------------------------------------------- - -[Diag @ 2,100,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 12/20 - COMPACT_CANT_DRIVE 8/20 - action_mag mean=0.384 p10=0.018 p90=0.884 (0=stopped, 1=full speed) - min_flock_radius mean=6.36m best=2.11m (target <5m to compact) - min_dog_to_com mean=2.94m best=0.40m (FLEE_DIST=7m) - min_com_to_pen mean=12.34m best=5.56m - reward/step (mean): progress=-0.0084 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1706 | -| iterations | 129 | -| time_elapsed | 1238 | -| total_timesteps | 2113536 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1712 | -| iterations | 130 | -| time_elapsed | 1243 | -| total_timesteps | 2129920 | -| train/ | | -| approx_kl | 0.006317258 | -| clip_fraction | 0.0623 | -| clip_range | 0.2 | -| entropy_loss | -3.26 | -| explained_variance | 0.912 | -| learning_rate | 0.0003 | -| loss | -0.0419 | -| n_updates | 1290 | -| policy_gradient_loss | -0.00427 | -| std | 1.24 | -| value_loss | 0.00859 | ------------------------------------------ ----------------------------------------- -| time/ | | -| fps | 1716 | -| iterations | 131 | -| time_elapsed | 1250 | -| total_timesteps | 2146304 | -| train/ | | -| approx_kl | 0.00636432 | -| clip_fraction | 0.0698 | -| clip_range | 0.2 | -| entropy_loss | -3.28 | -| explained_variance | 0.851 | -| learning_rate | 0.0003 | -| loss | -0.0266 | -| n_updates | 1300 | -| policy_gradient_loss | -0.00374 | -| std | 1.25 | -| value_loss | 0.0299 | ----------------------------------------- -Eval num_timesteps=2150000, episode_reward=-63.32 +/- 33.74 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -63.3 | -| time/ | | -| total_timesteps | 2150000 | -| train/ | | -| approx_kl | 0.0060345423 | -| clip_fraction | 0.0563 | -| clip_range | 0.2 | -| entropy_loss | -3.27 | -| explained_variance | 0.898 | -| learning_rate | 0.0003 | -| loss | -0.0404 | -| n_updates | 1310 | -| policy_gradient_loss | -0.00356 | -| std | 1.24 | -| value_loss | 0.0205 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1704 | -| iterations | 132 | -| time_elapsed | 1268 | -| total_timesteps | 2162688 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1709 | -| iterations | 133 | -| time_elapsed | 1274 | -| total_timesteps | 2179072 | -| train/ | | -| approx_kl | 0.007027424 | -| clip_fraction | 0.0693 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.9 | -| learning_rate | 0.0003 | -| loss | -0.0315 | -| n_updates | 1320 | -| policy_gradient_loss | -0.00521 | -| std | 1.23 | -| value_loss | 0.0194 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1715 | -| iterations | 134 | -| time_elapsed | 1279 | -| total_timesteps | 2195456 | -| train/ | | -| approx_kl | 0.006112649 | -| clip_fraction | 0.0635 | -| clip_range | 0.2 | -| entropy_loss | -3.24 | -| explained_variance | 0.957 | -| learning_rate | 0.0003 | -| loss | -0.0339 | -| n_updates | 1330 | -| policy_gradient_loss | -0.00383 | -| std | 1.23 | -| value_loss | 0.00861 | ------------------------------------------ -Eval num_timesteps=2200000, episode_reward=-31.28 +/- 44.80 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -31.3 | -| time/ | | -| total_timesteps | 2200000 | -| train/ | | -| approx_kl | 0.0070182728 | -| clip_fraction | 0.076 | -| clip_range | 0.2 | -| entropy_loss | -3.26 | -| explained_variance | 0.883 | -| learning_rate | 0.0003 | -| loss | -0.0412 | -| n_updates | 1340 | -| policy_gradient_loss | -0.00534 | -| std | 1.25 | -| value_loss | 0.013 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1704 | -| iterations | 135 | -| time_elapsed | 1297 | -| total_timesteps | 2211840 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1708 | -| iterations | 136 | -| time_elapsed | 1304 | -| total_timesteps | 2228224 | -| train/ | | -| approx_kl | 0.0062820893 | -| clip_fraction | 0.062 | -| clip_range | 0.2 | -| entropy_loss | -3.26 | -| explained_variance | 0.924 | -| learning_rate | 0.0003 | -| loss | -0.0377 | -| n_updates | 1350 | -| policy_gradient_loss | -0.00497 | -| std | 1.24 | -| value_loss | 0.00797 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1713 | -| iterations | 137 | -| time_elapsed | 1310 | -| total_timesteps | 2244608 | -| train/ | | -| approx_kl | 0.0072454046 | -| clip_fraction | 0.0747 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.94 | -| learning_rate | 0.0003 | -| loss | -0.0366 | -| n_updates | 1360 | -| policy_gradient_loss | -0.00572 | -| std | 1.23 | -| value_loss | 0.00852 | ------------------------------------------- -Eval num_timesteps=2250000, episode_reward=-36.00 +/- 38.67 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -36 | -| time/ | | -| total_timesteps | 2250000 | -| train/ | | -| approx_kl | 0.005690419 | -| clip_fraction | 0.0546 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.957 | -| learning_rate | 0.0003 | -| loss | -0.0376 | -| n_updates | 1370 | -| policy_gradient_loss | -0.00425 | -| std | 1.23 | -| value_loss | 0.00524 | ------------------------------------------ - -[Diag @ 2,250,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 13/20 - COMPACT_CANT_DRIVE 7/20 - action_mag mean=0.416 p10=0.038 p90=0.887 (0=stopped, 1=full speed) - min_flock_radius mean=6.62m best=2.03m (target <5m to compact) - min_dog_to_com mean=3.54m best=0.40m (FLEE_DIST=7m) - min_com_to_pen mean=14.24m best=9.65m - reward/step (mean): progress=-0.0070 alignment=+0.0000 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1690 | -| iterations | 138 | -| time_elapsed | 1337 | -| total_timesteps | 2260992 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1696 | -| iterations | 139 | -| time_elapsed | 1342 | -| total_timesteps | 2277376 | -| train/ | | -| approx_kl | 0.0072061084 | -| clip_fraction | 0.0728 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.954 | -| learning_rate | 0.0003 | -| loss | -0.0312 | -| n_updates | 1380 | -| policy_gradient_loss | -0.00512 | -| std | 1.23 | -| value_loss | 0.006 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1702 | -| iterations | 140 | -| time_elapsed | 1347 | -| total_timesteps | 2293760 | -| train/ | | -| approx_kl | 0.0066916933 | -| clip_fraction | 0.0626 | -| clip_range | 0.2 | -| entropy_loss | -3.24 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0408 | -| n_updates | 1390 | -| policy_gradient_loss | -0.00463 | -| std | 1.23 | -| value_loss | 0.00827 | ------------------------------------------- -Eval num_timesteps=2300000, episode_reward=-43.65 +/- 42.86 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -43.7 | -| time/ | | -| total_timesteps | 2300000 | -| train/ | | -| approx_kl | 0.0062987795 | -| clip_fraction | 0.0609 | -| clip_range | 0.2 | -| entropy_loss | -3.26 | -| explained_variance | 0.898 | -| learning_rate | 0.0003 | -| loss | -0.0316 | -| n_updates | 1400 | -| policy_gradient_loss | -0.00442 | -| std | 1.25 | -| value_loss | 0.00955 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1691 | -| iterations | 141 | -| time_elapsed | 1365 | -| total_timesteps | 2310144 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1696 | -| iterations | 142 | -| time_elapsed | 1371 | -| total_timesteps | 2326528 | -| train/ | | -| approx_kl | 0.005443076 | -| clip_fraction | 0.054 | -| clip_range | 0.2 | -| entropy_loss | -3.27 | -| explained_variance | 0.877 | -| learning_rate | 0.0003 | -| loss | -0.0296 | -| n_updates | 1410 | -| policy_gradient_loss | -0.00375 | -| std | 1.24 | -| value_loss | 0.00928 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1701 | -| iterations | 143 | -| time_elapsed | 1376 | -| total_timesteps | 2342912 | -| train/ | | -| approx_kl | 0.004740049 | -| clip_fraction | 0.0456 | -| clip_range | 0.2 | -| entropy_loss | -3.26 | -| explained_variance | 0.922 | -| learning_rate | 0.0003 | -| loss | -0.0318 | -| n_updates | 1420 | -| policy_gradient_loss | -0.00351 | -| std | 1.24 | -| value_loss | 0.0156 | ------------------------------------------ -Eval num_timesteps=2350000, episode_reward=-37.57 +/- 37.78 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -37.6 | -| time/ | | -| total_timesteps | 2350000 | -| train/ | | -| approx_kl | 0.0056120222 | -| clip_fraction | 0.0542 | -| clip_range | 0.2 | -| entropy_loss | -3.27 | -| explained_variance | 0.911 | -| learning_rate | 0.0003 | -| loss | -0.0272 | -| n_updates | 1430 | -| policy_gradient_loss | -0.0035 | -| std | 1.25 | -| value_loss | 0.00811 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1690 | -| iterations | 144 | -| time_elapsed | 1395 | -| total_timesteps | 2359296 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1695 | -| iterations | 145 | -| time_elapsed | 1401 | -| total_timesteps | 2375680 | -| train/ | | -| approx_kl | 0.0064737825 | -| clip_fraction | 0.0697 | -| clip_range | 0.2 | -| entropy_loss | -3.28 | -| explained_variance | 0.93 | -| learning_rate | 0.0003 | -| loss | -0.036 | -| n_updates | 1440 | -| policy_gradient_loss | -0.00403 | -| std | 1.25 | -| value_loss | 0.00488 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1699 | -| iterations | 146 | -| time_elapsed | 1407 | -| total_timesteps | 2392064 | -| train/ | | -| approx_kl | 0.0050720195 | -| clip_fraction | 0.0466 | -| clip_range | 0.2 | -| entropy_loss | -3.29 | -| explained_variance | 0.902 | -| learning_rate | 0.0003 | -| loss | -0.0374 | -| n_updates | 1450 | -| policy_gradient_loss | -0.00283 | -| std | 1.26 | -| value_loss | 0.00958 | ------------------------------------------- -Eval num_timesteps=2400000, episode_reward=-42.55 +/- 37.89 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -42.6 | -| time/ | | -| total_timesteps | 2400000 | -| train/ | | -| approx_kl | 0.005990128 | -| clip_fraction | 0.0565 | -| clip_range | 0.2 | -| entropy_loss | -3.31 | -| explained_variance | 0.869 | -| learning_rate | 0.0003 | -| loss | -0.0448 | -| n_updates | 1460 | -| policy_gradient_loss | -0.0051 | -| std | 1.27 | -| value_loss | 0.00854 | ------------------------------------------ - -[Diag @ 2,400,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 15/20 - COMPACT_CANT_DRIVE 5/20 - action_mag mean=0.424 p10=0.025 p90=0.948 (0=stopped, 1=full speed) - min_flock_radius mean=7.66m best=1.63m (target <5m to compact) - min_dog_to_com mean=4.77m best=0.32m (FLEE_DIST=7m) - min_com_to_pen mean=14.47m best=8.96m - reward/step (mean): progress=-0.0008 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1677 | -| iterations | 147 | -| time_elapsed | 1435 | -| total_timesteps | 2408448 | --------------------------------- - -Training complete. Artefacts saved to runs/ppo_fix_check/ diff --git a/training/runs/ppo_fix_check/best_model/best_model.zip b/training/runs/ppo_fix_check/best_model/best_model.zip deleted file mode 100644 index 8533c33..0000000 Binary files a/training/runs/ppo_fix_check/best_model/best_model.zip and /dev/null differ diff --git a/training/runs/ppo_fix_check/evaluations.npz b/training/runs/ppo_fix_check/evaluations.npz deleted file mode 100644 index 9ae65e5..0000000 Binary files a/training/runs/ppo_fix_check/evaluations.npz and /dev/null differ diff --git a/training/runs/ppo_fix_check/final_model.zip b/training/runs/ppo_fix_check/final_model.zip deleted file mode 100644 index 7e1248e..0000000 Binary files a/training/runs/ppo_fix_check/final_model.zip and /dev/null differ diff --git a/training/runs/ppo_fix_check/vecnorm.pkl b/training/runs/ppo_fix_check/vecnorm.pkl deleted file mode 100644 index f51753c..0000000 Binary files a/training/runs/ppo_fix_check/vecnorm.pkl and /dev/null differ diff --git a/training/runs/ppo_fix_check2.log b/training/runs/ppo_fix_check2.log deleted file mode 100644 index a345ff5..0000000 --- a/training/runs/ppo_fix_check2.log +++ /dev/null @@ -1,3391 +0,0 @@ -Using cpu device -Logging to runs/ppo_fix_check2/ppo_1 ------------------------------- -| time/ | | -| fps | 4605 | -| iterations | 1 | -| time_elapsed | 3 | -| total_timesteps | 16384 | ------------------------------- ------------------------------------------- -| time/ | | -| fps | 4011 | -| iterations | 2 | -| time_elapsed | 8 | -| total_timesteps | 32768 | -| train/ | | -| approx_kl | 0.0033352287 | -| clip_fraction | 0.0253 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.271 | -| learning_rate | 0.0003 | -| loss | -0.00687 | -| n_updates | 10 | -| policy_gradient_loss | -0.00103 | -| std | 0.996 | -| value_loss | 0.0684 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 3789 | -| iterations | 3 | -| time_elapsed | 12 | -| total_timesteps | 49152 | -| train/ | | -| approx_kl | 0.005950423 | -| clip_fraction | 0.0552 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.527 | -| learning_rate | 0.0003 | -| loss | -0.0153 | -| n_updates | 20 | -| policy_gradient_loss | -0.0029 | -| std | 0.997 | -| value_loss | 0.0663 | ------------------------------------------ -/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. - warnings.warn( -Eval num_timesteps=50000, episode_reward=-25.68 +/- 59.67 -Episode length: 1815.95 +/- 456.88 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.82e+03 | -| mean_reward | -25.7 | -| time/ | | -| total_timesteps | 50000 | -| train/ | | -| approx_kl | 0.0040030424 | -| clip_fraction | 0.0356 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.421 | -| learning_rate | 0.0003 | -| loss | 0.149 | -| n_updates | 30 | -| policy_gradient_loss | -0.00198 | -| std | 1.01 | -| value_loss | 0.114 | ------------------------------------------- -New best mean reward! ------------------------------- -| time/ | | -| fps | 2351 | -| iterations | 4 | -| time_elapsed | 27 | -| total_timesteps | 65536 | ------------------------------- ------------------------------------------ -| time/ | | -| fps | 2446 | -| iterations | 5 | -| time_elapsed | 33 | -| total_timesteps | 81920 | -| train/ | | -| approx_kl | 0.005522004 | -| clip_fraction | 0.0604 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.737 | -| learning_rate | 0.0003 | -| loss | -0.0301 | -| n_updates | 40 | -| policy_gradient_loss | -0.00434 | -| std | 1.01 | -| value_loss | 0.0164 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 2617 | -| iterations | 6 | -| time_elapsed | 37 | -| total_timesteps | 98304 | -| train/ | | -| approx_kl | 0.0052388343 | -| clip_fraction | 0.0463 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.626 | -| learning_rate | 0.0003 | -| loss | -0.0294 | -| n_updates | 50 | -| policy_gradient_loss | -0.00297 | -| std | 1.01 | -| value_loss | 0.0597 | ------------------------------------------- -/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. - warnings.warn( -Eval num_timesteps=100000, episode_reward=-22.76 +/- 46.60 -Episode length: 1900.95 +/- 430.60 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.9e+03 | -| mean_reward | -22.8 | -| time/ | | -| total_timesteps | 100000 | -| train/ | | -| approx_kl | 0.005612197 | -| clip_fraction | 0.0475 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.747 | -| learning_rate | 0.0003 | -| loss | -0.0261 | -| n_updates | 60 | -| policy_gradient_loss | -0.00393 | -| std | 1.01 | -| value_loss | 0.0517 | ------------------------------------------ -New best mean reward! -------------------------------- -| time/ | | -| fps | 2178 | -| iterations | 7 | -| time_elapsed | 52 | -| total_timesteps | 114688 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2294 | -| iterations | 8 | -| time_elapsed | 57 | -| total_timesteps | 131072 | -| train/ | | -| approx_kl | 0.0057119504 | -| clip_fraction | 0.0541 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.896 | -| learning_rate | 0.0003 | -| loss | -0.0144 | -| n_updates | 70 | -| policy_gradient_loss | -0.00364 | -| std | 1 | -| value_loss | 0.0738 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2393 | -| iterations | 9 | -| time_elapsed | 61 | -| total_timesteps | 147456 | -| train/ | | -| approx_kl | 0.005940904 | -| clip_fraction | 0.0565 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.89 | -| learning_rate | 0.0003 | -| loss | -0.0283 | -| n_updates | 80 | -| policy_gradient_loss | -0.00245 | -| std | 1.01 | -| value_loss | 0.0761 | ------------------------------------------ -Eval num_timesteps=150000, episode_reward=-29.37 +/- 28.32 -Episode length: 1997.50 +/- 10.90 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -29.4 | -| time/ | | -| total_timesteps | 150000 | -| train/ | | -| approx_kl | 0.004531667 | -| clip_fraction | 0.0392 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.958 | -| learning_rate | 0.0003 | -| loss | -0.0343 | -| n_updates | 90 | -| policy_gradient_loss | -0.00379 | -| std | 1.01 | -| value_loss | 0.00995 | ------------------------------------------ - -[Diag @ 150,000 | n_sheep=1 | success=0%] - COMPACT_CANT_DRIVE 17/20 - DROVE_NO_SHEEP 3/20 - action_mag mean=0.089 p10=0.003 p90=0.274 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=4.40m best=2.07m (FLEE_DIST=7m) - min_com_to_pen mean=11.66m best=1.50m - reward/step (mean): progress=+0.0004 alignment=+0.0000 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 -------------------------------- -| time/ | | -| fps | 1950 | -| iterations | 10 | -| time_elapsed | 84 | -| total_timesteps | 163840 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2020 | -| iterations | 11 | -| time_elapsed | 89 | -| total_timesteps | 180224 | -| train/ | | -| approx_kl | 0.0061831754 | -| clip_fraction | 0.068 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.975 | -| learning_rate | 0.0003 | -| loss | -0.0349 | -| n_updates | 100 | -| policy_gradient_loss | -0.00607 | -| std | 1.02 | -| value_loss | 0.0156 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2084 | -| iterations | 12 | -| time_elapsed | 94 | -| total_timesteps | 196608 | -| train/ | | -| approx_kl | 0.009407628 | -| clip_fraction | 0.123 | -| clip_range | 0.2 | -| entropy_loss | -2.87 | -| explained_variance | 0.899 | -| learning_rate | 0.0003 | -| loss | -0.0305 | -| n_updates | 110 | -| policy_gradient_loss | -0.00932 | -| std | 1.02 | -| value_loss | 0.0223 | ------------------------------------------ -Eval num_timesteps=200000, episode_reward=-12.36 +/- 51.37 -Episode length: 1880.20 +/- 355.04 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.88e+03 | -| mean_reward | -12.4 | -| time/ | | -| total_timesteps | 200000 | -| train/ | | -| approx_kl | 0.008270489 | -| clip_fraction | 0.0945 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.945 | -| learning_rate | 0.0003 | -| loss | -0.0339 | -| n_updates | 120 | -| policy_gradient_loss | -0.00809 | -| std | 1 | -| value_loss | 0.0162 | ------------------------------------------ -New best mean reward! -------------------------------- -| time/ | | -| fps | 1936 | -| iterations | 13 | -| time_elapsed | 109 | -| total_timesteps | 212992 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1989 | -| iterations | 14 | -| time_elapsed | 115 | -| total_timesteps | 229376 | -| train/ | | -| approx_kl | 0.008541125 | -| clip_fraction | 0.112 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.944 | -| learning_rate | 0.0003 | -| loss | -0.0184 | -| n_updates | 130 | -| policy_gradient_loss | -0.00846 | -| std | 0.994 | -| value_loss | 0.0284 | ------------------------------------------ ----------------------------------------- -| time/ | | -| fps | 2037 | -| iterations | 15 | -| time_elapsed | 120 | -| total_timesteps | 245760 | -| train/ | | -| approx_kl | 0.00763176 | -| clip_fraction | 0.0894 | -| clip_range | 0.2 | -| entropy_loss | -2.81 | -| explained_variance | 0.9 | -| learning_rate | 0.0003 | -| loss | -0.0128 | -| n_updates | 140 | -| policy_gradient_loss | -0.00655 | -| std | 0.987 | -| value_loss | 0.071 | ----------------------------------------- -Eval num_timesteps=250000, episode_reward=45.82 +/- 68.33 -Episode length: 1391.70 +/- 757.58 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.39e+03 | -| mean_reward | 45.8 | -| time/ | | -| total_timesteps | 250000 | -| train/ | | -| approx_kl | 0.009210973 | -| clip_fraction | 0.11 | -| clip_range | 0.2 | -| entropy_loss | -2.81 | -| explained_variance | 0.95 | -| learning_rate | 0.0003 | -| loss | -0.0401 | -| n_updates | 150 | -| policy_gradient_loss | -0.0082 | -| std | 0.986 | -| value_loss | 0.0202 | ------------------------------------------ -New best mean reward! -------------------------------- -| time/ | | -| fps | 1958 | -| iterations | 16 | -| time_elapsed | 133 | -| total_timesteps | 262144 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2005 | -| iterations | 17 | -| time_elapsed | 138 | -| total_timesteps | 278528 | -| train/ | | -| approx_kl | 0.008197077 | -| clip_fraction | 0.096 | -| clip_range | 0.2 | -| entropy_loss | -2.79 | -| explained_variance | 0.949 | -| learning_rate | 0.0003 | -| loss | -0.0375 | -| n_updates | 160 | -| policy_gradient_loss | -0.00834 | -| std | 0.976 | -| value_loss | 0.0207 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2061 | -| iterations | 18 | -| time_elapsed | 143 | -| total_timesteps | 294912 | -| train/ | | -| approx_kl | 0.006078005 | -| clip_fraction | 0.0598 | -| clip_range | 0.2 | -| entropy_loss | -2.78 | -| explained_variance | 0.965 | -| learning_rate | 0.0003 | -| loss | -0.0188 | -| n_updates | 170 | -| policy_gradient_loss | -0.00464 | -| std | 0.969 | -| value_loss | 0.0178 | ------------------------------------------ -Eval num_timesteps=300000, episode_reward=56.19 +/- 63.26 -Episode length: 1246.75 +/- 843.82 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.25e+03 | -| mean_reward | 56.2 | -| time/ | | -| total_timesteps | 300000 | -| train/ | | -| approx_kl | 0.0056289425 | -| clip_fraction | 0.0523 | -| clip_range | 0.2 | -| entropy_loss | -2.76 | -| explained_variance | 0.969 | -| learning_rate | 0.0003 | -| loss | -0.0246 | -| n_updates | 180 | -| policy_gradient_loss | -0.00378 | -| std | 0.961 | -| value_loss | 0.0174 | ------------------------------------------- -New best mean reward! - -[Diag @ 300,000 | n_sheep=1 | success=40%] - DROVE_NO_SHEEP 11/20 - SUCCESS 8/20 - COMPACT_CANT_DRIVE 1/20 - action_mag mean=0.076 p10=0.000 p90=0.193 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=2.83m best=0.24m (FLEE_DIST=7m) - min_com_to_pen mean=2.99m best=1.50m - reward/step (mean): progress=+0.0236 alignment=+0.0012 pen_bonus=+0.0029 step_cost=-0.0200 complete=+0.0291 -------------------------------- -| time/ | | -| fps | 1939 | -| iterations | 19 | -| time_elapsed | 160 | -| total_timesteps | 311296 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1983 | -| iterations | 20 | -| time_elapsed | 165 | -| total_timesteps | 327680 | -| train/ | | -| approx_kl | 0.005042998 | -| clip_fraction | 0.05 | -| clip_range | 0.2 | -| entropy_loss | -2.73 | -| explained_variance | 0.941 | -| learning_rate | 0.0003 | -| loss | -0.0242 | -| n_updates | 190 | -| policy_gradient_loss | -0.00399 | -| std | 0.947 | -| value_loss | 0.00505 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 2018 | -| iterations | 21 | -| time_elapsed | 170 | -| total_timesteps | 344064 | -| train/ | | -| approx_kl | 0.0054986854 | -| clip_fraction | 0.0569 | -| clip_range | 0.2 | -| entropy_loss | -2.72 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | -0.0248 | -| n_updates | 200 | -| policy_gradient_loss | -0.00415 | -| std | 0.941 | -| value_loss | 0.00784 | ------------------------------------------- -Eval num_timesteps=350000, episode_reward=25.08 +/- 61.55 -Episode length: 1562.00 +/- 761.23 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.56e+03 | -| mean_reward | 25.1 | -| time/ | | -| total_timesteps | 350000 | -| train/ | | -| approx_kl | 0.0046333643 | -| clip_fraction | 0.0476 | -| clip_range | 0.2 | -| entropy_loss | -2.71 | -| explained_variance | 0.934 | -| learning_rate | 0.0003 | -| loss | -0.0244 | -| n_updates | 210 | -| policy_gradient_loss | -0.00237 | -| std | 0.934 | -| value_loss | 0.00827 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1950 | -| iterations | 22 | -| time_elapsed | 184 | -| total_timesteps | 360448 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1990 | -| iterations | 23 | -| time_elapsed | 189 | -| total_timesteps | 376832 | -| train/ | | -| approx_kl | 0.006686668 | -| clip_fraction | 0.0757 | -| clip_range | 0.2 | -| entropy_loss | -2.7 | -| explained_variance | 0.963 | -| learning_rate | 0.0003 | -| loss | -0.0423 | -| n_updates | 220 | -| policy_gradient_loss | -0.00244 | -| std | 0.936 | -| value_loss | 0.00575 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2027 | -| iterations | 24 | -| time_elapsed | 193 | -| total_timesteps | 393216 | -| train/ | | -| approx_kl | 0.009116547 | -| clip_fraction | 0.103 | -| clip_range | 0.2 | -| entropy_loss | -2.71 | -| explained_variance | 0.97 | -| learning_rate | 0.0003 | -| loss | -0.0353 | -| n_updates | 230 | -| policy_gradient_loss | -0.0042 | -| std | 0.941 | -| value_loss | 0.006 | ------------------------------------------ -Eval num_timesteps=400000, episode_reward=56.91 +/- 71.91 -Episode length: 1225.25 +/- 861.21 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.23e+03 | -| mean_reward | 56.9 | -| time/ | | -| total_timesteps | 400000 | -| train/ | | -| approx_kl | 0.0061917743 | -| clip_fraction | 0.0658 | -| clip_range | 0.2 | -| entropy_loss | -2.72 | -| explained_variance | 0.975 | -| learning_rate | 0.0003 | -| loss | -0.0378 | -| n_updates | 240 | -| policy_gradient_loss | -0.00282 | -| std | 0.943 | -| value_loss | 0.00633 | ------------------------------------------- -New best mean reward! -------------------------------- -| time/ | | -| fps | 1981 | -| iterations | 25 | -| time_elapsed | 206 | -| total_timesteps | 409600 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2011 | -| iterations | 26 | -| time_elapsed | 211 | -| total_timesteps | 425984 | -| train/ | | -| approx_kl | 0.007945089 | -| clip_fraction | 0.1 | -| clip_range | 0.2 | -| entropy_loss | -2.73 | -| explained_variance | 0.978 | -| learning_rate | 0.0003 | -| loss | -0.0343 | -| n_updates | 250 | -| policy_gradient_loss | -0.00475 | -| std | 0.95 | -| value_loss | 0.00708 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2044 | -| iterations | 27 | -| time_elapsed | 216 | -| total_timesteps | 442368 | -| train/ | | -| approx_kl | 0.013059773 | -| clip_fraction | 0.152 | -| clip_range | 0.2 | -| entropy_loss | -2.76 | -| explained_variance | 0.984 | -| learning_rate | 0.0003 | -| loss | -0.0421 | -| n_updates | 260 | -| policy_gradient_loss | -0.00542 | -| std | 0.967 | -| value_loss | 0.00331 | ------------------------------------------ -Eval num_timesteps=450000, episode_reward=58.80 +/- 74.46 -Episode length: 1123.15 +/- 881.85 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.12e+03 | -| mean_reward | 58.8 | -| time/ | | -| total_timesteps | 450000 | -| train/ | | -| approx_kl | 0.0085322345 | -| clip_fraction | 0.0967 | -| clip_range | 0.2 | -| entropy_loss | -2.77 | -| explained_variance | 0.98 | -| learning_rate | 0.0003 | -| loss | -0.0264 | -| n_updates | 270 | -| policy_gradient_loss | -0.00612 | -| std | 0.963 | -| value_loss | 0.00919 | ------------------------------------------- -New best mean reward! - -[Diag @ 450,000 | n_sheep=1 | success=65%] - SUCCESS 13/20 - DROVE_NO_SHEEP 4/20 - COMPACT_CANT_DRIVE 3/20 - action_mag mean=0.105 p10=0.000 p90=0.272 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=1.67m best=0.43m (FLEE_DIST=7m) - min_com_to_pen mean=3.26m best=2.29m - reward/step (mean): progress=+0.0326 alignment=+0.0024 pen_bonus=+0.0076 step_cost=-0.0200 complete=+0.0762 -------------------------------- -| time/ | | -| fps | 1974 | -| iterations | 28 | -| time_elapsed | 232 | -| total_timesteps | 458752 | -------------------------------- ----------------------------------------- -| time/ | | -| fps | 2005 | -| iterations | 29 | -| time_elapsed | 236 | -| total_timesteps | 475136 | -| train/ | | -| approx_kl | 0.01203198 | -| clip_fraction | 0.146 | -| clip_range | 0.2 | -| entropy_loss | -2.79 | -| explained_variance | 0.963 | -| learning_rate | 0.0003 | -| loss | 0.00738 | -| n_updates | 280 | -| policy_gradient_loss | -0.0128 | -| std | 0.982 | -| value_loss | 0.0749 | ----------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2037 | -| iterations | 30 | -| time_elapsed | 241 | -| total_timesteps | 491520 | -| train/ | | -| approx_kl | 0.0078244675 | -| clip_fraction | 0.0856 | -| clip_range | 0.2 | -| entropy_loss | -2.8 | -| explained_variance | 0.937 | -| learning_rate | 0.0003 | -| loss | 0.0631 | -| n_updates | 290 | -| policy_gradient_loss | -0.00651 | -| std | 0.977 | -| value_loss | 0.131 | ------------------------------------------- -Eval num_timesteps=500000, episode_reward=135.29 +/- 9.81 -Episode length: 287.30 +/- 88.71 ----------------------------------------- -| eval/ | | -| mean_ep_length | 287 | -| mean_reward | 135 | -| time/ | | -| total_timesteps | 500000 | -| train/ | | -| approx_kl | 0.00837522 | -| clip_fraction | 0.0866 | -| clip_range | 0.2 | -| entropy_loss | -2.77 | -| explained_variance | 0.948 | -| learning_rate | 0.0003 | -| loss | 0.041 | -| n_updates | 300 | -| policy_gradient_loss | -0.00532 | -| std | 0.962 | -| value_loss | 0.0898 | ----------------------------------------- -New best mean reward! -------------------------------- -| time/ | | -| fps | 2048 | -| iterations | 31 | -| time_elapsed | 247 | -| total_timesteps | 507904 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2070 | -| iterations | 32 | -| time_elapsed | 253 | -| total_timesteps | 524288 | -| train/ | | -| approx_kl | 0.0067581255 | -| clip_fraction | 0.0543 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.932 | -| learning_rate | 0.0003 | -| loss | 0.0518 | -| n_updates | 310 | -| policy_gradient_loss | -0.00297 | -| std | 0.954 | -| value_loss | 0.111 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2090 | -| iterations | 33 | -| time_elapsed | 258 | -| total_timesteps | 540672 | -| train/ | | -| approx_kl | 0.0066835573 | -| clip_fraction | 0.0597 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.934 | -| learning_rate | 0.0003 | -| loss | 0.00545 | -| n_updates | 320 | -| policy_gradient_loss | -0.00508 | -| std | 0.949 | -| value_loss | 0.101 | ------------------------------------------- -Eval num_timesteps=550000, episode_reward=136.08 +/- 11.93 -Episode length: 285.80 +/- 123.59 ------------------------------------------- -| eval/ | | -| mean_ep_length | 286 | -| mean_reward | 136 | -| time/ | | -| total_timesteps | 550000 | -| train/ | | -| approx_kl | 0.0062076193 | -| clip_fraction | 0.0672 | -| clip_range | 0.2 | -| entropy_loss | -2.71 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | 0.0229 | -| n_updates | 330 | -| policy_gradient_loss | -0.00616 | -| std | 0.933 | -| value_loss | 0.0813 | ------------------------------------------- -New best mean reward! -------------------------------- -| time/ | | -| fps | 2104 | -| iterations | 34 | -| time_elapsed | 264 | -| total_timesteps | 557056 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2130 | -| iterations | 35 | -| time_elapsed | 269 | -| total_timesteps | 573440 | -| train/ | | -| approx_kl | 0.0064913128 | -| clip_fraction | 0.0631 | -| clip_range | 0.2 | -| entropy_loss | -2.67 | -| explained_variance | 0.971 | -| learning_rate | 0.0003 | -| loss | -0.0199 | -| n_updates | 340 | -| policy_gradient_loss | -0.00631 | -| std | 0.917 | -| value_loss | 0.0185 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2155 | -| iterations | 36 | -| time_elapsed | 273 | -| total_timesteps | 589824 | -| train/ | | -| approx_kl | 0.0067110434 | -| clip_fraction | 0.0719 | -| clip_range | 0.2 | -| entropy_loss | -2.63 | -| explained_variance | 0.98 | -| learning_rate | 0.0003 | -| loss | -0.0343 | -| n_updates | 350 | -| policy_gradient_loss | -0.0069 | -| std | 0.897 | -| value_loss | 0.0113 | ------------------------------------------- -Eval num_timesteps=600000, episode_reward=135.45 +/- 12.96 -Episode length: 273.05 +/- 118.26 ------------------------------------------- -| eval/ | | -| mean_ep_length | 273 | -| mean_reward | 135 | -| time/ | | -| total_timesteps | 600000 | -| train/ | | -| approx_kl | 0.0054842415 | -| clip_fraction | 0.0564 | -| clip_range | 0.2 | -| entropy_loss | -2.59 | -| explained_variance | 0.983 | -| learning_rate | 0.0003 | -| loss | -0.033 | -| n_updates | 360 | -| policy_gradient_loss | -0.0042 | -| std | 0.883 | -| value_loss | 0.00479 | ------------------------------------------- - -[Diag @ 600,000 | n_sheep=1 | success=100%] - SUCCESS 20/20 - action_mag mean=0.343 p10=0.232 p90=0.548 (0=stopped, 1=full speed) - min_flock_radius mean=0.00m best=0.00m (target <5m to compact) - min_dog_to_com mean=1.53m best=0.76m (FLEE_DIST=7m) - min_com_to_pen mean=3.49m best=2.84m - reward/step (mean): progress=+0.1066 alignment=+0.0088 pen_bonus=+0.0357 step_cost=-0.0200 complete=+0.3567 - -[Curriculum] leaving stage n_sheep=1 after 600,000 steps | training success rate (last 100 eps) = 100% -[Curriculum] → 2 sheep at step 600,000 - -------------------------------- -| time/ | | -| fps | 2156 | -| iterations | 37 | -| time_elapsed | 281 | -| total_timesteps | 606208 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2173 | -| iterations | 38 | -| time_elapsed | 286 | -| total_timesteps | 622592 | -| train/ | | -| approx_kl | 0.011170821 | -| clip_fraction | 0.117 | -| clip_range | 0.2 | -| entropy_loss | -2.59 | -| explained_variance | 0.924 | -| learning_rate | 0.0003 | -| loss | -0.0137 | -| n_updates | 370 | -| policy_gradient_loss | 0.00714 | -| std | 0.886 | -| value_loss | 0.0417 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2192 | -| iterations | 39 | -| time_elapsed | 291 | -| total_timesteps | 638976 | -| train/ | | -| approx_kl | 0.012632904 | -| clip_fraction | 0.156 | -| clip_range | 0.2 | -| entropy_loss | -2.6 | -| explained_variance | 0.858 | -| learning_rate | 0.0003 | -| loss | -0.00445 | -| n_updates | 380 | -| policy_gradient_loss | 0.00112 | -| std | 0.892 | -| value_loss | 0.0156 | ------------------------------------------ -Eval num_timesteps=650000, episode_reward=-38.36 +/- 29.94 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -38.4 | -| time/ | | -| total_timesteps | 650000 | -| train/ | | -| approx_kl | 0.012015635 | -| clip_fraction | 0.133 | -| clip_range | 0.2 | -| entropy_loss | -2.62 | -| explained_variance | 0.946 | -| learning_rate | 0.0003 | -| loss | -0.0168 | -| n_updates | 390 | -| policy_gradient_loss | -0.000726 | -| std | 0.904 | -| value_loss | 0.0126 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 2131 | -| iterations | 40 | -| time_elapsed | 307 | -| total_timesteps | 655360 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2145 | -| iterations | 41 | -| time_elapsed | 313 | -| total_timesteps | 671744 | -| train/ | | -| approx_kl | 0.009391339 | -| clip_fraction | 0.121 | -| clip_range | 0.2 | -| entropy_loss | -2.63 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.0164 | -| n_updates | 400 | -| policy_gradient_loss | -0.00177 | -| std | 0.905 | -| value_loss | 0.00536 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 2156 | -| iterations | 42 | -| time_elapsed | 319 | -| total_timesteps | 688128 | -| train/ | | -| approx_kl | 0.0077482145 | -| clip_fraction | 0.0977 | -| clip_range | 0.2 | -| entropy_loss | -2.64 | -| explained_variance | 0.895 | -| learning_rate | 0.0003 | -| loss | -0.023 | -| n_updates | 410 | -| policy_gradient_loss | -0.00158 | -| std | 0.908 | -| value_loss | 0.0068 | ------------------------------------------- -Eval num_timesteps=700000, episode_reward=-16.26 +/- 48.54 -Episode length: 1934.20 +/- 286.82 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.93e+03 | -| mean_reward | -16.3 | -| time/ | | -| total_timesteps | 700000 | -| train/ | | -| approx_kl | 0.007948186 | -| clip_fraction | 0.0933 | -| clip_range | 0.2 | -| entropy_loss | -2.64 | -| explained_variance | 0.934 | -| learning_rate | 0.0003 | -| loss | -0.0205 | -| n_updates | 420 | -| policy_gradient_loss | -0.00233 | -| std | 0.904 | -| value_loss | 0.00556 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 2093 | -| iterations | 43 | -| time_elapsed | 336 | -| total_timesteps | 704512 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2109 | -| iterations | 44 | -| time_elapsed | 341 | -| total_timesteps | 720896 | -| train/ | | -| approx_kl | 0.0077707805 | -| clip_fraction | 0.101 | -| clip_range | 0.2 | -| entropy_loss | -2.64 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.00469 | -| n_updates | 430 | -| policy_gradient_loss | -0.00226 | -| std | 0.909 | -| value_loss | 0.0031 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2129 | -| iterations | 45 | -| time_elapsed | 346 | -| total_timesteps | 737280 | -| train/ | | -| approx_kl | 0.0063995067 | -| clip_fraction | 0.0823 | -| clip_range | 0.2 | -| entropy_loss | -2.66 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.0249 | -| n_updates | 440 | -| policy_gradient_loss | -0.00261 | -| std | 0.922 | -| value_loss | 0.00343 | ------------------------------------------- -Eval num_timesteps=750000, episode_reward=-12.10 +/- 56.78 -Episode length: 1850.50 +/- 449.09 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.85e+03 | -| mean_reward | -12.1 | -| time/ | | -| total_timesteps | 750000 | -| train/ | | -| approx_kl | 0.0069549307 | -| clip_fraction | 0.0847 | -| clip_range | 0.2 | -| entropy_loss | -2.68 | -| explained_variance | 0.862 | -| learning_rate | 0.0003 | -| loss | -0.0192 | -| n_updates | 450 | -| policy_gradient_loss | -0.00165 | -| std | 0.929 | -| value_loss | 0.0032 | ------------------------------------------- - -[Diag @ 750,000 | n_sheep=2 | success=5%] - COMPACT_CANT_DRIVE 9/20 - NEVER_COMPACT 9/20 - PARTIAL_1of2 1/20 - SUCCESS 1/20 - action_mag mean=0.261 p10=0.002 p90=0.983 (0=stopped, 1=full speed) - min_flock_radius mean=3.93m best=0.00m (target <5m to compact) - min_dog_to_com mean=0.79m best=0.07m (FLEE_DIST=7m) - min_com_to_pen mean=13.43m best=1.62m - reward/step (mean): progress=-0.0058 alignment=+0.0087 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0025 -------------------------------- -| time/ | | -| fps | 2043 | -| iterations | 46 | -| time_elapsed | 368 | -| total_timesteps | 753664 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2062 | -| iterations | 47 | -| time_elapsed | 373 | -| total_timesteps | 770048 | -| train/ | | -| approx_kl | 0.008165602 | -| clip_fraction | 0.0997 | -| clip_range | 0.2 | -| entropy_loss | -2.69 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0461 | -| n_updates | 460 | -| policy_gradient_loss | -0.00412 | -| std | 0.932 | -| value_loss | 0.00308 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2074 | -| iterations | 48 | -| time_elapsed | 379 | -| total_timesteps | 786432 | -| train/ | | -| approx_kl | 0.006088208 | -| clip_fraction | 0.0805 | -| clip_range | 0.2 | -| entropy_loss | -2.71 | -| explained_variance | 0.917 | -| learning_rate | 0.0003 | -| loss | -0.034 | -| n_updates | 470 | -| policy_gradient_loss | -0.000257 | -| std | 0.943 | -| value_loss | 0.00533 | ------------------------------------------ -Eval num_timesteps=800000, episode_reward=-32.78 +/- 23.33 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -32.8 | -| time/ | | -| total_timesteps | 800000 | -| train/ | | -| approx_kl | 0.0069386996 | -| clip_fraction | 0.0883 | -| clip_range | 0.2 | -| entropy_loss | -2.73 | -| explained_variance | 0.954 | -| learning_rate | 0.0003 | -| loss | -0.0361 | -| n_updates | 480 | -| policy_gradient_loss | -0.00228 | -| std | 0.948 | -| value_loss | 0.00495 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 2028 | -| iterations | 49 | -| time_elapsed | 395 | -| total_timesteps | 802816 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 2045 | -| iterations | 50 | -| time_elapsed | 400 | -| total_timesteps | 819200 | -| train/ | | -| approx_kl | 0.0070893797 | -| clip_fraction | 0.0687 | -| clip_range | 0.2 | -| entropy_loss | -2.74 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.035 | -| n_updates | 490 | -| policy_gradient_loss | -0.00221 | -| std | 0.954 | -| value_loss | 0.00229 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 2060 | -| iterations | 51 | -| time_elapsed | 405 | -| total_timesteps | 835584 | -| train/ | | -| approx_kl | 0.0068652867 | -| clip_fraction | 0.0787 | -| clip_range | 0.2 | -| entropy_loss | -2.75 | -| explained_variance | 0.863 | -| learning_rate | 0.0003 | -| loss | -0.0337 | -| n_updates | 500 | -| policy_gradient_loss | -0.00277 | -| std | 0.959 | -| value_loss | 0.00229 | ------------------------------------------- -Eval num_timesteps=850000, episode_reward=-14.34 +/- 48.77 -Episode length: 1998.40 +/- 6.97 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -14.3 | -| time/ | | -| total_timesteps | 850000 | -| train/ | | -| approx_kl | 0.007872021 | -| clip_fraction | 0.0815 | -| clip_range | 0.2 | -| entropy_loss | -2.76 | -| explained_variance | 0.852 | -| learning_rate | 0.0003 | -| loss | -0.0358 | -| n_updates | 510 | -| policy_gradient_loss | -0.00365 | -| std | 0.966 | -| value_loss | 0.00272 | ------------------------------------------ -------------------------------- -| time/ | | -| fps | 2018 | -| iterations | 52 | -| time_elapsed | 422 | -| total_timesteps | 851968 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 2032 | -| iterations | 53 | -| time_elapsed | 427 | -| total_timesteps | 868352 | -| train/ | | -| approx_kl | 0.007002457 | -| clip_fraction | 0.0752 | -| clip_range | 0.2 | -| entropy_loss | -2.78 | -| explained_variance | 0.879 | -| learning_rate | 0.0003 | -| loss | -0.0414 | -| n_updates | 520 | -| policy_gradient_loss | -0.00242 | -| std | 0.977 | -| value_loss | 0.00166 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2047 | -| iterations | 54 | -| time_elapsed | 432 | -| total_timesteps | 884736 | -| train/ | | -| approx_kl | 0.007822147 | -| clip_fraction | 0.0813 | -| clip_range | 0.2 | -| entropy_loss | -2.8 | -| explained_variance | 0.871 | -| learning_rate | 0.0003 | -| loss | -0.0376 | -| n_updates | 530 | -| policy_gradient_loss | -0.00362 | -| std | 0.984 | -| value_loss | 0.00212 | ------------------------------------------ -Eval num_timesteps=900000, episode_reward=-20.41 +/- 60.01 -Episode length: 1929.40 +/- 284.99 ----------------------------------------- -| eval/ | | -| mean_ep_length | 1.93e+03 | -| mean_reward | -20.4 | -| time/ | | -| total_timesteps | 900000 | -| train/ | | -| approx_kl | 0.00738756 | -| clip_fraction | 0.0793 | -| clip_range | 0.2 | -| entropy_loss | -2.81 | -| explained_variance | 0.808 | -| learning_rate | 0.0003 | -| loss | -0.0355 | -| n_updates | 540 | -| policy_gradient_loss | -0.00195 | -| std | 0.988 | -| value_loss | 0.00721 | ----------------------------------------- - -[Diag @ 900,000 | n_sheep=2 | success=5%] - COMPACT_CANT_DRIVE 11/20 - NEVER_COMPACT 8/20 - SUCCESS 1/20 - action_mag mean=0.203 p10=0.007 p90=0.704 (0=stopped, 1=full speed) - min_flock_radius mean=3.40m best=0.00m (target <5m to compact) - min_dog_to_com mean=0.60m best=0.11m (FLEE_DIST=7m) - min_com_to_pen mean=14.01m best=3.61m - reward/step (mean): progress=-0.0040 alignment=+0.0071 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0026 -------------------------------- -| time/ | | -| fps | 1977 | -| iterations | 55 | -| time_elapsed | 455 | -| total_timesteps | 901120 | -------------------------------- ------------------------------------------ -| time/ | | -| fps | 1990 | -| iterations | 56 | -| time_elapsed | 460 | -| total_timesteps | 917504 | -| train/ | | -| approx_kl | 0.007000256 | -| clip_fraction | 0.0831 | -| clip_range | 0.2 | -| entropy_loss | -2.8 | -| explained_variance | 0.889 | -| learning_rate | 0.0003 | -| loss | -0.0285 | -| n_updates | 550 | -| policy_gradient_loss | -0.00402 | -| std | 0.984 | -| value_loss | 0.00171 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 2005 | -| iterations | 57 | -| time_elapsed | 465 | -| total_timesteps | 933888 | -| train/ | | -| approx_kl | 0.007749311 | -| clip_fraction | 0.0755 | -| clip_range | 0.2 | -| entropy_loss | -2.83 | -| explained_variance | 0.599 | -| learning_rate | 0.0003 | -| loss | -0.032 | -| n_updates | 560 | -| policy_gradient_loss | -0.00239 | -| std | 1.01 | -| value_loss | 0.00351 | ------------------------------------------ -Eval num_timesteps=950000, episode_reward=-13.16 +/- 44.70 -Episode length: 1949.30 +/- 221.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.95e+03 | -| mean_reward | -13.2 | -| time/ | | -| total_timesteps | 950000 | -| train/ | | -| approx_kl | 0.0075328955 | -| clip_fraction | 0.0829 | -| clip_range | 0.2 | -| entropy_loss | -2.85 | -| explained_variance | 0.783 | -| learning_rate | 0.0003 | -| loss | -0.0306 | -| n_updates | 570 | -| policy_gradient_loss | -0.00352 | -| std | 1.01 | -| value_loss | 0.00319 | ------------------------------------------- -------------------------------- -| time/ | | -| fps | 1971 | -| iterations | 58 | -| time_elapsed | 482 | -| total_timesteps | 950272 | -------------------------------- ------------------------------------------- -| time/ | | -| fps | 1981 | -| iterations | 59 | -| time_elapsed | 487 | -| total_timesteps | 966656 | -| train/ | | -| approx_kl | 0.0072506005 | -| clip_fraction | 0.0835 | -| clip_range | 0.2 | -| entropy_loss | -2.86 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0291 | -| n_updates | 580 | -| policy_gradient_loss | -0.00173 | -| std | 1.01 | -| value_loss | 0.00491 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1991 | -| iterations | 60 | -| time_elapsed | 493 | -| total_timesteps | 983040 | -| train/ | | -| approx_kl | 0.0068104668 | -| clip_fraction | 0.0799 | -| clip_range | 0.2 | -| entropy_loss | -2.87 | -| explained_variance | 0.813 | -| learning_rate | 0.0003 | -| loss | -0.0282 | -| n_updates | 590 | -| policy_gradient_loss | -0.00162 | -| std | 1.02 | -| value_loss | 0.00477 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 2005 | -| iterations | 61 | -| time_elapsed | 498 | -| total_timesteps | 999424 | -| train/ | | -| approx_kl | 0.007103944 | -| clip_fraction | 0.0774 | -| clip_range | 0.2 | -| entropy_loss | -2.88 | -| explained_variance | 0.942 | -| learning_rate | 0.0003 | -| loss | -0.0322 | -| n_updates | 600 | -| policy_gradient_loss | -0.00143 | -| std | 1.03 | -| value_loss | 0.0033 | ------------------------------------------ -Eval num_timesteps=1000000, episode_reward=-25.58 +/- 49.00 -Episode length: 1999.50 +/- 2.18 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -25.6 | -| time/ | | -| total_timesteps | 1000000 | -| train/ | | -| approx_kl | 0.0075788023 | -| clip_fraction | 0.088 | -| clip_range | 0.2 | -| entropy_loss | -2.9 | -| explained_variance | 0.864 | -| learning_rate | 0.0003 | -| loss | -0.0352 | -| n_updates | 610 | -| policy_gradient_loss | -0.003 | -| std | 1.04 | -| value_loss | 0.00192 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1971 | -| iterations | 62 | -| time_elapsed | 515 | -| total_timesteps | 1015808 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1983 | -| iterations | 63 | -| time_elapsed | 520 | -| total_timesteps | 1032192 | -| train/ | | -| approx_kl | 0.009131588 | -| clip_fraction | 0.0902 | -| clip_range | 0.2 | -| entropy_loss | -2.89 | -| explained_variance | 0.941 | -| learning_rate | 0.0003 | -| loss | -0.0476 | -| n_updates | 620 | -| policy_gradient_loss | -0.00341 | -| std | 1.03 | -| value_loss | 0.00705 | ------------------------------------------ ----------------------------------------- -| time/ | | -| fps | 1995 | -| iterations | 64 | -| time_elapsed | 525 | -| total_timesteps | 1048576 | -| train/ | | -| approx_kl | 0.00746674 | -| clip_fraction | 0.0838 | -| clip_range | 0.2 | -| entropy_loss | -2.89 | -| explained_variance | 0.958 | -| learning_rate | 0.0003 | -| loss | -0.022 | -| n_updates | 630 | -| policy_gradient_loss | -0.00392 | -| std | 1.03 | -| value_loss | 0.00592 | ----------------------------------------- -Eval num_timesteps=1050000, episode_reward=-12.04 +/- 64.56 -Episode length: 1889.90 +/- 333.38 ------------------------------------------- -| eval/ | | -| mean_ep_length | 1.89e+03 | -| mean_reward | -12 | -| time/ | | -| total_timesteps | 1050000 | -| train/ | | -| approx_kl | 0.0058071706 | -| clip_fraction | 0.0721 | -| clip_range | 0.2 | -| entropy_loss | -2.9 | -| explained_variance | 0.932 | -| learning_rate | 0.0003 | -| loss | -0.0188 | -| n_updates | 640 | -| policy_gradient_loss | -0.00235 | -| std | 1.03 | -| value_loss | 0.00513 | ------------------------------------------- - -[Diag @ 1,050,000 | n_sheep=2 | success=5%] - COMPACT_CANT_DRIVE 10/20 - NEVER_COMPACT 9/20 - SUCCESS 1/20 - action_mag mean=0.190 p10=0.001 p90=0.686 (0=stopped, 1=full speed) - min_flock_radius mean=4.60m best=0.00m (target <5m to compact) - min_dog_to_com mean=0.54m best=0.21m (FLEE_DIST=7m) - min_com_to_pen mean=13.05m best=3.62m - reward/step (mean): progress=-0.0023 alignment=+0.0072 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0025 --------------------------------- -| time/ | | -| fps | 1931 | -| iterations | 65 | -| time_elapsed | 551 | -| total_timesteps | 1064960 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1944 | -| iterations | 66 | -| time_elapsed | 556 | -| total_timesteps | 1081344 | -| train/ | | -| approx_kl | 0.006802067 | -| clip_fraction | 0.0701 | -| clip_range | 0.2 | -| entropy_loss | -2.92 | -| explained_variance | 0.937 | -| learning_rate | 0.0003 | -| loss | -0.0304 | -| n_updates | 650 | -| policy_gradient_loss | -0.0019 | -| std | 1.04 | -| value_loss | 0.00206 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1956 | -| iterations | 67 | -| time_elapsed | 561 | -| total_timesteps | 1097728 | -| train/ | | -| approx_kl | 0.007102525 | -| clip_fraction | 0.074 | -| clip_range | 0.2 | -| entropy_loss | -2.92 | -| explained_variance | 0.953 | -| learning_rate | 0.0003 | -| loss | -0.00869 | -| n_updates | 660 | -| policy_gradient_loss | -0.00208 | -| std | 1.04 | -| value_loss | 0.00579 | ------------------------------------------ -Eval num_timesteps=1100000, episode_reward=-29.51 +/- 23.80 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -29.5 | -| time/ | | -| total_timesteps | 1100000 | -| train/ | | -| approx_kl | 0.006372301 | -| clip_fraction | 0.0669 | -| clip_range | 0.2 | -| entropy_loss | -2.94 | -| explained_variance | 0.829 | -| learning_rate | 0.0003 | -| loss | -0.0349 | -| n_updates | 670 | -| policy_gradient_loss | -0.00135 | -| std | 1.06 | -| value_loss | 0.00208 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1932 | -| iterations | 68 | -| time_elapsed | 576 | -| total_timesteps | 1114112 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1942 | -| iterations | 69 | -| time_elapsed | 581 | -| total_timesteps | 1130496 | -| train/ | | -| approx_kl | 0.007083354 | -| clip_fraction | 0.0839 | -| clip_range | 0.2 | -| entropy_loss | -2.95 | -| explained_variance | 0.845 | -| learning_rate | 0.0003 | -| loss | -0.0464 | -| n_updates | 680 | -| policy_gradient_loss | -0.00298 | -| std | 1.06 | -| value_loss | 0.00747 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1954 | -| iterations | 70 | -| time_elapsed | 586 | -| total_timesteps | 1146880 | -| train/ | | -| approx_kl | 0.007034454 | -| clip_fraction | 0.0875 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.892 | -| learning_rate | 0.0003 | -| loss | -0.0382 | -| n_updates | 690 | -| policy_gradient_loss | -0.00359 | -| std | 1.06 | -| value_loss | 0.00208 | ------------------------------------------ -Eval num_timesteps=1150000, episode_reward=-20.98 +/- 49.18 -Episode length: 1959.70 +/- 175.66 ------------------------------------------ -| eval/ | | -| mean_ep_length | 1.96e+03 | -| mean_reward | -21 | -| time/ | | -| total_timesteps | 1150000 | -| train/ | | -| approx_kl | 0.006192833 | -| clip_fraction | 0.0626 | -| clip_range | 0.2 | -| entropy_loss | -2.94 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.0224 | -| n_updates | 700 | -| policy_gradient_loss | -0.00299 | -| std | 1.05 | -| value_loss | 0.00883 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1926 | -| iterations | 71 | -| time_elapsed | 603 | -| total_timesteps | 1163264 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1937 | -| iterations | 72 | -| time_elapsed | 608 | -| total_timesteps | 1179648 | -| train/ | | -| approx_kl | 0.008185772 | -| clip_fraction | 0.0969 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.944 | -| learning_rate | 0.0003 | -| loss | -0.0278 | -| n_updates | 710 | -| policy_gradient_loss | -0.00316 | -| std | 1.07 | -| value_loss | 0.00421 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1947 | -| iterations | 73 | -| time_elapsed | 614 | -| total_timesteps | 1196032 | -| train/ | | -| approx_kl | 0.0063469247 | -| clip_fraction | 0.065 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.912 | -| learning_rate | 0.0003 | -| loss | -0.0239 | -| n_updates | 720 | -| policy_gradient_loss | -0.00224 | -| std | 1.06 | -| value_loss | 0.0054 | ------------------------------------------- -Eval num_timesteps=1200000, episode_reward=-29.34 +/- 18.71 -Episode length: 2000.00 +/- 0.00 ----------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -29.3 | -| time/ | | -| total_timesteps | 1200000 | -| train/ | | -| approx_kl | 0.00778389 | -| clip_fraction | 0.0734 | -| clip_range | 0.2 | -| entropy_loss | -2.95 | -| explained_variance | 0.961 | -| learning_rate | 0.0003 | -| loss | -0.0435 | -| n_updates | 730 | -| policy_gradient_loss | -0.00184 | -| std | 1.06 | -| value_loss | 0.0048 | ----------------------------------------- - -[Diag @ 1,200,000 | n_sheep=2 | success=10%] - NEVER_COMPACT 9/20 - COMPACT_CANT_DRIVE 9/20 - SUCCESS 2/20 - action_mag mean=0.198 p10=0.002 p90=0.744 (0=stopped, 1=full speed) - min_flock_radius mean=3.94m best=0.00m (target <5m to compact) - min_dog_to_com mean=0.50m best=0.14m (FLEE_DIST=7m) - min_com_to_pen mean=11.36m best=3.58m - reward/step (mean): progress=-0.0002 alignment=+0.0073 pen_bonus=+0.0013 step_cost=-0.0200 complete=+0.0053 - -[Curriculum] leaving stage n_sheep=2 after 600,000 steps | training success rate (last 100 eps) = 5% -[Curriculum] → 3 sheep at step 1,200,000 - --------------------------------- -| time/ | | -| fps | 1898 | -| iterations | 74 | -| time_elapsed | 638 | -| total_timesteps | 1212416 | --------------------------------- ----------------------------------------- -| time/ | | -| fps | 1909 | -| iterations | 75 | -| time_elapsed | 643 | -| total_timesteps | 1228800 | -| train/ | | -| approx_kl | 0.00918101 | -| clip_fraction | 0.106 | -| clip_range | 0.2 | -| entropy_loss | -2.95 | -| explained_variance | 0.919 | -| learning_rate | 0.0003 | -| loss | -0.0112 | -| n_updates | 740 | -| policy_gradient_loss | -0.00123 | -| std | 1.06 | -| value_loss | 0.0427 | ----------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1917 | -| iterations | 76 | -| time_elapsed | 649 | -| total_timesteps | 1245184 | -| train/ | | -| approx_kl | 0.010076641 | -| clip_fraction | 0.137 | -| clip_range | 0.2 | -| entropy_loss | -2.94 | -| explained_variance | 0.919 | -| learning_rate | 0.0003 | -| loss | -0.0229 | -| n_updates | 750 | -| policy_gradient_loss | -0.000617 | -| std | 1.05 | -| value_loss | 0.0222 | ------------------------------------------ -Eval num_timesteps=1250000, episode_reward=-38.73 +/- 33.85 -Episode length: 2000.00 +/- 0.00 ---------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -38.7 | -| time/ | | -| total_timesteps | 1250000 | -| train/ | | -| approx_kl | 0.0084493 | -| clip_fraction | 0.109 | -| clip_range | 0.2 | -| entropy_loss | -2.96 | -| explained_variance | 0.96 | -| learning_rate | 0.0003 | -| loss | -0.0259 | -| n_updates | 760 | -| policy_gradient_loss | -0.00168 | -| std | 1.06 | -| value_loss | 0.0024 | ---------------------------------------- --------------------------------- -| time/ | | -| fps | 1890 | -| iterations | 77 | -| time_elapsed | 667 | -| total_timesteps | 1261568 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1899 | -| iterations | 78 | -| time_elapsed | 672 | -| total_timesteps | 1277952 | -| train/ | | -| approx_kl | 0.008724872 | -| clip_fraction | 0.109 | -| clip_range | 0.2 | -| entropy_loss | -2.98 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0293 | -| n_updates | 770 | -| policy_gradient_loss | -0.00204 | -| std | 1.08 | -| value_loss | 0.0067 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1906 | -| iterations | 79 | -| time_elapsed | 678 | -| total_timesteps | 1294336 | -| train/ | | -| approx_kl | 0.008191848 | -| clip_fraction | 0.096 | -| clip_range | 0.2 | -| entropy_loss | -2.99 | -| explained_variance | 0.963 | -| learning_rate | 0.0003 | -| loss | -0.0247 | -| n_updates | 780 | -| policy_gradient_loss | -0.002 | -| std | 1.08 | -| value_loss | 0.00632 | ------------------------------------------ -Eval num_timesteps=1300000, episode_reward=-26.68 +/- 27.12 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -26.7 | -| time/ | | -| total_timesteps | 1300000 | -| train/ | | -| approx_kl | 0.006018152 | -| clip_fraction | 0.0869 | -| clip_range | 0.2 | -| entropy_loss | -3 | -| explained_variance | 0.96 | -| learning_rate | 0.0003 | -| loss | -0.0311 | -| n_updates | 790 | -| policy_gradient_loss | -0.00129 | -| std | 1.09 | -| value_loss | 0.00189 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1881 | -| iterations | 80 | -| time_elapsed | 696 | -| total_timesteps | 1310720 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1892 | -| iterations | 81 | -| time_elapsed | 701 | -| total_timesteps | 1327104 | -| train/ | | -| approx_kl | 0.0077671953 | -| clip_fraction | 0.082 | -| clip_range | 0.2 | -| entropy_loss | -3.01 | -| explained_variance | 0.972 | -| learning_rate | 0.0003 | -| loss | -0.0308 | -| n_updates | 800 | -| policy_gradient_loss | -0.00219 | -| std | 1.09 | -| value_loss | 0.00177 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1902 | -| iterations | 82 | -| time_elapsed | 706 | -| total_timesteps | 1343488 | -| train/ | | -| approx_kl | 0.008806022 | -| clip_fraction | 0.0947 | -| clip_range | 0.2 | -| entropy_loss | -3.02 | -| explained_variance | 0.962 | -| learning_rate | 0.0003 | -| loss | -0.0426 | -| n_updates | 810 | -| policy_gradient_loss | -0.00231 | -| std | 1.1 | -| value_loss | 0.00235 | ------------------------------------------ -Eval num_timesteps=1350000, episode_reward=-24.30 +/- 32.03 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -24.3 | -| time/ | | -| total_timesteps | 1350000 | -| train/ | | -| approx_kl | 0.007263833 | -| clip_fraction | 0.0797 | -| clip_range | 0.2 | -| entropy_loss | -3.03 | -| explained_variance | 0.957 | -| learning_rate | 0.0003 | -| loss | -0.0338 | -| n_updates | 820 | -| policy_gradient_loss | -0.00251 | -| std | 1.11 | -| value_loss | 0.00397 | ------------------------------------------ - -[Diag @ 1,350,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 16/20 - COMPACT_CANT_DRIVE 4/20 - action_mag mean=0.058 p10=0.004 p90=0.054 (0=stopped, 1=full speed) - min_flock_radius mean=6.77m best=1.04m (target <5m to compact) - min_dog_to_com mean=0.58m best=0.28m (FLEE_DIST=7m) - min_com_to_pen mean=12.71m best=4.27m - reward/step (mean): progress=-0.0038 alignment=+0.0015 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1859 | -| iterations | 83 | -| time_elapsed | 731 | -| total_timesteps | 1359872 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1870 | -| iterations | 84 | -| time_elapsed | 735 | -| total_timesteps | 1376256 | -| train/ | | -| approx_kl | 0.007816839 | -| clip_fraction | 0.0812 | -| clip_range | 0.2 | -| entropy_loss | -3.05 | -| explained_variance | 0.946 | -| learning_rate | 0.0003 | -| loss | -0.0285 | -| n_updates | 830 | -| policy_gradient_loss | -0.00277 | -| std | 1.11 | -| value_loss | 0.0018 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1880 | -| iterations | 85 | -| time_elapsed | 740 | -| total_timesteps | 1392640 | -| train/ | | -| approx_kl | 0.0064534983 | -| clip_fraction | 0.0774 | -| clip_range | 0.2 | -| entropy_loss | -3.06 | -| explained_variance | 0.958 | -| learning_rate | 0.0003 | -| loss | -0.0305 | -| n_updates | 840 | -| policy_gradient_loss | -0.00158 | -| std | 1.12 | -| value_loss | 0.00988 | ------------------------------------------- -Eval num_timesteps=1400000, episode_reward=-39.10 +/- 41.08 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -39.1 | -| time/ | | -| total_timesteps | 1400000 | -| train/ | | -| approx_kl | 0.0069560152 | -| clip_fraction | 0.0835 | -| clip_range | 0.2 | -| entropy_loss | -3.07 | -| explained_variance | 0.96 | -| learning_rate | 0.0003 | -| loss | -0.0302 | -| n_updates | 850 | -| policy_gradient_loss | -0.00283 | -| std | 1.12 | -| value_loss | 0.00307 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1857 | -| iterations | 86 | -| time_elapsed | 758 | -| total_timesteps | 1409024 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1864 | -| iterations | 87 | -| time_elapsed | 764 | -| total_timesteps | 1425408 | -| train/ | | -| approx_kl | 0.007682803 | -| clip_fraction | 0.0931 | -| clip_range | 0.2 | -| entropy_loss | -3.09 | -| explained_variance | 0.902 | -| learning_rate | 0.0003 | -| loss | -0.0322 | -| n_updates | 860 | -| policy_gradient_loss | -0.00224 | -| std | 1.14 | -| value_loss | 0.013 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1869 | -| iterations | 88 | -| time_elapsed | 771 | -| total_timesteps | 1441792 | -| train/ | | -| approx_kl | 0.0063949013 | -| clip_fraction | 0.0786 | -| clip_range | 0.2 | -| entropy_loss | -3.1 | -| explained_variance | 0.953 | -| learning_rate | 0.0003 | -| loss | -0.0401 | -| n_updates | 870 | -| policy_gradient_loss | -0.00134 | -| std | 1.14 | -| value_loss | 0.00193 | ------------------------------------------- -Eval num_timesteps=1450000, episode_reward=-28.59 +/- 25.61 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -28.6 | -| time/ | | -| total_timesteps | 1450000 | -| train/ | | -| approx_kl | 0.007503539 | -| clip_fraction | 0.0774 | -| clip_range | 0.2 | -| entropy_loss | -3.13 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.0378 | -| n_updates | 880 | -| policy_gradient_loss | -0.00309 | -| std | 1.16 | -| value_loss | 0.00551 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1845 | -| iterations | 89 | -| time_elapsed | 789 | -| total_timesteps | 1458176 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1852 | -| iterations | 90 | -| time_elapsed | 796 | -| total_timesteps | 1474560 | -| train/ | | -| approx_kl | 0.0075057503 | -| clip_fraction | 0.0793 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.0439 | -| n_updates | 890 | -| policy_gradient_loss | -0.00264 | -| std | 1.17 | -| value_loss | 0.00265 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1857 | -| iterations | 91 | -| time_elapsed | 802 | -| total_timesteps | 1490944 | -| train/ | | -| approx_kl | 0.0068523246 | -| clip_fraction | 0.0755 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.935 | -| learning_rate | 0.0003 | -| loss | -0.0282 | -| n_updates | 900 | -| policy_gradient_loss | -0.00292 | -| std | 1.17 | -| value_loss | 0.00268 | ------------------------------------------- -Eval num_timesteps=1500000, episode_reward=-40.66 +/- 25.29 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -40.7 | -| time/ | | -| total_timesteps | 1500000 | -| train/ | | -| approx_kl | 0.007249858 | -| clip_fraction | 0.0857 | -| clip_range | 0.2 | -| entropy_loss | -3.15 | -| explained_variance | 0.952 | -| learning_rate | 0.0003 | -| loss | -0.0366 | -| n_updates | 910 | -| policy_gradient_loss | -0.00319 | -| std | 1.17 | -| value_loss | 0.00564 | ------------------------------------------ - -[Diag @ 1,500,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 14/20 - COMPACT_CANT_DRIVE 6/20 - action_mag mean=0.050 p10=0.005 p90=0.049 (0=stopped, 1=full speed) - min_flock_radius mean=6.53m best=0.98m (target <5m to compact) - min_dog_to_com mean=0.46m best=0.06m (FLEE_DIST=7m) - min_com_to_pen mean=12.38m best=5.44m - reward/step (mean): progress=+0.0039 alignment=+0.0011 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1819 | -| iterations | 92 | -| time_elapsed | 828 | -| total_timesteps | 1507328 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1828 | -| iterations | 93 | -| time_elapsed | 833 | -| total_timesteps | 1523712 | -| train/ | | -| approx_kl | 0.007471386 | -| clip_fraction | 0.0834 | -| clip_range | 0.2 | -| entropy_loss | -3.16 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0275 | -| n_updates | 920 | -| policy_gradient_loss | -0.00192 | -| std | 1.17 | -| value_loss | 0.00791 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1835 | -| iterations | 94 | -| time_elapsed | 838 | -| total_timesteps | 1540096 | -| train/ | | -| approx_kl | 0.007296456 | -| clip_fraction | 0.0765 | -| clip_range | 0.2 | -| entropy_loss | -3.17 | -| explained_variance | 0.95 | -| learning_rate | 0.0003 | -| loss | -0.0484 | -| n_updates | 930 | -| policy_gradient_loss | -0.00366 | -| std | 1.18 | -| value_loss | 0.00788 | ------------------------------------------ -Eval num_timesteps=1550000, episode_reward=-34.66 +/- 25.47 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -34.7 | -| time/ | | -| total_timesteps | 1550000 | -| train/ | | -| approx_kl | 0.007654687 | -| clip_fraction | 0.095 | -| clip_range | 0.2 | -| entropy_loss | -3.18 | -| explained_variance | 0.92 | -| learning_rate | 0.0003 | -| loss | -0.0386 | -| n_updates | 940 | -| policy_gradient_loss | -0.00316 | -| std | 1.19 | -| value_loss | 0.00363 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1817 | -| iterations | 95 | -| time_elapsed | 856 | -| total_timesteps | 1556480 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1823 | -| iterations | 96 | -| time_elapsed | 862 | -| total_timesteps | 1572864 | -| train/ | | -| approx_kl | 0.007030643 | -| clip_fraction | 0.0881 | -| clip_range | 0.2 | -| entropy_loss | -3.18 | -| explained_variance | 0.944 | -| learning_rate | 0.0003 | -| loss | -0.0346 | -| n_updates | 950 | -| policy_gradient_loss | -0.00321 | -| std | 1.19 | -| value_loss | 0.00208 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1828 | -| iterations | 97 | -| time_elapsed | 869 | -| total_timesteps | 1589248 | -| train/ | | -| approx_kl | 0.0071562277 | -| clip_fraction | 0.0834 | -| clip_range | 0.2 | -| entropy_loss | -3.19 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.0196 | -| n_updates | 960 | -| policy_gradient_loss | -0.00259 | -| std | 1.2 | -| value_loss | 0.00773 | ------------------------------------------- -Eval num_timesteps=1600000, episode_reward=-33.49 +/- 36.88 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -33.5 | -| time/ | | -| total_timesteps | 1600000 | -| train/ | | -| approx_kl | 0.0069667175 | -| clip_fraction | 0.0741 | -| clip_range | 0.2 | -| entropy_loss | -3.2 | -| explained_variance | 0.94 | -| learning_rate | 0.0003 | -| loss | -0.0313 | -| n_updates | 970 | -| policy_gradient_loss | -0.00399 | -| std | 1.2 | -| value_loss | 0.00419 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1810 | -| iterations | 98 | -| time_elapsed | 886 | -| total_timesteps | 1605632 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1819 | -| iterations | 99 | -| time_elapsed | 891 | -| total_timesteps | 1622016 | -| train/ | | -| approx_kl | 0.0061995042 | -| clip_fraction | 0.0767 | -| clip_range | 0.2 | -| entropy_loss | -3.21 | -| explained_variance | 0.968 | -| learning_rate | 0.0003 | -| loss | -0.036 | -| n_updates | 980 | -| policy_gradient_loss | -0.00289 | -| std | 1.2 | -| value_loss | 0.00241 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1826 | -| iterations | 100 | -| time_elapsed | 896 | -| total_timesteps | 1638400 | -| train/ | | -| approx_kl | 0.006502889 | -| clip_fraction | 0.0714 | -| clip_range | 0.2 | -| entropy_loss | -3.22 | -| explained_variance | 0.976 | -| learning_rate | 0.0003 | -| loss | -0.0445 | -| n_updates | 990 | -| policy_gradient_loss | -0.00314 | -| std | 1.21 | -| value_loss | 0.00218 | ------------------------------------------ -Eval num_timesteps=1650000, episode_reward=-38.00 +/- 30.02 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -38 | -| time/ | | -| total_timesteps | 1650000 | -| train/ | | -| approx_kl | 0.006163503 | -| clip_fraction | 0.0739 | -| clip_range | 0.2 | -| entropy_loss | -3.22 | -| explained_variance | 0.955 | -| learning_rate | 0.0003 | -| loss | -0.0391 | -| n_updates | 1000 | -| policy_gradient_loss | -0.00257 | -| std | 1.22 | -| value_loss | 0.0027 | ------------------------------------------ - -[Diag @ 1,650,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 16/20 - COMPACT_CANT_DRIVE 4/20 - action_mag mean=0.054 p10=0.002 p90=0.051 (0=stopped, 1=full speed) - min_flock_radius mean=6.63m best=3.72m (target <5m to compact) - min_dog_to_com mean=0.60m best=0.09m (FLEE_DIST=7m) - min_com_to_pen mean=13.17m best=5.44m - reward/step (mean): progress=+0.0032 alignment=+0.0015 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1793 | -| iterations | 101 | -| time_elapsed | 922 | -| total_timesteps | 1654784 | --------------------------------- ----------------------------------------- -| time/ | | -| fps | 1800 | -| iterations | 102 | -| time_elapsed | 927 | -| total_timesteps | 1671168 | -| train/ | | -| approx_kl | 0.00634938 | -| clip_fraction | 0.073 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.97 | -| learning_rate | 0.0003 | -| loss | -0.0462 | -| n_updates | 1010 | -| policy_gradient_loss | -0.00394 | -| std | 1.22 | -| value_loss | 0.00334 | ----------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1807 | -| iterations | 103 | -| time_elapsed | 933 | -| total_timesteps | 1687552 | -| train/ | | -| approx_kl | 0.0072235917 | -| clip_fraction | 0.0774 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.957 | -| learning_rate | 0.0003 | -| loss | -0.0284 | -| n_updates | 1020 | -| policy_gradient_loss | -0.00292 | -| std | 1.22 | -| value_loss | 0.00807 | ------------------------------------------- -Eval num_timesteps=1700000, episode_reward=-32.26 +/- 31.96 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -32.3 | -| time/ | | -| total_timesteps | 1700000 | -| train/ | | -| approx_kl | 0.0060304543 | -| clip_fraction | 0.0721 | -| clip_range | 0.2 | -| entropy_loss | -3.23 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0427 | -| n_updates | 1030 | -| policy_gradient_loss | -0.00306 | -| std | 1.21 | -| value_loss | 0.00208 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1789 | -| iterations | 104 | -| time_elapsed | 952 | -| total_timesteps | 1703936 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1795 | -| iterations | 105 | -| time_elapsed | 958 | -| total_timesteps | 1720320 | -| train/ | | -| approx_kl | 0.006440907 | -| clip_fraction | 0.0642 | -| clip_range | 0.2 | -| entropy_loss | -3.22 | -| explained_variance | 0.947 | -| learning_rate | 0.0003 | -| loss | -0.0317 | -| n_updates | 1040 | -| policy_gradient_loss | -0.00158 | -| std | 1.21 | -| value_loss | 0.00165 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1801 | -| iterations | 106 | -| time_elapsed | 963 | -| total_timesteps | 1736704 | -| train/ | | -| approx_kl | 0.006897255 | -| clip_fraction | 0.0738 | -| clip_range | 0.2 | -| entropy_loss | -3.2 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0408 | -| n_updates | 1050 | -| policy_gradient_loss | -0.00349 | -| std | 1.19 | -| value_loss | 0.00814 | ------------------------------------------ -Eval num_timesteps=1750000, episode_reward=-40.58 +/- 28.91 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -40.6 | -| time/ | | -| total_timesteps | 1750000 | -| train/ | | -| approx_kl | 0.0070952754 | -| clip_fraction | 0.0742 | -| clip_range | 0.2 | -| entropy_loss | -3.19 | -| explained_variance | 0.957 | -| learning_rate | 0.0003 | -| loss | -0.0308 | -| n_updates | 1060 | -| policy_gradient_loss | -0.0037 | -| std | 1.19 | -| value_loss | 0.0191 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1784 | -| iterations | 107 | -| time_elapsed | 982 | -| total_timesteps | 1753088 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1791 | -| iterations | 108 | -| time_elapsed | 987 | -| total_timesteps | 1769472 | -| train/ | | -| approx_kl | 0.006444447 | -| clip_fraction | 0.0736 | -| clip_range | 0.2 | -| entropy_loss | -3.2 | -| explained_variance | 0.968 | -| learning_rate | 0.0003 | -| loss | -0.0362 | -| n_updates | 1070 | -| policy_gradient_loss | -0.00409 | -| std | 1.2 | -| value_loss | 0.00395 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1797 | -| iterations | 109 | -| time_elapsed | 993 | -| total_timesteps | 1785856 | -| train/ | | -| approx_kl | 0.007391736 | -| clip_fraction | 0.0758 | -| clip_range | 0.2 | -| entropy_loss | -3.22 | -| explained_variance | 0.96 | -| learning_rate | 0.0003 | -| loss | -0.0341 | -| n_updates | 1080 | -| policy_gradient_loss | -0.00272 | -| std | 1.21 | -| value_loss | 0.00221 | ------------------------------------------ -Eval num_timesteps=1800000, episode_reward=-29.06 +/- 30.98 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -29.1 | -| time/ | | -| total_timesteps | 1800000 | -| train/ | | -| approx_kl | 0.006899439 | -| clip_fraction | 0.0695 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.965 | -| learning_rate | 0.0003 | -| loss | -0.0317 | -| n_updates | 1090 | -| policy_gradient_loss | -0.00226 | -| std | 1.23 | -| value_loss | 0.00615 | ------------------------------------------ - -[Diag @ 1,800,000 | n_sheep=3 | success=0%] - NEVER_COMPACT 11/20 - COMPACT_CANT_DRIVE 9/20 - action_mag mean=0.054 p10=0.003 p90=0.057 (0=stopped, 1=full speed) - min_flock_radius mean=6.01m best=1.13m (target <5m to compact) - min_dog_to_com mean=0.51m best=0.11m (FLEE_DIST=7m) - min_com_to_pen mean=12.52m best=3.21m - reward/step (mean): progress=+0.0050 alignment=+0.0017 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0000 - -[Curriculum] leaving stage n_sheep=3 after 600,000 steps | training success rate (last 100 eps) = 0% -[Curriculum] → 4 sheep at step 1,800,000 - --------------------------------- -| time/ | | -| fps | 1769 | -| iterations | 110 | -| time_elapsed | 1018 | -| total_timesteps | 1802240 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1776 | -| iterations | 111 | -| time_elapsed | 1023 | -| total_timesteps | 1818624 | -| train/ | | -| approx_kl | 0.006710761 | -| clip_fraction | 0.0761 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.867 | -| learning_rate | 0.0003 | -| loss | -0.031 | -| n_updates | 1100 | -| policy_gradient_loss | -0.00311 | -| std | 1.23 | -| value_loss | 0.0186 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1783 | -| iterations | 112 | -| time_elapsed | 1028 | -| total_timesteps | 1835008 | -| train/ | | -| approx_kl | 0.006202608 | -| clip_fraction | 0.0682 | -| clip_range | 0.2 | -| entropy_loss | -3.25 | -| explained_variance | 0.954 | -| learning_rate | 0.0003 | -| loss | -0.0245 | -| n_updates | 1110 | -| policy_gradient_loss | -0.00429 | -| std | 1.23 | -| value_loss | 0.00641 | ------------------------------------------ -Eval num_timesteps=1850000, episode_reward=-35.87 +/- 42.36 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -35.9 | -| time/ | | -| total_timesteps | 1850000 | -| train/ | | -| approx_kl | 0.008398036 | -| clip_fraction | 0.086 | -| clip_range | 0.2 | -| entropy_loss | -3.28 | -| explained_variance | 0.938 | -| learning_rate | 0.0003 | -| loss | -0.0514 | -| n_updates | 1120 | -| policy_gradient_loss | -0.00497 | -| std | 1.25 | -| value_loss | 0.00614 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1768 | -| iterations | 113 | -| time_elapsed | 1046 | -| total_timesteps | 1851392 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1775 | -| iterations | 114 | -| time_elapsed | 1052 | -| total_timesteps | 1867776 | -| train/ | | -| approx_kl | 0.007641702 | -| clip_fraction | 0.0742 | -| clip_range | 0.2 | -| entropy_loss | -3.31 | -| explained_variance | 0.935 | -| learning_rate | 0.0003 | -| loss | -0.046 | -| n_updates | 1130 | -| policy_gradient_loss | -0.00349 | -| std | 1.28 | -| value_loss | 0.0228 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1781 | -| iterations | 115 | -| time_elapsed | 1057 | -| total_timesteps | 1884160 | -| train/ | | -| approx_kl | 0.0073437546 | -| clip_fraction | 0.0747 | -| clip_range | 0.2 | -| entropy_loss | -3.34 | -| explained_variance | 0.928 | -| learning_rate | 0.0003 | -| loss | -0.0498 | -| n_updates | 1140 | -| policy_gradient_loss | -0.00496 | -| std | 1.29 | -| value_loss | 0.00764 | ------------------------------------------- -Eval num_timesteps=1900000, episode_reward=-41.88 +/- 27.01 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -41.9 | -| time/ | | -| total_timesteps | 1900000 | -| train/ | | -| approx_kl | 0.006885264 | -| clip_fraction | 0.0728 | -| clip_range | 0.2 | -| entropy_loss | -3.36 | -| explained_variance | 0.934 | -| learning_rate | 0.0003 | -| loss | -0.0503 | -| n_updates | 1150 | -| policy_gradient_loss | -0.00384 | -| std | 1.3 | -| value_loss | 0.00423 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1767 | -| iterations | 116 | -| time_elapsed | 1075 | -| total_timesteps | 1900544 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1773 | -| iterations | 117 | -| time_elapsed | 1080 | -| total_timesteps | 1916928 | -| train/ | | -| approx_kl | 0.0077611385 | -| clip_fraction | 0.0792 | -| clip_range | 0.2 | -| entropy_loss | -3.38 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0374 | -| n_updates | 1160 | -| policy_gradient_loss | -0.00399 | -| std | 1.31 | -| value_loss | 0.00292 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1780 | -| iterations | 118 | -| time_elapsed | 1085 | -| total_timesteps | 1933312 | -| train/ | | -| approx_kl | 0.006831214 | -| clip_fraction | 0.0758 | -| clip_range | 0.2 | -| entropy_loss | -3.4 | -| explained_variance | 0.963 | -| learning_rate | 0.0003 | -| loss | -0.0175 | -| n_updates | 1170 | -| policy_gradient_loss | -0.00471 | -| std | 1.33 | -| value_loss | 0.00235 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1786 | -| iterations | 119 | -| time_elapsed | 1091 | -| total_timesteps | 1949696 | -| train/ | | -| approx_kl | 0.006474304 | -| clip_fraction | 0.0666 | -| clip_range | 0.2 | -| entropy_loss | -3.43 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0318 | -| n_updates | 1180 | -| policy_gradient_loss | -0.00285 | -| std | 1.35 | -| value_loss | 0.00699 | ------------------------------------------ -Eval num_timesteps=1950000, episode_reward=-35.80 +/- 28.95 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -35.8 | -| time/ | | -| total_timesteps | 1950000 | -| train/ | | -| approx_kl | 0.008532442 | -| clip_fraction | 0.0746 | -| clip_range | 0.2 | -| entropy_loss | -3.43 | -| explained_variance | 0.958 | -| learning_rate | 0.0003 | -| loss | -0.00337 | -| n_updates | 1190 | -| policy_gradient_loss | -0.00376 | -| std | 1.34 | -| value_loss | 0.0156 | ------------------------------------------ - -[Diag @ 1,950,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 19/20 - COMPACT_CANT_DRIVE 1/20 - action_mag mean=0.049 p10=0.007 p90=0.044 (0=stopped, 1=full speed) - min_flock_radius mean=8.95m best=4.96m (target <5m to compact) - min_dog_to_com mean=0.39m best=0.07m (FLEE_DIST=7m) - min_com_to_pen mean=14.18m best=9.30m - reward/step (mean): progress=-0.0121 alignment=+0.0010 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1759 | -| iterations | 120 | -| time_elapsed | 1117 | -| total_timesteps | 1966080 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1766 | -| iterations | 121 | -| time_elapsed | 1122 | -| total_timesteps | 1982464 | -| train/ | | -| approx_kl | 0.006549825 | -| clip_fraction | 0.0665 | -| clip_range | 0.2 | -| entropy_loss | -3.43 | -| explained_variance | 0.966 | -| learning_rate | 0.0003 | -| loss | -0.0345 | -| n_updates | 1200 | -| policy_gradient_loss | -0.00349 | -| std | 1.34 | -| value_loss | 0.00315 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1773 | -| iterations | 122 | -| time_elapsed | 1127 | -| total_timesteps | 1998848 | -| train/ | | -| approx_kl | 0.0062008686 | -| clip_fraction | 0.0699 | -| clip_range | 0.2 | -| entropy_loss | -3.44 | -| explained_variance | 0.959 | -| learning_rate | 0.0003 | -| loss | -0.0512 | -| n_updates | 1210 | -| policy_gradient_loss | -0.00291 | -| std | 1.35 | -| value_loss | 0.00544 | ------------------------------------------- -Eval num_timesteps=2000000, episode_reward=-45.28 +/- 26.78 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -45.3 | -| time/ | | -| total_timesteps | 2000000 | -| train/ | | -| approx_kl | 0.006553275 | -| clip_fraction | 0.0739 | -| clip_range | 0.2 | -| entropy_loss | -3.45 | -| explained_variance | 0.924 | -| learning_rate | 0.0003 | -| loss | -0.0416 | -| n_updates | 1220 | -| policy_gradient_loss | -0.00427 | -| std | 1.36 | -| value_loss | 0.0127 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1761 | -| iterations | 123 | -| time_elapsed | 1144 | -| total_timesteps | 2015232 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1767 | -| iterations | 124 | -| time_elapsed | 1149 | -| total_timesteps | 2031616 | -| train/ | | -| approx_kl | 0.0059226304 | -| clip_fraction | 0.0653 | -| clip_range | 0.2 | -| entropy_loss | -3.46 | -| explained_variance | 0.947 | -| learning_rate | 0.0003 | -| loss | -0.025 | -| n_updates | 1230 | -| policy_gradient_loss | -0.00273 | -| std | 1.36 | -| value_loss | 0.00879 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1775 | -| iterations | 125 | -| time_elapsed | 1153 | -| total_timesteps | 2048000 | -| train/ | | -| approx_kl | 0.0076779695 | -| clip_fraction | 0.0729 | -| clip_range | 0.2 | -| entropy_loss | -3.47 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0382 | -| n_updates | 1240 | -| policy_gradient_loss | -0.00385 | -| std | 1.37 | -| value_loss | 0.00692 | ------------------------------------------- -Eval num_timesteps=2050000, episode_reward=-44.22 +/- 28.52 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -44.2 | -| time/ | | -| total_timesteps | 2050000 | -| train/ | | -| approx_kl | 0.0073502595 | -| clip_fraction | 0.0822 | -| clip_range | 0.2 | -| entropy_loss | -3.49 | -| explained_variance | 0.946 | -| learning_rate | 0.0003 | -| loss | -0.0342 | -| n_updates | 1250 | -| policy_gradient_loss | -0.00592 | -| std | 1.39 | -| value_loss | 0.00555 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1764 | -| iterations | 126 | -| time_elapsed | 1170 | -| total_timesteps | 2064384 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1770 | -| iterations | 127 | -| time_elapsed | 1175 | -| total_timesteps | 2080768 | -| train/ | | -| approx_kl | 0.006628736 | -| clip_fraction | 0.0767 | -| clip_range | 0.2 | -| entropy_loss | -3.51 | -| explained_variance | 0.95 | -| learning_rate | 0.0003 | -| loss | -0.035 | -| n_updates | 1260 | -| policy_gradient_loss | -0.00457 | -| std | 1.4 | -| value_loss | 0.00416 | ------------------------------------------ ------------------------------------------- -| time/ | | -| fps | 1776 | -| iterations | 128 | -| time_elapsed | 1180 | -| total_timesteps | 2097152 | -| train/ | | -| approx_kl | 0.0068027405 | -| clip_fraction | 0.0719 | -| clip_range | 0.2 | -| entropy_loss | -3.53 | -| explained_variance | 0.891 | -| learning_rate | 0.0003 | -| loss | -0.0391 | -| n_updates | 1270 | -| policy_gradient_loss | -0.00312 | -| std | 1.42 | -| value_loss | 0.00492 | ------------------------------------------- -Eval num_timesteps=2100000, episode_reward=-39.37 +/- 34.76 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -39.4 | -| time/ | | -| total_timesteps | 2100000 | -| train/ | | -| approx_kl | 0.005523986 | -| clip_fraction | 0.0604 | -| clip_range | 0.2 | -| entropy_loss | -3.54 | -| explained_variance | 0.938 | -| learning_rate | 0.0003 | -| loss | -0.0364 | -| n_updates | 1280 | -| policy_gradient_loss | -0.00281 | -| std | 1.42 | -| value_loss | 0.015 | ------------------------------------------ - -[Diag @ 2,100,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 20/20 - action_mag mean=0.047 p10=0.002 p90=0.041 (0=stopped, 1=full speed) - min_flock_radius mean=8.62m best=5.89m (target <5m to compact) - min_dog_to_com mean=0.46m best=0.04m (FLEE_DIST=7m) - min_com_to_pen mean=14.19m best=7.53m - reward/step (mean): progress=-0.0012 alignment=+0.0012 pen_bonus=+0.0010 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1751 | -| iterations | 129 | -| time_elapsed | 1206 | -| total_timesteps | 2113536 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1756 | -| iterations | 130 | -| time_elapsed | 1212 | -| total_timesteps | 2129920 | -| train/ | | -| approx_kl | 0.007766474 | -| clip_fraction | 0.0823 | -| clip_range | 0.2 | -| entropy_loss | -3.53 | -| explained_variance | 0.96 | -| learning_rate | 0.0003 | -| loss | -0.0396 | -| n_updates | 1290 | -| policy_gradient_loss | -0.00492 | -| std | 1.41 | -| value_loss | 0.00554 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1762 | -| iterations | 131 | -| time_elapsed | 1217 | -| total_timesteps | 2146304 | -| train/ | | -| approx_kl | 0.006704482 | -| clip_fraction | 0.0748 | -| clip_range | 0.2 | -| entropy_loss | -3.53 | -| explained_variance | 0.97 | -| learning_rate | 0.0003 | -| loss | -0.0466 | -| n_updates | 1300 | -| policy_gradient_loss | -0.00339 | -| std | 1.42 | -| value_loss | 0.00432 | ------------------------------------------ -Eval num_timesteps=2150000, episode_reward=-43.17 +/- 26.95 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -43.2 | -| time/ | | -| total_timesteps | 2150000 | -| train/ | | -| approx_kl | 0.0065447316 | -| clip_fraction | 0.0751 | -| clip_range | 0.2 | -| entropy_loss | -3.53 | -| explained_variance | 0.888 | -| learning_rate | 0.0003 | -| loss | -0.0369 | -| n_updates | 1310 | -| policy_gradient_loss | -0.00369 | -| std | 1.41 | -| value_loss | 0.0165 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1750 | -| iterations | 132 | -| time_elapsed | 1235 | -| total_timesteps | 2162688 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1755 | -| iterations | 133 | -| time_elapsed | 1241 | -| total_timesteps | 2179072 | -| train/ | | -| approx_kl | 0.0070872563 | -| clip_fraction | 0.075 | -| clip_range | 0.2 | -| entropy_loss | -3.54 | -| explained_variance | 0.954 | -| learning_rate | 0.0003 | -| loss | -0.0427 | -| n_updates | 1320 | -| policy_gradient_loss | -0.00406 | -| std | 1.42 | -| value_loss | 0.00977 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1762 | -| iterations | 134 | -| time_elapsed | 1245 | -| total_timesteps | 2195456 | -| train/ | | -| approx_kl | 0.0073371828 | -| clip_fraction | 0.077 | -| clip_range | 0.2 | -| entropy_loss | -3.55 | -| explained_variance | 0.939 | -| learning_rate | 0.0003 | -| loss | -0.0303 | -| n_updates | 1330 | -| policy_gradient_loss | -0.00371 | -| std | 1.43 | -| value_loss | 0.00862 | ------------------------------------------- -Eval num_timesteps=2200000, episode_reward=-40.81 +/- 44.39 -Episode length: 2000.00 +/- 0.00 ------------------------------------------- -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -40.8 | -| time/ | | -| total_timesteps | 2200000 | -| train/ | | -| approx_kl | 0.0072064474 | -| clip_fraction | 0.0714 | -| clip_range | 0.2 | -| entropy_loss | -3.58 | -| explained_variance | 0.951 | -| learning_rate | 0.0003 | -| loss | -0.0517 | -| n_updates | 1340 | -| policy_gradient_loss | -0.00405 | -| std | 1.45 | -| value_loss | 0.00351 | ------------------------------------------- --------------------------------- -| time/ | | -| fps | 1751 | -| iterations | 135 | -| time_elapsed | 1262 | -| total_timesteps | 2211840 | --------------------------------- ------------------------------------------ -| time/ | | -| fps | 1758 | -| iterations | 136 | -| time_elapsed | 1267 | -| total_timesteps | 2228224 | -| train/ | | -| approx_kl | 0.008551812 | -| clip_fraction | 0.0911 | -| clip_range | 0.2 | -| entropy_loss | -3.58 | -| explained_variance | 0.929 | -| learning_rate | 0.0003 | -| loss | -0.0258 | -| n_updates | 1350 | -| policy_gradient_loss | -0.00599 | -| std | 1.45 | -| value_loss | 0.0034 | ------------------------------------------ ------------------------------------------ -| time/ | | -| fps | 1764 | -| iterations | 137 | -| time_elapsed | 1271 | -| total_timesteps | 2244608 | -| train/ | | -| approx_kl | 0.006960677 | -| clip_fraction | 0.0702 | -| clip_range | 0.2 | -| entropy_loss | -3.59 | -| explained_variance | 0.9 | -| learning_rate | 0.0003 | -| loss | -0.0396 | -| n_updates | 1360 | -| policy_gradient_loss | -0.00412 | -| std | 1.46 | -| value_loss | 0.00429 | ------------------------------------------ -Eval num_timesteps=2250000, episode_reward=-37.92 +/- 31.68 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -37.9 | -| time/ | | -| total_timesteps | 2250000 | -| train/ | | -| approx_kl | 0.005949891 | -| clip_fraction | 0.0683 | -| clip_range | 0.2 | -| entropy_loss | -3.59 | -| explained_variance | 0.948 | -| learning_rate | 0.0003 | -| loss | -0.0381 | -| n_updates | 1370 | -| policy_gradient_loss | -0.00328 | -| std | 1.46 | -| value_loss | 0.0113 | ------------------------------------------ - -[Diag @ 2,250,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 19/20 - COMPACT_CANT_DRIVE 1/20 - action_mag mean=0.068 p10=0.004 p90=0.045 (0=stopped, 1=full speed) - min_flock_radius mean=7.87m best=3.57m (target <5m to compact) - min_dog_to_com mean=0.45m best=0.15m (FLEE_DIST=7m) - min_com_to_pen mean=14.06m best=6.95m - reward/step (mean): progress=-0.0035 alignment=+0.0020 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1743 | -| iterations | 138 | -| time_elapsed | 1297 | -| total_timesteps | 2260992 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1749 | -| iterations | 139 | -| time_elapsed | 1301 | -| total_timesteps | 2277376 | -| train/ | | -| approx_kl | 0.0071727796 | -| clip_fraction | 0.0784 | -| clip_range | 0.2 | -| entropy_loss | -3.6 | -| explained_variance | 0.943 | -| learning_rate | 0.0003 | -| loss | -0.0387 | -| n_updates | 1380 | -| policy_gradient_loss | -0.0042 | -| std | 1.46 | -| value_loss | 0.0113 | ------------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1755 | -| iterations | 140 | -| time_elapsed | 1306 | -| total_timesteps | 2293760 | -| train/ | | -| approx_kl | 0.006800391 | -| clip_fraction | 0.0662 | -| clip_range | 0.2 | -| entropy_loss | -3.59 | -| explained_variance | 0.931 | -| learning_rate | 0.0003 | -| loss | -0.0283 | -| n_updates | 1390 | -| policy_gradient_loss | -0.00421 | -| std | 1.46 | -| value_loss | 0.00659 | ------------------------------------------ -Eval num_timesteps=2300000, episode_reward=-47.47 +/- 37.24 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -47.5 | -| time/ | | -| total_timesteps | 2300000 | -| train/ | | -| approx_kl | 0.008103053 | -| clip_fraction | 0.081 | -| clip_range | 0.2 | -| entropy_loss | -3.59 | -| explained_variance | 0.945 | -| learning_rate | 0.0003 | -| loss | -0.0433 | -| n_updates | 1400 | -| policy_gradient_loss | -0.00404 | -| std | 1.46 | -| value_loss | 0.00796 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1745 | -| iterations | 141 | -| time_elapsed | 1323 | -| total_timesteps | 2310144 | --------------------------------- ------------------------------------------- -| time/ | | -| fps | 1751 | -| iterations | 142 | -| time_elapsed | 1328 | -| total_timesteps | 2326528 | -| train/ | | -| approx_kl | 0.0061590094 | -| clip_fraction | 0.066 | -| clip_range | 0.2 | -| entropy_loss | -3.61 | -| explained_variance | 0.957 | -| learning_rate | 0.0003 | -| loss | -0.0436 | -| n_updates | 1410 | -| policy_gradient_loss | -0.00287 | -| std | 1.47 | -| value_loss | 0.0102 | ------------------------------------------- ------------------------------------------- -| time/ | | -| fps | 1757 | -| iterations | 143 | -| time_elapsed | 1332 | -| total_timesteps | 2342912 | -| train/ | | -| approx_kl | 0.0070403973 | -| clip_fraction | 0.0733 | -| clip_range | 0.2 | -| entropy_loss | -3.62 | -| explained_variance | 0.863 | -| learning_rate | 0.0003 | -| loss | -0.0356 | -| n_updates | 1420 | -| policy_gradient_loss | -0.00525 | -| std | 1.48 | -| value_loss | 0.0103 | ------------------------------------------- -Eval num_timesteps=2350000, episode_reward=-47.95 +/- 27.60 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -48 | -| time/ | | -| total_timesteps | 2350000 | -| train/ | | -| approx_kl | 0.007505033 | -| clip_fraction | 0.0729 | -| clip_range | 0.2 | -| entropy_loss | -3.64 | -| explained_variance | 0.94 | -| learning_rate | 0.0003 | -| loss | -0.0473 | -| n_updates | 1430 | -| policy_gradient_loss | -0.00385 | -| std | 1.5 | -| value_loss | 0.00449 | ------------------------------------------ --------------------------------- -| time/ | | -| fps | 1747 | -| iterations | 144 | -| time_elapsed | 1350 | -| total_timesteps | 2359296 | --------------------------------- ----------------------------------------- -| time/ | | -| fps | 1752 | -| iterations | 145 | -| time_elapsed | 1355 | -| total_timesteps | 2375680 | -| train/ | | -| approx_kl | 0.00724002 | -| clip_fraction | 0.0739 | -| clip_range | 0.2 | -| entropy_loss | -3.65 | -| explained_variance | 0.948 | -| learning_rate | 0.0003 | -| loss | -0.0419 | -| n_updates | 1440 | -| policy_gradient_loss | -0.00426 | -| std | 1.5 | -| value_loss | 0.00886 | ----------------------------------------- ------------------------------------------ -| time/ | | -| fps | 1758 | -| iterations | 146 | -| time_elapsed | 1360 | -| total_timesteps | 2392064 | -| train/ | | -| approx_kl | 0.007578165 | -| clip_fraction | 0.0713 | -| clip_range | 0.2 | -| entropy_loss | -3.64 | -| explained_variance | 0.859 | -| learning_rate | 0.0003 | -| loss | -0.0427 | -| n_updates | 1450 | -| policy_gradient_loss | -0.0049 | -| std | 1.49 | -| value_loss | 0.00429 | ------------------------------------------ -Eval num_timesteps=2400000, episode_reward=-47.88 +/- 34.39 -Episode length: 2000.00 +/- 0.00 ------------------------------------------ -| eval/ | | -| mean_ep_length | 2e+03 | -| mean_reward | -47.9 | -| time/ | | -| total_timesteps | 2400000 | -| train/ | | -| approx_kl | 0.006707498 | -| clip_fraction | 0.0692 | -| clip_range | 0.2 | -| entropy_loss | -3.65 | -| explained_variance | 0.861 | -| learning_rate | 0.0003 | -| loss | -0.0426 | -| n_updates | 1460 | -| policy_gradient_loss | -0.00411 | -| std | 1.5 | -| value_loss | 0.00639 | ------------------------------------------ - -[Diag @ 2,400,000 | n_sheep=4 | success=0%] - NEVER_COMPACT 19/20 - COMPACT_CANT_DRIVE 1/20 - action_mag mean=0.052 p10=0.005 p90=0.045 (0=stopped, 1=full speed) - min_flock_radius mean=8.79m best=3.32m (target <5m to compact) - min_dog_to_com mean=0.45m best=0.20m (FLEE_DIST=7m) - min_com_to_pen mean=13.96m best=9.02m - reward/step (mean): progress=-0.0047 alignment=+0.0013 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 --------------------------------- -| time/ | | -| fps | 1737 | -| iterations | 147 | -| time_elapsed | 1386 | -| total_timesteps | 2408448 | --------------------------------- - -Training complete. Artefacts saved to runs/ppo_fix_check2/ diff --git a/training/runs/ppo_fix_check2/best_model/best_model.zip b/training/runs/ppo_fix_check2/best_model/best_model.zip deleted file mode 100644 index b07d85b..0000000 Binary files a/training/runs/ppo_fix_check2/best_model/best_model.zip and /dev/null differ diff --git a/training/runs/ppo_fix_check2/evaluations.npz b/training/runs/ppo_fix_check2/evaluations.npz deleted file mode 100644 index cc6f67e..0000000 Binary files a/training/runs/ppo_fix_check2/evaluations.npz and /dev/null differ diff --git a/training/runs/ppo_fix_check2/final_model.zip b/training/runs/ppo_fix_check2/final_model.zip deleted file mode 100644 index ac482b3..0000000 Binary files a/training/runs/ppo_fix_check2/final_model.zip and /dev/null differ diff --git a/training/runs/ppo_fix_check2/vecnorm.pkl b/training/runs/ppo_fix_check2/vecnorm.pkl deleted file mode 100644 index 20a640e..0000000 Binary files a/training/runs/ppo_fix_check2/vecnorm.pkl and /dev/null differ diff --git a/training/runs/ppo_v2/best_model/best_model.zip b/training/runs/ppo_v2/best_model/best_model.zip deleted file mode 100644 index 9d6f244..0000000 Binary files a/training/runs/ppo_v2/best_model/best_model.zip and /dev/null differ diff --git a/training/runs/ppo_v2/evaluations.npz b/training/runs/ppo_v2/evaluations.npz deleted file mode 100644 index 5f2a578..0000000 Binary files a/training/runs/ppo_v2/evaluations.npz and /dev/null differ diff --git a/training/runs/ppo_v2/final_model.zip b/training/runs/ppo_v2/final_model.zip deleted file mode 100644 index 49e3adf..0000000 Binary files a/training/runs/ppo_v2/final_model.zip and /dev/null differ diff --git a/training/runs/ppo_v2/vecnorm.pkl b/training/runs/ppo_v2/vecnorm.pkl deleted file mode 100644 index 76f9df6..0000000 Binary files a/training/runs/ppo_v2/vecnorm.pkl and /dev/null differ diff --git a/training/runs/ppo_v2_cont/best_model/best_model.zip b/training/runs/ppo_v2_cont/best_model/best_model.zip deleted file mode 100644 index fa1a968..0000000 Binary files a/training/runs/ppo_v2_cont/best_model/best_model.zip and /dev/null differ diff --git a/training/runs/ppo_v2_cont/evaluations.npz b/training/runs/ppo_v2_cont/evaluations.npz deleted file mode 100644 index 634b804..0000000 Binary files a/training/runs/ppo_v2_cont/evaluations.npz and /dev/null differ diff --git a/training/runs/ppo_v2_cont/final_model.zip b/training/runs/ppo_v2_cont/final_model.zip deleted file mode 100644 index 49a0296..0000000 Binary files a/training/runs/ppo_v2_cont/final_model.zip and /dev/null differ diff --git a/training/runs/ppo_v2_cont/vecnorm.pkl b/training/runs/ppo_v2_cont/vecnorm.pkl deleted file mode 100644 index d7b6d54..0000000 Binary files a/training/runs/ppo_v2_cont/vecnorm.pkl and /dev/null differ diff --git a/training/runs/ppo_v3/best_model/best_model.zip b/training/runs/ppo_v3/best_model/best_model.zip deleted file mode 100644 index 82d0259..0000000 Binary files a/training/runs/ppo_v3/best_model/best_model.zip and /dev/null differ diff --git a/training/runs/ppo_v3/evaluations.npz b/training/runs/ppo_v3/evaluations.npz deleted file mode 100644 index 1d5ee82..0000000 Binary files a/training/runs/ppo_v3/evaluations.npz and /dev/null differ diff --git a/training/runs/ppo_v3/final_model.zip b/training/runs/ppo_v3/final_model.zip deleted file mode 100644 index ce84843..0000000 Binary files a/training/runs/ppo_v3/final_model.zip and /dev/null differ diff --git a/training/runs/ppo_v3/vecnorm.pkl b/training/runs/ppo_v3/vecnorm.pkl deleted file mode 100644 index 4729c11..0000000 Binary files a/training/runs/ppo_v3/vecnorm.pkl and /dev/null differ diff --git a/training/runs/replay_20260425_152857/config.json b/training/runs/replay_20260425_152857/config.json deleted file mode 100644 index b2d15fe..0000000 --- a/training/runs/replay_20260425_152857/config.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "W_PER_SHEEP": 1.0, - "W_ALIGN": 0.0, - "W_PEN_BONUS": 5.0, - "W_STEP_COST": 0.02, - "W_COMPLETE": 200.0, - "W_COMPACT": 1.5, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": false, - "ent_coef": 0.02 -} \ No newline at end of file diff --git a/training/runs/replay_20260425_152857/final_model.zip b/training/runs/replay_20260425_152857/final_model.zip deleted file mode 100644 index b326e4c..0000000 Binary files a/training/runs/replay_20260425_152857/final_model.zip and /dev/null differ diff --git a/training/runs/replay_20260425_152857/stage_results.json b/training/runs/replay_20260425_152857/stage_results.json deleted file mode 100644 index c4e1ec0..0000000 --- a/training/runs/replay_20260425_152857/stage_results.json +++ /dev/null @@ -1,23 +0,0 @@ -[ - { - "n_sheep": 1, - "sr": 1.0, - "mean_len": 267.6333333333333, - "mean_min_pen": 3.7235233147939044, - "mean_act": 0.3746675180125346 - }, - { - "n_sheep": 2, - "sr": 0.06666666666666667, - "mean_len": 1458.6666666666667, - "mean_min_pen": 14.14484707514445, - "mean_act": 0.284232099657656 - }, - { - "n_sheep": 3, - "sr": 0.0, - "mean_len": 1500.0, - "mean_min_pen": 12.514182837804158, - "mean_act": 1.2590703022670828 - } -] \ No newline at end of file diff --git a/training/runs/replay_20260425_152857/vecnorm.pkl b/training/runs/replay_20260425_152857/vecnorm.pkl deleted file mode 100644 index 0a57434..0000000 Binary files a/training/runs/replay_20260425_152857/vecnorm.pkl and /dev/null differ diff --git a/training/runs/replay_best.log b/training/runs/replay_best.log deleted file mode 100644 index 5fd9f21..0000000 --- a/training/runs/replay_best.log +++ /dev/null @@ -1,72 +0,0 @@ -Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02} -Run dir: runs/replay_20260425_152857 -Curriculum: 1 → 3 sheep, 1,500,000 steps/stage - -[Stage n_sheep=1] training 1,500,000 steps - ... [trial 1 | 1 sheep | 100,000 steps | ret(last 50)=-20.83 sr=6%] - ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-21.40 sr=4%] - ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=-22.31 sr=0%] - ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=-19.13 sr=4%] - ... [trial 1 | 1 sheep | 500,000 steps | ret(last 50)=-18.79 sr=8%] - ... [trial 1 | 1 sheep | 600,000 steps | ret(last 50)=-10.15 sr=8%] - ... [trial 1 | 1 sheep | 700,000 steps | ret(last 50)=+10.14 sr=82%] - ... [trial 1 | 1 sheep | 800,000 steps | ret(last 50)=+11.90 sr=100%] - ... [trial 1 | 1 sheep | 900,000 steps | ret(last 50)=+11.32 sr=100%] - ... [trial 1 | 1 sheep | 1,000,000 steps | ret(last 50)=+11.36 sr=100%] - ... [trial 1 | 1 sheep | 1,100,000 steps | ret(last 50)=+11.18 sr=100%] - ... [trial 1 | 1 sheep | 1,200,000 steps | ret(last 50)=+11.08 sr=100%] - ... [trial 1 | 1 sheep | 1,300,000 steps | ret(last 50)=+11.14 sr=100%] - ... [trial 1 | 1 sheep | 1,400,000 steps | ret(last 50)=+11.10 sr=100%] - ... [trial 1 | 1 sheep | 1,500,000 steps | ret(last 50)=+10.99 sr=100%] -[Stage n_sheep=1] evaluating 30 eps -[Stage n_sheep=1] sr=100% mean_len=268 mean_min_pen=3.7m mean_act=0.37 - -[Stage n_sheep=2] training 1,500,000 steps - ... [trial 1 | 2 sheep | 1,507,336 steps | ret(last 0)=+nan sr=nan%] - ... [trial 1 | 2 sheep | 1,607,336 steps | ret(last 50)=-3.10 sr=2%] - ... [trial 1 | 2 sheep | 1,707,336 steps | ret(last 50)=-3.41 sr=2%] - ... [trial 1 | 2 sheep | 1,807,336 steps | ret(last 50)=-3.11 sr=6%] - ... [trial 1 | 2 sheep | 1,907,336 steps | ret(last 50)=-2.65 sr=8%] - ... [trial 1 | 2 sheep | 2,007,336 steps | ret(last 50)=-4.11 sr=2%] - ... [trial 1 | 2 sheep | 2,107,336 steps | ret(last 50)=-3.19 sr=6%] - ... [trial 1 | 2 sheep | 2,207,336 steps | ret(last 50)=-3.45 sr=4%] - ... [trial 1 | 2 sheep | 2,307,336 steps | ret(last 50)=-4.13 sr=0%] - ... [trial 1 | 2 sheep | 2,407,336 steps | ret(last 50)=-3.47 sr=8%] - ... [trial 1 | 2 sheep | 2,507,336 steps | ret(last 50)=-3.83 sr=4%] - ... [trial 1 | 2 sheep | 2,607,336 steps | ret(last 50)=-4.58 sr=0%] - ... [trial 1 | 2 sheep | 2,707,336 steps | ret(last 50)=-3.94 sr=2%] - ... [trial 1 | 2 sheep | 2,807,336 steps | ret(last 50)=-4.15 sr=2%] - ... [trial 1 | 2 sheep | 2,907,336 steps | ret(last 50)=-3.95 sr=4%] - ... [trial 1 | 2 sheep | 3,007,336 steps | ret(last 50)=-4.44 sr=0%] -[Stage n_sheep=2] evaluating 30 eps -[Stage n_sheep=2] sr=7% mean_len=1459 mean_min_pen=14.1m mean_act=0.28 - -[Stage n_sheep=3] training 1,500,000 steps - ... [trial 1 | 3 sheep | 3,014,664 steps | ret(last 0)=+nan sr=nan%] - ... [trial 1 | 3 sheep | 3,114,664 steps | ret(last 50)=-4.16 sr=0%] - ... [trial 1 | 3 sheep | 3,214,664 steps | ret(last 50)=-4.94 sr=0%] - ... [trial 1 | 3 sheep | 3,314,664 steps | ret(last 50)=-4.42 sr=0%] - ... [trial 1 | 3 sheep | 3,414,664 steps | ret(last 50)=-4.69 sr=0%] - ... [trial 1 | 3 sheep | 3,514,664 steps | ret(last 50)=-3.72 sr=0%] - ... [trial 1 | 3 sheep | 3,614,664 steps | ret(last 50)=-5.04 sr=0%] - ... [trial 1 | 3 sheep | 3,714,664 steps | ret(last 50)=-4.26 sr=0%] - ... [trial 1 | 3 sheep | 3,814,664 steps | ret(last 50)=-4.70 sr=0%] - ... [trial 1 | 3 sheep | 3,914,664 steps | ret(last 50)=-4.61 sr=0%] - ... [trial 1 | 3 sheep | 4,014,664 steps | ret(last 50)=-4.19 sr=0%] - ... [trial 1 | 3 sheep | 4,114,664 steps | ret(last 50)=-4.35 sr=0%] - ... [trial 1 | 3 sheep | 4,214,664 steps | ret(last 50)=-4.41 sr=0%] - ... [trial 1 | 3 sheep | 4,314,664 steps | ret(last 50)=-4.42 sr=0%] - ... [trial 1 | 3 sheep | 4,414,664 steps | ret(last 50)=-4.77 sr=0%] - ... [trial 1 | 3 sheep | 4,514,664 steps | ret(last 50)=-4.49 sr=0%] -[Stage n_sheep=3] evaluating 30 eps -[Stage n_sheep=3] sr=0% mean_len=1500 mean_min_pen=12.5m mean_act=1.26 - -============================================================ - REPLAY SUMMARY -============================================================ - n_sheep=1 sr=100% len= 268 min_pen= 3.7m act=0.37 - n_sheep=2 sr= 7% len= 1459 min_pen= 14.1m act=0.28 - n_sheep=3 sr= 0% len= 1500 min_pen= 12.5m act=1.26 - - Total time: 26.9 min - Artefacts: runs/replay_20260425_152857/ diff --git a/training/runs/smoke_stage1/model.zip b/training/runs/smoke_stage1/model.zip deleted file mode 100644 index 232a47b..0000000 Binary files a/training/runs/smoke_stage1/model.zip and /dev/null differ diff --git a/training/runs/smoke_stage1/timeseries.png b/training/runs/smoke_stage1/timeseries.png deleted file mode 100644 index 9d6bf7c..0000000 Binary files a/training/runs/smoke_stage1/timeseries.png and /dev/null differ diff --git a/training/runs/smoke_stage1/trajectory.png b/training/runs/smoke_stage1/trajectory.png deleted file mode 100644 index 965f743..0000000 Binary files a/training/runs/smoke_stage1/trajectory.png and /dev/null differ diff --git a/training/runs/smoke_stage1/vecnorm.pkl b/training/runs/smoke_stage1/vecnorm.pkl deleted file mode 100644 index 731c388..0000000 Binary files a/training/runs/smoke_stage1/vecnorm.pkl and /dev/null differ diff --git a/training/runs/smoke_stage2/model.zip b/training/runs/smoke_stage2/model.zip deleted file mode 100644 index 7c746b0..0000000 Binary files a/training/runs/smoke_stage2/model.zip and /dev/null differ diff --git a/training/runs/smoke_stage2/timeseries.png b/training/runs/smoke_stage2/timeseries.png deleted file mode 100644 index 2165716..0000000 Binary files a/training/runs/smoke_stage2/timeseries.png and /dev/null differ diff --git a/training/runs/smoke_stage2/trajectory.png b/training/runs/smoke_stage2/trajectory.png deleted file mode 100644 index 52340b9..0000000 Binary files a/training/runs/smoke_stage2/trajectory.png and /dev/null differ diff --git a/training/runs/smoke_stage2/vecnorm.pkl b/training/runs/smoke_stage2/vecnorm.pkl deleted file mode 100644 index 8870baa..0000000 Binary files a/training/runs/smoke_stage2/vecnorm.pkl and /dev/null differ diff --git a/training/runs/smoke_stage3/model.zip b/training/runs/smoke_stage3/model.zip deleted file mode 100644 index 4640505..0000000 Binary files a/training/runs/smoke_stage3/model.zip and /dev/null differ diff --git a/training/runs/smoke_stage3/timeseries.png b/training/runs/smoke_stage3/timeseries.png deleted file mode 100644 index c548598..0000000 Binary files a/training/runs/smoke_stage3/timeseries.png and /dev/null differ diff --git a/training/runs/smoke_stage3/trajectory.png b/training/runs/smoke_stage3/trajectory.png deleted file mode 100644 index 1b70804..0000000 Binary files a/training/runs/smoke_stage3/trajectory.png and /dev/null differ diff --git a/training/runs/smoke_stage3/vecnorm.pkl b/training/runs/smoke_stage3/vecnorm.pkl deleted file mode 100644 index 6cd290a..0000000 Binary files a/training/runs/smoke_stage3/vecnorm.pkl and /dev/null differ diff --git a/training/runs/sweep_20260425_124021/best.json b/training/runs/sweep_20260425_124021/best.json deleted file mode 100644 index ee2c2ff..0000000 --- a/training/runs/sweep_20260425_124021/best.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "trial": 0, - "config": { - "W_PER_SHEEP": 1.0, - "W_ALIGN": 0.1, - "W_PEN_BONUS": 10.0, - "W_STEP_COST": 0.02, - "W_COMPLETE": 100.0, - "W_COMPACT": 3.0, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": false, - "ent_coef": 0.005 - }, - "score": 0.06, - "sr": { - "1": 0.3, - "2": 0.0, - "3": 0.0 - }, - "details": { - "1": { - "sr": 0.3, - "mean_len": 1252.2, - "mean_min_pen": 2.1085331559181215, - "mean_act": 0.07743233270979732 - }, - "2": { - "sr": 0.0, - "mean_len": 1500.0, - "mean_min_pen": 12.107558453083039, - "mean_act": 0.15608626089841424 - }, - "3": { - "sr": 0.0, - "mean_len": 1500.0, - "mean_min_pen": 13.675278377532958, - "mean_act": 0.10535904271739319 - } - }, - "elapsed_s": 307.773992061615 -} \ No newline at end of file diff --git a/training/runs/sweep_20260425_124021/results.jsonl b/training/runs/sweep_20260425_124021/results.jsonl deleted file mode 100644 index 191ddee..0000000 --- a/training/runs/sweep_20260425_124021/results.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"trial": 0, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1252.2, "mean_min_pen": 2.1085331559181215, "mean_act": 0.07743233270979732}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.107558453083039, "mean_act": 0.15608626089841424}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.675278377532958, "mean_act": 0.10535904271739319}}, "elapsed_s": 307.773992061615} diff --git a/training/runs/sweep_20260425_124630/best.json b/training/runs/sweep_20260425_124630/best.json deleted file mode 100644 index 4f41880..0000000 --- a/training/runs/sweep_20260425_124630/best.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "trial": 13, - "config": { - "W_PER_SHEEP": 1.0, - "W_ALIGN": 0.0, - "W_PEN_BONUS": 5.0, - "W_STEP_COST": 0.02, - "W_COMPLETE": 200.0, - "W_COMPACT": 1.5, - "ALIGN_SHAPE": "standoff", - "ALIGN_GATED": false, - "ent_coef": 0.02 - }, - "score": 0.35, - "sr": { - "1": 1.0, - "2": 0.3, - "3": 0.0 - }, - "details": { - "1": { - "sr": 1.0, - "mean_len": 428.9, - "mean_min_pen": 3.731236696243286, - "mean_act": 0.33429858573849425 - }, - "2": { - "sr": 0.3, - "mean_len": 1242.7, - "mean_min_pen": 8.937442195415496, - "mean_act": 0.3998076917437125 - }, - "3": { - "sr": 0.0, - "mean_len": 1500.0, - "mean_min_pen": 14.061083602905274, - "mean_act": 0.5966902794524755 - } - }, - "elapsed_s": 313.8281009197235 -} \ No newline at end of file diff --git a/training/runs/sweep_20260425_124630/results.jsonl b/training/runs/sweep_20260425_124630/results.jsonl deleted file mode 100644 index cbecd6f..0000000 --- a/training/runs/sweep_20260425_124630/results.jsonl +++ /dev/null @@ -1,25 +0,0 @@ -{"trial": 0, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.15000000000000002, "sr": {"1": 0.5, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.5, "mean_len": 1051.6, "mean_min_pen": 3.0551586985588073, "mean_act": 0.0887192903536989}, "2": {"sr": 0.1, "mean_len": 1438.1, "mean_min_pen": 10.993862140178681, "mean_act": 0.1723056222816755}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 11.92835488319397, "mean_act": 0.15403316749989074}}, "elapsed_s": 316.9084241390228} -{"trial": 1, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.05, "W_COMPLETE": 200.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1153.8, "mean_min_pen": 3.8145030617713926, "mean_act": 0.15146865127462797}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.058024168014526, "mean_act": 0.10904584494279744}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.5988187789917, "mean_act": 0.09578829008591905}}, "elapsed_s": 310.8732409477234} -{"trial": 2, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 50.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.01}, "score": 0.27, "sr": {"1": 0.7, "2": 0.2, "3": 0.1}, "details": {"1": {"sr": 0.7, "mean_len": 772.1, "mean_min_pen": 2.92204372882843, "mean_act": 0.1583604314471399}, "2": {"sr": 0.2, "mean_len": 1390.6, "mean_min_pen": 12.992859578132629, "mean_act": 0.16090679360424953}, "3": {"sr": 0.1, "mean_len": 1403.7, "mean_min_pen": 13.045468378067017, "mean_act": 0.07991531561051667}}, "elapsed_s": 303.7708294391632} -{"trial": 3, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 50.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1137.5, "mean_min_pen": 2.1229824781417848, "mean_act": 0.08172097406143335}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 11.521494126319885, "mean_act": 0.16864279503144788}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.317158126831055, "mean_act": 0.05537428615499472}}, "elapsed_s": 301.6172459125519} -{"trial": 4, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.02, "W_COMPLETE": 50.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": true, "ent_coef": 0.005}, "score": 0.2, "sr": {"1": 1.0, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 1.0, "mean_len": 567.0, "mean_min_pen": 3.2795117855072022, "mean_act": 0.1855437107780058}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 9.976170372962951, "mean_act": 0.2074074002778701}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.89306182861328, "mean_act": 0.21666522849385267}}, "elapsed_s": 313.525591135025} -{"trial": 5, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.05, "W_COMPLETE": 200.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.01}, "score": 0.16000000000000003, "sr": {"1": 0.8, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.8, "mean_len": 675.5, "mean_min_pen": 3.1338732481002807, "mean_act": 0.11691584614814514}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 9.693846690654755, "mean_act": 0.19984676872865814}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.684805488586425, "mean_act": 0.06430307933471292}}, "elapsed_s": 312.4476580619812} -{"trial": 6, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.005, "W_COMPLETE": 200.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.01}, "score": 0.08000000000000002, "sr": {"1": 0.4, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.4, "mean_len": 1343.9, "mean_min_pen": 4.092962062358856, "mean_act": 0.07675616785431166}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.157618689537049, "mean_act": 0.13906600509098352}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.079688358306885, "mean_act": 0.07073271389845953}}, "elapsed_s": 337.7615342140198} -{"trial": 7, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.11, "sr": {"1": 0.3, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1177.5, "mean_min_pen": 2.261639392375946, "mean_act": 0.11013885321646562}, "2": {"sr": 0.1, "mean_len": 1437.5, "mean_min_pen": 5.9263048529624935, "mean_act": 0.16420815230170227}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.130784749984741, "mean_act": 0.20303070502222206}}, "elapsed_s": 451.2424490451813} -{"trial": 8, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.02, "W_COMPLETE": 50.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.05}, "score": 0.19, "sr": {"1": 0.7, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.7, "mean_len": 874.2, "mean_min_pen": 4.152815592288971, "mean_act": 0.1303976929043709}, "2": {"sr": 0.1, "mean_len": 1381.4, "mean_min_pen": 12.115124177932739, "mean_act": 0.3749806733317197}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.596546864509582, "mean_act": 0.10082290474528718}}, "elapsed_s": 349.3926422595978} -{"trial": 9, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.02, "W_COMPLETE": 200.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.05}, "score": 0.0, "sr": {"1": 0.0, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 8.404254817962647, "mean_act": 0.6749623541596586}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 11.970247220993041, "mean_act": 0.45562502020561796}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.029277420043945, "mean_act": 0.1599790089856222}}, "elapsed_s": 319.38924622535706} -{"trial": 10, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.02, "W_COMPLETE": 200.0, "W_COMPACT": 0.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.16000000000000003, "sr": {"1": 0.8, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.8, "mean_len": 690.7, "mean_min_pen": 3.1264367938041686, "mean_act": 0.13493279961414406}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.040377330780029, "mean_act": 0.20203861368317985}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.379706478118896, "mean_act": 0.05979441475490263}}, "elapsed_s": 310.1806254386902} -{"trial": 11, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.05, "W_COMPLETE": 50.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.24, "sr": {"1": 0.7, "2": 0.2, "3": 0.0}, "details": {"1": {"sr": 0.7, "mean_len": 727.5, "mean_min_pen": 2.933144009113312, "mean_act": 0.11888058594495643}, "2": {"sr": 0.2, "mean_len": 1317.8, "mean_min_pen": 10.2599928855896, "mean_act": 0.14370172662258304}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.231103086471558, "mean_act": 0.0614644922383149}}, "elapsed_s": 330.0620620250702} -{"trial": 12, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 0.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1244.8, "mean_min_pen": 2.1193889737129212, "mean_act": 0.08216679023110932}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.745809042453766, "mean_act": 0.16497857472260813}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.016976690292358, "mean_act": 0.09897869050660908}}, "elapsed_s": 323.27931213378906} -{"trial": 13, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.02, "W_COMPLETE": 200.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.35, "sr": {"1": 1.0, "2": 0.3, "3": 0.0}, "details": {"1": {"sr": 1.0, "mean_len": 428.9, "mean_min_pen": 3.731236696243286, "mean_act": 0.33429858573849425}, "2": {"sr": 0.3, "mean_len": 1242.7, "mean_min_pen": 8.937442195415496, "mean_act": 0.3998076917437125}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.061083602905274, "mean_act": 0.5966902794524755}}, "elapsed_s": 313.8281009197235} -{"trial": 14, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.05}, "score": 0.13999999999999999, "sr": {"1": 0.7, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.7, "mean_len": 912.4, "mean_min_pen": 2.940706562995911, "mean_act": 1.3471978399000248}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 9.901372599601746, "mean_act": 0.9463685217667609}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.291404342651367, "mean_act": 0.08601266834173493}}, "elapsed_s": 322.57220220565796} -{"trial": 15, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.01}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1210.5, "mean_min_pen": 2.107759189605713, "mean_act": 0.08131515106917063}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.824185514450074, "mean_act": 0.20362997558291535}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.042323064804076, "mean_act": 0.17125511734669563}}, "elapsed_s": 312.3465087413788} -{"trial": 16, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.005, "W_COMPLETE": 200.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": true, "ent_coef": 0.05}, "score": 0.24, "sr": {"1": 0.7, "2": 0.2, "3": 0.0}, "details": {"1": {"sr": 0.7, "mean_len": 650.1, "mean_min_pen": 2.981771671772003, "mean_act": 0.1621352170537764}, "2": {"sr": 0.2, "mean_len": 1435.5, "mean_min_pen": 8.686615812778474, "mean_act": 0.3279171284351484}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.374159717559815, "mean_act": 0.04937917392927017}}, "elapsed_s": 303.71519470214844} -{"trial": 17, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.005, "W_COMPLETE": 100.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.16, "sr": {"1": 0.3, "2": 0.2, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1088.1, "mean_min_pen": 3.4793057322502134, "mean_act": 0.09515179877670824}, "2": {"sr": 0.2, "mean_len": 1428.5, "mean_min_pen": 10.024536824226379, "mean_act": 0.4135459636897354}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.302330660820008, "mean_act": 0.34973196326509737}}, "elapsed_s": 315.76633620262146} -{"trial": 18, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.05, "W_COMPLETE": 50.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.005}, "score": 0.16000000000000003, "sr": {"1": 0.8, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.8, "mean_len": 645.4, "mean_min_pen": 3.1326077818870544, "mean_act": 0.15081361126264722}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.723365247249603, "mean_act": 0.10806036127302399}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.303192138671875, "mean_act": 0.08246586098832388}}, "elapsed_s": 318.483638048172} -{"trial": 19, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.13, "sr": {"1": 0.4, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.4, "mean_len": 1231.4, "mean_min_pen": 2.6246669054031373, "mean_act": 0.07338090033141094}, "2": {"sr": 0.1, "mean_len": 1420.2, "mean_min_pen": 8.371916389465332, "mean_act": 0.16944798908643302}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.287557554244994, "mean_act": 0.09957915147298428}}, "elapsed_s": 315.07627868652344} -{"trial": 20, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 0.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.005}, "score": 0.05, "sr": {"1": 0.0, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 1.5734932541847229, "mean_act": 0.08394606926547861}, "2": {"sr": 0.1, "mean_len": 1498.9, "mean_min_pen": 6.444609999656677, "mean_act": 0.2938110977638972}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 11.258054113388061, "mean_act": 0.16288984295733971}}, "elapsed_s": 309.5854580402374} -{"trial": 21, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.005}, "score": 0.11, "sr": {"1": 0.3, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1324.6, "mean_min_pen": 3.3425565361976624, "mean_act": 0.1115106962044226}, "2": {"sr": 0.1, "mean_len": 1443.0, "mean_min_pen": 11.069470012187958, "mean_act": 0.17271345215252376}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.845431709289551, "mean_act": 0.13337391122176}}, "elapsed_s": 315.54923272132874} -{"trial": 22, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.05}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1220.2, "mean_min_pen": 2.1276236534118653, "mean_act": 0.4312911105166665}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 8.770305395126343, "mean_act": 0.6047595652043354}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.12634140253067, "mean_act": 0.14348885283676113}}, "elapsed_s": 471.740927696228} -{"trial": 23, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.005, "W_COMPLETE": 200.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.01}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1209.4, "mean_min_pen": 3.811609184741974, "mean_act": 0.08888363576016632}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.143073177337646, "mean_act": 0.27062979487000655}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 15.135865116119385, "mean_act": 0.3670903712440903}}, "elapsed_s": 335.26912212371826} -{"trial": 24, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.05, "W_COMPLETE": 50.0, "W_COMPACT": 0.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": true, "ent_coef": 0.02}, "score": 0.0, "sr": {"1": 0.0, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.014724779129029, "mean_act": 1.024556803444028}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.734652400016785, "mean_act": 1.0186923123559604}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.690151166915893, "mean_act": 1.000638129701217}}, "elapsed_s": 306.1110165119171} diff --git a/training/runs/sweep_full.log b/training/runs/sweep_full.log deleted file mode 100644 index a60c7e4..0000000 --- a/training/runs/sweep_full.log +++ /dev/null @@ -1,681 +0,0 @@ -Sweep dir: runs/sweep_20260425_124630 -Search space: ['W_PER_SHEEP', 'W_ALIGN', 'W_PEN_BONUS', 'W_STEP_COST', 'W_COMPLETE', 'W_COMPACT', 'ALIGN_SHAPE', 'ALIGN_GATED', 'ent_coef'] -Per-trial: 1,000,000 steps train + 30 eval eps -Time budget: 7.5h - -[Trial 1] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.005} - ... [trial 1 | 1 sheep | 50,000 steps | ret(last 33)=-7.72 sr=6%] - ... [trial 1 | 1 sheep | 100,000 steps | ret(last 50)=-10.07 sr=2%] - ... [trial 1 | 1 sheep | 150,000 steps | ret(last 50)=-9.89 sr=2%] - ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-7.94 sr=4%] - ... [trial 1 | 1 sheep | 250,000 steps | ret(last 50)=+2.69 sr=2%] - ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=+18.25 sr=24%] - ... [trial 1 | 1 sheep | 350,000 steps | ret(last 50)=+24.63 sr=20%] - ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=+24.83 sr=26%] - ... [trial 1 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 1 | 2 sheep | 459,608 steps | ret(last 32)=+10.08 sr=0%] - ... [trial 1 | 2 sheep | 509,608 steps | ret(last 50)=+11.51 sr=0%] - ... [trial 1 | 2 sheep | 559,608 steps | ret(last 50)=+12.82 sr=0%] - ... [trial 1 | 2 sheep | 609,608 steps | ret(last 50)=+14.39 sr=0%] - ... [trial 1 | 2 sheep | 659,608 steps | ret(last 50)=+14.14 sr=0%] - ... [trial 1 | 2 sheep | 709,608 steps | ret(last 50)=+12.36 sr=2%] - ... [trial 1 | 2 sheep | 759,608 steps | ret(last 50)=+13.08 sr=0%] - ... [trial 1 | 2 sheep | 809,608 steps | ret(last 50)=+13.24 sr=0%] - ... [trial 1 | 2 sheep | 859,608 steps | ret(last 50)=+13.23 sr=0%] - ... [trial 1 | 2 sheep | 909,608 steps | ret(last 50)=+14.23 sr=2%] - ... [trial 1 | 2 sheep | 959,608 steps | ret(last 50)=+14.69 sr=0%] - ... [trial 1 | 2 sheep | 1,009,608 steps | ret(last 50)=+20.23 sr=0%] - ... [trial 1 | eval n=1] - ... [trial 1 | eval n=2] - ... [trial 1 | eval n=3] - → score=0.150 sr1=0.50 sr2=0.10 sr3=0.00 [317s] -[Trial 2] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.005} - ... [trial 2 | 1 sheep | 50,000 steps | ret(last 34)=-24.61 sr=9%] - ... [trial 2 | 1 sheep | 100,000 steps | ret(last 50)=-28.20 sr=10%] - ... [trial 2 | 1 sheep | 150,000 steps | ret(last 50)=-28.14 sr=8%] - ... [trial 2 | 1 sheep | 200,000 steps | ret(last 50)=-31.36 sr=2%] - ... [trial 2 | 1 sheep | 250,000 steps | ret(last 50)=-31.38 sr=6%] - ... [trial 2 | 1 sheep | 300,000 steps | ret(last 50)=-32.89 sr=4%] - ... [trial 2 | 1 sheep | 350,000 steps | ret(last 50)=-29.11 sr=8%] - ... [trial 2 | 1 sheep | 400,000 steps | ret(last 50)=-19.16 sr=30%] - ... [trial 2 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 2 | 2 sheep | 459,608 steps | ret(last 34)=-17.61 sr=9%] - ... [trial 2 | 2 sheep | 509,608 steps | ret(last 50)=-18.59 sr=2%] - ... [trial 2 | 2 sheep | 559,608 steps | ret(last 50)=-16.92 sr=0%] - ... [trial 2 | 2 sheep | 609,608 steps | ret(last 50)=-17.40 sr=0%] - ... [trial 2 | 2 sheep | 659,608 steps | ret(last 50)=-18.13 sr=0%] - ... [trial 2 | 2 sheep | 709,608 steps | ret(last 50)=-17.45 sr=0%] - ... [trial 2 | 2 sheep | 759,608 steps | ret(last 50)=-16.06 sr=0%] - ... [trial 2 | 2 sheep | 809,608 steps | ret(last 50)=-15.35 sr=0%] - ... [trial 2 | 2 sheep | 859,608 steps | ret(last 50)=-12.63 sr=0%] - ... [trial 2 | 2 sheep | 909,608 steps | ret(last 50)=-12.41 sr=0%] - ... [trial 2 | 2 sheep | 959,608 steps | ret(last 50)=-12.91 sr=0%] - ... [trial 2 | 2 sheep | 1,009,608 steps | ret(last 50)=-10.94 sr=0%] - ... [trial 2 | eval n=1] - ... [trial 2 | eval n=2] - ... [trial 2 | eval n=3] - → score=0.060 sr1=0.30 sr2=0.00 sr3=0.00 [311s] -[Trial 3] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 50.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.01} - ... [trial 3 | 1 sheep | 50,000 steps | ret(last 32)=-1.75 sr=0%] - ... [trial 3 | 1 sheep | 100,000 steps | ret(last 50)=-3.70 sr=0%] - ... [trial 3 | 1 sheep | 150,000 steps | ret(last 50)=-6.09 sr=2%] - ... [trial 3 | 1 sheep | 200,000 steps | ret(last 50)=-3.44 sr=4%] - ... [trial 3 | 1 sheep | 250,000 steps | ret(last 50)=+6.68 sr=8%] - ... [trial 3 | 1 sheep | 300,000 steps | ret(last 50)=+14.58 sr=22%] - ... [trial 3 | 1 sheep | 350,000 steps | ret(last 50)=+15.28 sr=64%] - ... [trial 3 | 1 sheep | 400,000 steps | ret(last 50)=+14.70 sr=74%] - ... [trial 3 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 3 | 2 sheep | 459,608 steps | ret(last 35)=+0.82 sr=9%] - ... [trial 3 | 2 sheep | 509,608 steps | ret(last 50)=-0.66 sr=2%] - ... [trial 3 | 2 sheep | 559,608 steps | ret(last 50)=-0.02 sr=0%] - ... [trial 3 | 2 sheep | 609,608 steps | ret(last 50)=-0.02 sr=0%] - ... [trial 3 | 2 sheep | 659,608 steps | ret(last 50)=+1.37 sr=4%] - ... [trial 3 | 2 sheep | 709,608 steps | ret(last 50)=+2.75 sr=8%] - ... [trial 3 | 2 sheep | 759,608 steps | ret(last 50)=+1.25 sr=6%] - ... [trial 3 | 2 sheep | 809,608 steps | ret(last 50)=+4.20 sr=10%] - ... [trial 3 | 2 sheep | 859,608 steps | ret(last 50)=+2.14 sr=4%] - ... [trial 3 | 2 sheep | 909,608 steps | ret(last 50)=+3.13 sr=8%] - ... [trial 3 | 2 sheep | 959,608 steps | ret(last 50)=+5.16 sr=6%] - ... [trial 3 | 2 sheep | 1,009,608 steps | ret(last 50)=+5.95 sr=8%] - ... [trial 3 | eval n=1] - ... [trial 3 | eval n=2] - ... [trial 3 | eval n=3] - → score=0.270 sr1=0.70 sr2=0.20 sr3=0.10 [304s] -[Trial 4] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 50.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.005} - ... [trial 4 | 1 sheep | 50,000 steps | ret(last 33)=-2.86 sr=9%] - ... [trial 4 | 1 sheep | 100,000 steps | ret(last 50)=-3.54 sr=6%] - ... [trial 4 | 1 sheep | 150,000 steps | ret(last 50)=-2.76 sr=8%] - ... [trial 4 | 1 sheep | 200,000 steps | ret(last 50)=-1.56 sr=8%] - ... [trial 4 | 1 sheep | 250,000 steps | ret(last 50)=+9.18 sr=24%] - ... [trial 4 | 1 sheep | 300,000 steps | ret(last 50)=+18.46 sr=46%] - ... [trial 4 | 1 sheep | 350,000 steps | ret(last 50)=+15.01 sr=34%] - ... [trial 4 | 1 sheep | 400,000 steps | ret(last 50)=+14.44 sr=42%] - ... [trial 4 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 4 | 2 sheep | 459,608 steps | ret(last 35)=+6.77 sr=9%] - ... [trial 4 | 2 sheep | 509,608 steps | ret(last 50)=+5.50 sr=6%] - ... [trial 4 | 2 sheep | 559,608 steps | ret(last 50)=+4.39 sr=0%] - ... [trial 4 | 2 sheep | 609,608 steps | ret(last 50)=+4.54 sr=0%] - ... [trial 4 | 2 sheep | 659,608 steps | ret(last 50)=+6.97 sr=0%] - ... [trial 4 | 2 sheep | 709,608 steps | ret(last 50)=+4.28 sr=4%] - ... [trial 4 | 2 sheep | 759,608 steps | ret(last 50)=+4.30 sr=2%] - ... [trial 4 | 2 sheep | 809,608 steps | ret(last 50)=+6.34 sr=4%] - ... [trial 4 | 2 sheep | 859,608 steps | ret(last 50)=+7.27 sr=2%] - ... [trial 4 | 2 sheep | 909,608 steps | ret(last 50)=+8.22 sr=4%] - ... [trial 4 | 2 sheep | 959,608 steps | ret(last 50)=+7.23 sr=6%] - ... [trial 4 | 2 sheep | 1,009,608 steps | ret(last 50)=+7.24 sr=2%] - ... [trial 4 | eval n=1] - ... [trial 4 | eval n=2] - ... [trial 4 | eval n=3] - → score=0.060 sr1=0.30 sr2=0.00 sr3=0.00 [302s] -[Trial 5] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 50.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': True, 'ent_coef': 0.005} - ... [trial 5 | 1 sheep | 50,000 steps | ret(last 33)=+3.70 sr=6%] - ... [trial 5 | 1 sheep | 100,000 steps | ret(last 50)=-2.32 sr=0%] - ... [trial 5 | 1 sheep | 150,000 steps | ret(last 50)=-4.36 sr=4%] - ... [trial 5 | 1 sheep | 200,000 steps | ret(last 50)=-4.30 sr=6%] - ... [trial 5 | 1 sheep | 250,000 steps | ret(last 50)=-0.15 sr=14%] - ... [trial 5 | 1 sheep | 300,000 steps | ret(last 50)=+1.39 sr=8%] - ... [trial 5 | 1 sheep | 350,000 steps | ret(last 50)=+11.40 sr=36%] - ... [trial 5 | 1 sheep | 400,000 steps | ret(last 50)=+11.08 sr=24%] - ... [trial 5 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 5 | 2 sheep | 459,608 steps | ret(last 34)=+6.85 sr=6%] - ... [trial 5 | 2 sheep | 509,608 steps | ret(last 50)=+7.35 sr=8%] - ... [trial 5 | 2 sheep | 559,608 steps | ret(last 50)=+7.57 sr=4%] - ... [trial 5 | 2 sheep | 609,608 steps | ret(last 50)=+6.64 sr=2%] - ... [trial 5 | 2 sheep | 659,608 steps | ret(last 50)=+9.15 sr=10%] - ... [trial 5 | 2 sheep | 709,608 steps | ret(last 50)=+14.27 sr=10%] - ... [trial 5 | 2 sheep | 759,608 steps | ret(last 50)=+10.93 sr=6%] - ... [trial 5 | 2 sheep | 809,608 steps | ret(last 50)=+10.17 sr=12%] - ... [trial 5 | 2 sheep | 859,608 steps | ret(last 50)=+8.20 sr=8%] - ... [trial 5 | 2 sheep | 909,608 steps | ret(last 50)=+9.61 sr=14%] - ... [trial 5 | 2 sheep | 959,608 steps | ret(last 50)=+11.14 sr=10%] - ... [trial 5 | 2 sheep | 1,009,608 steps | ret(last 50)=+10.75 sr=12%] - ... [trial 5 | eval n=1] - ... [trial 5 | eval n=2] - ... [trial 5 | eval n=3] - → score=0.200 sr1=1.00 sr2=0.00 sr3=0.00 [314s] -[Trial 6] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 200.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.01} - ... [trial 6 | 1 sheep | 50,000 steps | ret(last 32)=-13.18 sr=9%] - ... [trial 6 | 1 sheep | 100,000 steps | ret(last 50)=-10.28 sr=16%] - ... [trial 6 | 1 sheep | 150,000 steps | ret(last 50)=+5.28 sr=44%] - ... [trial 6 | 1 sheep | 200,000 steps | ret(last 50)=+9.40 sr=38%] - ... [trial 6 | 1 sheep | 250,000 steps | ret(last 50)=+8.62 sr=32%] - ... [trial 6 | 1 sheep | 300,000 steps | ret(last 50)=+9.14 sr=34%] - ... [trial 6 | 1 sheep | 350,000 steps | ret(last 50)=+12.59 sr=60%] - ... [trial 6 | 1 sheep | 400,000 steps | ret(last 50)=+14.10 sr=72%] - ... [trial 6 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 6 | 2 sheep | 459,608 steps | ret(last 34)=+0.12 sr=9%] - ... [trial 6 | 2 sheep | 509,608 steps | ret(last 50)=-2.84 sr=4%] - ... [trial 6 | 2 sheep | 559,608 steps | ret(last 50)=-2.11 sr=10%] - ... [trial 6 | 2 sheep | 609,608 steps | ret(last 50)=-1.91 sr=14%] - ... [trial 6 | 2 sheep | 659,608 steps | ret(last 50)=-2.14 sr=14%] - ... [trial 6 | 2 sheep | 709,608 steps | ret(last 50)=-4.30 sr=6%] - ... [trial 6 | 2 sheep | 759,608 steps | ret(last 50)=-1.89 sr=10%] - ... [trial 6 | 2 sheep | 809,608 steps | ret(last 50)=-3.47 sr=8%] - ... [trial 6 | 2 sheep | 859,608 steps | ret(last 50)=-1.45 sr=8%] - ... [trial 6 | 2 sheep | 909,608 steps | ret(last 50)=-3.55 sr=2%] - ... [trial 6 | 2 sheep | 959,608 steps | ret(last 50)=-2.93 sr=4%] - ... [trial 6 | 2 sheep | 1,009,608 steps | ret(last 50)=-1.45 sr=10%] - ... [trial 6 | eval n=1] - ... [trial 6 | eval n=2] - ... [trial 6 | eval n=3] - → score=0.160 sr1=0.80 sr2=0.00 sr3=0.00 [312s] -[Trial 7] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.005, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.01} - ... [trial 7 | 1 sheep | 50,000 steps | ret(last 32)=-8.47 sr=0%] - ... [trial 7 | 1 sheep | 100,000 steps | ret(last 50)=-5.40 sr=4%] - ... [trial 7 | 1 sheep | 150,000 steps | ret(last 50)=-2.72 sr=10%] - ... [trial 7 | 1 sheep | 200,000 steps | ret(last 50)=-1.59 sr=10%] - ... [trial 7 | 1 sheep | 250,000 steps | ret(last 50)=-1.58 sr=6%] - ... [trial 7 | 1 sheep | 300,000 steps | ret(last 50)=-3.68 sr=2%] - ... [trial 7 | 1 sheep | 350,000 steps | ret(last 50)=+4.82 sr=10%] - ... [trial 7 | 1 sheep | 400,000 steps | ret(last 50)=+15.81 sr=54%] - ... [trial 7 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 7 | 2 sheep | 459,608 steps | ret(last 32)=-2.50 sr=6%] - ... [trial 7 | 2 sheep | 509,608 steps | ret(last 50)=-2.32 sr=2%] - ... [trial 7 | 2 sheep | 559,608 steps | ret(last 50)=+0.76 sr=4%] - ... [trial 7 | 2 sheep | 609,608 steps | ret(last 50)=+0.45 sr=0%] - ... [trial 7 | 2 sheep | 659,608 steps | ret(last 50)=+1.03 sr=8%] - ... [trial 7 | 2 sheep | 709,608 steps | ret(last 50)=+0.62 sr=6%] - ... [trial 7 | 2 sheep | 759,608 steps | ret(last 50)=+0.36 sr=8%] - ... [trial 7 | 2 sheep | 809,608 steps | ret(last 50)=+2.27 sr=10%] - ... [trial 7 | 2 sheep | 859,608 steps | ret(last 50)=+2.31 sr=6%] - ... [trial 7 | 2 sheep | 909,608 steps | ret(last 50)=+3.78 sr=4%] - ... [trial 7 | 2 sheep | 959,608 steps | ret(last 50)=+2.21 sr=10%] - ... [trial 7 | 2 sheep | 1,009,608 steps | ret(last 50)=+2.66 sr=4%] - ... [trial 7 | eval n=1] - ... [trial 7 | eval n=2] - ... [trial 7 | eval n=3] - → score=0.080 sr1=0.40 sr2=0.00 sr3=0.00 [338s] -[Trial 8] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.005} - ... [trial 8 | 1 sheep | 50,000 steps | ret(last 32)=-7.73 sr=6%] - ... [trial 8 | 1 sheep | 100,000 steps | ret(last 50)=-9.58 sr=8%] - ... [trial 8 | 1 sheep | 150,000 steps | ret(last 50)=-10.87 sr=8%] - ... [trial 8 | 1 sheep | 200,000 steps | ret(last 50)=-9.79 sr=6%] - ... [trial 8 | 1 sheep | 250,000 steps | ret(last 50)=-7.19 sr=8%] - ... [trial 8 | 1 sheep | 300,000 steps | ret(last 50)=-3.84 sr=18%] - ... [trial 8 | 1 sheep | 350,000 steps | ret(last 50)=-0.03 sr=26%] - ... [trial 8 | 1 sheep | 400,000 steps | ret(last 50)=+6.80 sr=44%] - ... [trial 8 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 8 | 2 sheep | 459,608 steps | ret(last 35)=-3.00 sr=9%] - ... [trial 8 | 2 sheep | 509,608 steps | ret(last 50)=-4.26 sr=4%] - ... [trial 8 | 2 sheep | 559,608 steps | ret(last 50)=+1.91 sr=14%] - ... [trial 8 | 2 sheep | 609,608 steps | ret(last 50)=-0.57 sr=16%] - ... [trial 8 | 2 sheep | 659,608 steps | ret(last 50)=+1.65 sr=14%] - ... [trial 8 | 2 sheep | 709,608 steps | ret(last 50)=+2.90 sr=8%] - ... [trial 8 | 2 sheep | 759,608 steps | ret(last 50)=+0.98 sr=2%] - ... [trial 8 | 2 sheep | 809,608 steps | ret(last 50)=-2.52 sr=4%] - ... [trial 8 | 2 sheep | 859,608 steps | ret(last 50)=-1.11 sr=2%] - ... [trial 8 | 2 sheep | 909,608 steps | ret(last 50)=+2.74 sr=2%] - ... [trial 8 | 2 sheep | 959,608 steps | ret(last 50)=+2.94 sr=0%] - ... [trial 8 | 2 sheep | 1,009,608 steps | ret(last 50)=+5.13 sr=0%] - ... [trial 8 | eval n=1] - ... [trial 8 | eval n=2] - ... [trial 8 | eval n=3] - → score=0.110 sr1=0.30 sr2=0.10 sr3=0.00 [451s] -[Trial 9] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 50.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.05} - ... [trial 9 | 1 sheep | 50,000 steps | ret(last 34)=-11.25 sr=15%] - ... [trial 9 | 1 sheep | 100,000 steps | ret(last 50)=-11.98 sr=8%] - ... [trial 9 | 1 sheep | 150,000 steps | ret(last 50)=-10.46 sr=14%] - ... [trial 9 | 1 sheep | 200,000 steps | ret(last 50)=-2.86 sr=14%] - ... [trial 9 | 1 sheep | 250,000 steps | ret(last 50)=+8.65 sr=60%] - ... [trial 9 | 1 sheep | 300,000 steps | ret(last 50)=+10.48 sr=58%] - ... [trial 9 | 1 sheep | 350,000 steps | ret(last 50)=+8.65 sr=56%] - ... [trial 9 | 1 sheep | 400,000 steps | ret(last 50)=+10.25 sr=68%] - ... [trial 9 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 9 | 2 sheep | 459,608 steps | ret(last 35)=-0.75 sr=20%] - ... [trial 9 | 2 sheep | 509,608 steps | ret(last 50)=-6.64 sr=2%] - ... [trial 9 | 2 sheep | 559,608 steps | ret(last 50)=-7.43 sr=4%] - ... [trial 9 | 2 sheep | 609,608 steps | ret(last 50)=-4.32 sr=6%] - ... [trial 9 | 2 sheep | 659,608 steps | ret(last 50)=-3.64 sr=6%] - ... [trial 9 | 2 sheep | 709,608 steps | ret(last 50)=-7.09 sr=0%] - ... [trial 9 | 2 sheep | 759,608 steps | ret(last 50)=-5.60 sr=4%] - ... [trial 9 | 2 sheep | 809,608 steps | ret(last 50)=-5.70 sr=6%] - ... [trial 9 | 2 sheep | 859,608 steps | ret(last 50)=-4.99 sr=4%] - ... [trial 9 | 2 sheep | 909,608 steps | ret(last 50)=-4.60 sr=6%] - ... [trial 9 | 2 sheep | 959,608 steps | ret(last 50)=-6.53 sr=4%] - ... [trial 9 | 2 sheep | 1,009,608 steps | ret(last 50)=-7.46 sr=2%] - ... [trial 9 | eval n=1] - ... [trial 9 | eval n=2] - ... [trial 9 | eval n=3] - → score=0.190 sr1=0.70 sr2=0.10 sr3=0.00 [349s] -[Trial 10] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.05} - ... [trial 10 | 1 sheep | 50,000 steps | ret(last 32)=-13.35 sr=3%] - ... [trial 10 | 1 sheep | 100,000 steps | ret(last 50)=-12.49 sr=4%] - ... [trial 10 | 1 sheep | 150,000 steps | ret(last 50)=-13.24 sr=8%] - ... [trial 10 | 1 sheep | 200,000 steps | ret(last 50)=-12.73 sr=10%] - ... [trial 10 | 1 sheep | 250,000 steps | ret(last 50)=-15.27 sr=4%] - ... [trial 10 | 1 sheep | 300,000 steps | ret(last 50)=-9.43 sr=8%] - ... [trial 10 | 1 sheep | 350,000 steps | ret(last 50)=-2.65 sr=22%] - ... [trial 10 | 1 sheep | 400,000 steps | ret(last 50)=+5.12 sr=46%] - ... [trial 10 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 10 | 2 sheep | 459,608 steps | ret(last 34)=-4.93 sr=6%] - ... [trial 10 | 2 sheep | 509,608 steps | ret(last 50)=-6.25 sr=2%] - ... [trial 10 | 2 sheep | 559,608 steps | ret(last 50)=-5.57 sr=4%] - ... [trial 10 | 2 sheep | 609,608 steps | ret(last 50)=-6.24 sr=4%] - ... [trial 10 | 2 sheep | 659,608 steps | ret(last 50)=-9.34 sr=0%] - ... [trial 10 | 2 sheep | 709,608 steps | ret(last 50)=-8.23 sr=0%] - ... [trial 10 | 2 sheep | 759,608 steps | ret(last 50)=-8.34 sr=0%] - ... [trial 10 | 2 sheep | 809,608 steps | ret(last 50)=-5.27 sr=0%] - ... [trial 10 | 2 sheep | 859,608 steps | ret(last 50)=-8.24 sr=0%] - ... [trial 10 | 2 sheep | 909,608 steps | ret(last 50)=-8.75 sr=0%] - ... [trial 10 | 2 sheep | 959,608 steps | ret(last 50)=-9.15 sr=0%] - ... [trial 10 | 2 sheep | 1,009,608 steps | ret(last 50)=-9.75 sr=0%] - ... [trial 10 | eval n=1] - ... [trial 10 | eval n=2] - ... [trial 10 | eval n=3] - → score=0.000 sr1=0.00 sr2=0.00 sr3=0.00 [319s] -[Trial 11] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 0.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02} - ... [trial 11 | 1 sheep | 50,000 steps | ret(last 32)=-3.50 sr=12%] - ... [trial 11 | 1 sheep | 100,000 steps | ret(last 50)=-5.79 sr=6%] - ... [trial 11 | 1 sheep | 150,000 steps | ret(last 50)=-2.10 sr=18%] - ... [trial 11 | 1 sheep | 200,000 steps | ret(last 50)=+2.60 sr=8%] - ... [trial 11 | 1 sheep | 250,000 steps | ret(last 50)=+11.49 sr=8%] - ... [trial 11 | 1 sheep | 300,000 steps | ret(last 50)=+21.73 sr=26%] - ... [trial 11 | 1 sheep | 350,000 steps | ret(last 50)=+20.73 sr=36%] - ... [trial 11 | 1 sheep | 400,000 steps | ret(last 50)=+19.77 sr=62%] - ... [trial 11 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 11 | 2 sheep | 459,608 steps | ret(last 36)=+10.19 sr=11%] - ... [trial 11 | 2 sheep | 509,608 steps | ret(last 50)=+11.56 sr=6%] - ... [trial 11 | 2 sheep | 559,608 steps | ret(last 50)=+13.61 sr=2%] - ... [trial 11 | 2 sheep | 609,608 steps | ret(last 50)=+15.44 sr=4%] - ... [trial 11 | 2 sheep | 659,608 steps | ret(last 50)=+15.61 sr=10%] - ... [trial 11 | 2 sheep | 709,608 steps | ret(last 50)=+16.30 sr=6%] - ... [trial 11 | 2 sheep | 759,608 steps | ret(last 50)=+17.33 sr=4%] - ... [trial 11 | 2 sheep | 809,608 steps | ret(last 50)=+18.36 sr=2%] - ... [trial 11 | 2 sheep | 859,608 steps | ret(last 50)=+19.78 sr=8%] - ... [trial 11 | 2 sheep | 909,608 steps | ret(last 50)=+20.12 sr=14%] - ... [trial 11 | 2 sheep | 959,608 steps | ret(last 50)=+18.93 sr=8%] - ... [trial 11 | 2 sheep | 1,009,608 steps | ret(last 50)=+18.16 sr=2%] - ... [trial 11 | eval n=1] - ... [trial 11 | eval n=2] - ... [trial 11 | eval n=3] - → score=0.160 sr1=0.80 sr2=0.00 sr3=0.00 [310s] -[Trial 12] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 50.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.02} - ... [trial 12 | 1 sheep | 50,000 steps | ret(last 32)=-42.77 sr=0%] - ... [trial 12 | 1 sheep | 100,000 steps | ret(last 50)=-39.16 sr=2%] - ... [trial 12 | 1 sheep | 150,000 steps | ret(last 50)=-35.02 sr=6%] - ... [trial 12 | 1 sheep | 200,000 steps | ret(last 50)=-31.49 sr=4%] - ... [trial 12 | 1 sheep | 250,000 steps | ret(last 50)=-8.31 sr=16%] - ... [trial 12 | 1 sheep | 300,000 steps | ret(last 50)=+7.97 sr=36%] - ... [trial 12 | 1 sheep | 350,000 steps | ret(last 50)=+11.77 sr=68%] - ... [trial 12 | 1 sheep | 400,000 steps | ret(last 50)=+12.47 sr=74%] - ... [trial 12 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 12 | 2 sheep | 459,608 steps | ret(last 34)=-9.76 sr=0%] - ... [trial 12 | 2 sheep | 509,608 steps | ret(last 50)=-4.85 sr=0%] - ... [trial 12 | 2 sheep | 559,608 steps | ret(last 50)=-2.81 sr=8%] - ... [trial 12 | 2 sheep | 609,608 steps | ret(last 50)=+2.27 sr=10%] - ... [trial 12 | 2 sheep | 659,608 steps | ret(last 50)=+1.66 sr=6%] - ... [trial 12 | 2 sheep | 709,608 steps | ret(last 50)=+3.42 sr=4%] - ... [trial 12 | 2 sheep | 759,608 steps | ret(last 50)=+4.08 sr=2%] - ... [trial 12 | 2 sheep | 809,608 steps | ret(last 50)=+5.49 sr=2%] - ... [trial 12 | 2 sheep | 859,608 steps | ret(last 50)=+7.12 sr=10%] - ... [trial 12 | 2 sheep | 909,608 steps | ret(last 50)=+7.91 sr=6%] - ... [trial 12 | 2 sheep | 959,608 steps | ret(last 50)=+6.87 sr=2%] - ... [trial 12 | 2 sheep | 1,009,608 steps | ret(last 50)=+5.83 sr=2%] - ... [trial 12 | eval n=1] - ... [trial 12 | eval n=2] - ... [trial 12 | eval n=3] - → score=0.240 sr1=0.70 sr2=0.20 sr3=0.00 [330s] -[Trial 13] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 0.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.005} - ... [trial 13 | 1 sheep | 50,000 steps | ret(last 34)=-31.15 sr=9%] - ... [trial 13 | 1 sheep | 100,000 steps | ret(last 50)=-32.34 sr=4%] - ... [trial 13 | 1 sheep | 150,000 steps | ret(last 50)=-33.16 sr=0%] - ... [trial 13 | 1 sheep | 200,000 steps | ret(last 50)=-29.98 sr=6%] - ... [trial 13 | 1 sheep | 250,000 steps | ret(last 50)=-28.64 sr=4%] - ... [trial 13 | 1 sheep | 300,000 steps | ret(last 50)=-17.91 sr=14%] - ... [trial 13 | 1 sheep | 350,000 steps | ret(last 50)=-15.27 sr=22%] - ... [trial 13 | 1 sheep | 400,000 steps | ret(last 50)=-11.36 sr=16%] - ... [trial 13 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 13 | 2 sheep | 459,608 steps | ret(last 34)=-16.78 sr=0%] - ... [trial 13 | 2 sheep | 509,608 steps | ret(last 50)=-16.84 sr=2%] - ... [trial 13 | 2 sheep | 559,608 steps | ret(last 50)=-14.28 sr=0%] - ... [trial 13 | 2 sheep | 609,608 steps | ret(last 50)=-12.35 sr=6%] - ... [trial 13 | 2 sheep | 659,608 steps | ret(last 50)=-14.50 sr=2%] - ... [trial 13 | 2 sheep | 709,608 steps | ret(last 50)=-12.96 sr=2%] - ... [trial 13 | 2 sheep | 759,608 steps | ret(last 50)=-9.86 sr=4%] - ... [trial 13 | 2 sheep | 809,608 steps | ret(last 50)=-13.88 sr=2%] - ... [trial 13 | 2 sheep | 859,608 steps | ret(last 50)=-14.76 sr=0%] - ... [trial 13 | 2 sheep | 909,608 steps | ret(last 50)=-12.79 sr=0%] - ... [trial 13 | 2 sheep | 959,608 steps | ret(last 50)=-12.54 sr=0%] - ... [trial 13 | 2 sheep | 1,009,608 steps | ret(last 50)=-12.11 sr=8%] - ... [trial 13 | eval n=1] - ... [trial 13 | eval n=2] - ... [trial 13 | eval n=3] - → score=0.060 sr1=0.30 sr2=0.00 sr3=0.00 [323s] -[Trial 14] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02} - ... [trial 14 | 1 sheep | 50,000 steps | ret(last 32)=-20.15 sr=9%] - ... [trial 14 | 1 sheep | 100,000 steps | ret(last 50)=-15.28 sr=8%] - ... [trial 14 | 1 sheep | 150,000 steps | ret(last 50)=-8.87 sr=26%] - ... [trial 14 | 1 sheep | 200,000 steps | ret(last 50)=-9.94 sr=8%] - ... [trial 14 | 1 sheep | 250,000 steps | ret(last 50)=-9.04 sr=8%] - ... [trial 14 | 1 sheep | 300,000 steps | ret(last 50)=-7.40 sr=14%] - ... [trial 14 | 1 sheep | 350,000 steps | ret(last 50)=+2.22 sr=50%] - ... [trial 14 | 1 sheep | 400,000 steps | ret(last 50)=+4.06 sr=58%] - ... [trial 14 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 14 | 2 sheep | 459,608 steps | ret(last 33)=-5.93 sr=3%] - ... [trial 14 | 2 sheep | 509,608 steps | ret(last 50)=-6.85 sr=4%] - ... [trial 14 | 2 sheep | 559,608 steps | ret(last 50)=-6.81 sr=6%] - ... [trial 14 | 2 sheep | 609,608 steps | ret(last 50)=-4.80 sr=4%] - ... [trial 14 | 2 sheep | 659,608 steps | ret(last 50)=-6.55 sr=4%] - ... [trial 14 | 2 sheep | 709,608 steps | ret(last 50)=-4.81 sr=12%] - ... [trial 14 | 2 sheep | 759,608 steps | ret(last 50)=-5.41 sr=10%] - ... [trial 14 | 2 sheep | 809,608 steps | ret(last 50)=-0.00 sr=30%] - ... [trial 14 | 2 sheep | 859,608 steps | ret(last 50)=+1.17 sr=26%] - ... [trial 14 | 2 sheep | 909,608 steps | ret(last 50)=+0.17 sr=20%] - ... [trial 14 | 2 sheep | 959,608 steps | ret(last 50)=-0.96 sr=18%] - ... [trial 14 | 2 sheep | 1,009,608 steps | ret(last 50)=-1.33 sr=20%] - ... [trial 14 | eval n=1] - ... [trial 14 | eval n=2] - ... [trial 14 | eval n=3] - → score=0.350 sr1=1.00 sr2=0.30 sr3=0.00 [314s] -[Trial 15] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.05} - ... [trial 15 | 1 sheep | 50,000 steps | ret(last 32)=-6.83 sr=3%] - ... [trial 15 | 1 sheep | 100,000 steps | ret(last 50)=-7.59 sr=4%] - ... [trial 15 | 1 sheep | 150,000 steps | ret(last 50)=-5.74 sr=6%] - ... [trial 15 | 1 sheep | 200,000 steps | ret(last 50)=-5.92 sr=6%] - ... [trial 15 | 1 sheep | 250,000 steps | ret(last 50)=+8.14 sr=22%] - ... [trial 15 | 1 sheep | 300,000 steps | ret(last 50)=+15.51 sr=22%] - ... [trial 15 | 1 sheep | 350,000 steps | ret(last 50)=+21.46 sr=20%] - ... [trial 15 | 1 sheep | 400,000 steps | ret(last 50)=+22.52 sr=16%] - ... [trial 15 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 15 | 2 sheep | 459,608 steps | ret(last 35)=+6.28 sr=0%] - ... [trial 15 | 2 sheep | 509,608 steps | ret(last 50)=+13.19 sr=2%] - ... [trial 15 | 2 sheep | 559,608 steps | ret(last 50)=+15.58 sr=4%] - ... [trial 15 | 2 sheep | 609,608 steps | ret(last 50)=+18.78 sr=10%] - ... [trial 15 | 2 sheep | 659,608 steps | ret(last 50)=+22.71 sr=10%] - ... [trial 15 | 2 sheep | 709,608 steps | ret(last 50)=+23.95 sr=6%] - ... [trial 15 | 2 sheep | 759,608 steps | ret(last 50)=+24.84 sr=14%] - ... [trial 15 | 2 sheep | 809,608 steps | ret(last 50)=+24.00 sr=8%] - ... [trial 15 | 2 sheep | 859,608 steps | ret(last 50)=+23.91 sr=2%] - ... [trial 15 | 2 sheep | 909,608 steps | ret(last 50)=+23.73 sr=4%] - ... [trial 15 | 2 sheep | 959,608 steps | ret(last 50)=+24.23 sr=4%] - ... [trial 15 | 2 sheep | 1,009,608 steps | ret(last 50)=+24.77 sr=4%] - ... [trial 15 | eval n=1] - ... [trial 15 | eval n=2] - ... [trial 15 | eval n=3] - → score=0.140 sr1=0.70 sr2=0.00 sr3=0.00 [323s] -[Trial 16] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.01} - ... [trial 16 | 1 sheep | 50,000 steps | ret(last 32)=-7.14 sr=9%] - ... [trial 16 | 1 sheep | 100,000 steps | ret(last 50)=-5.58 sr=12%] - ... [trial 16 | 1 sheep | 150,000 steps | ret(last 50)=+5.93 sr=26%] - ... [trial 16 | 1 sheep | 200,000 steps | ret(last 50)=+15.53 sr=68%] - ... [trial 16 | 1 sheep | 250,000 steps | ret(last 50)=+14.88 sr=56%] - ... [trial 16 | 1 sheep | 300,000 steps | ret(last 50)=+13.86 sr=36%] - ... [trial 16 | 1 sheep | 350,000 steps | ret(last 50)=+14.84 sr=54%] - ... [trial 16 | 1 sheep | 400,000 steps | ret(last 50)=+15.15 sr=70%] - ... [trial 16 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 16 | 2 sheep | 459,608 steps | ret(last 34)=-1.47 sr=6%] - ... [trial 16 | 2 sheep | 509,608 steps | ret(last 50)=-1.63 sr=2%] - ... [trial 16 | 2 sheep | 559,608 steps | ret(last 50)=-3.78 sr=2%] - ... [trial 16 | 2 sheep | 609,608 steps | ret(last 50)=-2.17 sr=4%] - ... [trial 16 | 2 sheep | 659,608 steps | ret(last 50)=+1.25 sr=6%] - ... [trial 16 | 2 sheep | 709,608 steps | ret(last 50)=+0.28 sr=4%] - ... [trial 16 | 2 sheep | 759,608 steps | ret(last 50)=+2.74 sr=4%] - ... [trial 16 | 2 sheep | 809,608 steps | ret(last 50)=+7.19 sr=6%] - ... [trial 16 | 2 sheep | 859,608 steps | ret(last 50)=+7.68 sr=4%] - ... [trial 16 | 2 sheep | 909,608 steps | ret(last 50)=+2.38 sr=0%] - ... [trial 16 | 2 sheep | 959,608 steps | ret(last 50)=+3.43 sr=0%] - ... [trial 16 | 2 sheep | 1,009,608 steps | ret(last 50)=+11.11 sr=0%] - ... [trial 16 | eval n=1] - ... [trial 16 | eval n=2] - ... [trial 16 | eval n=3] - → score=0.060 sr1=0.30 sr2=0.00 sr3=0.00 [312s] -[Trial 17] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.005, 'W_COMPLETE': 200.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': True, 'ent_coef': 0.05} - ... [trial 17 | 1 sheep | 50,000 steps | ret(last 32)=+2.15 sr=6%] - ... [trial 17 | 1 sheep | 100,000 steps | ret(last 50)=-0.51 sr=2%] - ... [trial 17 | 1 sheep | 150,000 steps | ret(last 50)=+0.84 sr=6%] - ... [trial 17 | 1 sheep | 200,000 steps | ret(last 50)=+2.96 sr=6%] - ... [trial 17 | 1 sheep | 250,000 steps | ret(last 50)=+3.04 sr=4%] - ... [trial 17 | 1 sheep | 300,000 steps | ret(last 50)=+10.58 sr=10%] - ... [trial 17 | 1 sheep | 350,000 steps | ret(last 50)=+21.95 sr=36%] - ... [trial 17 | 1 sheep | 400,000 steps | ret(last 50)=+19.20 sr=16%] - ... [trial 17 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 17 | 2 sheep | 459,608 steps | ret(last 32)=+10.27 sr=16%] - ... [trial 17 | 2 sheep | 509,608 steps | ret(last 50)=+12.25 sr=6%] - ... [trial 17 | 2 sheep | 559,608 steps | ret(last 50)=+12.94 sr=6%] - ... [trial 17 | 2 sheep | 609,608 steps | ret(last 50)=+11.82 sr=4%] - ... [trial 17 | 2 sheep | 659,608 steps | ret(last 50)=+13.45 sr=4%] - ... [trial 17 | 2 sheep | 709,608 steps | ret(last 50)=+13.03 sr=4%] - ... [trial 17 | 2 sheep | 759,608 steps | ret(last 50)=+10.69 sr=6%] - ... [trial 17 | 2 sheep | 809,608 steps | ret(last 50)=+7.79 sr=6%] - ... [trial 17 | 2 sheep | 859,608 steps | ret(last 50)=+12.16 sr=16%] - ... [trial 17 | 2 sheep | 909,608 steps | ret(last 50)=+11.75 sr=12%] - ... [trial 17 | 2 sheep | 959,608 steps | ret(last 50)=+13.65 sr=16%] - ... [trial 17 | 2 sheep | 1,009,608 steps | ret(last 50)=+12.43 sr=10%] - ... [trial 17 | eval n=1] - ... [trial 17 | eval n=2] - ... [trial 17 | eval n=3] - → score=0.240 sr1=0.70 sr2=0.20 sr3=0.00 [304s] -[Trial 18] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.005, 'W_COMPLETE': 100.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.02} - ... [trial 18 | 1 sheep | 50,000 steps | ret(last 32)=-3.63 sr=3%] - ... [trial 18 | 1 sheep | 100,000 steps | ret(last 50)=-2.28 sr=12%] - ... [trial 18 | 1 sheep | 150,000 steps | ret(last 50)=-3.15 sr=10%] - ... [trial 18 | 1 sheep | 200,000 steps | ret(last 50)=-3.31 sr=6%] - ... [trial 18 | 1 sheep | 250,000 steps | ret(last 50)=-3.23 sr=2%] - ... [trial 18 | 1 sheep | 300,000 steps | ret(last 50)=+3.55 sr=22%] - ... [trial 18 | 1 sheep | 350,000 steps | ret(last 50)=+8.15 sr=28%] - ... [trial 18 | 1 sheep | 400,000 steps | ret(last 50)=+10.56 sr=18%] - ... [trial 18 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 18 | 2 sheep | 459,608 steps | ret(last 34)=+3.80 sr=0%] - ... [trial 18 | 2 sheep | 509,608 steps | ret(last 50)=+7.30 sr=4%] - ... [trial 18 | 2 sheep | 559,608 steps | ret(last 50)=+9.61 sr=10%] - ... [trial 18 | 2 sheep | 609,608 steps | ret(last 50)=+7.70 sr=8%] - ... [trial 18 | 2 sheep | 659,608 steps | ret(last 50)=+6.01 sr=2%] - ... [trial 18 | 2 sheep | 709,608 steps | ret(last 50)=+8.28 sr=6%] - ... [trial 18 | 2 sheep | 759,608 steps | ret(last 50)=+6.74 sr=0%] - ... [trial 18 | 2 sheep | 809,608 steps | ret(last 50)=+10.61 sr=0%] - ... [trial 18 | 2 sheep | 859,608 steps | ret(last 50)=+12.20 sr=0%] - ... [trial 18 | 2 sheep | 909,608 steps | ret(last 50)=+11.25 sr=2%] - ... [trial 18 | 2 sheep | 959,608 steps | ret(last 50)=+13.58 sr=4%] - ... [trial 18 | 2 sheep | 1,009,608 steps | ret(last 50)=+16.61 sr=20%] - ... [trial 18 | eval n=1] - ... [trial 18 | eval n=2] - ... [trial 18 | eval n=3] - → score=0.160 sr1=0.30 sr2=0.20 sr3=0.00 [316s] -[Trial 19] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 50.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.005} - ... [trial 19 | 1 sheep | 50,000 steps | ret(last 32)=-36.89 sr=3%] - ... [trial 19 | 1 sheep | 100,000 steps | ret(last 50)=-30.93 sr=4%] - ... [trial 19 | 1 sheep | 150,000 steps | ret(last 50)=-28.35 sr=12%] - ... [trial 19 | 1 sheep | 200,000 steps | ret(last 50)=-30.73 sr=8%] - ... [trial 19 | 1 sheep | 250,000 steps | ret(last 50)=-29.54 sr=4%] - ... [trial 19 | 1 sheep | 300,000 steps | ret(last 50)=-20.15 sr=20%] - ... [trial 19 | 1 sheep | 350,000 steps | ret(last 50)=-0.07 sr=68%] - ... [trial 19 | 1 sheep | 400,000 steps | ret(last 50)=+1.66 sr=52%] - ... [trial 19 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 19 | 2 sheep | 459,608 steps | ret(last 36)=-12.82 sr=19%] - ... [trial 19 | 2 sheep | 509,608 steps | ret(last 50)=-20.66 sr=0%] - ... [trial 19 | 2 sheep | 559,608 steps | ret(last 50)=-16.54 sr=4%] - ... [trial 19 | 2 sheep | 609,608 steps | ret(last 50)=-17.11 sr=4%] - ... [trial 19 | 2 sheep | 659,608 steps | ret(last 50)=-19.32 sr=0%] - ... [trial 19 | 2 sheep | 709,608 steps | ret(last 50)=-16.20 sr=0%] - ... [trial 19 | 2 sheep | 759,608 steps | ret(last 50)=-13.12 sr=2%] - ... [trial 19 | 2 sheep | 809,608 steps | ret(last 50)=-17.18 sr=4%] - ... [trial 19 | 2 sheep | 859,608 steps | ret(last 50)=-18.16 sr=2%] - ... [trial 19 | 2 sheep | 909,608 steps | ret(last 50)=-18.12 sr=4%] - ... [trial 19 | 2 sheep | 959,608 steps | ret(last 50)=-17.79 sr=2%] - ... [trial 19 | 2 sheep | 1,009,608 steps | ret(last 50)=-17.58 sr=0%] - ... [trial 19 | eval n=1] - ... [trial 19 | eval n=2] - ... [trial 19 | eval n=3] - → score=0.160 sr1=0.80 sr2=0.00 sr3=0.00 [318s] -[Trial 20] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02} - ... [trial 20 | 1 sheep | 50,000 steps | ret(last 33)=-15.83 sr=9%] - ... [trial 20 | 1 sheep | 100,000 steps | ret(last 50)=-18.74 sr=10%] - ... [trial 20 | 1 sheep | 150,000 steps | ret(last 50)=-22.88 sr=6%] - ... [trial 20 | 1 sheep | 200,000 steps | ret(last 50)=-23.86 sr=4%] - ... [trial 20 | 1 sheep | 250,000 steps | ret(last 50)=-21.10 sr=6%] - ... [trial 20 | 1 sheep | 300,000 steps | ret(last 50)=-18.42 sr=6%] - ... [trial 20 | 1 sheep | 350,000 steps | ret(last 50)=+1.74 sr=14%] - ... [trial 20 | 1 sheep | 400,000 steps | ret(last 50)=+7.62 sr=34%] - ... [trial 20 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 20 | 2 sheep | 459,608 steps | ret(last 34)=-2.63 sr=3%] - ... [trial 20 | 2 sheep | 509,608 steps | ret(last 50)=+1.10 sr=2%] - ... [trial 20 | 2 sheep | 559,608 steps | ret(last 50)=+5.57 sr=4%] - ... [trial 20 | 2 sheep | 609,608 steps | ret(last 50)=+8.54 sr=8%] - ... [trial 20 | 2 sheep | 659,608 steps | ret(last 50)=+12.02 sr=8%] - ... [trial 20 | 2 sheep | 709,608 steps | ret(last 50)=+11.28 sr=4%] - ... [trial 20 | 2 sheep | 759,608 steps | ret(last 50)=+11.45 sr=2%] - ... [trial 20 | 2 sheep | 809,608 steps | ret(last 50)=+9.52 sr=0%] - ... [trial 20 | 2 sheep | 859,608 steps | ret(last 50)=+9.07 sr=2%] - ... [trial 20 | 2 sheep | 909,608 steps | ret(last 50)=+12.06 sr=8%] - ... [trial 20 | 2 sheep | 959,608 steps | ret(last 50)=+12.77 sr=8%] - ... [trial 20 | 2 sheep | 1,009,608 steps | ret(last 50)=+11.55 sr=2%] - ... [trial 20 | eval n=1] - ... [trial 20 | eval n=2] - ... [trial 20 | eval n=3] - → score=0.130 sr1=0.40 sr2=0.10 sr3=0.00 [315s] -[Trial 21] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 0.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.005} - ... [trial 21 | 1 sheep | 50,000 steps | ret(last 32)=-14.94 sr=6%] - ... [trial 21 | 1 sheep | 100,000 steps | ret(last 50)=-12.47 sr=4%] - ... [trial 21 | 1 sheep | 150,000 steps | ret(last 50)=-12.65 sr=6%] - ... [trial 21 | 1 sheep | 200,000 steps | ret(last 50)=-12.44 sr=2%] - ... [trial 21 | 1 sheep | 250,000 steps | ret(last 50)=-12.95 sr=6%] - ... [trial 21 | 1 sheep | 300,000 steps | ret(last 50)=-13.04 sr=6%] - ... [trial 21 | 1 sheep | 350,000 steps | ret(last 50)=-5.14 sr=8%] - ... [trial 21 | 1 sheep | 400,000 steps | ret(last 50)=-0.46 sr=8%] - ... [trial 21 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 21 | 2 sheep | 459,608 steps | ret(last 33)=-7.10 sr=0%] - ... [trial 21 | 2 sheep | 509,608 steps | ret(last 50)=-8.26 sr=0%] - ... [trial 21 | 2 sheep | 559,608 steps | ret(last 50)=-6.17 sr=4%] - ... [trial 21 | 2 sheep | 609,608 steps | ret(last 50)=-4.23 sr=4%] - ... [trial 21 | 2 sheep | 659,608 steps | ret(last 50)=-5.62 sr=0%] - ... [trial 21 | 2 sheep | 709,608 steps | ret(last 50)=-3.72 sr=0%] - ... [trial 21 | 2 sheep | 759,608 steps | ret(last 50)=-2.06 sr=0%] - ... [trial 21 | 2 sheep | 809,608 steps | ret(last 50)=-1.23 sr=0%] - ... [trial 21 | 2 sheep | 859,608 steps | ret(last 50)=-0.14 sr=0%] - ... [trial 21 | 2 sheep | 909,608 steps | ret(last 50)=+1.30 sr=2%] - ... [trial 21 | 2 sheep | 959,608 steps | ret(last 50)=+0.64 sr=2%] - ... [trial 21 | 2 sheep | 1,009,608 steps | ret(last 50)=+2.62 sr=6%] - ... [trial 21 | eval n=1] - ... [trial 21 | eval n=2] - ... [trial 21 | eval n=3] - → score=0.050 sr1=0.00 sr2=0.10 sr3=0.00 [310s] -[Trial 22] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.005} - ... [trial 22 | 1 sheep | 50,000 steps | ret(last 32)=-11.10 sr=6%] - ... [trial 22 | 1 sheep | 100,000 steps | ret(last 50)=-10.61 sr=8%] - ... [trial 22 | 1 sheep | 150,000 steps | ret(last 50)=-11.16 sr=4%] - ... [trial 22 | 1 sheep | 200,000 steps | ret(last 50)=-11.15 sr=4%] - ... [trial 22 | 1 sheep | 250,000 steps | ret(last 50)=-10.56 sr=6%] - ... [trial 22 | 1 sheep | 300,000 steps | ret(last 50)=-14.90 sr=0%] - ... [trial 22 | 1 sheep | 350,000 steps | ret(last 50)=-5.11 sr=14%] - ... [trial 22 | 1 sheep | 400,000 steps | ret(last 50)=+2.22 sr=24%] - ... [trial 22 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 22 | 2 sheep | 459,608 steps | ret(last 35)=-4.69 sr=6%] - ... [trial 22 | 2 sheep | 509,608 steps | ret(last 50)=-3.17 sr=0%] - ... [trial 22 | 2 sheep | 559,608 steps | ret(last 50)=+2.18 sr=2%] - ... [trial 22 | 2 sheep | 609,608 steps | ret(last 50)=+4.53 sr=8%] - ... [trial 22 | 2 sheep | 659,608 steps | ret(last 50)=+4.97 sr=10%] - ... [trial 22 | 2 sheep | 709,608 steps | ret(last 50)=+5.06 sr=8%] - ... [trial 22 | 2 sheep | 759,608 steps | ret(last 50)=+6.04 sr=4%] - ... [trial 22 | 2 sheep | 809,608 steps | ret(last 50)=+5.95 sr=4%] - ... [trial 22 | 2 sheep | 859,608 steps | ret(last 50)=+3.34 sr=2%] - ... [trial 22 | 2 sheep | 909,608 steps | ret(last 50)=+6.80 sr=8%] - ... [trial 22 | 2 sheep | 959,608 steps | ret(last 50)=+4.13 sr=8%] - ... [trial 22 | 2 sheep | 1,009,608 steps | ret(last 50)=+4.17 sr=2%] - ... [trial 22 | eval n=1] - ... [trial 22 | eval n=2] - ... [trial 22 | eval n=3] - → score=0.110 sr1=0.30 sr2=0.10 sr3=0.00 [316s] -[Trial 23] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.05} - ... [trial 23 | 1 sheep | 50,000 steps | ret(last 32)=-22.59 sr=9%] - ... [trial 23 | 1 sheep | 100,000 steps | ret(last 50)=-21.14 sr=6%] - ... [trial 23 | 1 sheep | 150,000 steps | ret(last 50)=-20.75 sr=6%] - ... [trial 23 | 1 sheep | 200,000 steps | ret(last 50)=-20.37 sr=8%] - ... [trial 23 | 1 sheep | 250,000 steps | ret(last 50)=-5.04 sr=18%] - ... [trial 23 | 1 sheep | 300,000 steps | ret(last 50)=+7.25 sr=12%] - ... [trial 23 | 1 sheep | 350,000 steps | ret(last 50)=+11.34 sr=32%] - ... [trial 23 | 1 sheep | 400,000 steps | ret(last 50)=+13.02 sr=24%] - ... [trial 23 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 23 | 2 sheep | 459,608 steps | ret(last 32)=+0.29 sr=3%] - ... [trial 23 | 2 sheep | 509,608 steps | ret(last 50)=-0.39 sr=4%] - ... [trial 23 | 2 sheep | 559,608 steps | ret(last 50)=+6.56 sr=2%] - ... [trial 23 | 2 sheep | 609,608 steps | ret(last 50)=+10.45 sr=2%] - ... [trial 23 | 2 sheep | 659,608 steps | ret(last 50)=+9.75 sr=2%] - ... [trial 23 | 2 sheep | 709,608 steps | ret(last 50)=+7.98 sr=6%] - ... [trial 23 | 2 sheep | 759,608 steps | ret(last 50)=+9.20 sr=4%] - ... [trial 23 | 2 sheep | 809,608 steps | ret(last 50)=+11.03 sr=6%] - ... [trial 23 | 2 sheep | 859,608 steps | ret(last 50)=+12.53 sr=6%] - ... [trial 23 | 2 sheep | 909,608 steps | ret(last 50)=+10.86 sr=6%] - ... [trial 23 | 2 sheep | 959,608 steps | ret(last 50)=+13.16 sr=14%] - ... [trial 23 | 2 sheep | 1,009,608 steps | ret(last 50)=+12.36 sr=12%] - ... [trial 23 | eval n=1] - ... [trial 23 | eval n=2] - ... [trial 23 | eval n=3] - → score=0.060 sr1=0.30 sr2=0.00 sr3=0.00 [472s] -[Trial 24] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.005, 'W_COMPLETE': 200.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.01} - ... [trial 24 | 1 sheep | 50,000 steps | ret(last 32)=-1.97 sr=0%] - ... [trial 24 | 1 sheep | 100,000 steps | ret(last 50)=-1.86 sr=2%] - ... [trial 24 | 1 sheep | 150,000 steps | ret(last 50)=-2.97 sr=4%] - ... [trial 24 | 1 sheep | 200,000 steps | ret(last 50)=-0.45 sr=8%] - ... [trial 24 | 1 sheep | 250,000 steps | ret(last 50)=-1.73 sr=4%] - ... [trial 24 | 1 sheep | 300,000 steps | ret(last 50)=+0.64 sr=4%] - ... [trial 24 | 1 sheep | 350,000 steps | ret(last 50)=+1.35 sr=2%] - ... [trial 24 | 1 sheep | 400,000 steps | ret(last 50)=+0.95 sr=4%] - ... [trial 24 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 24 | 2 sheep | 459,608 steps | ret(last 33)=+1.34 sr=0%] - ... [trial 24 | 2 sheep | 509,608 steps | ret(last 50)=+1.48 sr=0%] - ... [trial 24 | 2 sheep | 559,608 steps | ret(last 50)=+6.05 sr=0%] - ... [trial 24 | 2 sheep | 609,608 steps | ret(last 50)=+3.58 sr=0%] - ... [trial 24 | 2 sheep | 659,608 steps | ret(last 50)=+2.33 sr=0%] - ... [trial 24 | 2 sheep | 709,608 steps | ret(last 50)=+4.05 sr=2%] - ... [trial 24 | 2 sheep | 759,608 steps | ret(last 50)=+0.93 sr=0%] - ... [trial 24 | 2 sheep | 809,608 steps | ret(last 50)=-0.39 sr=0%] - ... [trial 24 | 2 sheep | 859,608 steps | ret(last 50)=-2.68 sr=0%] - ... [trial 24 | 2 sheep | 909,608 steps | ret(last 50)=+0.90 sr=0%] - ... [trial 24 | 2 sheep | 959,608 steps | ret(last 50)=+2.63 sr=0%] - ... [trial 24 | 2 sheep | 1,009,608 steps | ret(last 50)=+2.88 sr=0%] - ... [trial 24 | eval n=1] - ... [trial 24 | eval n=2] - ... [trial 24 | eval n=3] - → score=0.060 sr1=0.30 sr2=0.00 sr3=0.00 [335s] -[Trial 25] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 50.0, 'W_COMPACT': 0.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': True, 'ent_coef': 0.02} - ... [trial 25 | 1 sheep | 50,000 steps | ret(last 32)=-56.03 sr=3%] - ... [trial 25 | 1 sheep | 100,000 steps | ret(last 50)=-53.61 sr=4%] - ... [trial 25 | 1 sheep | 150,000 steps | ret(last 50)=-54.50 sr=4%] - ... [trial 25 | 1 sheep | 200,000 steps | ret(last 50)=-57.55 sr=4%] - ... [trial 25 | 1 sheep | 250,000 steps | ret(last 50)=-54.77 sr=8%] - ... [trial 25 | 1 sheep | 300,000 steps | ret(last 50)=-55.53 sr=4%] - ... [trial 25 | 1 sheep | 350,000 steps | ret(last 50)=-55.26 sr=4%] - ... [trial 25 | 1 sheep | 400,000 steps | ret(last 50)=-56.11 sr=4%] - ... [trial 25 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 25 | 2 sheep | 459,608 steps | ret(last 32)=-48.36 sr=0%] - ... [trial 25 | 2 sheep | 509,608 steps | ret(last 50)=-54.87 sr=0%] - ... [trial 25 | 2 sheep | 559,608 steps | ret(last 50)=-56.08 sr=0%] - ... [trial 25 | 2 sheep | 609,608 steps | ret(last 50)=-54.86 sr=0%] - ... [trial 25 | 2 sheep | 659,608 steps | ret(last 50)=-50.62 sr=0%] - ... [trial 25 | 2 sheep | 709,608 steps | ret(last 50)=-49.92 sr=0%] - ... [trial 25 | 2 sheep | 759,608 steps | ret(last 50)=-50.11 sr=0%] - ... [trial 25 | 2 sheep | 809,608 steps | ret(last 50)=-51.41 sr=0%] - ... [trial 25 | 2 sheep | 859,608 steps | ret(last 50)=-51.02 sr=0%] - ... [trial 25 | 2 sheep | 909,608 steps | ret(last 50)=-50.80 sr=0%] - ... [trial 25 | 2 sheep | 959,608 steps | ret(last 50)=-50.01 sr=0%] - ... [trial 25 | 2 sheep | 1,009,608 steps | ret(last 50)=-49.71 sr=0%] - ... [trial 25 | eval n=1] - ... [trial 25 | eval n=2] - ... [trial 25 | eval n=3] - → score=0.000 sr1=0.00 sr2=0.00 sr3=0.00 [306s] - -============================================================================================ - LEADERBOARD -============================================================================================ - rank score sr1 sr2 sr3 config - ---------------------------------------------------------------------------------------- - 1 0.350 1.00 0.30 0.00 W_PER_SHEEP=1.0 W_ALIGN=0.0 W_PEN_BONUS=5.0 W_STEP_COST=0.02 W_COMPLETE=200.0 W_COMPACT=1.5 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.02 - 2 0.270 0.70 0.20 0.10 W_PER_SHEEP=6.0 W_ALIGN=0.025 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=50.0 W_COMPACT=3.0 ALIGN_SHAPE=near ALIGN_GATED=False ent_coef=0.01 - 3 0.240 0.70 0.20 0.00 W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=5.0 W_STEP_COST=0.05 W_COMPLETE=50.0 W_COMPACT=3.0 ALIGN_SHAPE=near ALIGN_GATED=False ent_coef=0.02 - 4 0.240 0.70 0.20 0.00 W_PER_SHEEP=6.0 W_ALIGN=0.1 W_PEN_BONUS=5.0 W_STEP_COST=0.005 W_COMPLETE=200.0 W_COMPACT=0.0 ALIGN_SHAPE=near ALIGN_GATED=True ent_coef=0.05 - 5 0.200 1.00 0.00 0.00 W_PER_SHEEP=6.0 W_ALIGN=0.1 W_PEN_BONUS=5.0 W_STEP_COST=0.02 W_COMPLETE=50.0 W_COMPACT=3.0 ALIGN_SHAPE=near ALIGN_GATED=True ent_coef=0.005 - 6 0.190 0.70 0.10 0.00 W_PER_SHEEP=2.0 W_ALIGN=0.0 W_PEN_BONUS=20.0 W_STEP_COST=0.02 W_COMPLETE=50.0 W_COMPACT=0.0 ALIGN_SHAPE=near ALIGN_GATED=False ent_coef=0.05 - 7 0.160 0.80 0.00 0.00 W_PER_SHEEP=6.0 W_ALIGN=0.025 W_PEN_BONUS=20.0 W_STEP_COST=0.05 W_COMPLETE=200.0 W_COMPACT=3.0 ALIGN_SHAPE=standoff ALIGN_GATED=True ent_coef=0.01 - 8 0.160 0.80 0.00 0.00 W_PER_SHEEP=2.0 W_ALIGN=0.1 W_PEN_BONUS=20.0 W_STEP_COST=0.02 W_COMPLETE=200.0 W_COMPACT=0.5 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.02 - 9 0.160 0.80 0.00 0.00 W_PER_SHEEP=2.0 W_ALIGN=0.025 W_PEN_BONUS=10.0 W_STEP_COST=0.05 W_COMPLETE=50.0 W_COMPACT=0.0 ALIGN_SHAPE=standoff ALIGN_GATED=True ent_coef=0.005 - 10 0.160 0.30 0.20 0.00 W_PER_SHEEP=2.0 W_ALIGN=0.025 W_PEN_BONUS=10.0 W_STEP_COST=0.005 W_COMPLETE=100.0 W_COMPACT=1.5 ALIGN_SHAPE=near ALIGN_GATED=False ent_coef=0.02 - 11 0.150 0.50 0.10 0.00 W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=100.0 W_COMPACT=3.0 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.005 - 12 0.140 0.70 0.00 0.00 W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=100.0 W_COMPACT=1.5 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.05 - 13 0.130 0.40 0.10 0.00 W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=20.0 W_STEP_COST=0.05 W_COMPLETE=100.0 W_COMPACT=1.5 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.02 - 14 0.110 0.30 0.10 0.00 W_PER_SHEEP=6.0 W_ALIGN=0.025 W_PEN_BONUS=5.0 W_STEP_COST=0.05 W_COMPLETE=100.0 W_COMPACT=0.0 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.005 - 15 0.110 0.30 0.10 0.00 W_PER_SHEEP=2.0 W_ALIGN=0.05 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=100.0 W_COMPACT=3.0 ALIGN_SHAPE=standoff ALIGN_GATED=True ent_coef=0.005 - - Best config saved to runs/sweep_20260425_124630/best.json - Total trials: 25 (25 successful, 0 failed) - Total time: 2.28h - diff --git a/training/runs/sweep_smoke.log b/training/runs/sweep_smoke.log deleted file mode 100644 index ae47b2a..0000000 --- a/training/runs/sweep_smoke.log +++ /dev/null @@ -1,43 +0,0 @@ -Sweep dir: runs/sweep_20260425_124021 -Search space: ['W_PER_SHEEP', 'W_ALIGN', 'W_PEN_BONUS', 'W_STEP_COST', 'W_COMPLETE', 'W_COMPACT', 'ALIGN_SHAPE', 'ALIGN_GATED', 'ent_coef'] -Per-trial: 1,000,000 steps train + 30 eval eps -Time budget: 0.5h - -[Trial 1] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.005} - ... [trial 1 | 1 sheep | 50,000 steps | ret(last 32)=-8.33 sr=6%] - ... [trial 1 | 1 sheep | 100,000 steps | ret(last 50)=-2.95 sr=6%] - ... [trial 1 | 1 sheep | 150,000 steps | ret(last 50)=+12.68 sr=10%] - ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=+22.15 sr=22%] - ... [trial 1 | 1 sheep | 250,000 steps | ret(last 50)=+22.47 sr=18%] - ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=+23.58 sr=24%] - ... [trial 1 | 1 sheep | 350,000 steps | ret(last 50)=+23.42 sr=18%] - ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=+24.39 sr=32%] - ... [trial 1 | 2 sheep | 409,608 steps | ret(last 0)=+nan sr=nan%] - ... [trial 1 | 2 sheep | 459,608 steps | ret(last 35)=+15.39 sr=3%] - ... [trial 1 | 2 sheep | 509,608 steps | ret(last 50)=+20.25 sr=0%] - ... [trial 1 | 2 sheep | 559,608 steps | ret(last 50)=+23.24 sr=4%] - ... [trial 1 | 2 sheep | 609,608 steps | ret(last 50)=+23.36 sr=4%] - ... [trial 1 | 2 sheep | 659,608 steps | ret(last 50)=+25.32 sr=2%] - ... [trial 1 | 2 sheep | 709,608 steps | ret(last 50)=+24.02 sr=4%] - ... [trial 1 | 2 sheep | 759,608 steps | ret(last 50)=+24.66 sr=4%] - ... [trial 1 | 2 sheep | 809,608 steps | ret(last 50)=+25.41 sr=4%] - ... [trial 1 | 2 sheep | 859,608 steps | ret(last 50)=+24.27 sr=4%] - ... [trial 1 | 2 sheep | 909,608 steps | ret(last 50)=+25.13 sr=8%] - ... [trial 1 | 2 sheep | 959,608 steps | ret(last 50)=+25.10 sr=2%] - ... [trial 1 | 2 sheep | 1,009,608 steps | ret(last 50)=+26.02 sr=2%] - ... [trial 1 | eval n=1] - ... [trial 1 | eval n=2] - ... [trial 1 | eval n=3] - → score=0.060 sr1=0.30 sr2=0.00 sr3=0.00 [308s] - -============================================================================================ - LEADERBOARD -============================================================================================ - rank score sr1 sr2 sr3 config - ---------------------------------------------------------------------------------------- - 1 0.060 0.30 0.00 0.00 W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=100.0 W_COMPACT=3.0 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.005 - - Best config saved to runs/sweep_20260425_124021/best.json - Total trials: 1 (1 successful, 0 failed) - Total time: 0.09h - diff --git a/training/smoke_test.py b/training/smoke_test.py deleted file mode 100644 index 7892aca..0000000 --- a/training/smoke_test.py +++ /dev/null @@ -1,369 +0,0 @@ -""" -Quick sanity check before committing to a full 15M-step training run. - -Trains 1 sheep for 500k steps (~5 min), then 3 sheep for 500k steps. -If both pass, the obs/reward setup is sound and full training is worth running. -If either fails, abort and fix before wasting 15M steps. - -Usage: - python smoke_test.py # fresh run - python smoke_test.py --render # watch episodes after each stage -""" - -import argparse -import os -import sys -import numpy as np -from copy import deepcopy - -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches -from matplotlib.collections import LineCollection - -from stable_baselines3 import PPO -from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize - -from herding_env import HerdingEnv - - -COMPACT_RADIUS = 5.0 -PASS_THRESHOLD = 0.60 # success rate required to pass each stage - - -def make_env(n_sheep, seed, max_steps=2000): - def _init(): - env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps) - env.reset(seed=seed) - return env - return _init - - -def classify_failure(ep_radius, ep_com_dist, n_penned, n_sheep, success): - if success: - return "SUCCESS" - if min(ep_radius) > COMPACT_RADIUS: - return "NEVER_COMPACT" - first_compact = next(i for i, r in enumerate(ep_radius) if r <= COMPACT_RADIUS) - if min(ep_com_dist[first_compact:]) > 3.0: - return "COMPACT_CANT_DRIVE" - if n_penned == 0: - return "DROVE_NO_SHEEP" - return f"PARTIAL_{n_penned}of{n_sheep}" - - -def run_episodes(model, eval_env, n_episodes=30, max_steps=2000, render=False): - """ - Run N deterministic episodes. - Returns (success_rate, failure_counts, diagnostics_dict). - diagnostics_dict contains per-episode and aggregate stats useful for - understanding WHY the policy is failing without assuming the cause. - """ - failure_counts = {} - successes = 0 - - all_action_mags = [] # action magnitude every step across all episodes - all_pen_progress = [] # per-episode: total pen-dist reduction (positive = good) - ep_steps_list = [] - ep_min_pen_list = [] # min pen dist reached in each episode - - for ep in range(n_episodes): - obs = eval_env.reset() - done = False - ep_radius, ep_com_dist = [], [] - ep_action_mags = [] - n_penned = 0 - n_sheep = 1 - prev_pen_dist = None - - while not done: - action, _ = model.predict(obs, deterministic=True) - obs, _, dones, infos = eval_env.step(action) - done = dones[0] - - inner = eval_env.envs[0] - com, radius, _ = inner._flock_stats() - com_dist = float(np.linalg.norm(com - inner.PEN_CENTER)) - ep_radius.append(radius) - ep_com_dist.append(com_dist) - - act_mag = float(np.linalg.norm(action[0])) - ep_action_mags.append(act_mag) - - active = ~inner.penned[:inner.n_sheep] - if active.any(): - pen_dist = float(np.linalg.norm( - inner.sheep_pos[:inner.n_sheep][active] - inner.PEN_CENTER, axis=1 - ).sum()) - else: - pen_dist = 0.0 - if prev_pen_dist is None: - prev_pen_dist = pen_dist - prev_pen_dist = pen_dist - - if render and ep == 0: - inner.render() - - info = infos[0] - n_penned = info.get("n_penned", 0) - n_sheep = info.get("n_sheep", 1) - success = n_penned == n_sheep - successes += int(success) - mode = classify_failure(ep_radius, ep_com_dist, n_penned, n_sheep, success) - failure_counts[mode] = failure_counts.get(mode, 0) + 1 - - all_action_mags.extend(ep_action_mags) - ep_steps_list.append(len(ep_action_mags)) - ep_min_pen_list.append(min(ep_com_dist)) - - # Per-episode one-liner for real-time feedback - mean_act = float(np.mean(ep_action_mags)) - min_pen = min(ep_com_dist) - print(f" ep {ep+1:>3} steps={len(ep_action_mags):>5} " - f"penned={n_penned}/{n_sheep} " - f"act={mean_act:.2f} " - f"min_pen={min_pen:.1f}m [{mode}]") - - success_rate = successes / n_episodes - - diag = { - "mean_action_mag" : float(np.mean(all_action_mags)), - "p10_action_mag" : float(np.percentile(all_action_mags, 10)), - "p90_action_mag" : float(np.percentile(all_action_mags, 90)), - "mean_min_pen_dist": float(np.mean(ep_min_pen_list)), - "best_min_pen_dist": float(np.min(ep_min_pen_list)), - "mean_ep_steps" : float(np.mean(ep_steps_list)), - } - - print(f"\n Action magnitude mean={diag['mean_action_mag']:.3f} " - f"p10={diag['p10_action_mag']:.3f} p90={diag['p90_action_mag']:.3f}" - f" (0=stopped, 1=full speed)") - print(f" Pen distance mean_min={diag['mean_min_pen_dist']:.1f}m " - f"best_min={diag['best_min_pen_dist']:.1f}m " - f"(how close sheep got to pen center)") - - return success_rate, failure_counts, diag - - -def train_stage(n_sheep, steps, n_envs=4, prev_model=None, prev_vecnorm=None): - """Train one stage; return (model, vecnorm).""" - train_env = SubprocVecEnv([make_env(n_sheep, i) for i in range(n_envs)]) - - if prev_vecnorm is not None: - vn = deepcopy(prev_vecnorm) - vn.set_venv(train_env) - vn.training = True - vn.norm_reward = True - else: - vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0) - - if prev_model is not None: - model = prev_model - model.set_env(vn) - else: - model = PPO( - "MlpPolicy", vn, - learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10, - gamma=0.995, gae_lambda=0.95, clip_range=0.2, ent_coef=0.02, - vf_coef=0.5, max_grad_norm=0.5, - policy_kwargs=dict(net_arch=[256, 256]), - verbose=1, - ) - - model.learn(total_timesteps=steps, reset_num_timesteps=(prev_model is None), - tb_log_name="ppo_smoke") - return model, vn - - -def make_eval_env(model, vecnorm, n_sheep, max_steps=2000): - raw = DummyVecEnv([make_env(n_sheep, seed=9999, max_steps=max_steps)]) - vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) - vn.obs_rms = deepcopy(vecnorm.obs_rms) - vn.ret_rms = deepcopy(vecnorm.ret_rms) - return vn - - -def report(n_sheep, success_rate, failure_counts, n_episodes, threshold=PASS_THRESHOLD): - print(f"\n{'='*52}") - print(f" Stage n_sheep={n_sheep} | success={success_rate*100:.0f}% ({int(success_rate*n_episodes)}/{n_episodes})") - print(f" {'─'*48}") - for mode, cnt in sorted(failure_counts.items(), key=lambda x: -x[1]): - bar = "█" * cnt - print(f" {mode:<26} {cnt:>3}/{n_episodes} {bar}") - print(f"{'='*52}") - - passed = success_rate >= threshold - if passed: - print(f" ✓ PASS (threshold {threshold*100:.0f}%)") - else: - dominant = max(failure_counts, key=failure_counts.get) - print(f" ✗ FAIL — dominant: {dominant}") - if dominant == "NEVER_COMPACT": - print(" Dog can't compact flock. Check W_COLLECT, obs contains straggler positions?") - elif dominant == "COMPACT_CANT_DRIVE": - print(" Flock compacts but dog doesn't drive to pen. Check alignment reward / W_DRIVE.") - elif dominant.startswith("PARTIAL"): - print(" Flock splits near pen. Dog loses stragglers at the end.") - print() - return passed - - -SHEEP_COLORS = ["#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00", - "#a65628","#f781bf","#999999","#66c2a5","#fc8d62"] - -def _save_smoke_vis(model, vn, n_sheep, save_dir, seed=42, max_steps=2000): - """Run one episode and save trajectory + timeseries PNGs.""" - from copy import deepcopy - raw = DummyVecEnv([make_env(n_sheep, seed=seed, max_steps=max_steps)]) - env = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) - env.obs_rms = deepcopy(vn.obs_rms) - env.ret_rms = deepcopy(vn.ret_rms) - - obs = env.reset() - inner = env.envs[0] - dog_xs, dog_ys = [], [] - sheep_xs = [[] for _ in range(n_sheep)] - sheep_ys = [[] for _ in range(n_sheep)] - radii, action_mags, rewards = [], [], [] - pen_dists = [[] for _ in range(n_sheep)] - done = False - - while not done: - action, _ = model.predict(obs, deterministic=True) - obs, reward, dones, _ = env.step(action) - done = dones[0] - dog_xs.append(float(inner.dog_pos[0])); dog_ys.append(float(inner.dog_pos[1])) - com, radius, _ = inner._flock_stats() - radii.append(radius) - rewards.append(float(reward[0])) - action_mags.append(float(np.linalg.norm(action[0]))) - for i in range(n_sheep): - sheep_xs[i].append(float(inner.sheep_pos[i][0])) - sheep_ys[i].append(float(inner.sheep_pos[i][1])) - pen_dists[i].append(float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER))) - env.close() - - steps = len(dog_xs) - # Trajectory - fig, ax = plt.subplots(figsize=(6,6)) - ax.set_xlim(-16,16); ax.set_ylim(-16,16); ax.set_aspect("equal") - ax.set_facecolor("#dcedc8") - ax.add_patch(mpatches.Rectangle((-15,-15),30,30,fill=False,edgecolor="#795548",lw=2)) - ax.add_patch(mpatches.Rectangle((10,-15),3,7,facecolor="#ffe082",edgecolor="#795548",lw=2)) - ax.text(11.5,-11.5,"pen",ha="center",va="center",fontsize=8,color="#795548") - for i in range(n_sheep): - c = SHEEP_COLORS[i % len(SHEEP_COLORS)] - ax.plot(sheep_xs[i], sheep_ys[i], color=c, lw=1, alpha=0.6, label=f"sheep {i+1}") - ax.plot(sheep_xs[i][0], sheep_ys[i][0], "o", color=c, ms=7) - ax.plot(sheep_xs[i][-1], sheep_ys[i][-1], "*", color=c, ms=10) - ax.plot(dog_xs, dog_ys, color="#4e342e", lw=1.5, label="dog", alpha=0.8) - ax.plot(dog_xs[0], dog_ys[0], "s", color="#4e342e", ms=9) - ax.plot(dog_xs[-1], dog_ys[-1], "D", color="#4e342e", ms=9) - ax.set_title(f"n_sheep={n_sheep} {steps} steps min_r={min(radii):.1f}m") - ax.legend(fontsize=7, loc="upper left") - plt.tight_layout() - fig.savefig(os.path.join(save_dir, "trajectory.png"), dpi=100) - plt.close(fig) - - # Timeseries - t = np.arange(steps) - fig, axes = plt.subplots(4,1,figsize=(10,8),sharex=True) - axes[0].plot(t, radii, color="steelblue"); axes[0].axhline(5,color="orange",ls="--",lw=1) - axes[0].set_ylabel("radius (m)"); axes[0].set_title("Flock radius (orange=5m threshold)") - for i in range(n_sheep): - axes[1].plot(t, pen_dists[i], color=SHEEP_COLORS[i%len(SHEEP_COLORS)], lw=1, label=f"sheep {i+1}") - axes[1].set_ylabel("pen dist (m)"); axes[1].set_title("Per-sheep distance to pen"); axes[1].legend(fontsize=7) - axes[2].plot(t, action_mags, color="tomato", lw=1, alpha=0.8) - axes[2].axhline(1.0,color="gray",ls="--",lw=1); axes[2].set_ylim(0,1.5) - axes[2].set_ylabel("action mag"); axes[2].set_title("Dog action magnitude (0=stopped)") - axes[3].plot(t, rewards, color="purple", lw=1, alpha=0.7); axes[3].axhline(0,color="black",lw=0.5) - axes[3].set_ylabel("reward"); axes[3].set_xlabel("step"); axes[3].set_title("Reward per step") - fig.suptitle(f"Smoke stage n_sheep={n_sheep}", fontsize=12) - plt.tight_layout() - fig.savefig(os.path.join(save_dir, "timeseries.png"), dpi=100) - plt.close(fig) - print(f" Viz saved to {save_dir}/trajectory.png + timeseries.png") - - -def main(): - p = argparse.ArgumentParser() - p.add_argument("--steps", type=int, default=500_000, - help="Steps per smoke-test stage (default 500k)") - p.add_argument("--n-envs", type=int, default=4) - p.add_argument("--episodes", type=int, default=30, - help="Validation episodes per stage") - p.add_argument("--render", action="store_true") - args = p.parse_args() - - # 1 sheep (500k): hard check — obs/reward structurally correct? - # Thresholds are MINIMUM bars — smoke test always runs ALL stages even on failure. - # The per-episode diagnostics tell you WHY a stage failed. - stages = [(1, args.steps, 0.10), (2, args.steps * 2, 0.20), (3, args.steps * 3, 0.10)] - - model, vn = None, None - stage_results = [] - - for n_sheep, steps, threshold in stages: - print(f"\n{'#'*52}") - print(f"# Smoke-test stage: n_sheep={n_sheep}, {steps:,} steps") - print(f"{'#'*52}") - - model, vn = train_stage(n_sheep, steps, args.n_envs, model, vn) - - eval_env = make_eval_env(model, vn, n_sheep) - success_rate, failure_counts, diag = run_episodes( - model, eval_env, args.episodes, render=args.render - ) - eval_env.close() - - save_dir = f"runs/smoke_stage{n_sheep}" - os.makedirs(save_dir, exist_ok=True) - model.save(os.path.join(save_dir, "model")) - vn.save(os.path.join(save_dir, "vecnorm.pkl")) - print(f" Model saved to {save_dir}/") - _save_smoke_vis(model, vn, n_sheep, save_dir) - - passed = report(n_sheep, success_rate, failure_counts, args.episodes, threshold) - stage_results.append((n_sheep, success_rate, passed, diag)) - - if not passed: - print(f" ⚠ Stage {n_sheep} BELOW threshold — continuing to next stage.") - print(f" mean_action={diag['mean_action_mag']:.3f} " - f"best_pen_approach={diag['best_min_pen_dist']:.1f}m") - if diag['mean_action_mag'] < 0.05: - print(" !! Dog is NOT moving (sit-still). " - "Check ent_coef / step_cost / alignment.") - elif diag['best_min_pen_dist'] > 5.0: - print(" !! Dog never gets sheep near pen. " - "Check reward direction / initialization.") - else: - print(" !! Dog moves and approaches pen but low success rate. " - "Likely needs more training time.") - - print(f"\n{'='*52}") - print(" SMOKE TEST SUMMARY") - print(f"{'='*52}") - all_passed = True - for n_sheep, sr, passed, diag in stage_results: - status = "PASS" if passed else "FAIL" - print(f" n_sheep={n_sheep} success={sr*100:.0f}% " - f"act={diag['mean_action_mag']:.2f} " - f"best_pen={diag['best_min_pen_dist']:.1f}m [{status}]") - if not passed: - all_passed = False - - if all_passed: - print("\n All stages passed. Ready for full curriculum training:") - print(" python train.py --curriculum --steps-per-stage 1500000 " - "--total-steps 15000000 --n-sheep 1 --max-sheep 10 " - "--n-envs 8 --run-dir runs/ppo_v3") - else: - print("\n Some stages below threshold — check diagnostics above.") - print(" Key signals: act<0.05=sit-still, best_pen>5=wrong direction, " - "else needs more training time.") - print() - - -if __name__ == "__main__": - main() diff --git a/training/sweep_reward.py b/training/sweep_reward.py deleted file mode 100644 index 03f318a..0000000 --- a/training/sweep_reward.py +++ /dev/null @@ -1,314 +0,0 @@ -""" -Random-search sweep over reward-function hyperparameters. - -Each trial trains a fresh PPO policy through a 1→2-sheep curriculum on a tight -budget, then evaluates at n=1,2,3 sheep. A composite score is computed and -written to a JSONL log. After all trials, a leaderboard is printed and the -best config is saved. - -Sized to fit in ~4 hours wall-clock with default settings on 8 envs. - -Usage ------ - python sweep_reward.py # 25 trials, default budget - python sweep_reward.py --n-trials 15 - python sweep_reward.py --time-budget 6 # stop adding trials past 6h - python sweep_reward.py --resume runs/sweep_ # continue logging - -Per-trial budget (see TRAIN_*_STEPS below): ~1.0M training steps + 30 eval -episodes × 3 sheep counts. On this env that runs in ~8–12 min per trial. -""" -import argparse -import json -import os -import time -import traceback -from copy import deepcopy - -import numpy as np -from stable_baselines3 import PPO -from stable_baselines3.common.callbacks import BaseCallback -from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize - -from herding_env import HerdingEnv - - -class ProgressCallback(BaseCallback): - """Print a one-line trial-progress summary every `freq` env steps. - Tracks per-env returns and success directly from rollout rewards/infos - (no Monitor wrapper needed). The success window is COUNT-BASED, not - time-based, so successful episodes (which finish faster) don't oversample - the window vs truncated episodes (which take max_steps).""" - def __init__(self, trial_id: int, stage_label: str, freq: int = 50_000): - super().__init__() - self.trial_id = trial_id - self.stage_label = stage_label - self.freq = freq - self._last = 0 - self._ep_returns = [] - self._ep_success = [] - self._completed_count = 0 # total completed episodes since callback start - self._success_count = 0 # total successful episodes since callback start - self._cur_ret = None # per-env running return - - def _on_step(self) -> bool: - rewards = self.locals.get("rewards") - dones = self.locals.get("dones") - infos = self.locals.get("infos", []) - if rewards is None or dones is None: - return True - if self._cur_ret is None or len(self._cur_ret) != len(rewards): - self._cur_ret = np.zeros(len(rewards), dtype=np.float64) - self._cur_ret += np.asarray(rewards, dtype=np.float64) - for i, d in enumerate(dones): - if not d: continue - self._ep_returns.append(float(self._cur_ret[i])) - info = infos[i] if i < len(infos) else {} - success = int(info.get("n_penned", 0) == info.get("n_sheep", -1)) - self._ep_success.append(success) - self._completed_count += 1 - self._success_count += success - self._cur_ret[i] = 0.0 - if len(self._ep_returns) > 50: - self._ep_returns.pop(0); self._ep_success.pop(0) - if self.num_timesteps - self._last >= self.freq: - self._last = self.num_timesteps - n_eps = len(self._ep_returns) - mean_r = float(np.mean(self._ep_returns)) if n_eps else float("nan") - # Window sr (biased: short eps over-represented), and cumulative sr - # (unbiased over the whole stage). - win_sr = float(np.mean(self._ep_success)) if n_eps else float("nan") - cum_sr = (self._success_count / self._completed_count - if self._completed_count else float("nan")) - print(f" ... [trial {self.trial_id+1} | {self.stage_label} | " - f"{self.num_timesteps:>7,} steps | " - f"ret(last {n_eps})={mean_r:+.2f} " - f"win_sr={win_sr*100:.0f}% cum_sr={cum_sr*100:.0f}%]", - flush=True) - return True - -# --------------------------------------------------------------------------- -# Search space — reward weights + a couple of hyperparams -# --------------------------------------------------------------------------- -SEARCH_SPACE = { - "W_PER_SHEEP": [1.0, 2.0, 4.0, 6.0], - "W_ALIGN": [0.0, 0.025, 0.05, 0.1], - "W_PEN_BONUS": [5.0, 10.0, 20.0], - "W_STEP_COST": [0.005, 0.02, 0.05], - "W_COMPLETE": [50.0, 100.0, 200.0], - "W_COMPACT": [0.0, 0.5, 1.5, 3.0], - "ALIGN_SHAPE": ["standoff", "near"], - "ALIGN_GATED": [True, False], - "ent_coef": [0.005, 0.01, 0.02, 0.05], -} - -# Per-trial training budget — keep tight; total = sum + eval -TRAIN_STAGE1_STEPS = 400_000 # 1 sheep -TRAIN_STAGE2_STEPS = 600_000 # 2 sheep -EVAL_EPISODES = 10 -EVAL_NSHEEP = (1, 2, 3) -MAX_STEPS = 1500 -N_ENVS = 8 - - -def sample_config(rng: np.random.Generator) -> dict: - cfg = {} - for k, v in SEARCH_SPACE.items(): - choice = v[int(rng.integers(0, len(v)))] - cfg[k] = bool(choice) if isinstance(choice, np.bool_) else choice - return cfg - - -def reward_cfg(cfg: dict) -> dict: - """Strip non-env keys (anything that isn't a HerdingEnv attribute).""" - return {k: v for k, v in cfg.items() if k != "ent_coef"} - - -def make_env(n_sheep, seed, max_steps, rcfg, random_n_sheep=False): - def _init(): - env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, - reward_cfg=rcfg, random_n_sheep=random_n_sheep) - env.reset(seed=seed) - return env - return _init - - -def evaluate(model, vn_template, n_sheep, n_episodes, max_steps, rcfg): - raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, rcfg)]) - vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) - vn.obs_rms = deepcopy(vn_template.obs_rms) - vn.ret_rms = deepcopy(vn_template.ret_rms) - successes = 0 - ep_lens, min_pen_list, action_mags = [], [], [] - for _ in range(n_episodes): - obs = vn.reset() - done = False - steps, min_pen, mags = 0, float("inf"), [] - while not done: - action, _ = model.predict(obs, deterministic=True) - obs, _, dones, infos = vn.step(action) - done = dones[0] - inner = vn.envs[0] - com, _, _ = inner._flock_stats() - min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER))) - mags.append(float(np.linalg.norm(action[0]))) - steps += 1 - successes += int(infos[0].get("n_penned") == n_sheep) - ep_lens.append(steps) - min_pen_list.append(min_pen) - action_mags.extend(mags) - vn.close() - return { - "sr": successes / n_episodes, - "mean_len": float(np.mean(ep_lens)), - "mean_min_pen": float(np.mean(min_pen_list)), - "mean_act": float(np.mean(action_mags)), - } - - -def run_trial(trial_id: int, cfg: dict, log_path: str, run_dir: str) -> dict: - rcfg = reward_cfg(cfg) - trial_dir = os.path.join(run_dir, f"trial_{trial_id:03d}") - os.makedirs(trial_dir, exist_ok=True) - with open(os.path.join(trial_dir, "config.json"), "w") as f: - json.dump(cfg, f, indent=2) - - train_env = SubprocVecEnv([ - make_env(1, seed=trial_id * 100 + i, max_steps=MAX_STEPS, rcfg=rcfg) - for i in range(N_ENVS) - ]) - vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0) - - model = PPO( - "MlpPolicy", vn, - learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10, - gamma=0.995, gae_lambda=0.95, clip_range=0.2, - ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5, - policy_kwargs=dict(net_arch=[256, 256]), - verbose=0, - ) - - try: - model.learn(total_timesteps=TRAIN_STAGE1_STEPS, - reset_num_timesteps=True, - callback=ProgressCallback(trial_id, "1 sheep")) - vn.env_method("set_n_sheep", 2) - model.learn(total_timesteps=TRAIN_STAGE2_STEPS, - reset_num_timesteps=False, - callback=ProgressCallback(trial_id, "2 sheep")) - - per_sheep = {} - for n in EVAL_NSHEEP: - print(f" ... [trial {trial_id+1} | eval n={n}]", flush=True) - per_sheep[n] = evaluate(model, vn, n, EVAL_EPISODES, MAX_STEPS, rcfg) - - model.save(os.path.join(trial_dir, "model")) - vn.save(os.path.join(trial_dir, "vecnorm.pkl")) - finally: - try: vn.close() - except Exception: pass - - sr = {n: per_sheep[n]["sr"] for n in EVAL_NSHEEP} - score = 0.2 * sr[1] + 0.5 * sr[2] + 0.3 * sr[3] - return { - "trial": trial_id, - "config": cfg, - "score": score, - "sr": sr, - "details": per_sheep, - } - - -def main(): - p = argparse.ArgumentParser() - p.add_argument("--n-trials", type=int, default=25) - p.add_argument("--time-budget", type=float, default=7.5, - help="Stop launching new trials past this many hours.") - p.add_argument("--seed", type=int, default=42) - p.add_argument("--run-dir", type=str, default=None, - help="If unset, creates runs/sweep_/") - p.add_argument("--resume", type=str, default=None, - help="Continue logging into an existing sweep dir") - args = p.parse_args() - - run_dir = args.resume or args.run_dir or os.path.join( - "runs", "sweep_" + time.strftime("%Y%m%d_%H%M%S") - ) - os.makedirs(run_dir, exist_ok=True) - log_path = os.path.join(run_dir, "results.jsonl") - - rng = np.random.default_rng(args.seed) - start = time.time() - budget_s = args.time_budget * 3600 - results = [] - - # If resuming, replay the existing log into memory - if args.resume and os.path.exists(log_path): - with open(log_path) as f: - for line in f: - try: results.append(json.loads(line)) - except Exception: pass - print(f"Resumed sweep: {len(results)} prior trials loaded from {log_path}") - - print(f"Sweep dir: {run_dir}") - print(f"Search space: {list(SEARCH_SPACE.keys())}") - print(f"Per-trial: {TRAIN_STAGE1_STEPS+TRAIN_STAGE2_STEPS:,} steps train + " - f"{EVAL_EPISODES * len(EVAL_NSHEEP)} eval eps") - print(f"Time budget: {args.time_budget}h\n") - - n_done = sum(1 for r in results if "error" not in r) - trial_id = len(results) - while n_done < args.n_trials: - elapsed_h = (time.time() - start) / 3600 - if elapsed_h >= args.time_budget: - print(f"\n[Sweep] time budget reached ({elapsed_h:.2f}h) — stopping.") - break - - cfg = sample_config(rng) - t0 = time.time() - print(f"[Trial {trial_id+1:>3}] {cfg}") - try: - result = run_trial(trial_id, cfg, log_path, run_dir) - result["elapsed_s"] = time.time() - t0 - sr = result["sr"] - print(f" → score={result['score']:.3f} " - f"sr1={sr[1]:.2f} sr2={sr[2]:.2f} sr3={sr[3]:.2f} " - f"[{result['elapsed_s']:.0f}s]") - results.append(result) - n_done += 1 - except Exception as e: - traceback.print_exc() - err = {"trial": trial_id, "config": cfg, - "error": f"{type(e).__name__}: {e}", - "elapsed_s": time.time() - t0} - results.append(err) - print(f" ! FAILED: {err['error']}") - with open(log_path, "a") as f: - f.write(json.dumps(results[-1]) + "\n") - trial_id += 1 - - # Leaderboard - succ = [r for r in results if "error" not in r] - succ.sort(key=lambda r: -r["score"]) - print("\n" + "=" * 92) - print(" LEADERBOARD") - print("=" * 92) - hdr = f" {'rank':>4} {'score':>6} {'sr1':>5} {'sr2':>5} {'sr3':>5} config" - print(hdr); print(" " + "-" * 88) - for i, r in enumerate(succ[:15], 1): - sr = r["sr"] - cfg_short = " ".join(f"{k}={v}" for k, v in r["config"].items()) - print(f" {i:>4d} {r['score']:>6.3f} {sr[1]:>5.2f} {sr[2]:>5.2f} {sr[3]:>5.2f} {cfg_short}") - - if succ: - best = succ[0] - with open(os.path.join(run_dir, "best.json"), "w") as f: - json.dump(best, f, indent=2) - print(f"\n Best config saved to {run_dir}/best.json") - print(f" Total trials: {len(results)} ({len(succ)} successful, " - f"{len(results)-len(succ)} failed)") - print(f" Total time: {(time.time()-start)/3600:.2f}h\n") - - -if __name__ == "__main__": - main() diff --git a/training/train.py b/training/train.py index b961cd6..7e549c4 100644 --- a/training/train.py +++ b/training/train.py @@ -1,414 +1,529 @@ """ -PPO training script for the herding task. +PPO training for the herding task with curriculum learning. -Usage examples --------------- -# Proper 5-sheep curriculum, 1 M steps per stage: - python train.py --curriculum --steps-per-stage 1000000 --total-steps 5000000 +Trains from scratch through a 1→max_sheep curriculum, evaluates after each +stage, and auto-generates trajectory/timeseries plots plus a summary chart. -# Success-rate curriculum (advances when 70 % success over 100 episodes): - python train.py --curriculum --threshold 0.70 +Usage +----- + python train.py # defaults from config.json + python train.py --config my_config.json --max-sheep 5 + python train.py --max-sheep 3 --steps-per-stage 1000000 -# Resume from checkpoint at stage 3: - python train.py --resume runs/ppo_herding/ckpt_3000000_steps.zip --n-sheep 3 \ - --curriculum --steps-per-stage 1000000 --total-steps 5000000 - -# Quick smoke-test: - python train.py --n-envs 1 --total-steps 50000 +Outputs (in runs//): + config.json resolved config + final_model.zip trained PPO model + vecnorm.pkl VecNormalize statistics + stage_results.json per-stage evaluation metrics + success_rate.png summary bar chart + eval/ trajectory & timeseries plots per sheep count """ import argparse +import json import os +import time from copy import deepcopy +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches import numpy as np +from matplotlib.collections import LineCollection from stable_baselines3 import PPO -from stable_baselines3.common.callbacks import ( - BaseCallback, - CallbackList, - CheckpointCallback, - EvalCallback, +from stable_baselines3.common.callbacks import BaseCallback +from stable_baselines3.common.vec_env import ( + DummyVecEnv, + SubprocVecEnv, + VecNormalize, ) -from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize from herding_env import HerdingEnv + +# ── Colours ────────────────────────────────────────────────────────────────── + +SHEEP_COLORS = [ + "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", + "#a65628", "#f781bf", "#999999", "#66c2a5", "#fc8d62", +] +DOG_COLOR = "#4e342e" + + +# ── Callbacks ──────────────────────────────────────────────────────────────── + +class ProgressCallback(BaseCallback): + """One-line progress summary every `freq` env steps.""" + + def __init__(self, stage_label: str, freq: int = 100_000): + super().__init__() + self.stage_label = stage_label + self.freq = freq + self._last = 0 + self._ep_returns = [] + self._ep_success = [] + self._total_eps = 0 + self._total_success = 0 + self._cur_ret = None + + def _on_step(self) -> bool: + rewards = self.locals.get("rewards") + dones = self.locals.get("dones") + infos = self.locals.get("infos", []) + if rewards is None or dones is None: + return True + if self._cur_ret is None or len(self._cur_ret) != len(rewards): + self._cur_ret = np.zeros(len(rewards), dtype=np.float64) + self._cur_ret += np.asarray(rewards, dtype=np.float64) + for i, d in enumerate(dones): + if not d: + continue + self._ep_returns.append(float(self._cur_ret[i])) + info = infos[i] if i < len(infos) else {} + success = int(info.get("n_penned", 0) == info.get("n_sheep", -1)) + self._ep_success.append(success) + self._total_eps += 1 + self._total_success += success + self._cur_ret[i] = 0.0 + if len(self._ep_returns) > 50: + self._ep_returns.pop(0) + self._ep_success.pop(0) + if self.num_timesteps - self._last >= self.freq: + self._last = self.num_timesteps + n = len(self._ep_returns) + mean_r = float(np.mean(self._ep_returns)) if n else float("nan") + win_sr = float(np.mean(self._ep_success)) if n else float("nan") + cum_sr = (self._total_success / self._total_eps + if self._total_eps else float("nan")) + print(f" ... [{self.stage_label} | " + f"{self.num_timesteps:>7,} steps | " + f"ret(last {n})={mean_r:+.2f} " + f"win_sr={win_sr*100:.0f}% cum_sr={cum_sr*100:.0f}%]", + flush=True) + return True + + +# ── Environment factory ────────────────────────────────────────────────────── + +def make_env(n_sheep, seed, max_steps, reward_cfg=None): + def _init(): + env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, + reward_cfg=reward_cfg) + env.reset(seed=seed) + return env + return _init + + +# ── Failure-mode classification ────────────────────────────────────────────── + COMPACT_RADIUS = 5.0 -def _classify(ep_radius, ep_com_dist, n_penned, n_sheep, success): - if success: +def _classify(ep_radii, ep_com_dists, n_penned, n_sheep): + if n_penned == n_sheep: return "SUCCESS" - if min(ep_radius) > COMPACT_RADIUS: + if min(ep_radii) > COMPACT_RADIUS: return "NEVER_COMPACT" - first = next(i for i, r in enumerate(ep_radius) if r <= COMPACT_RADIUS) - if min(ep_com_dist[first:]) > 3.0: + first = next(i for i, r in enumerate(ep_radii) if r <= COMPACT_RADIUS) + if min(ep_com_dists[first:]) > 3.0: return "COMPACT_CANT_DRIVE" if n_penned == 0: return "DROVE_NO_SHEEP" return f"PARTIAL_{n_penned}of{n_sheep}" -# --------------------------------------------------------------------------- -# Curriculum callback -# --------------------------------------------------------------------------- +# ── Evaluation ─────────────────────────────────────────────────────────────── -class CurriculumCallback(BaseCallback): - """ - Advances n_sheep on both training and eval envs. +def evaluate(model, vn_template, n_sheep, n_episodes, max_steps, + reward_cfg=None): + """Evaluate at a given sheep count; returns metrics dict.""" + raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, reward_cfg)]) + vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) + vn.obs_rms = deepcopy(vn_template.obs_rms) + vn.ret_rms = deepcopy(vn_template.ret_rms) - Two modes (mutually exclusive): - steps_per_stage — advance every N environment steps regardless of - success rate (recommended for reliability). - threshold — advance when rolling success rate exceeds this value - (requires the policy to actually reach the threshold). - """ + successes = 0 + ep_lens = [] + min_pen_list = [] + action_mags = [] + failure_counts = {} + rc_sums = {} + rc_n = 0 - def __init__(self, start_sheep: int, max_sheep: int, - eval_env=None, - steps_per_stage: int = None, - threshold: float = 0.75, - window: int = 100, - min_episodes: int = 50, - verbose: int = 1): - super().__init__(verbose) - self.max_sheep = max_sheep - self.eval_env = eval_env - self.steps_per_stage = steps_per_stage - self.threshold = threshold - self.window = window - self.min_episodes = min_episodes - self._cur_sheep = start_sheep - self._successes = [] - self._stage_start = 0 + for _ in range(n_episodes): + obs = vn.reset() + done = False + steps = 0 + min_pen = float("inf") + mags = [] + ep_radii = [] + ep_com_dists = [] + while not done: + action, _ = model.predict(obs, deterministic=True) + obs, _, dones, infos = vn.step(action) + done = dones[0] + inner = vn.envs[0] + com, radius, _ = inner._flock_stats() + min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER))) + mags.append(float(np.linalg.norm(action[0]))) + ep_radii.append(radius) + ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER))) + steps += 1 + rc = infos[0].get("rcomps") + if rc: + for k, v in rc.items(): + rc_sums[k] = rc_sums.get(k, 0.0) + v + rc_n += 1 + n_penned = infos[0].get("n_penned", 0) + success = n_penned == n_sheep + successes += int(success) + ep_lens.append(steps) + min_pen_list.append(min_pen) + action_mags.extend(mags) + mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep) + failure_counts[mode] = failure_counts.get(mode, 0) + 1 - def _advance(self): - prev_sheep = self._cur_sheep - recent_sr = (np.mean(self._successes) if self._successes else float("nan")) - if self.verbose: - print(f"\n[Curriculum] leaving stage n_sheep={prev_sheep} " - f"after {self.num_timesteps - self._stage_start:,} steps " - f"| training success rate (last {len(self._successes)} eps) = " - f"{recent_sr*100:.0f}%") - self._cur_sheep += 1 - self.training_env.env_method("set_n_sheep", self._cur_sheep) - if self.eval_env is not None: - self.eval_env.env_method("set_n_sheep", self._cur_sheep) - self._stage_start = self.num_timesteps - self._successes.clear() - if self.verbose: - print(f"[Curriculum] → {self._cur_sheep} sheep " - f"at step {self.num_timesteps:,}\n") + vn.close() - def _on_step(self) -> bool: - if self._cur_sheep >= self.max_sheep: - return True - - # Always track training-side success (success = sheep all penned, not truncated) - for info, done in zip(self.locals["infos"], self.locals["dones"]): - if done: - npen = info.get("n_penned", 0) - nshp = info.get("n_sheep", self._cur_sheep) - self._successes.append(1 if npen == nshp else 0) - if len(self._successes) > self.window: - self._successes.pop(0) - - if self.steps_per_stage is not None: - if self.num_timesteps - self._stage_start >= self.steps_per_stage: - self._advance() - else: - if (len(self._successes) >= self.min_episodes - and np.mean(self._successes) >= self.threshold): - self._advance() - - return True + result = { + "sr": successes / n_episodes, + "mean_len": float(np.mean(ep_lens)), + "mean_min_pen": float(np.mean(min_pen_list)), + "mean_act": float(np.mean(action_mags)) if action_mags else 0.0, + "failure_modes": failure_counts, + } + if rc_n > 0: + result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()} + return result -# --------------------------------------------------------------------------- -# Diagnostic callback — failure-mode breakdown every diag_freq steps -# --------------------------------------------------------------------------- +# ── Visualization helpers ──────────────────────────────────────────────────── -class DiagnosticCallback(BaseCallback): - """ - Every diag_freq env steps: spin up a temporary eval env, run n_episodes - deterministic episodes, and print a failure-mode breakdown. - Aborts training (returns False) if the dominant failure mode hasn't - changed after two consecutive checks at the same n_sheep — a sign that - training has stalled and further steps are wasted. - """ - - def __init__(self, diag_freq: int = 500_000, n_episodes: int = 20, - max_steps: int = 2000, abort_on_stall: bool = True, - verbose: int = 1): - super().__init__(verbose) - self.diag_freq = diag_freq - self.n_episodes = n_episodes - self.max_steps = max_steps - self.abort_on_stall = abort_on_stall - self._last_diag = 0 - self._prev_dominant = None # (n_sheep, mode) from last check - self._stall_count = 0 - - def _on_step(self) -> bool: - if self.num_timesteps - self._last_diag < self.diag_freq: - return True - self._last_diag = self.num_timesteps - - n_sheep = self.training_env.get_attr("n_sheep")[0] - - # Build a temporary single-env with copied VecNorm stats - raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep, - max_steps=self.max_steps)]) - vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) - vn.obs_rms = deepcopy(self.training_env.obs_rms) - vn.ret_rms = deepcopy(self.training_env.ret_rms) - - failure_counts = {} - successes = 0 - all_action_mags = [] - ep_min_radii = [] - ep_min_dog_com = [] # closest the dog ever got to flock COM - ep_min_pen_dists = [] # closest COM ever got to pen - rcomp_sums = {"progress":0.0,"alignment":0.0,"pen_bonus":0.0, - "step_cost":0.0,"complete":0.0} - rcomp_n = 0 - - for _ in range(self.n_episodes): - obs = vn.reset() - done = False - ep_radius, ep_com_dist, ep_dog_com = [], [], [] - ep_actions = [] - n_penned = 0 - - while not done: - action, _ = self.model.predict(obs, deterministic=True) - obs, _, dones, infos = vn.step(action) - done = dones[0] - inner = vn.envs[0] - com, radius, _ = inner._flock_stats() - ep_radius.append(radius) - ep_com_dist.append( - float(np.linalg.norm(com - inner.PEN_CENTER)) - ) - ep_dog_com.append( - float(np.linalg.norm(inner.dog_pos - com)) - ) - ep_actions.append(float(np.linalg.norm(action[0]))) - rc = infos[0].get("rcomps") - if rc is not None: - for k in rcomp_sums: rcomp_sums[k] += rc[k] - rcomp_n += 1 - - n_penned = infos[0].get("n_penned", 0) - success = n_penned == n_sheep - successes += int(success) - mode = _classify(ep_radius, ep_com_dist, n_penned, n_sheep, success) - failure_counts[mode] = failure_counts.get(mode, 0) + 1 - all_action_mags.extend(ep_actions) - ep_min_radii.append(min(ep_radius)) - ep_min_dog_com.append(min(ep_dog_com)) - ep_min_pen_dists.append(min(ep_com_dist)) - - vn.close() - - success_rate = successes / self.n_episodes - dominant = max(failure_counts, key=failure_counts.get) - - if self.verbose: - print(f"\n[Diag @ {self.num_timesteps:,} | n_sheep={n_sheep} | " - f"success={success_rate*100:.0f}%]") - for m, c in sorted(failure_counts.items(), key=lambda x: -x[1]): - print(f" {m:<26} {c}/{self.n_episodes}") - mean_act = float(np.mean(all_action_mags)) if all_action_mags else 0.0 - p10 = float(np.percentile(all_action_mags, 10)) if all_action_mags else 0.0 - p90 = float(np.percentile(all_action_mags, 90)) if all_action_mags else 0.0 - print(f" action_mag mean={mean_act:.3f} p10={p10:.3f} p90={p90:.3f} " - f"(0=stopped, 1=full speed)") - print(f" min_flock_radius mean={np.mean(ep_min_radii):.2f}m " - f"best={np.min(ep_min_radii):.2f}m (target <5m to compact)") - print(f" min_dog_to_com mean={np.mean(ep_min_dog_com):.2f}m " - f"best={np.min(ep_min_dog_com):.2f}m (FLEE_DIST=7m)") - print(f" min_com_to_pen mean={np.mean(ep_min_pen_dists):.2f}m " - f"best={np.min(ep_min_pen_dists):.2f}m") - if rcomp_n > 0: - print(f" reward/step (mean): " + " ".join( - f"{k}={rcomp_sums[k]/rcomp_n:+.4f}" for k in - ("progress","alignment","pen_bonus","step_cost","complete") - )) - - # Stall detection — disabled when --no-stall-abort or when we've never - # seen any stage succeed (we want full visibility into what's happening). - key = (n_sheep, dominant) - if key == self._prev_dominant and dominant != "SUCCESS": - self._stall_count += 1 - if (self.abort_on_stall and self._stall_count >= 5 - and self.num_timesteps >= 3_000_000): - print(f"\n[Diag] STALL DETECTED — '{dominant}' on {n_sheep} sheep " - f"for {self._stall_count} consecutive checks. " - f"Aborting training early.") - return False - else: - self._stall_count = 0 - self._prev_dominant = key - - return True +def _draw_field(ax): + ax.set_xlim(-16, 16) + ax.set_ylim(-16, 16) + ax.set_aspect("equal") + ax.set_facecolor("#dcedc8") + ax.add_patch(mpatches.Rectangle((-15, -15), 30, 30, + fill=False, edgecolor="#795548", lw=2)) + ax.add_patch(mpatches.Rectangle((10, -15), 3, 7, + facecolor="#ffe082", edgecolor="#795548", lw=2)) + ax.text(11.5, -11.5, "pen", ha="center", va="center", + fontsize=8, color="#795548") -# --------------------------------------------------------------------------- -# Environment factory -# --------------------------------------------------------------------------- - -def make_env(n_sheep: int, seed: int, max_steps: int, random_n_sheep: bool = False): - def _init(): - env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps, - random_n_sheep=random_n_sheep) - env.reset(seed=seed) - return env - return _init +def _faded_path(ax, xs, ys, color, lw=1.5, label=None): + n = len(xs) + if n < 2: + return + points = np.array([xs, ys]).T.reshape(-1, 1, 2) + segs = np.concatenate([points[:-1], points[1:]], axis=1) + alphas = np.linspace(0.15, 1.0, len(segs)) + colors = [(*matplotlib.colors.to_rgb(color), a) for a in alphas] + ax.add_collection(LineCollection(segs, colors=colors, linewidth=lw)) + if label: + ax.plot([], [], color=color, lw=lw, label=label) -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- +def run_and_record(model, vn_template, n_sheep, max_steps, + reward_cfg=None, seed=42): + """Run one deterministic episode and return full history.""" + raw = DummyVecEnv([make_env(n_sheep, seed, max_steps, reward_cfg)]) + vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False) + vn.obs_rms = deepcopy(vn_template.obs_rms) + vn.ret_rms = deepcopy(vn_template.ret_rms) + + obs = vn.reset() + inner = vn.envs[0] + done = False + + dog_xs, dog_ys = [], [] + sheep_xs = [[] for _ in range(n_sheep)] + sheep_ys = [[] for _ in range(n_sheep)] + radii = [] + pen_dists = [[] for _ in range(n_sheep)] + action_mags = [] + rewards = [] + penned_at = [None] * n_sheep + step = 0 + + while not done: + action, _ = model.predict(obs, deterministic=True) + obs, reward, dones, infos = vn.step(action) + done = dones[0] + step += 1 + + dog_xs.append(float(inner.dog_pos[0])) + dog_ys.append(float(inner.dog_pos[1])) + com, radius, _ = inner._flock_stats() + radii.append(radius) + rewards.append(float(reward[0])) + action_mags.append(float(np.linalg.norm(action[0]))) + + for i in range(n_sheep): + sheep_xs[i].append(float(inner.sheep_pos[i][0])) + sheep_ys[i].append(float(inner.sheep_pos[i][1])) + pen_dists[i].append( + float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER))) + if inner.penned[i] and penned_at[i] is None: + penned_at[i] = step + + n_penned = infos[0].get("n_penned", 0) + vn.close() + + return dict( + dog_xs=dog_xs, dog_ys=dog_ys, + sheep_xs=sheep_xs, sheep_ys=sheep_ys, + radii=radii, pen_dists=pen_dists, + action_mags=action_mags, rewards=rewards, + penned_at=penned_at, + n_penned=n_penned, n_sheep=n_sheep, + success=n_penned == n_sheep, steps=step, + ) + + +def plot_trajectory(hist, out_path): + fig, ax = plt.subplots(figsize=(7, 7)) + _draw_field(ax) + for i in range(hist["n_sheep"]): + c = SHEEP_COLORS[i % len(SHEEP_COLORS)] + xs, ys = hist["sheep_xs"][i], hist["sheep_ys"][i] + _faded_path(ax, xs, ys, c, lw=1.2, label=f"sheep {i+1}") + ax.plot(xs[0], ys[0], "o", color=c, ms=7, zorder=4) + end = hist["penned_at"][i] if hist["penned_at"][i] is not None else -1 + ax.plot(xs[end], ys[end], "*", color=c, ms=11, zorder=5) + _faded_path(ax, hist["dog_xs"], hist["dog_ys"], DOG_COLOR, lw=2.0, + label="dog") + ax.plot(hist["dog_xs"][0], hist["dog_ys"][0], "s", color=DOG_COLOR, + ms=10, zorder=5) + ax.plot(hist["dog_xs"][-1], hist["dog_ys"][-1], "D", color=DOG_COLOR, + ms=10, zorder=5) + result = ("SUCCESS" if hist["success"] + else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})") + ax.set_title(f"n={hist['n_sheep']} {result} {hist['steps']} steps", + fontsize=12) + ax.legend(loc="upper left", fontsize=8) + plt.tight_layout() + fig.savefig(out_path, dpi=120) + plt.close(fig) + + +def plot_timeseries(hist, out_path): + t = np.arange(hist["steps"]) + fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True) + + axes[0].plot(t, hist["radii"], color="steelblue") + axes[0].axhline(5.0, color="orange", ls="--", lw=1, label="compact (5m)") + axes[0].set_ylabel("flock radius (m)") + axes[0].legend(fontsize=8) + axes[0].set_title("Flock radius") + + for i in range(hist["n_sheep"]): + c = SHEEP_COLORS[i % len(SHEEP_COLORS)] + axes[1].plot(t, hist["pen_dists"][i], color=c, lw=1, + label=f"sheep {i+1}") + if hist["penned_at"][i] is not None: + axes[1].axvline(hist["penned_at"][i], color=c, ls=":", lw=1) + axes[1].set_ylabel("dist to pen (m)") + axes[1].legend(fontsize=7, ncol=min(hist["n_sheep"], 5)) + axes[1].set_title("Per-sheep distance to pen") + + axes[2].plot(t, hist["action_mags"], color="tomato", lw=1) + axes[2].axhline(1.0, color="gray", ls="--", lw=1, label="max") + axes[2].set_ylabel("action ||(vx,vy)||") + axes[2].set_ylim(0, 1.5) + axes[2].set_title("Dog action magnitude") + axes[2].legend(fontsize=8) + + axes[3].plot(t, hist["rewards"], color="purple", lw=1, alpha=0.7) + axes[3].axhline(0, color="black", lw=0.5) + axes[3].set_ylabel("reward") + axes[3].set_xlabel("step") + axes[3].set_title("Reward per step") + + result = ("SUCCESS" if hist["success"] + else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})") + fig.suptitle(f"n_sheep={hist['n_sheep']} {result} {hist['steps']} steps", + fontsize=13) + plt.tight_layout() + fig.savefig(out_path, dpi=120) + plt.close(fig) + + +def plot_success_rate(stage_results, out_path): + fig, ax = plt.subplots(figsize=(8, 4)) + ns = [r["n_sheep"] for r in stage_results] + srs = [r["sr"] * 100 for r in stage_results] + bars = ax.bar(ns, srs, color="steelblue", edgecolor="white") + ax.set_xlabel("Sheep count") + ax.set_ylabel("Success rate (%)") + ax.set_ylim(0, 105) + ax.axhline(90, color="orange", ls="--", lw=1, label="90% target") + for bar, sr in zip(bars, srs): + ax.text(bar.get_x() + bar.get_width() / 2, + bar.get_height() + 1, f"{sr:.0f}%", + ha="center", fontsize=9) + ax.legend() + ax.set_title("Evaluation success rate per sheep count") + plt.tight_layout() + fig.savefig(out_path, dpi=120) + plt.close(fig) + + +# ── CLI ────────────────────────────────────────────────────────────────────── + +DEFAULT_CONFIG = { + "W_PER_SHEEP": 2.0, + "W_ALIGN": 0.05, + "W_PEN_BONUS": 10.0, + "W_COMPLETE": 100.0, + "W_STEP_COST": 0.02, + "W_COMPACT": 0.0, + "W_WALL_TOUCH": 0.15, + "WALL_TOUCH_BUFFER": 0.8, + "ALIGN_SHAPE": "standoff", + "ALIGN_GATED": True, + "ENTRY_AWARE": False, + "ent_coef": 0.02, +} + def parse_args(): - p = argparse.ArgumentParser() - p.add_argument("--n-sheep", type=int, default=1, - help="Starting sheep count") - p.add_argument("--max-sheep", type=int, default=5, - help="Final sheep count for curriculum") - p.add_argument("--n-envs", type=int, default=8, - help="Parallel training environments") - p.add_argument("--total-steps", type=int, default=5_000_000) - p.add_argument("--max-steps", type=int, default=2000, - help="Episode step limit") - p.add_argument("--curriculum", action="store_true", - help="Enable curriculum advancement") - p.add_argument("--steps-per-stage", type=int, default=None, - help="Advance curriculum every N steps (overrides --threshold)") - p.add_argument("--threshold", type=float, default=0.75, - help="Success-rate threshold to advance (used without --steps-per-stage)") - p.add_argument("--resume", type=str, default=None, - help="Checkpoint .zip to resume from") - p.add_argument("--run-dir", type=str, default="runs/ppo_herding") - p.add_argument("--save-freq", type=int, default=100_000) - p.add_argument("--eval-freq", type=int, default=50_000) - p.add_argument("--eval-eps", type=int, default=20) - p.add_argument("--diag-freq", type=int, default=500_000, - help="Run failure-mode diagnostics every N env steps") - p.add_argument("--no-stall-abort", action="store_true", - help="Disable early-abort on stall — run full --total-steps " - "for diagnostics") - p.add_argument("--mixed", action="store_true", - help="Randomise n_sheep each episode (consolidation pass, " - "use with --resume after curriculum training)") + p = argparse.ArgumentParser( + description="PPO training for herding task with curriculum learning") + p.add_argument("--config", type=str, default=None, + help="JSON config file (reward weights + ent_coef)") + p.add_argument("--max-sheep", type=int, default=10) + p.add_argument("--steps-per-stage", type=int, default=1_500_000) + p.add_argument("--n-envs", type=int, default=8) + p.add_argument("--max-steps", type=int, default=2500) + p.add_argument("--eval-episodes", type=int, default=30) + p.add_argument("--run-dir", type=str, default=None) return p.parse_args() +# ── Main ───────────────────────────────────────────────────────────────────── + def main(): args = parse_args() - os.makedirs(args.run_dir, exist_ok=True) - ckpt_dir = os.path.join(args.run_dir, "checkpoints") - best_dir = os.path.join(args.run_dir, "best_model") - norm_path = os.path.join(args.run_dir, "vecnorm.pkl") - os.makedirs(ckpt_dir, exist_ok=True) + + # Load config + cfg = dict(DEFAULT_CONFIG) + if args.config: + with open(args.config) as f: + cfg.update(json.load(f)) + + rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)} + + # Run directory + run_dir = args.run_dir or os.path.join( + "runs", time.strftime("%Y%m%d_%H%M%S")) + eval_dir = os.path.join(run_dir, "eval") + os.makedirs(eval_dir, exist_ok=True) + with open(os.path.join(run_dir, "config.json"), "w") as f: + json.dump(cfg, f, indent=2) + + print(f"Config: {cfg}") + print(f"Run dir: {run_dir}") + print(f"Curriculum: 1 → {args.max_sheep} sheep, " + f"{args.steps_per_stage:,} steps/stage\n") # Training envs train_env = SubprocVecEnv([ - make_env(args.n_sheep, seed=i, max_steps=args.max_steps, - random_n_sheep=args.mixed) + make_env(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg) for i in range(args.n_envs) ]) - if args.resume and os.path.exists(norm_path): - train_env = VecNormalize.load(norm_path, train_env) - train_env.training = True - train_env.norm_reward = True - else: - train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, - clip_obs=10.0) - - # Eval env — starts at same difficulty, advances with curriculum callback - eval_env = SubprocVecEnv([ - make_env(args.n_sheep, seed=1000 + i, max_steps=args.max_steps) - for i in range(2) - ]) - eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, - clip_obs=10.0, training=False) - - # Callbacks - checkpoint_cb = CheckpointCallback( - save_freq=max(args.save_freq // args.n_envs, 1), - save_path=ckpt_dir, - name_prefix="ckpt", - save_vecnormalize=True, - ) - eval_cb = EvalCallback( - eval_env, - best_model_save_path=best_dir, - log_path=args.run_dir, - eval_freq=max(args.eval_freq // args.n_envs, 1), - n_eval_episodes=args.eval_eps, - deterministic=True, - verbose=1, - ) - diag_cb = DiagnosticCallback( - diag_freq=args.diag_freq, - n_episodes=20, - max_steps=args.max_steps, - abort_on_stall=not args.no_stall_abort, - ) - callbacks = [checkpoint_cb, eval_cb, diag_cb] - - if args.curriculum: - cur_cb = CurriculumCallback( - start_sheep=args.n_sheep, - max_sheep=args.max_sheep, - eval_env=eval_env, - steps_per_stage=args.steps_per_stage, - threshold=args.threshold, - ) - callbacks.append(cur_cb) - - callback_list = CallbackList(callbacks) + vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, + clip_obs=10.0) # Model - ppo_kwargs = dict( - policy = "MlpPolicy", - env = train_env, - learning_rate = 3e-4, - n_steps = 2048, - batch_size = 256, - n_epochs = 10, - gamma = 0.995, - gae_lambda = 0.95, - clip_range = 0.2, - ent_coef = 0.01, - vf_coef = 0.5, - max_grad_norm = 0.5, - policy_kwargs = dict(net_arch=[256, 256]), - tensorboard_log = args.run_dir, - verbose = 1, + model = PPO( + "MlpPolicy", vn, + learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10, + gamma=0.995, gae_lambda=0.95, clip_range=0.2, + ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5, + policy_kwargs=dict(net_arch=[256, 256]), + verbose=0, ) - if args.resume: - print(f"Resuming from {args.resume}") - model = PPO.load(args.resume, env=train_env, **{ - k: v for k, v in ppo_kwargs.items() - if k not in ("policy", "env") - }) - else: - model = PPO(**ppo_kwargs) + # Curriculum training + stage_results = [] + t0 = time.time() - model.learn( - total_timesteps=args.total_steps, - callback=callback_list, - reset_num_timesteps=args.resume is None, - tb_log_name="ppo", - ) + try: + for n in range(1, args.max_sheep + 1): + if n > 1: + vn.env_method("set_n_sheep", n) - model.save(os.path.join(args.run_dir, "final_model")) - train_env.save(norm_path) - print(f"\nTraining complete. Artefacts saved to {args.run_dir}/") + print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps") + model.learn( + total_timesteps=args.steps_per_stage, + reset_num_timesteps=(n == 1), + callback=ProgressCallback(f"{n} sheep", freq=100_000), + ) + + # Evaluate + print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps") + r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg) + print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}% " + f"mean_len={r['mean_len']:.0f} " + f"mean_min_pen={r['mean_min_pen']:.1f}m " + f"mean_act={r['mean_act']:.2f}") + + # Failure-mode breakdown + if r["failure_modes"]: + modes = " ".join( + f"{k}={v}" for k, v in sorted( + r["failure_modes"].items(), key=lambda x: -x[1])) + print(f" failure modes: {modes}") + + # Reward breakdown + if "reward_per_step" in r: + rps = r["reward_per_step"] + print(f" reward/step: " + " ".join( + f"{k}={v:+.4f}" for k, v in rps.items())) + + # Episode visualization + hist = run_and_record(model, vn, n, args.max_steps, rcfg, + seed=1000 + n) + tag = "success" if hist["success"] else "fail" + plot_trajectory( + hist, + os.path.join(eval_dir, f"traj_{n}s_{tag}.png")) + plot_timeseries( + hist, + os.path.join(eval_dir, f"ts_{n}s_{tag}.png")) + + r["n_sheep"] = n + stage_results.append(r) + + # Save artefacts + model.save(os.path.join(run_dir, "final_model")) + vn.save(os.path.join(run_dir, "vecnorm.pkl")) + with open(os.path.join(run_dir, "stage_results.json"), "w") as f: + json.dump(stage_results, f, indent=2) + + finally: + try: + vn.close() + except Exception: + pass + + # Summary + elapsed = (time.time() - t0) / 60 + print("\n" + "=" * 70) + print(" TRAINING SUMMARY") + print("=" * 70) + for r in stage_results: + print(f" n_sheep={r['n_sheep']} sr={r['sr']*100:>3.0f}% " + f"len={r['mean_len']:>5.0f} min_pen={r['mean_min_pen']:>5.1f}m " + f"act={r['mean_act']:.2f}") + print(f"\n Total time: {elapsed:.1f} min") + print(f" Artefacts: {run_dir}/") + + plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png")) + print(f" Plots: {run_dir}/success_rate.png, {eval_dir}/") if __name__ == "__main__": diff --git a/training/vis_final_10sheep/episode.gif b/training/vis_final_10sheep/episode.gif deleted file mode 100644 index dbe4cd6..0000000 Binary files a/training/vis_final_10sheep/episode.gif and /dev/null differ diff --git a/training/vis_final_10sheep/timeseries.png b/training/vis_final_10sheep/timeseries.png deleted file mode 100644 index ae80df5..0000000 Binary files a/training/vis_final_10sheep/timeseries.png and /dev/null differ diff --git a/training/vis_final_10sheep/trajectory.png b/training/vis_final_10sheep/trajectory.png deleted file mode 100644 index 2839c5b..0000000 Binary files a/training/vis_final_10sheep/trajectory.png and /dev/null differ diff --git a/training/vis_final_1sheep/episode.gif b/training/vis_final_1sheep/episode.gif deleted file mode 100644 index 47a9aa2..0000000 Binary files a/training/vis_final_1sheep/episode.gif and /dev/null differ diff --git a/training/vis_final_1sheep/timeseries.png b/training/vis_final_1sheep/timeseries.png deleted file mode 100644 index 7f5e026..0000000 Binary files a/training/vis_final_1sheep/timeseries.png and /dev/null differ diff --git a/training/vis_final_1sheep/trajectory.png b/training/vis_final_1sheep/trajectory.png deleted file mode 100644 index de47bb5..0000000 Binary files a/training/vis_final_1sheep/trajectory.png and /dev/null differ diff --git a/training/vis_final_5sheep/episode.gif b/training/vis_final_5sheep/episode.gif deleted file mode 100644 index 4690c8f..0000000 Binary files a/training/vis_final_5sheep/episode.gif and /dev/null differ diff --git a/training/vis_final_5sheep/timeseries.png b/training/vis_final_5sheep/timeseries.png deleted file mode 100644 index 23dcde3..0000000 Binary files a/training/vis_final_5sheep/timeseries.png and /dev/null differ diff --git a/training/vis_final_5sheep/trajectory.png b/training/vis_final_5sheep/trajectory.png deleted file mode 100644 index 2880d46..0000000 Binary files a/training/vis_final_5sheep/trajectory.png and /dev/null differ diff --git a/training/vis_final_v2_10sheep/episode.gif b/training/vis_final_v2_10sheep/episode.gif deleted file mode 100644 index 4daecf4..0000000 Binary files a/training/vis_final_v2_10sheep/episode.gif and /dev/null differ diff --git a/training/vis_final_v2_10sheep/timeseries.png b/training/vis_final_v2_10sheep/timeseries.png deleted file mode 100644 index 7ebaff5..0000000 Binary files a/training/vis_final_v2_10sheep/timeseries.png and /dev/null differ diff --git a/training/vis_final_v2_10sheep/trajectory.png b/training/vis_final_v2_10sheep/trajectory.png deleted file mode 100644 index 46254d7..0000000 Binary files a/training/vis_final_v2_10sheep/trajectory.png and /dev/null differ diff --git a/training/vis_final_v2_1sheep/episode.gif b/training/vis_final_v2_1sheep/episode.gif deleted file mode 100644 index 2f10452..0000000 Binary files a/training/vis_final_v2_1sheep/episode.gif and /dev/null differ diff --git a/training/vis_final_v2_1sheep/timeseries.png b/training/vis_final_v2_1sheep/timeseries.png deleted file mode 100644 index 09a2634..0000000 Binary files a/training/vis_final_v2_1sheep/timeseries.png and /dev/null differ diff --git a/training/vis_final_v2_1sheep/trajectory.png b/training/vis_final_v2_1sheep/trajectory.png deleted file mode 100644 index b955810..0000000 Binary files a/training/vis_final_v2_1sheep/trajectory.png and /dev/null differ diff --git a/training/vis_final_v2_3sheep/episode.gif b/training/vis_final_v2_3sheep/episode.gif deleted file mode 100644 index 02010c5..0000000 Binary files a/training/vis_final_v2_3sheep/episode.gif and /dev/null differ diff --git a/training/vis_final_v2_3sheep/timeseries.png b/training/vis_final_v2_3sheep/timeseries.png deleted file mode 100644 index f5bafb9..0000000 Binary files a/training/vis_final_v2_3sheep/timeseries.png and /dev/null differ diff --git a/training/vis_final_v2_3sheep/trajectory.png b/training/vis_final_v2_3sheep/trajectory.png deleted file mode 100644 index a505c3d..0000000 Binary files a/training/vis_final_v2_3sheep/trajectory.png and /dev/null differ diff --git a/training/vis_final_v2_5sheep/episode.gif b/training/vis_final_v2_5sheep/episode.gif deleted file mode 100644 index 61ed892..0000000 Binary files a/training/vis_final_v2_5sheep/episode.gif and /dev/null differ diff --git a/training/vis_final_v2_5sheep/timeseries.png b/training/vis_final_v2_5sheep/timeseries.png deleted file mode 100644 index 36f4810..0000000 Binary files a/training/vis_final_v2_5sheep/timeseries.png and /dev/null differ diff --git a/training/vis_final_v2_5sheep/trajectory.png b/training/vis_final_v2_5sheep/trajectory.png deleted file mode 100644 index 02c6430..0000000 Binary files a/training/vis_final_v2_5sheep/trajectory.png and /dev/null differ diff --git a/training/vis_final_v2_7sheep/episode.gif b/training/vis_final_v2_7sheep/episode.gif deleted file mode 100644 index bfe678b..0000000 Binary files a/training/vis_final_v2_7sheep/episode.gif and /dev/null differ diff --git a/training/vis_final_v2_7sheep/timeseries.png b/training/vis_final_v2_7sheep/timeseries.png deleted file mode 100644 index 623a8c1..0000000 Binary files a/training/vis_final_v2_7sheep/timeseries.png and /dev/null differ diff --git a/training/vis_final_v2_7sheep/trajectory.png b/training/vis_final_v2_7sheep/trajectory.png deleted file mode 100644 index 6e8c92d..0000000 Binary files a/training/vis_final_v2_7sheep/trajectory.png and /dev/null differ diff --git a/training/vis_random/episode.gif b/training/vis_random/episode.gif deleted file mode 100644 index e2ba22a..0000000 Binary files a/training/vis_random/episode.gif and /dev/null differ diff --git a/training/vis_random/timeseries.png b/training/vis_random/timeseries.png deleted file mode 100644 index 98f2822..0000000 Binary files a/training/vis_random/timeseries.png and /dev/null differ diff --git a/training/vis_random/trajectory.png b/training/vis_random/trajectory.png deleted file mode 100644 index 19dbde1..0000000 Binary files a/training/vis_random/trajectory.png and /dev/null differ diff --git a/training/vis_trained_1sheep/episode.gif b/training/vis_trained_1sheep/episode.gif deleted file mode 100644 index 4d4af6c..0000000 Binary files a/training/vis_trained_1sheep/episode.gif and /dev/null differ diff --git a/training/vis_trained_1sheep/timeseries.png b/training/vis_trained_1sheep/timeseries.png deleted file mode 100644 index ce46af7..0000000 Binary files a/training/vis_trained_1sheep/timeseries.png and /dev/null differ diff --git a/training/vis_trained_1sheep/trajectory.png b/training/vis_trained_1sheep/trajectory.png deleted file mode 100644 index 1ea89ff..0000000 Binary files a/training/vis_trained_1sheep/trajectory.png and /dev/null differ diff --git a/training/vis_trained_3sheep/episode.gif b/training/vis_trained_3sheep/episode.gif deleted file mode 100644 index e902074..0000000 Binary files a/training/vis_trained_3sheep/episode.gif and /dev/null differ diff --git a/training/vis_trained_3sheep/timeseries.png b/training/vis_trained_3sheep/timeseries.png deleted file mode 100644 index ba7685f..0000000 Binary files a/training/vis_trained_3sheep/timeseries.png and /dev/null differ diff --git a/training/vis_trained_3sheep/trajectory.png b/training/vis_trained_3sheep/trajectory.png deleted file mode 100644 index 58dbe70..0000000 Binary files a/training/vis_trained_3sheep/trajectory.png and /dev/null differ diff --git a/training/visualize.py b/training/visualize.py deleted file mode 100644 index ea616e9..0000000 --- a/training/visualize.py +++ /dev/null @@ -1,316 +0,0 @@ -""" -Single-episode visualization for the herding policy. - -Outputs (all saved to --out-dir): - trajectory.png — full field view: dog path + every sheep path - timeseries.png — radius, per-sheep pen distance, action magnitude, reward - episode.gif — animated replay (slow enough to read) - -Run with no model to watch a RANDOM policy (useful baseline): - python visualize.py --random --n-sheep 3 --out-dir vis_random/ - -Usage: - python visualize.py \\ - --model runs/ppo_consolidation/final_model.zip \\ - --vecnorm runs/ppo_consolidation/vecnorm.pkl \\ - --n-sheep 3 --out-dir vis_out/ -""" - -import argparse -import os -import math -import numpy as np -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches -import matplotlib.animation as animation -from matplotlib.collections import LineCollection -from stable_baselines3 import PPO -from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize -from herding_env import HerdingEnv - - -# ── colours ────────────────────────────────────────────────────────────────── -SHEEP_COLORS = [ - "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", - "#a65628", "#f781bf", "#999999", "#66c2a5", "#fc8d62", -] -DOG_COLOR = "#4e342e" -PEN_COLOR = "#ffe082" -FIELD_COLOR = "#dcedc8" - - -def make_env(n_sheep, max_steps, seed=42): - def _init(): - env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps) - env.reset(seed=seed) - return env - return _init - - -def run_episode(model, env, n_sheep, max_steps): - """Run one deterministic episode; return recorded history.""" - obs = env.reset() - inner = env.envs[0] - done = False - - dog_xs, dog_ys = [], [] - sheep_xs = [[] for _ in range(n_sheep)] - sheep_ys = [[] for _ in range(n_sheep)] - radii = [] - pen_dists = [[] for _ in range(n_sheep)] - action_mags = [] - rewards = [] - penned_at = [None] * n_sheep # step when each sheep was penned - - step = 0 - while not done: - if model is None: - action = env.action_space.sample()[np.newaxis] - else: - action, _ = model.predict(obs, deterministic=True) - - obs, reward, dones, infos = env.step(action) - done = dones[0] - step += 1 - - dx, dy = float(inner.dog_pos[0]), float(inner.dog_pos[1]) - dog_xs.append(dx); dog_ys.append(dy) - - com, radius, _ = inner._flock_stats() - radii.append(radius) - rewards.append(float(reward[0])) - - act = action[0] - action_mags.append(float(np.linalg.norm(act))) - - for i in range(n_sheep): - sx, sy = float(inner.sheep_pos[i][0]), float(inner.sheep_pos[i][1]) - sheep_xs[i].append(sx) - sheep_ys[i].append(sy) - pen_dists[i].append(float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER))) - if inner.penned[i] and penned_at[i] is None: - penned_at[i] = step - - info = infos[0] - n_penned = info.get("n_penned", 0) - success = n_penned == n_sheep - - return dict( - dog_xs=dog_xs, dog_ys=dog_ys, - sheep_xs=sheep_xs, sheep_ys=sheep_ys, - radii=radii, pen_dists=pen_dists, - action_mags=action_mags, rewards=rewards, - penned_at=penned_at, - n_penned=n_penned, n_sheep=n_sheep, - success=success, steps=step, - ) - - -# ── plot helpers ───────────────────────────────────────────────────────────── - -def draw_field(ax): - ax.set_xlim(-16, 16); ax.set_ylim(-16, 16) - ax.set_aspect("equal"); ax.set_facecolor(FIELD_COLOR) - ax.add_patch(mpatches.Rectangle((-15,-15), 30, 30, - fill=False, edgecolor="#795548", lw=2)) - ax.add_patch(mpatches.Rectangle((10,-15), 3, 7, - facecolor=PEN_COLOR, edgecolor="#795548", lw=2)) - ax.text(11.5, -11.5, "pen", ha="center", va="center", - fontsize=8, color="#795548") - - -def faded_path(ax, xs, ys, color, lw=1.5, label=None): - """Draw a path with alpha fading from start (transparent) to end (opaque).""" - n = len(xs) - if n < 2: - return - points = np.array([xs, ys]).T.reshape(-1, 1, 2) - segs = np.concatenate([points[:-1], points[1:]], axis=1) - alphas = np.linspace(0.15, 1.0, len(segs)) - colors = [(*matplotlib.colors.to_rgb(color), a) for a in alphas] - lc = LineCollection(segs, colors=colors, linewidth=lw) - ax.add_collection(lc) - if label: - ax.plot([], [], color=color, lw=lw, label=label) - - -# ── main plots ──────────────────────────────────────────────────────────────── - -def plot_trajectory(hist, out_path): - fig, ax = plt.subplots(figsize=(7, 7)) - draw_field(ax) - - # Sheep paths - for i in range(hist["n_sheep"]): - c = SHEEP_COLORS[i % len(SHEEP_COLORS)] - xs, ys = hist["sheep_xs"][i], hist["sheep_ys"][i] - faded_path(ax, xs, ys, c, lw=1.2, label=f"sheep {i+1}") - ax.plot(xs[0], ys[0], "o", color=c, ms=7, zorder=4) - pa = hist["penned_at"][i] - end = pa if pa is not None else -1 - ax.plot(xs[end], ys[end], "*", color=c, ms=11, zorder=5) - - # Dog path - faded_path(ax, hist["dog_xs"], hist["dog_ys"], DOG_COLOR, lw=2.0, label="dog") - ax.plot(hist["dog_xs"][0], hist["dog_ys"][0], "s", color=DOG_COLOR, ms=10, zorder=5) - ax.plot(hist["dog_xs"][-1], hist["dog_ys"][-1], "D", color=DOG_COLOR, ms=10, zorder=5) - - result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']} penned)" - ax.set_title(f"Trajectory — {result} — {hist['steps']} steps", fontsize=12) - ax.legend(loc="upper left", fontsize=8) - plt.tight_layout() - fig.savefig(out_path, dpi=120) - plt.close(fig) - print(f" saved {out_path}") - - -def plot_timeseries(hist, out_path): - t = np.arange(hist["steps"]) - fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True) - - # 1. Flock radius - axes[0].plot(t, hist["radii"], color="steelblue") - axes[0].axhline(5.0, color="orange", ls="--", lw=1, label="compact threshold (5m)") - axes[0].set_ylabel("flock radius (m)") - axes[0].legend(fontsize=8) - axes[0].set_title("Flock radius — goal: get below 5m") - - # 2. Per-sheep distance to pen - for i in range(hist["n_sheep"]): - c = SHEEP_COLORS[i % len(SHEEP_COLORS)] - axes[1].plot(t, hist["pen_dists"][i], color=c, lw=1, label=f"sheep {i+1}") - pa = hist["penned_at"][i] - if pa is not None: - axes[1].axvline(pa, color=c, ls=":", lw=1) - axes[1].set_ylabel("dist to pen (m)") - axes[1].legend(fontsize=7, ncol=min(hist["n_sheep"], 5)) - axes[1].set_title("Per-sheep distance to pen — goal: all reach 0") - - # 3. Action magnitude (how fast dog is moving) - axes[2].plot(t, hist["action_mags"], color="tomato", lw=1) - axes[2].axhline(1.0, color="gray", ls="--", lw=1, label="max") - axes[2].set_ylabel("action ||(vx,vy)||") - axes[2].set_ylim(0, 1.5) - axes[2].set_title("Dog action magnitude — 0=stopped, 1=full speed") - axes[2].legend(fontsize=8) - - # 4. Reward per step - axes[3].plot(t, hist["rewards"], color="purple", lw=1, alpha=0.7) - axes[3].axhline(0, color="black", lw=0.5) - axes[3].set_ylabel("reward") - axes[3].set_xlabel("step") - axes[3].set_title("Reward per step") - - result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']} penned)" - fig.suptitle(f"n_sheep={hist['n_sheep']} {result} {hist['steps']} steps", fontsize=13) - plt.tight_layout() - fig.savefig(out_path, dpi=120) - plt.close(fig) - print(f" saved {out_path}") - - -def save_gif(hist, out_path, fps=15, skip=5): - """Animated replay, every `skip` steps.""" - n = hist["n_sheep"] - idxs = list(range(0, hist["steps"], skip)) - - fig, ax = plt.subplots(figsize=(6, 6)) - - def _frame(k): - ax.clear() - draw_field(ax) - t = idxs[k] - - for i in range(n): - c = SHEEP_COLORS[i % len(SHEEP_COLORS)] - s0 = max(0, t - 30) - ax.plot(hist["sheep_xs"][i][s0:t+1], - hist["sheep_ys"][i][s0:t+1], - color=c, lw=0.8, alpha=0.5) - color = "#ff69b4" if (hist["penned_at"][i] is not None - and t >= hist["penned_at"][i]) else c - ax.plot(hist["sheep_xs"][i][t], hist["sheep_ys"][i][t], - "o", color=color, ms=10, zorder=4, - markeredgecolor="#555", markeredgewidth=1) - - s0 = max(0, t - 30) - ax.plot(hist["dog_xs"][s0:t+1], hist["dog_ys"][s0:t+1], - color=DOG_COLOR, lw=1.5, alpha=0.6) - ax.plot(hist["dog_xs"][t], hist["dog_ys"][t], - "s", color=DOG_COLOR, ms=13, zorder=5, - markeredgecolor="black", markeredgewidth=1.5) - - r = hist["radii"][t] - ax.set_title(f"step {t}/{hist['steps']} radius={r:.1f}m " - f"penned={hist['n_penned'] if t==hist['steps']-1 else '?'}/{n}", - fontsize=10) - - ani = animation.FuncAnimation(fig, _frame, frames=len(idxs), interval=1000//fps) - ani.save(out_path, writer="pillow", fps=fps) - plt.close(fig) - print(f" saved {out_path}") - - -# ── entry point ─────────────────────────────────────────────────────────────── - -def parse_args(): - p = argparse.ArgumentParser() - p.add_argument("--model", default=None, help="Model .zip (omit for random policy)") - p.add_argument("--vecnorm", default=None) - p.add_argument("--n-sheep", type=int, default=3) - p.add_argument("--max-steps", type=int, default=2000) - p.add_argument("--seed", type=int, default=42) - p.add_argument("--out-dir", default="vis_out") - p.add_argument("--random", action="store_true", - help="Use random policy (baseline comparison)") - p.add_argument("--gif-fps", type=int, default=15) - p.add_argument("--gif-skip", type=int, default=5, - help="Render every Nth step in the GIF") - p.add_argument("--no-gif", action="store_true") - return p.parse_args() - - -def main(): - args = parse_args() - os.makedirs(args.out_dir, exist_ok=True) - - raw = DummyVecEnv([make_env(args.n_sheep, args.max_steps, args.seed)]) - - if args.random or args.model is None: - print("Using RANDOM policy") - env = raw - model = None - else: - if args.vecnorm: - env = VecNormalize.load(args.vecnorm, raw) - env.training = False - env.norm_reward = False - else: - env = raw - model = PPO.load(args.model, env=env) - print(f"Loaded model: {args.model}") - - print(f"Running episode n_sheep={args.n_sheep} seed={args.seed} ...") - hist = run_episode(model, env, args.n_sheep, args.max_steps) - - result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']} penned)" - print(f"Episode done: {result} steps={hist['steps']}") - print(f" min radius : {min(hist['radii']):.2f} m") - print(f" mean reward: {np.mean(hist['rewards']):.4f}") - print(f" mean action: {np.mean(hist['action_mags']):.3f}") - - env.close() - - plot_trajectory(hist, os.path.join(args.out_dir, "trajectory.png")) - plot_timeseries(hist, os.path.join(args.out_dir, "timeseries.png")) - if not args.no_gif: - save_gif(hist, os.path.join(args.out_dir, "episode.gif"), - fps=args.gif_fps, skip=args.gif_skip) - - print(f"\nAll outputs saved to {args.out_dir}/") - - -if __name__ == "__main__": - main()