diff --git a/.gitignore b/.gitignore
index 996a8ec..035b6d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Stuff
 #_example/
+.claude/
 
 # Python
 __pycache__/
diff --git a/training/config.json b/training/config.json
new file mode 100644
index 0000000..3c0774e
--- /dev/null
+++ b/training/config.json
@@ -0,0 +1,14 @@
+{
+    "W_PER_SHEEP": 2.0,
+    "W_ALIGN": 0.05,
+    "W_PEN_BONUS": 10.0,
+    "W_COMPLETE": 100.0,
+    "W_STEP_COST": 0.02,
+    "W_COMPACT": 0.0,
+    "W_WALL_TOUCH": 0.15,
+    "WALL_TOUCH_BUFFER": 0.8,
+    "ALIGN_SHAPE": "standoff",
+    "ALIGN_GATED": true,
+    "ENTRY_AWARE": false,
+    "ent_coef": 0.02
+}
diff --git a/training/debug_plots/10sheep/ep001_SUCCESS.png b/training/debug_plots/10sheep/ep001_SUCCESS.png
deleted file mode 100644
index 93beee7..0000000
Binary files a/training/debug_plots/10sheep/ep001_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep002_SUCCESS.png b/training/debug_plots/10sheep/ep002_SUCCESS.png
deleted file mode 100644
index 4fe5075..0000000
Binary files a/training/debug_plots/10sheep/ep002_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep003_SUCCESS.png b/training/debug_plots/10sheep/ep003_SUCCESS.png
deleted file mode 100644
index 54a413a..0000000
Binary files a/training/debug_plots/10sheep/ep003_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep004_SUCCESS.png b/training/debug_plots/10sheep/ep004_SUCCESS.png
deleted file mode 100644
index 8ad143b..0000000
Binary files a/training/debug_plots/10sheep/ep004_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep005_SUCCESS.png b/training/debug_plots/10sheep/ep005_SUCCESS.png
deleted file mode 100644
index 52e2394..0000000
Binary files a/training/debug_plots/10sheep/ep005_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep006_SUCCESS.png b/training/debug_plots/10sheep/ep006_SUCCESS.png
deleted file mode 100644
index fef269e..0000000
Binary files a/training/debug_plots/10sheep/ep006_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep007_SUCCESS.png b/training/debug_plots/10sheep/ep007_SUCCESS.png
deleted file mode 100644
index 1efed9d..0000000
Binary files a/training/debug_plots/10sheep/ep007_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep008_SUCCESS.png b/training/debug_plots/10sheep/ep008_SUCCESS.png
deleted file mode 100644
index 94c048a..0000000
Binary files a/training/debug_plots/10sheep/ep008_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep009_SUCCESS.png b/training/debug_plots/10sheep/ep009_SUCCESS.png
deleted file mode 100644
index 16888d1..0000000
Binary files a/training/debug_plots/10sheep/ep009_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep010_SUCCESS.png b/training/debug_plots/10sheep/ep010_SUCCESS.png
deleted file mode 100644
index cfb2fdc..0000000
Binary files a/training/debug_plots/10sheep/ep010_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep011_SUCCESS.png b/training/debug_plots/10sheep/ep011_SUCCESS.png
deleted file mode 100644
index 5aa1fe7..0000000
Binary files a/training/debug_plots/10sheep/ep011_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep012_SUCCESS.png b/training/debug_plots/10sheep/ep012_SUCCESS.png
deleted file mode 100644
index 138eead..0000000
Binary files a/training/debug_plots/10sheep/ep012_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep013_SUCCESS.png b/training/debug_plots/10sheep/ep013_SUCCESS.png
deleted file mode 100644
index b13ecf9..0000000
Binary files a/training/debug_plots/10sheep/ep013_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep014_SUCCESS.png b/training/debug_plots/10sheep/ep014_SUCCESS.png
deleted file mode 100644
index e6ca7e6..0000000
Binary files a/training/debug_plots/10sheep/ep014_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep015_SUCCESS.png b/training/debug_plots/10sheep/ep015_SUCCESS.png
deleted file mode 100644
index c0cf257..0000000
Binary files a/training/debug_plots/10sheep/ep015_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep016_SUCCESS.png b/training/debug_plots/10sheep/ep016_SUCCESS.png
deleted file mode 100644
index e0e0e5e..0000000
Binary files a/training/debug_plots/10sheep/ep016_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep017_SUCCESS.png b/training/debug_plots/10sheep/ep017_SUCCESS.png
deleted file mode 100644
index 390bcaa..0000000
Binary files a/training/debug_plots/10sheep/ep017_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep018_SUCCESS.png b/training/debug_plots/10sheep/ep018_SUCCESS.png
deleted file mode 100644
index b9c0ccf..0000000
Binary files a/training/debug_plots/10sheep/ep018_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep019_SUCCESS.png b/training/debug_plots/10sheep/ep019_SUCCESS.png
deleted file mode 100644
index dd1bbf4..0000000
Binary files a/training/debug_plots/10sheep/ep019_SUCCESS.png and /dev/null differ
diff --git a/training/debug_plots/10sheep/ep020_SUCCESS.png b/training/debug_plots/10sheep/ep020_SUCCESS.png
deleted file mode 100644
index d31d1ef..0000000
Binary files a/training/debug_plots/10sheep/ep020_SUCCESS.png and /dev/null differ
diff --git a/training/diagnose.py b/training/diagnose.py
deleted file mode 100644
index 59022a1..0000000
--- a/training/diagnose.py
+++ /dev/null
@@ -1,223 +0,0 @@
-"""
-Episode-level diagnostics for the herding policy.
-
-Runs N episodes and for each one tracks:
-  - flock radius over time
-  - COM-to-pen distance over time
-  - dog position over time
-  - when (if ever) the flock first became compact
-  - failure mode classification
-
-Then produces:
-  1. Console summary of failure modes
-  2. Per-episode time-series plots (radius + com_dist)
-  3. Optional rendered playback of the worst episodes
-
-Usage
------
-    python diagnose.py --model runs/ppo_consolidation/final_model.zip \
-                       --vecnorm runs/ppo_consolidation/vecnorm.pkl \
-                       --n-sheep 5 --episodes 20
-
-    # Watch the policy live (first episode rendered):
-    python diagnose.py ... --render
-
-    # Save plots to a directory instead of showing interactively:
-    python diagnose.py ... --plot-dir debug_plots/
-"""
-
-import argparse
-import os
-import numpy as np
-import matplotlib
-import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
-
-from stable_baselines3 import PPO
-from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
-from herding_env import HerdingEnv
-
-
-# ── failure mode constants ────────────────────────────────────────────────────
-
-COMPACT_RADIUS = 5.0   # must match DRIVE_GATE_RADIUS in herding_env.py
-
-
-def classify_failure(ep_radius, ep_com_dist, n_penned, n_sheep, success):
-    if success:
-        return "SUCCESS"
-    if min(ep_radius) > COMPACT_RADIUS:
-        return "NEVER_COMPACT"         # flock was always too scattered
-    first_compact = next(i for i, r in enumerate(ep_radius) if r <= COMPACT_RADIUS)
-    min_com_after = min(ep_com_dist[first_compact:])
-    pen_close = 3.0   # COM within 3m of pen counts as "got close"
-    if min_com_after > pen_close:
-        return "COMPACT_CANT_DRIVE"    # compacted but never drove to pen
-    if n_penned == 0:
-        return "DROVE_NO_SHEEP"        # got near pen, nothing went in
-    return f"PARTIAL_{n_penned}of{n_sheep}"   # some in, not all
-
-
-# ── main ─────────────────────────────────────────────────────────────────────
-
-def parse_args():
-    p = argparse.ArgumentParser()
-    p.add_argument("--model",    required=True)
-    p.add_argument("--vecnorm",  default=None)
-    p.add_argument("--n-sheep",  type=int, default=5)
-    p.add_argument("--episodes", type=int, default=20)
-    p.add_argument("--max-steps", type=int, default=4000)
-    p.add_argument("--render",   action="store_true",
-                   help="Show matplotlib animation of the first episode")
-    p.add_argument("--plot-dir", default=None,
-                   help="Save time-series plots here (one per episode)")
-    p.add_argument("--seed",     type=int, default=0)
-    return p.parse_args()
-
-
-def make_env(n_sheep, max_steps, render_mode=None):
-    def _init():
-        return HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
-                          render_mode=render_mode)
-    return _init
-
-
-def main():
-    args = parse_args()
-
-    if args.plot_dir:
-        os.makedirs(args.plot_dir, exist_ok=True)
-        matplotlib.use("Agg")
-
-    render_mode = "human" if args.render else None
-    raw_env = DummyVecEnv([make_env(args.n_sheep, args.max_steps, render_mode)])
-
-    if args.vecnorm:
-        env = VecNormalize.load(args.vecnorm, raw_env)
-        env.training    = False
-        env.norm_reward = False
-    else:
-        env = raw_env
-
-    model = PPO.load(args.model, env=env)
-
-    failure_counts = {}
-    all_ep_data    = []
-
-    for ep in range(args.episodes):
-        obs   = env.reset()
-        done  = False
-        step  = 0
-
-        ep_radius   = []
-        ep_com_dist = []
-        ep_dog_x    = []
-        ep_dog_y    = []
-        ep_n_penned = []
-
-        while not done:
-            action, _ = model.predict(obs, deterministic=True)
-            obs, _, dones, infos = env.step(action)
-            done  = dones[0]
-            step += 1
-
-            inner = env.envs[0] if hasattr(env, "envs") else env.venv.envs[0]
-            com, radius, _ = inner._flock_stats()
-            com_dist = float(np.linalg.norm(com - inner.PEN_CENTER))
-            n_penned = int(inner.penned[:inner.n_sheep].sum())
-
-            ep_radius.append(radius)
-            ep_com_dist.append(com_dist)
-            ep_dog_x.append(float(inner.dog_pos[0]))
-            ep_dog_y.append(float(inner.dog_pos[1]))
-            ep_n_penned.append(n_penned)
-
-        info    = infos[0]
-        n_pen   = info.get("n_penned", 0)
-        n_sheep = info.get("n_sheep", args.n_sheep)
-        success = n_pen == n_sheep
-        mode    = classify_failure(ep_radius, ep_com_dist, n_pen, n_sheep, success)
-
-        failure_counts[mode] = failure_counts.get(mode, 0) + 1
-
-        compact_step = next((i for i, r in enumerate(ep_radius)
-                             if r <= COMPACT_RADIUS), None)
-        min_radius   = min(ep_radius)
-        min_com_dist = min(ep_com_dist)
-
-        print(f"  ep {ep+1:>3}  steps={step:>5}  penned={n_pen}/{n_sheep}"
-              f"  min_r={min_radius:.1f}m"
-              f"  min_com={min_com_dist:.1f}m"
-              f"  compact@step={compact_step if compact_step is not None else 'NEVER'}"
-              f"  [{mode}]")
-
-        all_ep_data.append(dict(
-            ep=ep, radius=ep_radius, com_dist=ep_com_dist,
-            dog_x=ep_dog_x, dog_y=ep_dog_y, n_penned=ep_n_penned,
-            steps=step, mode=mode, success=success,
-        ))
-
-        # ── per-episode time-series plot ──────────────────────────────────
-        if args.plot_dir or (not args.render and ep < 5):
-            fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
-            t = np.arange(len(ep_radius))
-
-            axes[0].plot(t, ep_radius, color="steelblue", label="flock radius (m)")
-            axes[0].axhline(COMPACT_RADIUS, color="orange", linestyle="--",
-                            label=f"compact threshold ({COMPACT_RADIUS}m)")
-            if compact_step is not None:
-                axes[0].axvline(compact_step, color="green", linestyle=":",
-                                alpha=0.6, label=f"first compact (step {compact_step})")
-            axes[0].set_ylabel("radius (m)")
-            axes[0].legend(fontsize=8)
-            axes[0].set_title(f"ep {ep+1} | n_sheep={n_sheep} | {mode}")
-
-            axes[1].plot(t, ep_com_dist, color="tomato", label="COM-to-pen dist (m)")
-            axes[1].set_ylabel("COM-to-pen (m)")
-            axes[1].set_xlabel("step")
-            axes[1].legend(fontsize=8)
-
-            plt.tight_layout()
-            if args.plot_dir:
-                fig.savefig(os.path.join(args.plot_dir, f"ep{ep+1:03d}_{mode}.png"),
-                            dpi=100)
-                plt.close(fig)
-            else:
-                plt.show(block=False)
-                plt.pause(0.5)
-
-    env.close()
-
-    # ── summary ──────────────────────────────────────────────────────────────
-    print("\n" + "=" * 55)
-    print(f"  Model   : {args.model}")
-    print(f"  n_sheep : {args.n_sheep}   episodes : {args.episodes}")
-    print("-" * 55)
-    total = sum(failure_counts.values())
-    for mode, cnt in sorted(failure_counts.items(), key=lambda x: -x[1]):
-        bar = "█" * cnt
-        print(f"  {mode:<26} {cnt:>3}/{total}  {bar}")
-    print("-" * 55)
-
-    never_compact = failure_counts.get("NEVER_COMPACT", 0)
-    cant_drive    = failure_counts.get("COMPACT_CANT_DRIVE", 0)
-    partial       = sum(v for k, v in failure_counts.items() if k.startswith("PARTIAL"))
-    successes     = failure_counts.get("SUCCESS", 0)
-
-    print(f"\n  Diagnosis:")
-    if never_compact / total > 0.5:
-        print("  ► COLLECT problem: dog rarely compacts the flock.")
-        print("    → Phase-gate W_DRIVE, increase W_COLLECT, check alignment reward.")
-    if cant_drive / total > 0.3:
-        print("  ► DRIVE problem: flock compacts but doesn't reach pen.")
-        print("    → Check dog alignment, pen direction, W_DRIVE magnitude.")
-    if partial / total > 0.3:
-        print("  ► PARTIAL problem: some sheep penned, stragglers remain.")
-        print("    → Flock splits; need better straggler-chasing behavior.")
-    if successes / total > 0.5:
-        print("  ► Mostly working! Fine-tune for consistency.")
-    print("=" * 55)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/training/eval_per_sheep.py b/training/eval_per_sheep.py
deleted file mode 100644
index 90779a3..0000000
--- a/training/eval_per_sheep.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""
-Load a saved run and evaluate the policy at every n_sheep from 1..N.
-Tells you exactly where the curriculum stopped working.
-
-Usage:
-    python eval_per_sheep.py --run-dir runs/ppo_v3
-    python eval_per_sheep.py --run-dir runs/ppo_v3 --max-sheep 10 --episodes 20
-    python eval_per_sheep.py --model runs/ppo_v3/final_model.zip \
-                              --vecnorm runs/ppo_v3/vecnorm.pkl
-"""
-import argparse
-import os
-from copy import deepcopy
-
-import numpy as np
-from stable_baselines3 import PPO
-from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
-
-from herding_env import HerdingEnv
-from train import _classify, COMPACT_RADIUS
-
-
-def evaluate(model, vn_template, n_sheep, n_episodes, max_steps):
-    raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep, max_steps=max_steps)])
-    vn  = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
-    vn.obs_rms = deepcopy(vn_template.obs_rms)
-    vn.ret_rms = deepcopy(vn_template.ret_rms)
-
-    failure = {}
-    successes = 0
-    act_mags, min_radii, min_dog_com, min_pen = [], [], [], []
-
-    for _ in range(n_episodes):
-        obs = vn.reset()
-        done = False
-        ep_radius, ep_com_dist, ep_dog_com, ep_act = [], [], [], []
-        while not done:
-            action, _ = model.predict(obs, deterministic=True)
-            obs, _, dones, infos = vn.step(action)
-            done = dones[0]
-            inner = vn.envs[0]
-            com, radius, _ = inner._flock_stats()
-            ep_radius.append(radius)
-            ep_com_dist.append(float(np.linalg.norm(com - inner.PEN_CENTER)))
-            ep_dog_com.append(float(np.linalg.norm(inner.dog_pos - com)))
-            ep_act.append(float(np.linalg.norm(action[0])))
-        npen = infos[0].get("n_penned", 0)
-        success = npen == n_sheep
-        successes += int(success)
-        mode = _classify(ep_radius, ep_com_dist, npen, n_sheep, success)
-        failure[mode] = failure.get(mode, 0) + 1
-        act_mags.extend(ep_act)
-        min_radii.append(min(ep_radius))
-        min_dog_com.append(min(ep_dog_com))
-        min_pen.append(min(ep_com_dist))
-    vn.close()
-
-    return {
-        "n_sheep": n_sheep,
-        "success_rate": successes / n_episodes,
-        "failure": failure,
-        "mean_action": float(np.mean(act_mags)),
-        "mean_min_radius": float(np.mean(min_radii)),
-        "mean_min_dog_com": float(np.mean(min_dog_com)),
-        "mean_min_pen": float(np.mean(min_pen)),
-    }
-
-
-def main():
-    p = argparse.ArgumentParser()
-    p.add_argument("--run-dir", type=str, default=None)
-    p.add_argument("--model", type=str, default=None)
-    p.add_argument("--vecnorm", type=str, default=None)
-    p.add_argument("--max-sheep", type=int, default=10)
-    p.add_argument("--episodes", type=int, default=10)
-    p.add_argument("--max-steps", type=int, default=2000)
-    args = p.parse_args()
-
-    if args.run_dir:
-        model_path = os.path.join(args.run_dir, "final_model.zip")
-        if not os.path.exists(model_path):
-            model_path = os.path.join(args.run_dir, "best_model", "best_model.zip")
-        vn_path = os.path.join(args.run_dir, "vecnorm.pkl")
-    else:
-        model_path = args.model
-        vn_path    = args.vecnorm
-
-    print(f"Loading model:   {model_path}")
-    print(f"Loading vecnorm: {vn_path}\n")
-    model = PPO.load(model_path, device="cpu")
-    raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=1, max_steps=args.max_steps)])
-    vn_template = VecNormalize.load(vn_path, raw)
-
-    print(f"{'n_sheep':>7} {'success':>8} {'act':>6} {'min_r':>7} "
-          f"{'dog→com':>8} {'com→pen':>8}  failure breakdown")
-    print("-" * 90)
-    for n in range(1, args.max_sheep + 1):
-        r = evaluate(model, vn_template, n, args.episodes, args.max_steps)
-        fb = " ".join(f"{m}={c}" for m, c in
-                      sorted(r["failure"].items(), key=lambda x: -x[1]))
-        print(f"{n:>7d} {r['success_rate']*100:>6.0f}% "
-              f"{r['mean_action']:>6.2f} "
-              f"{r['mean_min_radius']:>6.2f}m "
-              f"{r['mean_min_dog_com']:>7.2f}m "
-              f"{r['mean_min_pen']:>7.2f}m  {fb}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/training/evaluate.py b/training/evaluate.py
deleted file mode 100644
index 6fe7560..0000000
--- a/training/evaluate.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""
-Evaluation script for a trained herding policy.
-
-Runs N episodes and reports the three project metrics:
-  1. Success rate       — fraction of episodes where all sheep are penned
-  2. Time-to-pen        — mean steps across successful episodes (per sheep)
-  3. Flock dispersion   — mean pairwise distance among active sheep, averaged
-                          over all timesteps (lower = tighter herding)
-
-Usage
------
-    python evaluate.py --model runs/ppo_herding/best_model/best_model.zip \
-                       --vecnorm runs/ppo_herding/vecnorm.pkl \
-                       --n-sheep 5 --episodes 100
-
-Add --render to watch the first episode in a matplotlib window.
-"""
-
-import argparse
-
-import numpy as np
-from stable_baselines3 import PPO
-from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
-
-from herding_env import HerdingEnv
-
-
-def make_single_env(n_sheep: int, max_steps: int, render_mode: str = None):
-    def _init():
-        return HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
-                          render_mode=render_mode)
-    return _init
-
-
-def pairwise_mean(positions: np.ndarray, n_active: int) -> float:
-    """Mean pairwise distance among the first n_active sheep."""
-    if n_active < 2:
-        return 0.0
-    pts = positions[:n_active]
-    dists = []
-    for i in range(n_active):
-        for j in range(i + 1, n_active):
-            dists.append(float(np.linalg.norm(pts[i] - pts[j])))
-    return float(np.mean(dists))
-
-
-def parse_args():
-    p = argparse.ArgumentParser()
-    p.add_argument("--model",    required=True,
-                   help="Path to saved model .zip")
-    p.add_argument("--vecnorm",  default=None,
-                   help="Path to VecNormalize stats .pkl (optional)")
-    p.add_argument("--n-sheep",  type=int, default=1)
-    p.add_argument("--episodes", type=int, default=50)
-    p.add_argument("--max-steps", type=int, default=2000)
-    p.add_argument("--render",   action="store_true",
-                   help="Render first episode in matplotlib")
-    p.add_argument("--seed",     type=int, default=42)
-    return p.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    render_mode = "human" if args.render else None
-    raw_env = DummyVecEnv([make_single_env(args.n_sheep, args.max_steps,
-                                           render_mode)])
-    if args.vecnorm:
-        env = VecNormalize.load(args.vecnorm, raw_env)
-        env.training  = False
-        env.norm_reward = False
-    else:
-        env = raw_env
-
-    model = PPO.load(args.model, env=env)
-
-    successes       = []
-    steps_to_pen    = []   # steps for successful episodes
-    dispersions     = []   # per-episode mean flock dispersion
-
-    for ep in range(args.episodes):
-        obs = env.reset()
-        done = False
-        ep_steps = 0
-        ep_dispersion = []
-        first_ep = ep == 0
-
-        while not done:
-            action, _ = model.predict(obs, deterministic=True)
-            obs, _, dones, infos = env.step(action)
-            done = dones[0]
-            ep_steps += 1
-
-            # Access the underlying HerdingEnv for dispersion calculation
-            inner = env.envs[0] if hasattr(env, "envs") else env.venv.envs[0]
-            if not inner.penned[:inner.n_sheep].all():
-                _, radius, _ = inner._flock_stats()
-                ep_dispersion.append(radius)
-
-            if first_ep and render_mode == "human":
-                pass   # render() is called inside step()
-
-        info = infos[0]
-        n_penned = info.get("n_penned", 0)
-        n_sheep  = info.get("n_sheep",  args.n_sheep)
-        success  = n_penned == n_sheep
-
-        successes.append(int(success))
-        if success:
-            steps_to_pen.append(ep_steps / n_sheep)
-        if ep_dispersion:
-            dispersions.append(float(np.mean(ep_dispersion)))
-
-        if (ep + 1) % 10 == 0:
-            print(f"  Episode {ep + 1:>4}/{args.episodes}  "
-                  f"success={int(success)}  steps={ep_steps}")
-
-    env.close()
-
-    # -----------------------------------------------------------------------
-    # Report
-    # -----------------------------------------------------------------------
-    success_rate = float(np.mean(successes))
-    mean_ttp     = float(np.mean(steps_to_pen)) if steps_to_pen else float("nan")
-    mean_disp    = float(np.mean(dispersions))   if dispersions  else float("nan")
-
-    print("\n" + "=" * 50)
-    print(f"  Model           : {args.model}")
-    print(f"  Sheep           : {args.n_sheep}")
-    print(f"  Episodes        : {args.episodes}")
-    print("-" * 50)
-    print(f"  Success rate    : {success_rate * 100:.1f}%"
-          f"  ({sum(successes)}/{args.episodes})")
-    print(f"  Time-to-pen     : {mean_ttp:.1f} steps/sheep"
-          f"  (successful episodes only)")
-    print(f"  Flock radius    : {mean_disp:.2f} m"
-          f"  (max sheep-to-COM distance while active)")
-    print("=" * 50)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/training/herding_env.py b/training/herding_env.py
index b038551..6665906 100644
--- a/training/herding_env.py
+++ b/training/herding_env.py
@@ -61,18 +61,19 @@ class HerdingEnv(gym.Env):
     W_COMPLETE  = 100.0  # all sheep penned
     W_STEP_COST = 0.02   # time penalty — strong enough to punish doing nothing
     W_COMPACT   = 0.0    # reward for flock-radius reduction (off by default)
-    W_WALL_TOUCH = 0.01  # per-sheep, per-step penalty when an active sheep is
-                         # pinned against the outside of a pen W/E wall. Kept
-                         # small (<step_cost) so the dog isn't incentivised to
-                         # hover above the entrance to avoid the penalty.
-    WALL_TOUCH_BUFFER = 0.3   # metres outside the wall counted as "touching"
+    W_WALL_TOUCH = 0.15  # per-sheep max penalty at wall surface. Linear ramp
+                         # within WALL_TOUCH_BUFFER gives the RL agent a gradient
+                         # signal to avoid pinning sheep against pen walls.
+                         # 0.15 ≈ 7.5× step_cost — strong enough to shape behavior
+                         # without overwhelming progress reward.
+    WALL_TOUCH_BUFFER = 0.8   # metres from wall where penalty starts ramping
     ALIGN_SHAPE = "standoff"   # "standoff" (peaks at IDEAL) | "near" (peaks at 0)
     ALIGN_GATED = True   # gate alignment on action magnitude
-    ENTRY_AWARE = True   # progress reward targets PEN_ENTRY (entrance face), not
-                         # PEN_CENTER. Stops the wall-corraling exploit: when a
-                         # sheep is shoved south past y=-8 outside the pen x-range,
-                         # distance to PEN_ENTRY grows (since target is at y=-8),
-                         # so progress reward goes negative instead of positive.
+    ENTRY_AWARE = False  # When True, targets PEN_ENTRY (entrance face) instead
+                         # of PEN_CENTER for progress/obs. Intended to fix wall-
+                         # corralling but collapsed n_sheep≥2 success rate.
+                         # The wall-touch gradient penalty handles wall avoidance
+                         # without breaking the core herding signal.
 
     # Initial sheep spawn: first sheep placed anywhere; rest within CLUSTER_RADIUS
     # of it. Set to None for legacy uniform-scatter behaviour.
@@ -406,16 +407,25 @@ class HerdingEnv(gym.Env):
         else:
             alignment = 0.0
 
-        # Wall-touch penalty: count active sheep pinned against outside W/E pen walls.
+        # Wall-touch penalty: distance-based gradient covering all 3 solid pen
+        # walls (west, east, south). Linearly ramps from 0 at buffer edge to
+        # W_WALL_TOUCH at the wall surface — gives the agent a smooth signal
+        # to avoid pinning sheep against walls.
         if self.W_WALL_TOUCH and active.any():
             pts = self.sheep_pos[:self.n_sheep][active]
             px0, px1 = self.PEN_X
             py0, py1 = self.PEN_Y
-            in_y     = (pts[:, 1] > py0) & (pts[:, 1] < py1)
-            near_w   = (pts[:, 0] < px0) & (pts[:, 0] > px0 - self.WALL_TOUCH_BUFFER)
-            near_e   = (pts[:, 0] > px1) & (pts[:, 0] < px1 + self.WALL_TOUCH_BUFFER)
-            n_touch  = int(((near_w | near_e) & in_y).sum())
-            r_wall_touch = -n_touch * self.W_WALL_TOUCH
+            buf = self.WALL_TOUCH_BUFFER
+            far = buf + 1.0
+            d_w = np.where((pts[:, 0] < px0) & (pts[:, 1] > py0) & (pts[:, 1] < py1),
+                           px0 - pts[:, 0], far)
+            d_e = np.where((pts[:, 0] > px1) & (pts[:, 1] > py0) & (pts[:, 1] < py1),
+                           pts[:, 0] - px1, far)
+            d_s = np.where((pts[:, 1] < py0) & (pts[:, 0] > px0) & (pts[:, 0] < px1),
+                           py0 - pts[:, 1], far)
+            d_min = np.minimum(np.minimum(d_w, d_e), d_s)
+            penalties = np.maximum(0.0, 1.0 - d_min / buf) * self.W_WALL_TOUCH
+            r_wall_touch = -float(penalties.sum())
         else:
             r_wall_touch = 0.0
 
diff --git a/training/replay_config.py b/training/replay_config.py
deleted file mode 100644
index 08a151d..0000000
--- a/training/replay_config.py
+++ /dev/null
@@ -1,172 +0,0 @@
-"""
-Replay a reward config from the sweep with a longer training budget.
-
-Tells you whether a promising sweep config was bottlenecked by training time
-vs. structurally limited. If sr2/sr3 climb past their sweep numbers given more
-budget, the issue was budget; if they plateau, the policy/obs needs work.
-
-Usage
------
-    python replay_config.py --config runs/sweep_<ts>/best.json
-    python replay_config.py --config runs/sweep_<ts>/trial_007/config.json \
-        --max-sheep 4 --steps-per-stage 1500000
-
-Argument summary:
-    --config           JSON file with the reward config (sweep best.json works)
-    --max-sheep        Final curriculum stage (default 3)
-    --steps-per-stage  Env steps per curriculum stage (default 1.5M)
-    --n-envs           Parallel envs (default 8)
-    --eval-episodes    Per-stage eval episodes (default 30)
-    --run-dir          Output directory (default runs/replay_<ts>/)
-"""
-import argparse
-import json
-import os
-import time
-from copy import deepcopy
-
-import numpy as np
-from stable_baselines3 import PPO
-from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize
-
-from herding_env import HerdingEnv
-from sweep_reward import ProgressCallback, reward_cfg, evaluate, make_env
-
-
-def main():
-    p = argparse.ArgumentParser()
-    p.add_argument("--config", type=str, required=True,
-                   help="Reward config JSON (sweep best.json or trial config.json)")
-    p.add_argument("--start-sheep", type=int, default=1)
-    p.add_argument("--max-sheep", type=int, default=3)
-    p.add_argument("--steps-per-stage", type=int, default=1_500_000)
-    p.add_argument("--mixed", action="store_true",
-                   help="Train with n_sheep randomized per episode (no curriculum). "
-                        "Total train steps = steps-per-stage * max_sheep.")
-    p.add_argument("--final-mixed-steps", type=int, default=0,
-                   help="After the curriculum, train this many extra steps with "
-                        "random_n_sheep ∈ [1, max_sheep] to consolidate the policy "
-                        "across all flock sizes. Re-evaluates all n_sheep at the end.")
-    p.add_argument("--n-envs", type=int, default=8)
-    p.add_argument("--max-steps", type=int, default=2500)
-    p.add_argument("--eval-episodes", type=int, default=30)
-    p.add_argument("--run-dir", type=str, default=None)
-    args = p.parse_args()
-
-    with open(args.config) as f:
-        raw = json.load(f)
-    cfg = raw["config"] if "config" in raw and isinstance(raw["config"], dict) else raw
-    rcfg = reward_cfg(cfg)
-    print(f"Config: {cfg}")
-
-    run_dir = args.run_dir or os.path.join(
-        "runs", "replay_" + time.strftime("%Y%m%d_%H%M%S")
-    )
-    os.makedirs(run_dir, exist_ok=True)
-    with open(os.path.join(run_dir, "config.json"), "w") as f:
-        json.dump(cfg, f, indent=2)
-    print(f"Run dir: {run_dir}")
-    if args.mixed:
-        print(f"MIXED training: random n_sheep ∈ [1, {args.max_sheep}], "
-              f"{args.steps_per_stage * args.max_sheep:,} total steps")
-    else:
-        print(f"Curriculum: {args.start_sheep} → {args.max_sheep} sheep, "
-              f"{args.steps_per_stage:,} steps/stage")
-
-    train_env = SubprocVecEnv([
-        make_env(args.max_sheep if args.mixed else args.start_sheep,
-                 seed=i, max_steps=args.max_steps, rcfg=rcfg,
-                 random_n_sheep=args.mixed)
-        for i in range(args.n_envs)
-    ])
-    vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
-
-    model = PPO(
-        "MlpPolicy", vn,
-        learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
-        gamma=0.995, gae_lambda=0.95, clip_range=0.2,
-        ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5,
-        policy_kwargs=dict(net_arch=[256, 256]),
-        verbose=0,
-    )
-
-    stage_results = []
-    t0 = time.time()
-    try:
-        if args.mixed:
-            total = args.steps_per_stage * args.max_sheep
-            print(f"\n[Mixed] training {total:,} steps")
-            model.learn(
-                total_timesteps=total,
-                reset_num_timesteps=True,
-                callback=ProgressCallback(0, "mixed", freq=100_000),
-            )
-            for n in range(1, args.max_sheep + 1):
-                print(f"[Mixed] evaluating n={n}, {args.eval_episodes} eps")
-                r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
-                print(f"[Mixed] n_sheep={n}  sr={r['sr']*100:.0f}%  "
-                      f"mean_len={r['mean_len']:.0f}  "
-                      f"mean_min_pen={r['mean_min_pen']:.1f}m  "
-                      f"mean_act={r['mean_act']:.2f}")
-                stage_results.append({"n_sheep": n, **r})
-        else:
-            for n in range(args.start_sheep, args.max_sheep + 1):
-                if n > args.start_sheep:
-                    vn.env_method("set_n_sheep", n)
-                print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps")
-                model.learn(
-                    total_timesteps=args.steps_per_stage,
-                    reset_num_timesteps=(n == args.start_sheep),
-                    callback=ProgressCallback(0, f"{n} sheep", freq=100_000),
-                )
-                print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
-                r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
-                print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}%  "
-                      f"mean_len={r['mean_len']:.0f}  "
-                      f"mean_min_pen={r['mean_min_pen']:.1f}m  "
-                      f"mean_act={r['mean_act']:.2f}")
-                stage_results.append({"n_sheep": n, **r})
-
-        # Optional consolidation pass with mixed n_sheep — fixes specialization
-        # imbalance from curriculum order (e.g. n=1 weakness after long n=10
-        # training). Replaces stage_results with the post-consolidation eval.
-        if args.final_mixed_steps > 0 and not args.mixed:
-            print(f"\n[Consolidation] mixed n_sheep ∈ [1, {args.max_sheep}], "
-                  f"{args.final_mixed_steps:,} steps")
-            vn.env_method("__setattr__", "random_n_sheep", True)
-            model.learn(
-                total_timesteps=args.final_mixed_steps,
-                reset_num_timesteps=False,
-                callback=ProgressCallback(0, "consolidate", freq=100_000),
-            )
-            print("[Consolidation] re-evaluating all sheep counts")
-            stage_results = []
-            for n in range(1, args.max_sheep + 1):
-                r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
-                print(f"[Consolidation] n_sheep={n}  sr={r['sr']*100:.0f}%  "
-                      f"mean_len={r['mean_len']:.0f}  "
-                      f"mean_min_pen={r['mean_min_pen']:.1f}m  "
-                      f"mean_act={r['mean_act']:.2f}")
-                stage_results.append({"n_sheep": n, **r})
-
-        model.save(os.path.join(run_dir, "final_model"))
-        vn.save(os.path.join(run_dir, "vecnorm.pkl"))
-        with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
-            json.dump(stage_results, f, indent=2)
-    finally:
-        try: vn.close()
-        except Exception: pass
-
-    print("\n" + "=" * 60)
-    print("  REPLAY SUMMARY")
-    print("=" * 60)
-    for r in stage_results:
-        print(f"  n_sheep={r['n_sheep']}  sr={r['sr']*100:>3.0f}%  "
-              f"len={r['mean_len']:>5.0f}  min_pen={r['mean_min_pen']:>5.1f}m  "
-              f"act={r['mean_act']:.2f}")
-    print(f"\n  Total time: {(time.time()-t0)/60:.1f} min")
-    print(f"  Artefacts:  {run_dir}/")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/training/runs/expA_fresh2.log b/training/runs/expA_fresh2.log
deleted file mode 100644
index 6ea1c89..0000000
--- a/training/runs/expA_fresh2.log
+++ /dev/null
@@ -1,35 +0,0 @@
-Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-Run dir: runs/expA_fresh2
-Curriculum: 2 → 2 sheep, 2,000,000 steps/stage
-
-[Stage n_sheep=2] training 2,000,000 steps
-           ... [trial 1 | 2 sheep | 100,000 steps | ret(last 50)=-13.44  sr=0%]
-           ... [trial 1 | 2 sheep | 200,000 steps | ret(last 50)=-14.60  sr=0%]
-           ... [trial 1 | 2 sheep | 300,000 steps | ret(last 50)=-17.36  sr=0%]
-           ... [trial 1 | 2 sheep | 400,000 steps | ret(last 50)=-17.36  sr=0%]
-           ... [trial 1 | 2 sheep | 500,000 steps | ret(last 50)=-17.92  sr=0%]
-           ... [trial 1 | 2 sheep | 600,000 steps | ret(last 50)=-15.65  sr=0%]
-           ... [trial 1 | 2 sheep | 700,000 steps | ret(last 50)=-17.69  sr=2%]
-           ... [trial 1 | 2 sheep | 800,000 steps | ret(last 50)=-14.61  sr=2%]
-           ... [trial 1 | 2 sheep | 900,000 steps | ret(last 50)=-17.36  sr=0%]
-           ... [trial 1 | 2 sheep | 1,000,000 steps | ret(last 50)=-17.44  sr=0%]
-           ... [trial 1 | 2 sheep | 1,100,000 steps | ret(last 50)=-15.91  sr=2%]
-           ... [trial 1 | 2 sheep | 1,200,000 steps | ret(last 50)=-16.08  sr=0%]
-           ... [trial 1 | 2 sheep | 1,300,000 steps | ret(last 50)=-14.34  sr=0%]
-           ... [trial 1 | 2 sheep | 1,400,000 steps | ret(last 50)=-17.00  sr=2%]
-           ... [trial 1 | 2 sheep | 1,500,000 steps | ret(last 50)=-18.52  sr=0%]
-           ... [trial 1 | 2 sheep | 1,600,000 steps | ret(last 50)=-16.68  sr=0%]
-           ... [trial 1 | 2 sheep | 1,700,000 steps | ret(last 50)=-17.52  sr=0%]
-           ... [trial 1 | 2 sheep | 1,800,000 steps | ret(last 50)=-17.33  sr=0%]
-           ... [trial 1 | 2 sheep | 1,900,000 steps | ret(last 50)=-14.96  sr=2%]
-           ... [trial 1 | 2 sheep | 2,000,000 steps | ret(last 50)=-15.59  sr=0%]
-[Stage n_sheep=2] evaluating 30 eps
-[Stage n_sheep=2] sr=0%  mean_len=1500  mean_min_pen=13.2m  mean_act=0.96
-
-============================================================
-  REPLAY SUMMARY
-============================================================
-  n_sheep=2  sr=  0%  len= 1500  min_pen= 13.2m  act=0.96
-
-  Total time: 10.7 min
-  Artefacts:  runs/expA_fresh2/
diff --git a/training/runs/expA_fresh2/config.json b/training/runs/expA_fresh2/config.json
deleted file mode 100644
index b2d15fe..0000000
--- a/training/runs/expA_fresh2/config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "W_PER_SHEEP": 1.0,
-  "W_ALIGN": 0.0,
-  "W_PEN_BONUS": 5.0,
-  "W_STEP_COST": 0.02,
-  "W_COMPLETE": 200.0,
-  "W_COMPACT": 1.5,
-  "ALIGN_SHAPE": "standoff",
-  "ALIGN_GATED": false,
-  "ent_coef": 0.02
-}
\ No newline at end of file
diff --git a/training/runs/expA_fresh2/final_model.zip b/training/runs/expA_fresh2/final_model.zip
deleted file mode 100644
index 3d8a3e3..0000000
Binary files a/training/runs/expA_fresh2/final_model.zip and /dev/null differ
diff --git a/training/runs/expA_fresh2/stage_results.json b/training/runs/expA_fresh2/stage_results.json
deleted file mode 100644
index 323888a..0000000
--- a/training/runs/expA_fresh2/stage_results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-[
-  {
-    "n_sheep": 2,
-    "sr": 0.0,
-    "mean_len": 1500.0,
-    "mean_min_pen": 13.171057415008544,
-    "mean_act": 0.960968065615257
-  }
-]
\ No newline at end of file
diff --git a/training/runs/expA_fresh2/vecnorm.pkl b/training/runs/expA_fresh2/vecnorm.pkl
deleted file mode 100644
index 5e15a6d..0000000
Binary files a/training/runs/expA_fresh2/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/expB_mixed.log b/training/runs/expB_mixed.log
deleted file mode 100644
index 02c2b65..0000000
--- a/training/runs/expB_mixed.log
+++ /dev/null
@@ -1,51 +0,0 @@
-Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-Run dir: runs/expB_mixed
-MIXED training: random n_sheep ∈ [1, 3], 3,000,000 total steps
-
-[Mixed] training 3,000,000 steps
-           ... [trial 1 | mixed | 100,000 steps | ret(last 50)=-13.68  sr=2%]
-           ... [trial 1 | mixed | 200,000 steps | ret(last 50)=-14.08  sr=0%]
-           ... [trial 1 | mixed | 300,000 steps | ret(last 50)=-9.80  sr=0%]
-           ... [trial 1 | mixed | 400,000 steps | ret(last 50)=-11.20  sr=0%]
-           ... [trial 1 | mixed | 500,000 steps | ret(last 50)=-10.61  sr=0%]
-           ... [trial 1 | mixed | 600,000 steps | ret(last 50)=-11.19  sr=0%]
-           ... [trial 1 | mixed | 700,000 steps | ret(last 50)=-14.22  sr=0%]
-           ... [trial 1 | mixed | 800,000 steps | ret(last 50)=-6.31  sr=0%]
-           ... [trial 1 | mixed | 900,000 steps | ret(last 50)=-12.68  sr=0%]
-           ... [trial 1 | mixed | 1,000,000 steps | ret(last 50)=-11.06  sr=0%]
-           ... [trial 1 | mixed | 1,100,000 steps | ret(last 50)=-13.39  sr=0%]
-           ... [trial 1 | mixed | 1,200,000 steps | ret(last 50)=-14.20  sr=0%]
-           ... [trial 1 | mixed | 1,300,000 steps | ret(last 50)=-11.33  sr=0%]
-           ... [trial 1 | mixed | 1,400,000 steps | ret(last 50)=-10.73  sr=0%]
-           ... [trial 1 | mixed | 1,500,000 steps | ret(last 50)=-10.91  sr=0%]
-           ... [trial 1 | mixed | 1,600,000 steps | ret(last 50)=-10.44  sr=0%]
-           ... [trial 1 | mixed | 1,700,000 steps | ret(last 50)=-10.56  sr=0%]
-           ... [trial 1 | mixed | 1,800,000 steps | ret(last 50)=-15.74  sr=0%]
-           ... [trial 1 | mixed | 1,900,000 steps | ret(last 50)=-13.46  sr=0%]
-           ... [trial 1 | mixed | 2,000,000 steps | ret(last 50)=-9.86  sr=0%]
-           ... [trial 1 | mixed | 2,100,000 steps | ret(last 50)=-13.07  sr=0%]
-           ... [trial 1 | mixed | 2,200,000 steps | ret(last 50)=-9.86  sr=0%]
-           ... [trial 1 | mixed | 2,300,000 steps | ret(last 50)=-9.73  sr=2%]
-           ... [trial 1 | mixed | 2,400,000 steps | ret(last 50)=-12.21  sr=0%]
-           ... [trial 1 | mixed | 2,500,000 steps | ret(last 50)=-14.27  sr=0%]
-           ... [trial 1 | mixed | 2,600,000 steps | ret(last 50)=-10.90  sr=2%]
-           ... [trial 1 | mixed | 2,700,000 steps | ret(last 50)=-9.67  sr=0%]
-           ... [trial 1 | mixed | 2,800,000 steps | ret(last 50)=-14.29  sr=0%]
-           ... [trial 1 | mixed | 2,900,000 steps | ret(last 50)=-9.08  sr=0%]
-           ... [trial 1 | mixed | 3,000,000 steps | ret(last 50)=-11.62  sr=6%]
-[Mixed] evaluating n=1, 30 eps
-[Mixed] n_sheep=1  sr=0%  mean_len=1500  mean_min_pen=12.1m  mean_act=0.64
-[Mixed] evaluating n=2, 30 eps
-[Mixed] n_sheep=2  sr=0%  mean_len=1500  mean_min_pen=13.6m  mean_act=1.12
-[Mixed] evaluating n=3, 30 eps
-[Mixed] n_sheep=3  sr=0%  mean_len=1500  mean_min_pen=13.3m  mean_act=1.02
-
-============================================================
-  REPLAY SUMMARY
-============================================================
-  n_sheep=1  sr=  0%  len= 1500  min_pen= 12.1m  act=0.64
-  n_sheep=2  sr=  0%  len= 1500  min_pen= 13.6m  act=1.12
-  n_sheep=3  sr=  0%  len= 1500  min_pen= 13.3m  act=1.02
-
-  Total time: 20.6 min
-  Artefacts:  runs/expB_mixed/
diff --git a/training/runs/expB_mixed/config.json b/training/runs/expB_mixed/config.json
deleted file mode 100644
index b2d15fe..0000000
--- a/training/runs/expB_mixed/config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "W_PER_SHEEP": 1.0,
-  "W_ALIGN": 0.0,
-  "W_PEN_BONUS": 5.0,
-  "W_STEP_COST": 0.02,
-  "W_COMPLETE": 200.0,
-  "W_COMPACT": 1.5,
-  "ALIGN_SHAPE": "standoff",
-  "ALIGN_GATED": false,
-  "ent_coef": 0.02
-}
\ No newline at end of file
diff --git a/training/runs/expB_mixed/final_model.zip b/training/runs/expB_mixed/final_model.zip
deleted file mode 100644
index 211707c..0000000
Binary files a/training/runs/expB_mixed/final_model.zip and /dev/null differ
diff --git a/training/runs/expB_mixed/stage_results.json b/training/runs/expB_mixed/stage_results.json
deleted file mode 100644
index 735c94e..0000000
--- a/training/runs/expB_mixed/stage_results.json
+++ /dev/null
@@ -1,23 +0,0 @@
-[
-  {
-    "n_sheep": 1,
-    "sr": 0.0,
-    "mean_len": 1500.0,
-    "mean_min_pen": 12.136781152089437,
-    "mean_act": 0.6380681545449439
-  },
-  {
-    "n_sheep": 2,
-    "sr": 0.0,
-    "mean_len": 1500.0,
-    "mean_min_pen": 13.609641806284587,
-    "mean_act": 1.1225489819858792
-  },
-  {
-    "n_sheep": 3,
-    "sr": 0.0,
-    "mean_len": 1500.0,
-    "mean_min_pen": 13.337443319956462,
-    "mean_act": 1.0186407331574738
-  }
-]
\ No newline at end of file
diff --git a/training/runs/expB_mixed/vecnorm.pkl b/training/runs/expB_mixed/vecnorm.pkl
deleted file mode 100644
index 9bb6497..0000000
Binary files a/training/runs/expB_mixed/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/expC_clustered.log b/training/runs/expC_clustered.log
deleted file mode 100644
index 424303f..0000000
--- a/training/runs/expC_clustered.log
+++ /dev/null
@@ -1,57 +0,0 @@
-Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-Run dir: runs/expC_clustered
-Curriculum: 1 → 3 sheep, 1,000,000 steps/stage
-
-[Stage n_sheep=1] training 1,000,000 steps
-           ... [trial 1 | 1 sheep | 100,000 steps | ret(last 50)=-17.04  sr=6%]
-           ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-17.39  sr=4%]
-           ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=-15.50  sr=4%]
-           ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=-2.07  sr=26%]
-           ... [trial 1 | 1 sheep | 500,000 steps | ret(last 50)=+3.81  sr=52%]
-           ... [trial 1 | 1 sheep | 600,000 steps | ret(last 50)=+8.03  sr=76%]
-           ... [trial 1 | 1 sheep | 700,000 steps | ret(last 50)=+9.49  sr=86%]
-           ... [trial 1 | 1 sheep | 800,000 steps | ret(last 50)=+9.42  sr=88%]
-           ... [trial 1 | 1 sheep | 900,000 steps | ret(last 50)=+9.49  sr=88%]
-           ... [trial 1 | 1 sheep | 1,000,000 steps | ret(last 50)=+10.34  sr=94%]
-[Stage n_sheep=1] evaluating 30 eps
-[Stage n_sheep=1] sr=83%  mean_len=519  mean_min_pen=3.5m  mean_act=0.25
-
-[Stage n_sheep=2] training 1,000,000 steps
-           ... [trial 1 | 2 sheep | 1,015,816 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 1 | 2 sheep | 1,115,816 steps | ret(last 50)=-0.13  sr=10%]
-           ... [trial 1 | 2 sheep | 1,215,816 steps | ret(last 50)=-1.23  sr=10%]
-           ... [trial 1 | 2 sheep | 1,315,816 steps | ret(last 50)=-0.10  sr=6%]
-           ... [trial 1 | 2 sheep | 1,415,816 steps | ret(last 50)=+4.10  sr=28%]
-           ... [trial 1 | 2 sheep | 1,515,816 steps | ret(last 50)=+6.24  sr=32%]
-           ... [trial 1 | 2 sheep | 1,615,816 steps | ret(last 50)=+8.48  sr=52%]
-           ... [trial 1 | 2 sheep | 1,715,816 steps | ret(last 50)=+14.14  sr=98%]
-           ... [trial 1 | 2 sheep | 1,815,816 steps | ret(last 50)=+14.33  sr=98%]
-           ... [trial 1 | 2 sheep | 1,915,816 steps | ret(last 50)=+14.02  sr=100%]
-           ... [trial 1 | 2 sheep | 2,015,816 steps | ret(last 50)=+14.05  sr=100%]
-[Stage n_sheep=2] evaluating 30 eps
-[Stage n_sheep=2] sr=100%  mean_len=695  mean_min_pen=3.4m  mean_act=0.58
-
-[Stage n_sheep=3] training 1,000,000 steps
-           ... [trial 1 | 3 sheep | 2,031,624 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 1 | 3 sheep | 2,131,624 steps | ret(last 50)=+10.43  sr=56%]
-           ... [trial 1 | 3 sheep | 2,231,624 steps | ret(last 50)=+13.91  sr=74%]
-           ... [trial 1 | 3 sheep | 2,331,624 steps | ret(last 50)=+13.98  sr=76%]
-           ... [trial 1 | 3 sheep | 2,431,624 steps | ret(last 50)=+12.67  sr=68%]
-           ... [trial 1 | 3 sheep | 2,531,624 steps | ret(last 50)=+15.79  sr=90%]
-           ... [trial 1 | 3 sheep | 2,631,624 steps | ret(last 50)=+16.29  sr=94%]
-           ... [trial 1 | 3 sheep | 2,731,624 steps | ret(last 50)=+15.47  sr=90%]
-           ... [trial 1 | 3 sheep | 2,831,624 steps | ret(last 50)=+16.67  sr=96%]
-           ... [trial 1 | 3 sheep | 2,931,624 steps | ret(last 50)=+17.50  sr=100%]
-           ... [trial 1 | 3 sheep | 3,031,624 steps | ret(last 50)=+16.49  sr=96%]
-[Stage n_sheep=3] evaluating 30 eps
-[Stage n_sheep=3] sr=90%  mean_len=794  mean_min_pen=3.7m  mean_act=0.47
-
-============================================================
-  REPLAY SUMMARY
-============================================================
-  n_sheep=1  sr= 83%  len=  519  min_pen=  3.5m  act=0.25
-  n_sheep=2  sr=100%  len=  695  min_pen=  3.4m  act=0.58
-  n_sheep=3  sr= 90%  len=  794  min_pen=  3.7m  act=0.47
-
-  Total time: 15.1 min
-  Artefacts:  runs/expC_clustered/
diff --git a/training/runs/expC_clustered/config.json b/training/runs/expC_clustered/config.json
deleted file mode 100644
index b2d15fe..0000000
--- a/training/runs/expC_clustered/config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "W_PER_SHEEP": 1.0,
-  "W_ALIGN": 0.0,
-  "W_PEN_BONUS": 5.0,
-  "W_STEP_COST": 0.02,
-  "W_COMPLETE": 200.0,
-  "W_COMPACT": 1.5,
-  "ALIGN_SHAPE": "standoff",
-  "ALIGN_GATED": false,
-  "ent_coef": 0.02
-}
\ No newline at end of file
diff --git a/training/runs/expC_clustered/final_model.zip b/training/runs/expC_clustered/final_model.zip
deleted file mode 100644
index 86d9208..0000000
Binary files a/training/runs/expC_clustered/final_model.zip and /dev/null differ
diff --git a/training/runs/expC_clustered/stage_results.json b/training/runs/expC_clustered/stage_results.json
deleted file mode 100644
index 7614958..0000000
--- a/training/runs/expC_clustered/stage_results.json
+++ /dev/null
@@ -1,23 +0,0 @@
-[
-  {
-    "n_sheep": 1,
-    "sr": 0.8333333333333334,
-    "mean_len": 518.5333333333333,
-    "mean_min_pen": 3.5244259238243103,
-    "mean_act": 0.25044742608759274
-  },
-  {
-    "n_sheep": 2,
-    "sr": 1.0,
-    "mean_len": 694.9,
-    "mean_min_pen": 3.4314632336298625,
-    "mean_act": 0.5796192060058971
-  },
-  {
-    "n_sheep": 3,
-    "sr": 0.9,
-    "mean_len": 794.1333333333333,
-    "mean_min_pen": 3.6645382324854534,
-    "mean_act": 0.46590614892287907
-  }
-]
\ No newline at end of file
diff --git a/training/runs/expC_clustered/vecnorm.pkl b/training/runs/expC_clustered/vecnorm.pkl
deleted file mode 100644
index 0cffe9b..0000000
Binary files a/training/runs/expC_clustered/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/final_v2.log b/training/runs/final_v2.log
deleted file mode 100644
index 39cf38e..0000000
--- a/training/runs/final_v2.log
+++ /dev/null
@@ -1,219 +0,0 @@
-Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-Run dir: runs/final_v2
-Curriculum: 1 → 10 sheep, 1,500,000 steps/stage
-
-[Stage n_sheep=1] training 1,500,000 steps
-           ... [trial 1 | 1 sheep | 100,000 steps | ret(last 41)=-38.49  win_sr=10%  cum_sr=10%]
-           ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-32.87  win_sr=8%  cum_sr=9%]
-           ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=-33.60  win_sr=4%  cum_sr=7%]
-           ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=-34.78  win_sr=8%  cum_sr=7%]
-           ... [trial 1 | 1 sheep | 500,000 steps | ret(last 50)=-31.25  win_sr=12%  cum_sr=8%]
-           ... [trial 1 | 1 sheep | 600,000 steps | ret(last 50)=-32.87  win_sr=2%  cum_sr=7%]
-           ... [trial 1 | 1 sheep | 700,000 steps | ret(last 50)=-33.25  win_sr=6%  cum_sr=7%]
-           ... [trial 1 | 1 sheep | 800,000 steps | ret(last 50)=-27.80  win_sr=16%  cum_sr=8%]
-           ... [trial 1 | 1 sheep | 900,000 steps | ret(last 50)=-27.44  win_sr=14%  cum_sr=9%]
-           ... [trial 1 | 1 sheep | 1,000,000 steps | ret(last 50)=-30.52  win_sr=6%  cum_sr=9%]
-           ... [trial 1 | 1 sheep | 1,100,000 steps | ret(last 50)=-24.75  win_sr=20%  cum_sr=10%]
-           ... [trial 1 | 1 sheep | 1,200,000 steps | ret(last 50)=-29.94  win_sr=4%  cum_sr=10%]
-           ... [trial 1 | 1 sheep | 1,300,000 steps | ret(last 50)=-22.72  win_sr=22%  cum_sr=11%]
-           ... [trial 1 | 1 sheep | 1,400,000 steps | ret(last 50)=-9.84  win_sr=46%  cum_sr=14%]
-           ... [trial 1 | 1 sheep | 1,500,000 steps | ret(last 50)=+10.01  win_sr=96%  cum_sr=24%]
-[Stage n_sheep=1] evaluating 30 eps
-[Stage n_sheep=1] sr=97%  mean_len=351  mean_min_pen=3.9m  mean_act=0.28
-
-[Stage n_sheep=2] training 1,500,000 steps
-           ... [trial 1 | 2 sheep | 1,507,336 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 2 sheep | 1,607,336 steps | ret(last 43)=-4.11  win_sr=33%  cum_sr=33%]
-           ... [trial 1 | 2 sheep | 1,707,336 steps | ret(last 50)=-0.34  win_sr=36%  cum_sr=34%]
-           ... [trial 1 | 2 sheep | 1,807,336 steps | ret(last 50)=+14.73  win_sr=92%  cum_sr=62%]
-           ... [trial 1 | 2 sheep | 1,907,336 steps | ret(last 50)=+17.38  win_sr=100%  cum_sr=76%]
-           ... [trial 1 | 2 sheep | 2,007,336 steps | ret(last 50)=+16.80  win_sr=100%  cum_sr=83%]
-           ... [trial 1 | 2 sheep | 2,107,336 steps | ret(last 50)=+15.67  win_sr=100%  cum_sr=87%]
-           ... [trial 1 | 2 sheep | 2,207,336 steps | ret(last 50)=+15.39  win_sr=100%  cum_sr=90%]
-           ... [trial 1 | 2 sheep | 2,307,336 steps | ret(last 50)=+15.58  win_sr=100%  cum_sr=92%]
-           ... [trial 1 | 2 sheep | 2,407,336 steps | ret(last 50)=+15.01  win_sr=100%  cum_sr=93%]
-           ... [trial 1 | 2 sheep | 2,507,336 steps | ret(last 50)=+15.50  win_sr=100%  cum_sr=94%]
-           ... [trial 1 | 2 sheep | 2,607,336 steps | ret(last 50)=+15.21  win_sr=100%  cum_sr=95%]
-           ... [trial 1 | 2 sheep | 2,707,336 steps | ret(last 50)=+15.22  win_sr=100%  cum_sr=95%]
-           ... [trial 1 | 2 sheep | 2,807,336 steps | ret(last 50)=+15.05  win_sr=100%  cum_sr=96%]
-           ... [trial 1 | 2 sheep | 2,907,336 steps | ret(last 50)=+14.37  win_sr=100%  cum_sr=96%]
-           ... [trial 1 | 2 sheep | 3,007,336 steps | ret(last 50)=+14.70  win_sr=100%  cum_sr=97%]
-[Stage n_sheep=2] evaluating 30 eps
-[Stage n_sheep=2] sr=100%  mean_len=421  mean_min_pen=3.5m  mean_act=1.01
-
-[Stage n_sheep=3] training 1,500,000 steps
-           ... [trial 1 | 3 sheep | 3,014,664 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 3 sheep | 3,114,664 steps | ret(last 50)=+16.52  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 3 sheep | 3,214,664 steps | ret(last 50)=+16.74  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,314,664 steps | ret(last 50)=+17.09  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,414,664 steps | ret(last 50)=+16.90  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,514,664 steps | ret(last 50)=+16.97  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,614,664 steps | ret(last 50)=+17.20  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,714,664 steps | ret(last 50)=+17.09  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,814,664 steps | ret(last 50)=+17.12  win_sr=98%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,914,664 steps | ret(last 50)=+17.17  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,014,664 steps | ret(last 50)=+16.25  win_sr=98%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,114,664 steps | ret(last 50)=+17.04  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,214,664 steps | ret(last 50)=+16.31  win_sr=98%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,314,664 steps | ret(last 50)=+16.82  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,414,664 steps | ret(last 50)=+16.49  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,514,664 steps | ret(last 50)=+16.54  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=3] evaluating 30 eps
-[Stage n_sheep=3] sr=100%  mean_len=608  mean_min_pen=3.5m  mean_act=1.06
-
-[Stage n_sheep=4] training 1,500,000 steps
-           ... [trial 1 | 4 sheep | 4,521,992 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 4 sheep | 4,621,992 steps | ret(last 50)=+18.55  win_sr=98%  cum_sr=94%]
-           ... [trial 1 | 4 sheep | 4,721,992 steps | ret(last 50)=+19.17  win_sr=100%  cum_sr=97%]
-           ... [trial 1 | 4 sheep | 4,821,992 steps | ret(last 50)=+18.64  win_sr=100%  cum_sr=98%]
-           ... [trial 1 | 4 sheep | 4,921,992 steps | ret(last 50)=+19.06  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 4 sheep | 5,021,992 steps | ret(last 50)=+19.01  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 4 sheep | 5,121,992 steps | ret(last 50)=+19.23  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 4 sheep | 5,221,992 steps | ret(last 50)=+18.71  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 4 sheep | 5,321,992 steps | ret(last 50)=+18.81  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 4 sheep | 5,421,992 steps | ret(last 50)=+19.51  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 4 sheep | 5,521,992 steps | ret(last 50)=+19.01  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,621,992 steps | ret(last 50)=+19.21  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,721,992 steps | ret(last 50)=+18.62  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,821,992 steps | ret(last 50)=+18.57  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,921,992 steps | ret(last 50)=+19.22  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 6,021,992 steps | ret(last 50)=+18.73  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=4] evaluating 30 eps
-[Stage n_sheep=4] sr=100%  mean_len=874  mean_min_pen=3.3m  mean_act=1.23
-
-[Stage n_sheep=5] training 1,500,000 steps
-           ... [trial 1 | 5 sheep | 6,029,320 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 5 sheep | 6,129,320 steps | ret(last 50)=+22.70  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,229,320 steps | ret(last 50)=+20.82  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,329,320 steps | ret(last 50)=+20.84  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,429,320 steps | ret(last 50)=+21.70  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,529,320 steps | ret(last 50)=+21.25  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,629,320 steps | ret(last 50)=+20.61  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,729,320 steps | ret(last 50)=+21.10  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,829,320 steps | ret(last 50)=+21.42  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,929,320 steps | ret(last 50)=+21.39  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,029,320 steps | ret(last 50)=+20.80  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,129,320 steps | ret(last 50)=+21.19  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,229,320 steps | ret(last 50)=+20.92  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,329,320 steps | ret(last 50)=+20.97  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,429,320 steps | ret(last 50)=+20.48  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,529,320 steps | ret(last 50)=+21.36  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=5] evaluating 30 eps
-[Stage n_sheep=5] sr=97%  mean_len=945  mean_min_pen=3.4m  mean_act=1.33
-
-[Stage n_sheep=6] training 1,500,000 steps
-           ... [trial 1 | 6 sheep | 7,536,648 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 6 sheep | 7,636,648 steps | ret(last 50)=+22.41  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 7,736,648 steps | ret(last 50)=+23.84  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 7,836,648 steps | ret(last 50)=+22.95  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 7,936,648 steps | ret(last 50)=+23.97  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,036,648 steps | ret(last 50)=+24.02  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,136,648 steps | ret(last 50)=+23.42  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,236,648 steps | ret(last 50)=+24.15  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,336,648 steps | ret(last 50)=+23.32  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,436,648 steps | ret(last 50)=+23.46  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,536,648 steps | ret(last 50)=+23.80  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,636,648 steps | ret(last 50)=+24.41  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,736,648 steps | ret(last 50)=+23.86  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,836,648 steps | ret(last 50)=+23.57  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,936,648 steps | ret(last 50)=+23.74  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 9,036,648 steps | ret(last 50)=+22.87  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=6] evaluating 30 eps
-[Stage n_sheep=6] sr=100%  mean_len=1162  mean_min_pen=3.1m  mean_act=1.36
-
-[Stage n_sheep=7] training 1,500,000 steps
-           ... [trial 1 | 7 sheep | 9,043,976 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 7 sheep | 9,143,976 steps | ret(last 50)=+24.46  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,243,976 steps | ret(last 50)=+25.47  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,343,976 steps | ret(last 50)=+25.10  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,443,976 steps | ret(last 50)=+24.85  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,543,976 steps | ret(last 50)=+26.01  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,643,976 steps | ret(last 50)=+26.26  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,743,976 steps | ret(last 50)=+26.44  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,843,976 steps | ret(last 50)=+26.08  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,943,976 steps | ret(last 50)=+25.00  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,043,976 steps | ret(last 50)=+26.22  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,143,976 steps | ret(last 50)=+24.79  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,243,976 steps | ret(last 50)=+26.33  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,343,976 steps | ret(last 50)=+26.36  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,443,976 steps | ret(last 50)=+25.68  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,543,976 steps | ret(last 50)=+26.75  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=7] evaluating 30 eps
-[Stage n_sheep=7] sr=100%  mean_len=1253  mean_min_pen=2.7m  mean_act=1.38
-
-[Stage n_sheep=8] training 1,500,000 steps
-           ... [trial 1 | 8 sheep | 10,551,304 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 8 sheep | 10,651,304 steps | ret(last 50)=+28.19  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 10,751,304 steps | ret(last 50)=+28.80  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 10,851,304 steps | ret(last 50)=+27.81  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 10,951,304 steps | ret(last 50)=+27.31  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,051,304 steps | ret(last 50)=+27.67  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,151,304 steps | ret(last 50)=+27.14  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,251,304 steps | ret(last 50)=+29.60  win_sr=98%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,351,304 steps | ret(last 50)=+28.81  win_sr=98%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,451,304 steps | ret(last 50)=+27.76  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,551,304 steps | ret(last 50)=+27.28  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,651,304 steps | ret(last 50)=+29.04  win_sr=98%  cum_sr=99%]
-           ... [trial 1 | 8 sheep | 11,751,304 steps | ret(last 50)=+28.75  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,851,304 steps | ret(last 50)=+29.04  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,951,304 steps | ret(last 50)=+28.27  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 12,051,304 steps | ret(last 50)=+27.90  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=8] evaluating 30 eps
-[Stage n_sheep=8] sr=93%  mean_len=1495  mean_min_pen=2.6m  mean_act=1.39
-
-[Stage n_sheep=9] training 1,500,000 steps
-           ... [trial 1 | 9 sheep | 12,058,632 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 9 sheep | 12,158,632 steps | ret(last 50)=+30.67  win_sr=98%  cum_sr=98%]
-           ... [trial 1 | 9 sheep | 12,258,632 steps | ret(last 50)=+28.78  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 12,358,632 steps | ret(last 50)=+30.08  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 12,458,632 steps | ret(last 50)=+29.61  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 12,558,632 steps | ret(last 50)=+30.34  win_sr=98%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 12,658,632 steps | ret(last 50)=+29.48  win_sr=98%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 12,758,632 steps | ret(last 50)=+29.92  win_sr=98%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 12,858,632 steps | ret(last 50)=+29.26  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 12,958,632 steps | ret(last 50)=+30.36  win_sr=96%  cum_sr=98%]
-           ... [trial 1 | 9 sheep | 13,058,632 steps | ret(last 50)=+30.19  win_sr=100%  cum_sr=98%]
-           ... [trial 1 | 9 sheep | 13,158,632 steps | ret(last 50)=+29.24  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 13,258,632 steps | ret(last 50)=+30.40  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 13,358,632 steps | ret(last 50)=+31.65  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 13,458,632 steps | ret(last 50)=+30.77  win_sr=98%  cum_sr=99%]
-           ... [trial 1 | 9 sheep | 13,558,632 steps | ret(last 50)=+30.21  win_sr=94%  cum_sr=98%]
-[Stage n_sheep=9] evaluating 30 eps
-[Stage n_sheep=9] sr=97%  mean_len=1625  mean_min_pen=2.1m  mean_act=1.39
-
-[Stage n_sheep=10] training 1,500,000 steps
-           ... [trial 1 | 10 sheep | 13,565,960 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 10 sheep | 13,665,960 steps | ret(last 50)=+30.13  win_sr=90%  cum_sr=92%]
-           ... [trial 1 | 10 sheep | 13,765,960 steps | ret(last 50)=+31.84  win_sr=96%  cum_sr=92%]
-           ... [trial 1 | 10 sheep | 13,865,960 steps | ret(last 50)=+32.66  win_sr=88%  cum_sr=91%]
-           ... [trial 1 | 10 sheep | 13,965,960 steps | ret(last 50)=+32.56  win_sr=90%  cum_sr=91%]
-           ... [trial 1 | 10 sheep | 14,065,960 steps | ret(last 50)=+31.29  win_sr=98%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,165,960 steps | ret(last 50)=+32.72  win_sr=94%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,265,960 steps | ret(last 50)=+32.42  win_sr=96%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,365,960 steps | ret(last 50)=+33.96  win_sr=92%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,465,960 steps | ret(last 50)=+33.17  win_sr=98%  cum_sr=94%]
-           ... [trial 1 | 10 sheep | 14,565,960 steps | ret(last 50)=+31.48  win_sr=96%  cum_sr=94%]
-           ... [trial 1 | 10 sheep | 14,665,960 steps | ret(last 50)=+31.19  win_sr=90%  cum_sr=94%]
-           ... [trial 1 | 10 sheep | 14,765,960 steps | ret(last 50)=+32.87  win_sr=98%  cum_sr=94%]
-           ... [trial 1 | 10 sheep | 14,865,960 steps | ret(last 50)=+32.36  win_sr=94%  cum_sr=94%]
-           ... [trial 1 | 10 sheep | 14,965,960 steps | ret(last 50)=+31.14  win_sr=94%  cum_sr=94%]
-           ... [trial 1 | 10 sheep | 15,065,960 steps | ret(last 50)=+32.18  win_sr=96%  cum_sr=94%]
-[Stage n_sheep=10] evaluating 30 eps
-[Stage n_sheep=10] sr=97%  mean_len=1816  mean_min_pen=2.0m  mean_act=1.39
-
-============================================================
-  REPLAY SUMMARY
-============================================================
-  n_sheep=1  sr= 97%  len=  351  min_pen=  3.9m  act=0.28
-  n_sheep=2  sr=100%  len=  421  min_pen=  3.5m  act=1.01
-  n_sheep=3  sr=100%  len=  608  min_pen=  3.5m  act=1.06
-  n_sheep=4  sr=100%  len=  874  min_pen=  3.3m  act=1.23
-  n_sheep=5  sr= 97%  len=  945  min_pen=  3.4m  act=1.33
-  n_sheep=6  sr=100%  len= 1162  min_pen=  3.1m  act=1.36
-  n_sheep=7  sr=100%  len= 1253  min_pen=  2.7m  act=1.38
-  n_sheep=8  sr= 93%  len= 1495  min_pen=  2.6m  act=1.39
-  n_sheep=9  sr= 97%  len= 1625  min_pen=  2.1m  act=1.39
-  n_sheep=10  sr= 97%  len= 1816  min_pen=  2.0m  act=1.39
-
-  Total time: 90.3 min
-  Artefacts:  runs/final_v2/
diff --git a/training/runs/final_v2/config.json b/training/runs/final_v2/config.json
deleted file mode 100644
index b2d15fe..0000000
--- a/training/runs/final_v2/config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "W_PER_SHEEP": 1.0,
-  "W_ALIGN": 0.0,
-  "W_PEN_BONUS": 5.0,
-  "W_STEP_COST": 0.02,
-  "W_COMPLETE": 200.0,
-  "W_COMPACT": 1.5,
-  "ALIGN_SHAPE": "standoff",
-  "ALIGN_GATED": false,
-  "ent_coef": 0.02
-}
\ No newline at end of file
diff --git a/training/runs/final_v2/final_model.zip b/training/runs/final_v2/final_model.zip
deleted file mode 100644
index 41dc86d..0000000
Binary files a/training/runs/final_v2/final_model.zip and /dev/null differ
diff --git a/training/runs/final_v2/stage_results.json b/training/runs/final_v2/stage_results.json
deleted file mode 100644
index a8f3266..0000000
--- a/training/runs/final_v2/stage_results.json
+++ /dev/null
@@ -1,72 +0,0 @@
-[
-  {
-    "n_sheep": 1,
-    "sr": 0.9666666666666667,
-    "mean_len": 350.96666666666664,
-    "mean_min_pen": 3.913520161310832,
-    "mean_act": 0.2797267940386975
-  },
-  {
-    "n_sheep": 2,
-    "sr": 1.0,
-    "mean_len": 421.46666666666664,
-    "mean_min_pen": 3.485754116376241,
-    "mean_act": 1.0053067604365706
-  },
-  {
-    "n_sheep": 3,
-    "sr": 1.0,
-    "mean_len": 608.5,
-    "mean_min_pen": 3.52824010848999,
-    "mean_act": 1.0576287743527575
-  },
-  {
-    "n_sheep": 4,
-    "sr": 1.0,
-    "mean_len": 874.1333333333333,
-    "mean_min_pen": 3.2648465514183043,
-    "mean_act": 1.2302308682249101
-  },
-  {
-    "n_sheep": 5,
-    "sr": 0.9666666666666667,
-    "mean_len": 945.1333333333333,
-    "mean_min_pen": 3.390091093381246,
-    "mean_act": 1.328577256075333
-  },
-  {
-    "n_sheep": 6,
-    "sr": 1.0,
-    "mean_len": 1162.1,
-    "mean_min_pen": 3.0996540347735086,
-    "mean_act": 1.3581346810990618
-  },
-  {
-    "n_sheep": 7,
-    "sr": 1.0,
-    "mean_len": 1252.6,
-    "mean_min_pen": 2.6753984689712524,
-    "mean_act": 1.3753795162019462
-  },
-  {
-    "n_sheep": 8,
-    "sr": 0.9333333333333333,
-    "mean_len": 1495.2333333333333,
-    "mean_min_pen": 2.560386610031128,
-    "mean_act": 1.3861974064434042
-  },
-  {
-    "n_sheep": 9,
-    "sr": 0.9666666666666667,
-    "mean_len": 1624.9,
-    "mean_min_pen": 2.130835851033529,
-    "mean_act": 1.387693840600181
-  },
-  {
-    "n_sheep": 10,
-    "sr": 0.9666666666666667,
-    "mean_len": 1816.5,
-    "mean_min_pen": 1.9940622925758362,
-    "mean_act": 1.3946097864970635
-  }
-]
\ No newline at end of file
diff --git a/training/runs/final_v2/vecnorm.pkl b/training/runs/final_v2/vecnorm.pkl
deleted file mode 100644
index 44319c8..0000000
Binary files a/training/runs/final_v2/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/final_v3.log b/training/runs/final_v3.log
deleted file mode 100644
index 385c3ed..0000000
--- a/training/runs/final_v3.log
+++ /dev/null
@@ -1,253 +0,0 @@
-Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-Run dir: runs/final_v3
-Curriculum: 1 → 10 sheep, 1,500,000 steps/stage
-
-[Stage n_sheep=1] training 1,500,000 steps
-           ... [trial 1 | 1 sheep | 100,000 steps | ret(last 40)=-28.61  win_sr=10%  cum_sr=10%]
-           ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-29.25  win_sr=12%  cum_sr=11%]
-           ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=-31.55  win_sr=6%  cum_sr=9%]
-           ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=-30.74  win_sr=10%  cum_sr=9%]
-           ... [trial 1 | 1 sheep | 500,000 steps | ret(last 50)=-32.89  win_sr=4%  cum_sr=8%]
-           ... [trial 1 | 1 sheep | 600,000 steps | ret(last 50)=-34.66  win_sr=4%  cum_sr=7%]
-           ... [trial 1 | 1 sheep | 700,000 steps | ret(last 50)=-31.44  win_sr=12%  cum_sr=8%]
-           ... [trial 1 | 1 sheep | 800,000 steps | ret(last 50)=-32.70  win_sr=6%  cum_sr=8%]
-           ... [trial 1 | 1 sheep | 900,000 steps | ret(last 50)=-35.48  win_sr=2%  cum_sr=7%]
-           ... [trial 1 | 1 sheep | 1,000,000 steps | ret(last 50)=-31.81  win_sr=10%  cum_sr=8%]
-           ... [trial 1 | 1 sheep | 1,100,000 steps | ret(last 50)=-28.53  win_sr=10%  cum_sr=8%]
-           ... [trial 1 | 1 sheep | 1,200,000 steps | ret(last 50)=-5.61  win_sr=62%  cum_sr=13%]
-           ... [trial 1 | 1 sheep | 1,300,000 steps | ret(last 50)=+11.97  win_sr=100%  cum_sr=34%]
-           ... [trial 1 | 1 sheep | 1,400,000 steps | ret(last 50)=+10.92  win_sr=96%  cum_sr=50%]
-           ... [trial 1 | 1 sheep | 1,500,000 steps | ret(last 50)=+11.97  win_sr=100%  cum_sr=63%]
-[Stage n_sheep=1] evaluating 30 eps
-[Stage n_sheep=1] sr=100%  mean_len=249  mean_min_pen=3.7m  mean_act=0.41
-
-[Stage n_sheep=2] training 1,500,000 steps
-           ... [trial 1 | 2 sheep | 1,507,336 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 2 sheep | 1,607,336 steps | ret(last 47)=-1.11  win_sr=45%  cum_sr=45%]
-           ... [trial 1 | 2 sheep | 1,707,336 steps | ret(last 50)=-8.90  win_sr=8%  cum_sr=27%]
-           ... [trial 1 | 2 sheep | 1,807,336 steps | ret(last 50)=-5.28  win_sr=16%  cum_sr=24%]
-           ... [trial 1 | 2 sheep | 1,907,336 steps | ret(last 50)=+3.16  win_sr=58%  cum_sr=33%]
-           ... [trial 1 | 2 sheep | 2,007,336 steps | ret(last 50)=+10.26  win_sr=84%  cum_sr=48%]
-           ... [trial 1 | 2 sheep | 2,107,336 steps | ret(last 50)=+14.27  win_sr=100%  cum_sr=64%]
-           ... [trial 1 | 2 sheep | 2,207,336 steps | ret(last 50)=+14.08  win_sr=100%  cum_sr=72%]
-           ... [trial 1 | 2 sheep | 2,307,336 steps | ret(last 50)=+14.38  win_sr=100%  cum_sr=77%]
-           ... [trial 1 | 2 sheep | 2,407,336 steps | ret(last 50)=+14.27  win_sr=100%  cum_sr=81%]
-           ... [trial 1 | 2 sheep | 2,507,336 steps | ret(last 50)=+14.37  win_sr=100%  cum_sr=84%]
-           ... [trial 1 | 2 sheep | 2,607,336 steps | ret(last 50)=+14.33  win_sr=100%  cum_sr=86%]
-           ... [trial 1 | 2 sheep | 2,707,336 steps | ret(last 50)=+14.04  win_sr=100%  cum_sr=87%]
-           ... [trial 1 | 2 sheep | 2,807,336 steps | ret(last 50)=+14.25  win_sr=100%  cum_sr=89%]
-           ... [trial 1 | 2 sheep | 2,907,336 steps | ret(last 50)=+14.61  win_sr=100%  cum_sr=90%]
-           ... [trial 1 | 2 sheep | 3,007,336 steps | ret(last 50)=+13.98  win_sr=98%  cum_sr=91%]
-[Stage n_sheep=2] evaluating 30 eps
-[Stage n_sheep=2] sr=100%  mean_len=548  mean_min_pen=3.5m  mean_act=0.92
-
-[Stage n_sheep=3] training 1,500,000 steps
-           ... [trial 1 | 3 sheep | 3,014,664 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 3 sheep | 3,114,664 steps | ret(last 50)=+16.10  win_sr=100%  cum_sr=99%]
-           ... [trial 1 | 3 sheep | 3,214,664 steps | ret(last 50)=+17.27  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,314,664 steps | ret(last 50)=+16.86  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,414,664 steps | ret(last 50)=+16.86  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,514,664 steps | ret(last 50)=+17.46  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,614,664 steps | ret(last 50)=+17.43  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,714,664 steps | ret(last 50)=+16.76  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,814,664 steps | ret(last 50)=+16.97  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 3,914,664 steps | ret(last 50)=+16.97  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,014,664 steps | ret(last 50)=+17.19  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,114,664 steps | ret(last 50)=+17.23  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,214,664 steps | ret(last 50)=+16.45  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,314,664 steps | ret(last 50)=+17.18  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,414,664 steps | ret(last 50)=+16.42  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 3 sheep | 4,514,664 steps | ret(last 50)=+16.32  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=3] evaluating 30 eps
-[Stage n_sheep=3] sr=100%  mean_len=640  mean_min_pen=3.5m  mean_act=1.06
-
-[Stage n_sheep=4] training 1,500,000 steps
-           ... [trial 1 | 4 sheep | 4,521,992 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 4 sheep | 4,621,992 steps | ret(last 50)=+18.61  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 4,721,992 steps | ret(last 50)=+18.82  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 4,821,992 steps | ret(last 50)=+18.91  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 4,921,992 steps | ret(last 50)=+18.55  win_sr=98%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,021,992 steps | ret(last 50)=+18.99  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,121,992 steps | ret(last 50)=+18.76  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,221,992 steps | ret(last 50)=+18.46  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,321,992 steps | ret(last 50)=+19.21  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,421,992 steps | ret(last 50)=+17.86  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,521,992 steps | ret(last 50)=+19.19  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,621,992 steps | ret(last 50)=+18.83  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,721,992 steps | ret(last 50)=+18.51  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,821,992 steps | ret(last 50)=+18.38  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 5,921,992 steps | ret(last 50)=+18.56  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 4 sheep | 6,021,992 steps | ret(last 50)=+18.82  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=4] evaluating 30 eps
-[Stage n_sheep=4] sr=100%  mean_len=762  mean_min_pen=3.5m  mean_act=1.26
-
-[Stage n_sheep=5] training 1,500,000 steps
-           ... [trial 1 | 5 sheep | 6,029,320 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 5 sheep | 6,129,320 steps | ret(last 50)=+20.46  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,229,320 steps | ret(last 50)=+20.41  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,329,320 steps | ret(last 50)=+20.58  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,429,320 steps | ret(last 50)=+21.10  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,529,320 steps | ret(last 50)=+20.48  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,629,320 steps | ret(last 50)=+20.56  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,729,320 steps | ret(last 50)=+20.51  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,829,320 steps | ret(last 50)=+20.70  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 6,929,320 steps | ret(last 50)=+20.83  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,029,320 steps | ret(last 50)=+21.52  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,129,320 steps | ret(last 50)=+21.62  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,229,320 steps | ret(last 50)=+21.22  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,329,320 steps | ret(last 50)=+21.17  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,429,320 steps | ret(last 50)=+21.00  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 5 sheep | 7,529,320 steps | ret(last 50)=+20.48  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=5] evaluating 30 eps
-[Stage n_sheep=5] sr=100%  mean_len=931  mean_min_pen=3.6m  mean_act=1.31
-
-[Stage n_sheep=6] training 1,500,000 steps
-           ... [trial 1 | 6 sheep | 7,536,648 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 6 sheep | 7,636,648 steps | ret(last 50)=+21.89  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 7,736,648 steps | ret(last 50)=+22.98  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 7,836,648 steps | ret(last 50)=+22.66  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 7,936,648 steps | ret(last 50)=+23.23  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,036,648 steps | ret(last 50)=+22.83  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,136,648 steps | ret(last 50)=+22.65  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,236,648 steps | ret(last 50)=+22.22  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,336,648 steps | ret(last 50)=+22.45  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,436,648 steps | ret(last 50)=+22.55  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,536,648 steps | ret(last 50)=+22.99  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,636,648 steps | ret(last 50)=+21.99  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,736,648 steps | ret(last 50)=+22.30  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,836,648 steps | ret(last 50)=+23.06  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 8,936,648 steps | ret(last 50)=+23.32  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 6 sheep | 9,036,648 steps | ret(last 50)=+21.80  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=6] evaluating 30 eps
-[Stage n_sheep=6] sr=100%  mean_len=1082  mean_min_pen=3.6m  mean_act=1.35
-
-[Stage n_sheep=7] training 1,500,000 steps
-           ... [trial 1 | 7 sheep | 9,043,976 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 7 sheep | 9,143,976 steps | ret(last 50)=+25.57  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,243,976 steps | ret(last 50)=+24.76  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,343,976 steps | ret(last 50)=+24.69  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,443,976 steps | ret(last 50)=+26.12  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,543,976 steps | ret(last 50)=+25.53  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,643,976 steps | ret(last 50)=+25.39  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,743,976 steps | ret(last 50)=+24.45  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,843,976 steps | ret(last 50)=+26.45  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 9,943,976 steps | ret(last 50)=+24.51  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,043,976 steps | ret(last 50)=+24.80  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,143,976 steps | ret(last 50)=+25.56  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,243,976 steps | ret(last 50)=+25.75  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,343,976 steps | ret(last 50)=+25.64  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,443,976 steps | ret(last 50)=+26.45  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 7 sheep | 10,543,976 steps | ret(last 50)=+25.19  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=7] evaluating 30 eps
-[Stage n_sheep=7] sr=100%  mean_len=1081  mean_min_pen=3.5m  mean_act=1.37
-
-[Stage n_sheep=8] training 1,500,000 steps
-           ... [trial 1 | 8 sheep | 10,551,304 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 8 sheep | 10,651,304 steps | ret(last 50)=+26.63  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 10,751,304 steps | ret(last 50)=+27.63  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 10,851,304 steps | ret(last 50)=+27.53  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 10,951,304 steps | ret(last 50)=+27.43  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,051,304 steps | ret(last 50)=+27.70  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,151,304 steps | ret(last 50)=+26.53  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,251,304 steps | ret(last 50)=+27.24  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,351,304 steps | ret(last 50)=+27.14  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,451,304 steps | ret(last 50)=+27.43  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,551,304 steps | ret(last 50)=+27.25  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,651,304 steps | ret(last 50)=+27.40  win_sr=98%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,751,304 steps | ret(last 50)=+27.35  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,851,304 steps | ret(last 50)=+26.33  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 11,951,304 steps | ret(last 50)=+26.89  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 8 sheep | 12,051,304 steps | ret(last 50)=+27.86  win_sr=100%  cum_sr=100%]
-[Stage n_sheep=8] evaluating 30 eps
-[Stage n_sheep=8] sr=100%  mean_len=1311  mean_min_pen=3.5m  mean_act=1.38
-
-[Stage n_sheep=9] training 1,500,000 steps
-           ... [trial 1 | 9 sheep | 12,058,632 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 9 sheep | 12,158,632 steps | ret(last 50)=+29.62  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 12,258,632 steps | ret(last 50)=+31.32  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 12,358,632 steps | ret(last 50)=+30.30  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 12,458,632 steps | ret(last 50)=+29.33  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 12,558,632 steps | ret(last 50)=+28.83  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 12,658,632 steps | ret(last 50)=+29.02  win_sr=98%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 12,758,632 steps | ret(last 50)=+29.60  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 12,858,632 steps | ret(last 50)=+29.88  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 12,958,632 steps | ret(last 50)=+30.12  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 13,058,632 steps | ret(last 50)=+28.80  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 13,158,632 steps | ret(last 50)=+30.33  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 13,258,632 steps | ret(last 50)=+27.85  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 13,358,632 steps | ret(last 50)=+28.21  win_sr=96%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 13,458,632 steps | ret(last 50)=+29.88  win_sr=100%  cum_sr=100%]
-           ... [trial 1 | 9 sheep | 13,558,632 steps | ret(last 50)=+29.06  win_sr=98%  cum_sr=100%]
-[Stage n_sheep=9] evaluating 30 eps
-[Stage n_sheep=9] sr=100%  mean_len=1435  mean_min_pen=3.6m  mean_act=1.39
-
-[Stage n_sheep=10] training 1,500,000 steps
-           ... [trial 1 | 10 sheep | 13,565,960 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | 10 sheep | 13,665,960 steps | ret(last 50)=+30.42  win_sr=96%  cum_sr=96%]
-           ... [trial 1 | 10 sheep | 13,765,960 steps | ret(last 50)=+29.97  win_sr=92%  cum_sr=95%]
-           ... [trial 1 | 10 sheep | 13,865,960 steps | ret(last 50)=+30.45  win_sr=82%  cum_sr=90%]
-           ... [trial 1 | 10 sheep | 13,965,960 steps | ret(last 50)=+29.82  win_sr=90%  cum_sr=91%]
-           ... [trial 1 | 10 sheep | 14,065,960 steps | ret(last 50)=+29.66  win_sr=90%  cum_sr=91%]
-           ... [trial 1 | 10 sheep | 14,165,960 steps | ret(last 50)=+31.57  win_sr=98%  cum_sr=92%]
-           ... [trial 1 | 10 sheep | 14,265,960 steps | ret(last 50)=+31.71  win_sr=96%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,365,960 steps | ret(last 50)=+31.75  win_sr=94%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,465,960 steps | ret(last 50)=+29.46  win_sr=88%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,565,960 steps | ret(last 50)=+29.62  win_sr=94%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,665,960 steps | ret(last 50)=+31.64  win_sr=98%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,765,960 steps | ret(last 50)=+30.86  win_sr=90%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,865,960 steps | ret(last 50)=+31.65  win_sr=90%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 14,965,960 steps | ret(last 50)=+31.75  win_sr=92%  cum_sr=93%]
-           ... [trial 1 | 10 sheep | 15,065,960 steps | ret(last 50)=+30.24  win_sr=100%  cum_sr=93%]
-[Stage n_sheep=10] evaluating 30 eps
-[Stage n_sheep=10] sr=90%  mean_len=1841  mean_min_pen=3.6m  mean_act=1.39
-
-[Consolidation] mixed n_sheep ∈ [1, 10], 2,000,000 steps
-           ... [trial 1 | consolidate | 15,073,288 steps | ret(last 0)=+nan  win_sr=nan%  cum_sr=nan%]
-           ... [trial 1 | consolidate | 15,173,288 steps | ret(last 50)=+20.69  win_sr=94%  cum_sr=95%]
-           ... [trial 1 | consolidate | 15,273,288 steps | ret(last 50)=+20.62  win_sr=90%  cum_sr=92%]
-           ... [trial 1 | consolidate | 15,373,288 steps | ret(last 50)=+20.25  win_sr=94%  cum_sr=93%]
-           ... [trial 1 | consolidate | 15,473,288 steps | ret(last 50)=+19.82  win_sr=96%  cum_sr=94%]
-           ... [trial 1 | consolidate | 15,573,288 steps | ret(last 50)=+20.56  win_sr=94%  cum_sr=94%]
-           ... [trial 1 | consolidate | 15,673,288 steps | ret(last 50)=+20.56  win_sr=92%  cum_sr=94%]
-           ... [trial 1 | consolidate | 15,773,288 steps | ret(last 50)=+19.43  win_sr=94%  cum_sr=95%]
-           ... [trial 1 | consolidate | 15,873,288 steps | ret(last 50)=+21.85  win_sr=98%  cum_sr=95%]
-           ... [trial 1 | consolidate | 15,973,288 steps | ret(last 50)=+21.84  win_sr=94%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,073,288 steps | ret(last 50)=+22.13  win_sr=98%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,173,288 steps | ret(last 50)=+21.89  win_sr=94%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,273,288 steps | ret(last 50)=+21.88  win_sr=98%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,373,288 steps | ret(last 50)=+20.81  win_sr=94%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,473,288 steps | ret(last 50)=+20.91  win_sr=98%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,573,288 steps | ret(last 50)=+21.13  win_sr=98%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,673,288 steps | ret(last 50)=+19.85  win_sr=100%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,773,288 steps | ret(last 50)=+22.30  win_sr=92%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,873,288 steps | ret(last 50)=+20.61  win_sr=96%  cum_sr=95%]
-           ... [trial 1 | consolidate | 16,973,288 steps | ret(last 50)=+21.93  win_sr=98%  cum_sr=96%]
-           ... [trial 1 | consolidate | 17,073,288 steps | ret(last 50)=+21.86  win_sr=98%  cum_sr=96%]
-[Consolidation] re-evaluating all sheep counts
-[Consolidation] n_sheep=1  sr=97%  mean_len=377  mean_min_pen=3.5m  mean_act=1.39
-[Consolidation] n_sheep=2  sr=47%  mean_len=1718  mean_min_pen=2.4m  mean_act=1.39
-[Consolidation] n_sheep=3  sr=93%  mean_len=970  mean_min_pen=3.2m  mean_act=1.39
-[Consolidation] n_sheep=4  sr=97%  mean_len=1008  mean_min_pen=3.3m  mean_act=1.39
-[Consolidation] n_sheep=5  sr=100%  mean_len=1176  mean_min_pen=3.3m  mean_act=1.39
-[Consolidation] n_sheep=6  sr=100%  mean_len=1305  mean_min_pen=3.3m  mean_act=1.39
-[Consolidation] n_sheep=7  sr=100%  mean_len=1300  mean_min_pen=3.4m  mean_act=1.39
-[Consolidation] n_sheep=8  sr=100%  mean_len=1461  mean_min_pen=3.5m  mean_act=1.39
-[Consolidation] n_sheep=9  sr=87%  mean_len=1607  mean_min_pen=3.8m  mean_act=1.39
-[Consolidation] n_sheep=10  sr=80%  mean_len=1801  mean_min_pen=3.7m  mean_act=1.39
-
-============================================================
-  REPLAY SUMMARY
-============================================================
-  n_sheep=1  sr= 97%  len=  377  min_pen=  3.5m  act=1.39
-  n_sheep=2  sr= 47%  len= 1718  min_pen=  2.4m  act=1.39
-  n_sheep=3  sr= 93%  len=  970  min_pen=  3.2m  act=1.39
-  n_sheep=4  sr= 97%  len= 1008  min_pen=  3.3m  act=1.39
-  n_sheep=5  sr=100%  len= 1176  min_pen=  3.3m  act=1.39
-  n_sheep=6  sr=100%  len= 1305  min_pen=  3.3m  act=1.39
-  n_sheep=7  sr=100%  len= 1300  min_pen=  3.4m  act=1.39
-  n_sheep=8  sr=100%  len= 1461  min_pen=  3.5m  act=1.39
-  n_sheep=9  sr= 87%  len= 1607  min_pen=  3.8m  act=1.39
-  n_sheep=10  sr= 80%  len= 1801  min_pen=  3.7m  act=1.39
-
-  Total time: 110.1 min
-  Artefacts:  runs/final_v3/
diff --git a/training/runs/final_v3/config.json b/training/runs/final_v3/config.json
deleted file mode 100644
index b2d15fe..0000000
--- a/training/runs/final_v3/config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "W_PER_SHEEP": 1.0,
-  "W_ALIGN": 0.0,
-  "W_PEN_BONUS": 5.0,
-  "W_STEP_COST": 0.02,
-  "W_COMPLETE": 200.0,
-  "W_COMPACT": 1.5,
-  "ALIGN_SHAPE": "standoff",
-  "ALIGN_GATED": false,
-  "ent_coef": 0.02
-}
\ No newline at end of file
diff --git a/training/runs/final_v3/final_model.zip b/training/runs/final_v3/final_model.zip
deleted file mode 100644
index abad0d3..0000000
Binary files a/training/runs/final_v3/final_model.zip and /dev/null differ
diff --git a/training/runs/final_v3/stage_results.json b/training/runs/final_v3/stage_results.json
deleted file mode 100644
index e44f037..0000000
--- a/training/runs/final_v3/stage_results.json
+++ /dev/null
@@ -1,72 +0,0 @@
-[
-  {
-    "n_sheep": 1,
-    "sr": 0.9666666666666667,
-    "mean_len": 377.3666666666667,
-    "mean_min_pen": 3.5389957586924234,
-    "mean_act": 1.3908841227086732
-  },
-  {
-    "n_sheep": 2,
-    "sr": 0.4666666666666667,
-    "mean_len": 1717.6333333333334,
-    "mean_min_pen": 2.4164488633473713,
-    "mean_act": 1.3922284740020803
-  },
-  {
-    "n_sheep": 3,
-    "sr": 0.9333333333333333,
-    "mean_len": 970.2666666666667,
-    "mean_min_pen": 3.203955141703288,
-    "mean_act": 1.3945290882248416
-  },
-  {
-    "n_sheep": 4,
-    "sr": 0.9666666666666667,
-    "mean_len": 1008.0,
-    "mean_min_pen": 3.279213563601176,
-    "mean_act": 1.3918021049325862
-  },
-  {
-    "n_sheep": 5,
-    "sr": 1.0,
-    "mean_len": 1175.8666666666666,
-    "mean_min_pen": 3.3209743976593016,
-    "mean_act": 1.3925684957666513
-  },
-  {
-    "n_sheep": 6,
-    "sr": 1.0,
-    "mean_len": 1305.0,
-    "mean_min_pen": 3.312229561805725,
-    "mean_act": 1.391130207932886
-  },
-  {
-    "n_sheep": 7,
-    "sr": 1.0,
-    "mean_len": 1300.0,
-    "mean_min_pen": 3.363971138000488,
-    "mean_act": 1.392986050516367
-  },
-  {
-    "n_sheep": 8,
-    "sr": 1.0,
-    "mean_len": 1461.3666666666666,
-    "mean_min_pen": 3.4741388003031415,
-    "mean_act": 1.392040583461347
-  },
-  {
-    "n_sheep": 9,
-    "sr": 0.8666666666666667,
-    "mean_len": 1606.7333333333333,
-    "mean_min_pen": 3.835897175470988,
-    "mean_act": 1.3907199496534952
-  },
-  {
-    "n_sheep": 10,
-    "sr": 0.8,
-    "mean_len": 1800.9666666666667,
-    "mean_min_pen": 3.741190282503764,
-    "mean_act": 1.392501896076031
-  }
-]
\ No newline at end of file
diff --git a/training/runs/final_v3/vecnorm.pkl b/training/runs/final_v3/vecnorm.pkl
deleted file mode 100644
index 983de47..0000000
Binary files a/training/runs/final_v3/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/ppo_debug.log b/training/runs/ppo_debug.log
deleted file mode 100644
index 81b308e..0000000
--- a/training/runs/ppo_debug.log
+++ /dev/null
@@ -1,5569 +0,0 @@
-Using cpu device
-Logging to runs/ppo_debug/ppo_1
-------------------------------
-| time/              |       |
-|    fps             | 5496  |
-|    iterations      | 1     |
-|    time_elapsed    | 2     |
-|    total_timesteps | 16384 |
-------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 4317         |
-|    iterations           | 2            |
-|    time_elapsed         | 7            |
-|    total_timesteps      | 32768        |
-| train/                  |              |
-|    approx_kl            | 0.0036917897 |
-|    clip_fraction        | 0.0212       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.83        |
-|    explained_variance   | 0.352        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0118      |
-|    n_updates            | 10           |
-|    policy_gradient_loss | -0.000544    |
-|    std                  | 0.999        |
-|    value_loss           | 0.0658       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 3946         |
-|    iterations           | 3            |
-|    time_elapsed         | 12           |
-|    total_timesteps      | 49152        |
-| train/                  |              |
-|    approx_kl            | 0.0033213054 |
-|    clip_fraction        | 0.0266       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.83        |
-|    explained_variance   | 0.502        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0255      |
-|    n_updates            | 20           |
-|    policy_gradient_loss | -0.00158     |
-|    std                  | 0.997        |
-|    value_loss           | 0.08         |
-------------------------------------------
-/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.
-  warnings.warn(
-Eval num_timesteps=50000, episode_reward=-32.92 +/- 15.12
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -32.9       |
-| time/                   |             |
-|    total_timesteps      | 50000       |
-| train/                  |             |
-|    approx_kl            | 0.005147726 |
-|    clip_fraction        | 0.0478      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.84       |
-|    explained_variance   | 0.893       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0145     |
-|    n_updates            | 30          |
-|    policy_gradient_loss | -0.00318    |
-|    std                  | 1           |
-|    value_loss           | 0.0194      |
------------------------------------------
-New best mean reward!
-------------------------------
-| time/              |       |
-|    fps             | 2231  |
-|    iterations      | 4     |
-|    time_elapsed    | 29    |
-|    total_timesteps | 65536 |
-------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2444         |
-|    iterations           | 5            |
-|    time_elapsed         | 33           |
-|    total_timesteps      | 81920        |
-| train/                  |              |
-|    approx_kl            | 0.0054671075 |
-|    clip_fraction        | 0.0529       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.84        |
-|    explained_variance   | 0.914        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.021       |
-|    n_updates            | 40           |
-|    policy_gradient_loss | -0.00416     |
-|    std                  | 1            |
-|    value_loss           | 0.0247       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2616        |
-|    iterations           | 6           |
-|    time_elapsed         | 37          |
-|    total_timesteps      | 98304       |
-| train/                  |             |
-|    approx_kl            | 0.004603466 |
-|    clip_fraction        | 0.0379      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.83       |
-|    explained_variance   | 0.931       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0193     |
-|    n_updates            | 50          |
-|    policy_gradient_loss | -0.00284    |
-|    std                  | 0.995       |
-|    value_loss           | 0.0171      |
------------------------------------------
-/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.
-  warnings.warn(
-Eval num_timesteps=100000, episode_reward=-27.45 +/- 49.10
-Episode length: 1973.15 +/- 86.14
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.97e+03     |
-|    mean_reward          | -27.4        |
-| time/                   |              |
-|    total_timesteps      | 100000       |
-| train/                  |              |
-|    approx_kl            | 0.0053039393 |
-|    clip_fraction        | 0.0564       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.83        |
-|    explained_variance   | 0.878        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0325      |
-|    n_updates            | 60           |
-|    policy_gradient_loss | -0.00404     |
-|    std                  | 0.998        |
-|    value_loss           | 0.0118       |
-------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 2212   |
-|    iterations      | 7      |
-|    time_elapsed    | 51     |
-|    total_timesteps | 114688 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2332         |
-|    iterations           | 8            |
-|    time_elapsed         | 56           |
-|    total_timesteps      | 131072       |
-| train/                  |              |
-|    approx_kl            | 0.0048020086 |
-|    clip_fraction        | 0.0449       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.84        |
-|    explained_variance   | 0.839        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0375      |
-|    n_updates            | 70           |
-|    policy_gradient_loss | -0.00359     |
-|    std                  | 1            |
-|    value_loss           | 0.0102       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2429        |
-|    iterations           | 9           |
-|    time_elapsed         | 60          |
-|    total_timesteps      | 147456      |
-| train/                  |             |
-|    approx_kl            | 0.004460754 |
-|    clip_fraction        | 0.0349      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.85       |
-|    explained_variance   | 0.874       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0293     |
-|    n_updates            | 80          |
-|    policy_gradient_loss | -0.00294    |
-|    std                  | 1.01        |
-|    value_loss           | 0.0132      |
------------------------------------------
-Eval num_timesteps=150000, episode_reward=-33.46 +/- 39.53
-Episode length: 1990.60 +/- 40.97
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.99e+03    |
-|    mean_reward          | -33.5       |
-| time/                   |             |
-|    total_timesteps      | 150000      |
-| train/                  |             |
-|    approx_kl            | 0.003831089 |
-|    clip_fraction        | 0.0196      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.82       |
-|    explained_variance   | 0.381       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0191     |
-|    n_updates            | 90          |
-|    policy_gradient_loss | -0.00202    |
-|    std                  | 0.984       |
-|    value_loss           | 0.104       |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2147   |
-|    iterations      | 10     |
-|    time_elapsed    | 76     |
-|    total_timesteps | 163840 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2218         |
-|    iterations           | 11           |
-|    time_elapsed         | 81           |
-|    total_timesteps      | 180224       |
-| train/                  |              |
-|    approx_kl            | 0.0032510734 |
-|    clip_fraction        | 0.0246       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.82        |
-|    explained_variance   | 0.887        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0279      |
-|    n_updates            | 100          |
-|    policy_gradient_loss | -0.00207     |
-|    std                  | 0.993        |
-|    value_loss           | 0.045        |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2289         |
-|    iterations           | 12           |
-|    time_elapsed         | 85           |
-|    total_timesteps      | 196608       |
-| train/                  |              |
-|    approx_kl            | 0.0047060847 |
-|    clip_fraction        | 0.0387       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.83        |
-|    explained_variance   | 0.896        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.00931      |
-|    n_updates            | 110          |
-|    policy_gradient_loss | -0.00305     |
-|    std                  | 0.994        |
-|    value_loss           | 0.0489       |
-------------------------------------------
-Eval num_timesteps=200000, episode_reward=-18.47 +/- 55.53
-Episode length: 1938.95 +/- 147.97
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.94e+03     |
-|    mean_reward          | -18.5        |
-| time/                   |              |
-|    total_timesteps      | 200000       |
-| train/                  |              |
-|    approx_kl            | 0.0047602034 |
-|    clip_fraction        | 0.0421       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.86        |
-|    explained_variance   | 0.968        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0301      |
-|    n_updates            | 120          |
-|    policy_gradient_loss | -0.00281     |
-|    std                  | 1.01         |
-|    value_loss           | 0.0094       |
-------------------------------------------
-New best mean reward!
-
-[Diag @ 200,000 | n_sheep=1 | success=5%]
-  COMPACT_CANT_DRIVE         18/20
-  DROVE_NO_SHEEP             1/20
-  SUCCESS                    1/20
-  action_mag mean=0.269 p10=0.129 p90=0.447 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=3.86m best=1.91m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=11.22m best=2.44m
-  reward/step (mean): progress=-0.0022  alignment=+0.0006  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0026
--------------------------------
-| time/              |        |
-|    fps             | 1964   |
-|    iterations      | 13     |
-|    time_elapsed    | 108    |
-|    total_timesteps | 212992 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2034         |
-|    iterations           | 14           |
-|    time_elapsed         | 112          |
-|    total_timesteps      | 229376       |
-| train/                  |              |
-|    approx_kl            | 0.0041663316 |
-|    clip_fraction        | 0.0373       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.88        |
-|    explained_variance   | 0.901        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0251      |
-|    n_updates            | 130          |
-|    policy_gradient_loss | -0.00223     |
-|    std                  | 1.03         |
-|    value_loss           | 0.00752      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2102         |
-|    iterations           | 15           |
-|    time_elapsed         | 116          |
-|    total_timesteps      | 245760       |
-| train/                  |              |
-|    approx_kl            | 0.0042076977 |
-|    clip_fraction        | 0.032        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.91        |
-|    explained_variance   | 0.939        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0333      |
-|    n_updates            | 140          |
-|    policy_gradient_loss | -0.00281     |
-|    std                  | 1.04         |
-|    value_loss           | 0.00934      |
-------------------------------------------
-Eval num_timesteps=250000, episode_reward=-37.07 +/- 35.02
-Episode length: 1938.20 +/- 269.38
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.94e+03     |
-|    mean_reward          | -37.1        |
-| time/                   |              |
-|    total_timesteps      | 250000       |
-| train/                  |              |
-|    approx_kl            | 0.0028561926 |
-|    clip_fraction        | 0.0171       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.92        |
-|    explained_variance   | 0.822        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0292      |
-|    n_updates            | 150          |
-|    policy_gradient_loss | -0.00113     |
-|    std                  | 1.04         |
-|    value_loss           | 0.0473       |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1990   |
-|    iterations      | 16     |
-|    time_elapsed    | 131    |
-|    total_timesteps | 262144 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2042         |
-|    iterations           | 17           |
-|    time_elapsed         | 136          |
-|    total_timesteps      | 278528       |
-| train/                  |              |
-|    approx_kl            | 0.0054259067 |
-|    clip_fraction        | 0.0468       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.91        |
-|    explained_variance   | 0.891        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.032       |
-|    n_updates            | 160          |
-|    policy_gradient_loss | -0.00597     |
-|    std                  | 1.03         |
-|    value_loss           | 0.0128       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2085        |
-|    iterations           | 18          |
-|    time_elapsed         | 141         |
-|    total_timesteps      | 294912      |
-| train/                  |             |
-|    approx_kl            | 0.004205579 |
-|    clip_fraction        | 0.0291      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.91       |
-|    explained_variance   | 0.834       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0364     |
-|    n_updates            | 170         |
-|    policy_gradient_loss | -0.00307    |
-|    std                  | 1.03        |
-|    value_loss           | 0.0107      |
------------------------------------------
-Eval num_timesteps=300000, episode_reward=-25.41 +/- 48.70
-Episode length: 1886.45 +/- 435.99
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.89e+03     |
-|    mean_reward          | -25.4        |
-| time/                   |              |
-|    total_timesteps      | 300000       |
-| train/                  |              |
-|    approx_kl            | 0.0045948992 |
-|    clip_fraction        | 0.0354       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.9         |
-|    explained_variance   | 0.806        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0242      |
-|    n_updates            | 180          |
-|    policy_gradient_loss | -0.00236     |
-|    std                  | 1.03         |
-|    value_loss           | 0.0371       |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1981   |
-|    iterations      | 19     |
-|    time_elapsed    | 157    |
-|    total_timesteps | 311296 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2024        |
-|    iterations           | 20          |
-|    time_elapsed         | 161         |
-|    total_timesteps      | 327680      |
-| train/                  |             |
-|    approx_kl            | 0.005344864 |
-|    clip_fraction        | 0.0442      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.91       |
-|    explained_variance   | 0.877       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0369     |
-|    n_updates            | 190         |
-|    policy_gradient_loss | -0.00344    |
-|    std                  | 1.04        |
-|    value_loss           | 0.0104      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2066        |
-|    iterations           | 21          |
-|    time_elapsed         | 166         |
-|    total_timesteps      | 344064      |
-| train/                  |             |
-|    approx_kl            | 0.007574372 |
-|    clip_fraction        | 0.0753      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.92       |
-|    explained_variance   | 0.903       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0272     |
-|    n_updates            | 200         |
-|    policy_gradient_loss | -0.00726    |
-|    std                  | 1.04        |
-|    value_loss           | 0.0113      |
------------------------------------------
-Eval num_timesteps=350000, episode_reward=-21.14 +/- 37.01
-Episode length: 1959.80 +/- 175.23
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.96e+03     |
-|    mean_reward          | -21.1        |
-| time/                   |              |
-|    total_timesteps      | 350000       |
-| train/                  |              |
-|    approx_kl            | 0.0061714016 |
-|    clip_fraction        | 0.0569       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.91        |
-|    explained_variance   | 0.917        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.022       |
-|    n_updates            | 210          |
-|    policy_gradient_loss | -0.00598     |
-|    std                  | 1.04         |
-|    value_loss           | 0.0231       |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1984   |
-|    iterations      | 22     |
-|    time_elapsed    | 181    |
-|    total_timesteps | 360448 |
--------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 2026       |
-|    iterations           | 23         |
-|    time_elapsed         | 185        |
-|    total_timesteps      | 376832     |
-| train/                  |            |
-|    approx_kl            | 0.00587913 |
-|    clip_fraction        | 0.0501     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.92      |
-|    explained_variance   | 0.932      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0415    |
-|    n_updates            | 220        |
-|    policy_gradient_loss | -0.00484   |
-|    std                  | 1.04       |
-|    value_loss           | 0.0242     |
-----------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2064        |
-|    iterations           | 24          |
-|    time_elapsed         | 190         |
-|    total_timesteps      | 393216      |
-| train/                  |             |
-|    approx_kl            | 0.006933649 |
-|    clip_fraction        | 0.081       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.91       |
-|    explained_variance   | 0.918       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.032      |
-|    n_updates            | 230         |
-|    policy_gradient_loss | -0.00773    |
-|    std                  | 1.03        |
-|    value_loss           | 0.0233      |
------------------------------------------
-Eval num_timesteps=400000, episode_reward=-2.75 +/- 37.08
-Episode length: 1998.55 +/- 6.32
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -2.75        |
-| time/                   |              |
-|    total_timesteps      | 400000       |
-| train/                  |              |
-|    approx_kl            | 0.0064436095 |
-|    clip_fraction        | 0.0647       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.9         |
-|    explained_variance   | 0.853        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.0633       |
-|    n_updates            | 240          |
-|    policy_gradient_loss | -0.00551     |
-|    std                  | 1.03         |
-|    value_loss           | 0.128        |
-------------------------------------------
-New best mean reward!
-
-[Diag @ 400,000 | n_sheep=1 | success=0%]
-  DROVE_NO_SHEEP             13/20
-  COMPACT_CANT_DRIVE         7/20
-  action_mag mean=0.316 p10=0.057 p90=0.512 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=1.86m best=0.95m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=3.19m best=1.50m
-  reward/step (mean): progress=+0.0093  alignment=+0.0040  pen_bonus=+0.0000  step_cost=-0.0200  complete=+0.0000
--------------------------------
-| time/              |        |
-|    fps             | 1925   |
-|    iterations      | 25     |
-|    time_elapsed    | 212    |
-|    total_timesteps | 409600 |
--------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1961       |
-|    iterations           | 26         |
-|    time_elapsed         | 217        |
-|    total_timesteps      | 425984     |
-| train/                  |            |
-|    approx_kl            | 0.00806847 |
-|    clip_fraction        | 0.1        |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.88      |
-|    explained_variance   | 0.933      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0254    |
-|    n_updates            | 250        |
-|    policy_gradient_loss | -0.00871   |
-|    std                  | 1.02       |
-|    value_loss           | 0.0264     |
-----------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1997        |
-|    iterations           | 27          |
-|    time_elapsed         | 221         |
-|    total_timesteps      | 442368      |
-| train/                  |             |
-|    approx_kl            | 0.005784355 |
-|    clip_fraction        | 0.0531      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.86       |
-|    explained_variance   | 0.878       |
-|    learning_rate        | 0.0003      |
-|    loss                 | 0.00996     |
-|    n_updates            | 260         |
-|    policy_gradient_loss | -0.00485    |
-|    std                  | 1           |
-|    value_loss           | 0.0868      |
------------------------------------------
-Eval num_timesteps=450000, episode_reward=51.79 +/- 20.61
-Episode length: 1912.30 +/- 382.28
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.91e+03    |
-|    mean_reward          | 51.8        |
-| time/                   |             |
-|    total_timesteps      | 450000      |
-| train/                  |             |
-|    approx_kl            | 0.005881632 |
-|    clip_fraction        | 0.0639      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.83       |
-|    explained_variance   | 0.952       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0187     |
-|    n_updates            | 270         |
-|    policy_gradient_loss | -0.00655    |
-|    std                  | 0.991       |
-|    value_loss           | 0.0226      |
------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 1936   |
-|    iterations      | 28     |
-|    time_elapsed    | 236    |
-|    total_timesteps | 458752 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1965        |
-|    iterations           | 29          |
-|    time_elapsed         | 241         |
-|    total_timesteps      | 475136      |
-| train/                  |             |
-|    approx_kl            | 0.009020726 |
-|    clip_fraction        | 0.0982      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.81       |
-|    explained_variance   | 0.87        |
-|    learning_rate        | 0.0003      |
-|    loss                 | 0.0218      |
-|    n_updates            | 280         |
-|    policy_gradient_loss | -0.0061     |
-|    std                  | 0.984       |
-|    value_loss           | 0.209       |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1999        |
-|    iterations           | 30          |
-|    time_elapsed         | 245         |
-|    total_timesteps      | 491520      |
-| train/                  |             |
-|    approx_kl            | 0.011525536 |
-|    clip_fraction        | 0.136       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.79       |
-|    explained_variance   | 0.92        |
-|    learning_rate        | 0.0003      |
-|    loss                 | 0.0306      |
-|    n_updates            | 290         |
-|    policy_gradient_loss | -0.00896    |
-|    std                  | 0.97        |
-|    value_loss           | 0.0903      |
------------------------------------------
-Eval num_timesteps=500000, episode_reward=87.01 +/- 42.12
-Episode length: 1359.85 +/- 815.95
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.36e+03    |
-|    mean_reward          | 87          |
-| time/                   |             |
-|    total_timesteps      | 500000      |
-| train/                  |             |
-|    approx_kl            | 0.012545023 |
-|    clip_fraction        | 0.171       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.78       |
-|    explained_variance   | 0.956       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0369     |
-|    n_updates            | 300         |
-|    policy_gradient_loss | -0.0069     |
-|    std                  | 0.972       |
-|    value_loss           | 0.034       |
------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 1968   |
-|    iterations      | 31     |
-|    time_elapsed    | 258    |
-|    total_timesteps | 507904 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1996        |
-|    iterations           | 32          |
-|    time_elapsed         | 262         |
-|    total_timesteps      | 524288      |
-| train/                  |             |
-|    approx_kl            | 0.008305798 |
-|    clip_fraction        | 0.102       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.78       |
-|    explained_variance   | 0.975       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0285     |
-|    n_updates            | 310         |
-|    policy_gradient_loss | -0.00343    |
-|    std                  | 0.972       |
-|    value_loss           | 0.0162      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2021         |
-|    iterations           | 33           |
-|    time_elapsed         | 267          |
-|    total_timesteps      | 540672       |
-| train/                  |              |
-|    approx_kl            | 0.0074599315 |
-|    clip_fraction        | 0.0925       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.81        |
-|    explained_variance   | 0.976        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0282      |
-|    n_updates            | 320          |
-|    policy_gradient_loss | -0.0028      |
-|    std                  | 0.989        |
-|    value_loss           | 0.0136       |
-------------------------------------------
-Eval num_timesteps=550000, episode_reward=113.42 +/- 48.33
-Episode length: 926.05 +/- 792.99
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 926         |
-|    mean_reward          | 113         |
-| time/                   |             |
-|    total_timesteps      | 550000      |
-| train/                  |             |
-|    approx_kl            | 0.010888291 |
-|    clip_fraction        | 0.136       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.83       |
-|    explained_variance   | 0.981       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0226     |
-|    n_updates            | 330         |
-|    policy_gradient_loss | -0.00266    |
-|    std                  | 1           |
-|    value_loss           | 0.00643     |
------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 2005   |
-|    iterations      | 34     |
-|    time_elapsed    | 277    |
-|    total_timesteps | 557056 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2030        |
-|    iterations           | 35          |
-|    time_elapsed         | 282         |
-|    total_timesteps      | 573440      |
-| train/                  |             |
-|    approx_kl            | 0.009418717 |
-|    clip_fraction        | 0.121       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.84       |
-|    explained_variance   | 0.975       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0234     |
-|    n_updates            | 340         |
-|    policy_gradient_loss | -0.00417    |
-|    std                  | 1           |
-|    value_loss           | 0.0219      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2054        |
-|    iterations           | 36          |
-|    time_elapsed         | 287         |
-|    total_timesteps      | 589824      |
-| train/                  |             |
-|    approx_kl            | 0.009153167 |
-|    clip_fraction        | 0.132       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.86       |
-|    explained_variance   | 0.972       |
-|    learning_rate        | 0.0003      |
-|    loss                 | 0.00458     |
-|    n_updates            | 350         |
-|    policy_gradient_loss | -0.00925    |
-|    std                  | 1.01        |
-|    value_loss           | 0.0644      |
------------------------------------------
-Eval num_timesteps=600000, episode_reward=142.43 +/- 15.10
-Episode length: 292.00 +/- 114.85
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 292          |
-|    mean_reward          | 142          |
-| time/                   |              |
-|    total_timesteps      | 600000       |
-| train/                  |              |
-|    approx_kl            | 0.0073751104 |
-|    clip_fraction        | 0.0817       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.85        |
-|    explained_variance   | 0.967        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.0205       |
-|    n_updates            | 360          |
-|    policy_gradient_loss | -0.0078      |
-|    std                  | 1.01         |
-|    value_loss           | 0.0854       |
-------------------------------------------
-New best mean reward!
-
-[Diag @ 600,000 | n_sheep=1 | success=100%]
-  SUCCESS                    20/20
-  action_mag mean=0.339 p10=0.246 p90=0.609 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=1.68m best=0.23m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=3.54m best=2.70m
-  reward/step (mean): progress=+0.0996  alignment=+0.0271  pen_bonus=+0.0302  step_cost=-0.0200  complete=+0.3022
--------------------------------
-| time/              |        |
-|    fps             | 2059   |
-|    iterations      | 37     |
-|    time_elapsed    | 294    |
-|    total_timesteps | 606208 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2069        |
-|    iterations           | 38          |
-|    time_elapsed         | 300         |
-|    total_timesteps      | 622592      |
-| train/                  |             |
-|    approx_kl            | 0.006348365 |
-|    clip_fraction        | 0.0685      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.85       |
-|    explained_variance   | 0.954       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0107     |
-|    n_updates            | 370         |
-|    policy_gradient_loss | -0.00403    |
-|    std                  | 1           |
-|    value_loss           | 0.0629      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2085         |
-|    iterations           | 39           |
-|    time_elapsed         | 306          |
-|    total_timesteps      | 638976       |
-| train/                  |              |
-|    approx_kl            | 0.0073653567 |
-|    clip_fraction        | 0.089        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.83        |
-|    explained_variance   | 0.976        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0379      |
-|    n_updates            | 380          |
-|    policy_gradient_loss | -0.00635     |
-|    std                  | 0.993        |
-|    value_loss           | 0.0213       |
-------------------------------------------
-Eval num_timesteps=650000, episode_reward=148.63 +/- 11.08
-Episode length: 312.15 +/- 83.52
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 312          |
-|    mean_reward          | 149          |
-| time/                   |              |
-|    total_timesteps      | 650000       |
-| train/                  |              |
-|    approx_kl            | 0.0064217458 |
-|    clip_fraction        | 0.0662       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.81        |
-|    explained_variance   | 0.977        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0177      |
-|    n_updates            | 390          |
-|    policy_gradient_loss | -0.00451     |
-|    std                  | 0.983        |
-|    value_loss           | 0.0325       |
-------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 2092   |
-|    iterations      | 40     |
-|    time_elapsed    | 313    |
-|    total_timesteps | 655360 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2107        |
-|    iterations           | 41          |
-|    time_elapsed         | 318         |
-|    total_timesteps      | 671744      |
-| train/                  |             |
-|    approx_kl            | 0.007330196 |
-|    clip_fraction        | 0.0823      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.79       |
-|    explained_variance   | 0.985       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0257     |
-|    n_updates            | 400         |
-|    policy_gradient_loss | -0.00559    |
-|    std                  | 0.971       |
-|    value_loss           | 0.0108      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2123         |
-|    iterations           | 42           |
-|    time_elapsed         | 323          |
-|    total_timesteps      | 688128       |
-| train/                  |              |
-|    approx_kl            | 0.0076610697 |
-|    clip_fraction        | 0.0876       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.77        |
-|    explained_variance   | 0.99         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.037       |
-|    n_updates            | 410          |
-|    policy_gradient_loss | -0.00581     |
-|    std                  | 0.966        |
-|    value_loss           | 0.00623      |
-------------------------------------------
-Eval num_timesteps=700000, episode_reward=137.38 +/- 18.54
-Episode length: 255.10 +/- 119.47
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 255          |
-|    mean_reward          | 137          |
-| time/                   |              |
-|    total_timesteps      | 700000       |
-| train/                  |              |
-|    approx_kl            | 0.0072219693 |
-|    clip_fraction        | 0.0734       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.76        |
-|    explained_variance   | 0.989        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0383      |
-|    n_updates            | 420          |
-|    policy_gradient_loss | -0.00416     |
-|    std                  | 0.961        |
-|    value_loss           | 0.00951      |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2128   |
-|    iterations      | 43     |
-|    time_elapsed    | 331    |
-|    total_timesteps | 704512 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2144         |
-|    iterations           | 44           |
-|    time_elapsed         | 336          |
-|    total_timesteps      | 720896       |
-| train/                  |              |
-|    approx_kl            | 0.0075956425 |
-|    clip_fraction        | 0.0895       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.75        |
-|    explained_variance   | 0.993        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0433      |
-|    n_updates            | 430          |
-|    policy_gradient_loss | -0.00475     |
-|    std                  | 0.953        |
-|    value_loss           | 0.00343      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2160         |
-|    iterations           | 45           |
-|    time_elapsed         | 341          |
-|    total_timesteps      | 737280       |
-| train/                  |              |
-|    approx_kl            | 0.0062526334 |
-|    clip_fraction        | 0.0699       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.72        |
-|    explained_variance   | 0.99         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0329      |
-|    n_updates            | 440          |
-|    policy_gradient_loss | -0.00355     |
-|    std                  | 0.942        |
-|    value_loss           | 0.0113       |
-------------------------------------------
-Eval num_timesteps=750000, episode_reward=145.04 +/- 16.56
-Episode length: 291.10 +/- 132.25
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 291          |
-|    mean_reward          | 145          |
-| time/                   |              |
-|    total_timesteps      | 750000       |
-| train/                  |              |
-|    approx_kl            | 0.0058749127 |
-|    clip_fraction        | 0.0607       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.71        |
-|    explained_variance   | 0.993        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0281      |
-|    n_updates            | 450          |
-|    policy_gradient_loss | -0.00324     |
-|    std                  | 0.934        |
-|    value_loss           | 0.00811      |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2161   |
-|    iterations      | 46     |
-|    time_elapsed    | 348    |
-|    total_timesteps | 753664 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2176         |
-|    iterations           | 47           |
-|    time_elapsed         | 353          |
-|    total_timesteps      | 770048       |
-| train/                  |              |
-|    approx_kl            | 0.0070656985 |
-|    clip_fraction        | 0.0763       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.68        |
-|    explained_variance   | 0.996        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0322      |
-|    n_updates            | 460          |
-|    policy_gradient_loss | -0.00485     |
-|    std                  | 0.92         |
-|    value_loss           | 0.00234      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2193        |
-|    iterations           | 48          |
-|    time_elapsed         | 358         |
-|    total_timesteps      | 786432      |
-| train/                  |             |
-|    approx_kl            | 0.008987564 |
-|    clip_fraction        | 0.112       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.66       |
-|    explained_variance   | 0.997       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0471     |
-|    n_updates            | 470         |
-|    policy_gradient_loss | -0.00864    |
-|    std                  | 0.909       |
-|    value_loss           | 0.00178     |
------------------------------------------
-Eval num_timesteps=800000, episode_reward=141.03 +/- 13.75
-Episode length: 256.90 +/- 100.39
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 257         |
-|    mean_reward          | 141         |
-| time/                   |             |
-|    total_timesteps      | 800000      |
-| train/                  |             |
-|    approx_kl            | 0.008297143 |
-|    clip_fraction        | 0.0945      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.67       |
-|    explained_variance   | 0.989       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0173     |
-|    n_updates            | 480         |
-|    policy_gradient_loss | -0.00352    |
-|    std                  | 0.921       |
-|    value_loss           | 0.00934     |
------------------------------------------
-
-[Diag @ 800,000 | n_sheep=1 | success=100%]
-  SUCCESS                    20/20
-  action_mag mean=0.333 p10=0.244 p90=0.332 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=1.40m best=0.75m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=3.47m best=1.58m
-  reward/step (mean): progress=+0.1108  alignment=+0.0328  pen_bonus=+0.0366  step_cost=-0.0200  complete=+0.3664
-
-[Curriculum] leaving stage n_sheep=1 after 800,000 steps | training success rate (last 100 eps) = 100%
-[Curriculum] → 2 sheep at step 800,000
-
--------------------------------
-| time/              |        |
-|    fps             | 2187   |
-|    iterations      | 49     |
-|    time_elapsed    | 367    |
-|    total_timesteps | 802816 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2201        |
-|    iterations           | 50          |
-|    time_elapsed         | 372         |
-|    total_timesteps      | 819200      |
-| train/                  |             |
-|    approx_kl            | 0.006534174 |
-|    clip_fraction        | 0.0754      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.7        |
-|    explained_variance   | 0.968       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0252     |
-|    n_updates            | 490         |
-|    policy_gradient_loss | 0.00248     |
-|    std                  | 0.942       |
-|    value_loss           | 0.021       |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2213        |
-|    iterations           | 51          |
-|    time_elapsed         | 377         |
-|    total_timesteps      | 835584      |
-| train/                  |             |
-|    approx_kl            | 0.012509884 |
-|    clip_fraction        | 0.182       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.73       |
-|    explained_variance   | 0.51        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0127     |
-|    n_updates            | 500         |
-|    policy_gradient_loss | 0.00321     |
-|    std                  | 0.953       |
-|    value_loss           | 0.0093      |
------------------------------------------
-Eval num_timesteps=850000, episode_reward=-30.43 +/- 29.94
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -30.4       |
-| time/                   |             |
-|    total_timesteps      | 850000      |
-| train/                  |             |
-|    approx_kl            | 0.009752454 |
-|    clip_fraction        | 0.146       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.74       |
-|    explained_variance   | 0.865       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0289     |
-|    n_updates            | 510         |
-|    policy_gradient_loss | 0.00274     |
-|    std                  | 0.95        |
-|    value_loss           | 0.0117      |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2153   |
-|    iterations      | 52     |
-|    time_elapsed    | 395    |
-|    total_timesteps | 851968 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2166        |
-|    iterations           | 53          |
-|    time_elapsed         | 400         |
-|    total_timesteps      | 868352      |
-| train/                  |             |
-|    approx_kl            | 0.011746319 |
-|    clip_fraction        | 0.133       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.75       |
-|    explained_variance   | 0.953       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0316     |
-|    n_updates            | 520         |
-|    policy_gradient_loss | 0.00116     |
-|    std                  | 0.958       |
-|    value_loss           | 0.00603     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2179        |
-|    iterations           | 54          |
-|    time_elapsed         | 405         |
-|    total_timesteps      | 884736      |
-| train/                  |             |
-|    approx_kl            | 0.008340008 |
-|    clip_fraction        | 0.111       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.75       |
-|    explained_variance   | 0.959       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0317     |
-|    n_updates            | 530         |
-|    policy_gradient_loss | 0.000628    |
-|    std                  | 0.955       |
-|    value_loss           | 0.00663     |
------------------------------------------
-Eval num_timesteps=900000, episode_reward=-21.80 +/- 34.98
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -21.8       |
-| time/                   |             |
-|    total_timesteps      | 900000      |
-| train/                  |             |
-|    approx_kl            | 0.010461532 |
-|    clip_fraction        | 0.13        |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.74       |
-|    explained_variance   | 0.88        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.00905    |
-|    n_updates            | 540         |
-|    policy_gradient_loss | -0.000256   |
-|    std                  | 0.951       |
-|    value_loss           | 0.00567     |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2128   |
-|    iterations      | 55     |
-|    time_elapsed    | 423    |
-|    total_timesteps | 901120 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2139         |
-|    iterations           | 56           |
-|    time_elapsed         | 428          |
-|    total_timesteps      | 917504       |
-| train/                  |              |
-|    approx_kl            | 0.0071650296 |
-|    clip_fraction        | 0.0988       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.75        |
-|    explained_variance   | 0.931        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0294      |
-|    n_updates            | 550          |
-|    policy_gradient_loss | -0.000672    |
-|    std                  | 0.957        |
-|    value_loss           | 0.00545      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2152        |
-|    iterations           | 57          |
-|    time_elapsed         | 433         |
-|    total_timesteps      | 933888      |
-| train/                  |             |
-|    approx_kl            | 0.009678386 |
-|    clip_fraction        | 0.112       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.74       |
-|    explained_variance   | 0.927       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0308     |
-|    n_updates            | 560         |
-|    policy_gradient_loss | -0.000959   |
-|    std                  | 0.953       |
-|    value_loss           | 0.00409     |
------------------------------------------
-Eval num_timesteps=950000, episode_reward=-34.37 +/- 35.50
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -34.4       |
-| time/                   |             |
-|    total_timesteps      | 950000      |
-| train/                  |             |
-|    approx_kl            | 0.008903094 |
-|    clip_fraction        | 0.111       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.74       |
-|    explained_variance   | 0.939       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0259     |
-|    n_updates            | 570         |
-|    policy_gradient_loss | -0.000299   |
-|    std                  | 0.955       |
-|    value_loss           | 0.00432     |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2108   |
-|    iterations      | 58     |
-|    time_elapsed    | 450    |
-|    total_timesteps | 950272 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2117        |
-|    iterations           | 59          |
-|    time_elapsed         | 456         |
-|    total_timesteps      | 966656      |
-| train/                  |             |
-|    approx_kl            | 0.008592881 |
-|    clip_fraction        | 0.0954      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.74       |
-|    explained_variance   | 0.929       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0173     |
-|    n_updates            | 580         |
-|    policy_gradient_loss | 0.00103     |
-|    std                  | 0.95        |
-|    value_loss           | 0.00265     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2129        |
-|    iterations           | 60          |
-|    time_elapsed         | 461         |
-|    total_timesteps      | 983040      |
-| train/                  |             |
-|    approx_kl            | 0.010225108 |
-|    clip_fraction        | 0.108       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.74       |
-|    explained_variance   | 0.972       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0135     |
-|    n_updates            | 590         |
-|    policy_gradient_loss | -0.000738   |
-|    std                  | 0.954       |
-|    value_loss           | 0.0029      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2137        |
-|    iterations           | 61          |
-|    time_elapsed         | 467         |
-|    total_timesteps      | 999424      |
-| train/                  |             |
-|    approx_kl            | 0.008312117 |
-|    clip_fraction        | 0.0887      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.75       |
-|    explained_variance   | 0.898       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0262     |
-|    n_updates            | 600         |
-|    policy_gradient_loss | -0.000497   |
-|    std                  | 0.958       |
-|    value_loss           | 0.00511     |
------------------------------------------
-Eval num_timesteps=1000000, episode_reward=-32.64 +/- 38.38
-Episode length: 2000.00 +/- 0.00
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 2e+03      |
-|    mean_reward          | -32.6      |
-| time/                   |            |
-|    total_timesteps      | 1000000    |
-| train/                  |            |
-|    approx_kl            | 0.00942917 |
-|    clip_fraction        | 0.105      |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.76      |
-|    explained_variance   | 0.961      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0331    |
-|    n_updates            | 610        |
-|    policy_gradient_loss | -0.0023    |
-|    std                  | 0.966      |
-|    value_loss           | 0.00282    |
-----------------------------------------
-
-[Diag @ 1,000,000 | n_sheep=2 | success=0%]
-  COMPACT_CANT_DRIVE         14/20
-  NEVER_COMPACT              6/20
-  action_mag mean=0.216 p10=0.000 p90=0.805 (0=stopped, 1=full speed)
-  min_flock_radius mean=3.39m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=1.18m best=0.11m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.11m best=7.44m
-  reward/step (mean): progress=-0.0011  alignment=+0.0106  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 2057    |
-|    iterations      | 62      |
-|    time_elapsed    | 493     |
-|    total_timesteps | 1015808 |
---------------------------------
----------------------------------------
-| time/                   |           |
-|    fps                  | 2067      |
-|    iterations           | 63        |
-|    time_elapsed         | 499       |
-|    total_timesteps      | 1032192   |
-| train/                  |           |
-|    approx_kl            | 0.008683  |
-|    clip_fraction        | 0.0967    |
-|    clip_range           | 0.2       |
-|    entropy_loss         | -2.77     |
-|    explained_variance   | 0.93      |
-|    learning_rate        | 0.0003    |
-|    loss                 | -0.029    |
-|    n_updates            | 620       |
-|    policy_gradient_loss | -0.000765 |
-|    std                  | 0.965     |
-|    value_loss           | 0.00446   |
----------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2077        |
-|    iterations           | 64          |
-|    time_elapsed         | 504         |
-|    total_timesteps      | 1048576     |
-| train/                  |             |
-|    approx_kl            | 0.009014329 |
-|    clip_fraction        | 0.113       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.76       |
-|    explained_variance   | 0.984       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0279     |
-|    n_updates            | 630         |
-|    policy_gradient_loss | -0.00211    |
-|    std                  | 0.962       |
-|    value_loss           | 0.00312     |
------------------------------------------
-Eval num_timesteps=1050000, episode_reward=-31.51 +/- 42.52
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -31.5       |
-| time/                   |             |
-|    total_timesteps      | 1050000     |
-| train/                  |             |
-|    approx_kl            | 0.008500135 |
-|    clip_fraction        | 0.105       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.75       |
-|    explained_variance   | 0.968       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0306     |
-|    n_updates            | 640         |
-|    policy_gradient_loss | -0.00312    |
-|    std                  | 0.955       |
-|    value_loss           | 0.00288     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 2042    |
-|    iterations      | 65      |
-|    time_elapsed    | 521     |
-|    total_timesteps | 1064960 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2056         |
-|    iterations           | 66           |
-|    time_elapsed         | 525          |
-|    total_timesteps      | 1081344      |
-| train/                  |              |
-|    approx_kl            | 0.0069593494 |
-|    clip_fraction        | 0.0923       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.74        |
-|    explained_variance   | 0.835        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0291      |
-|    n_updates            | 650          |
-|    policy_gradient_loss | -0.000469    |
-|    std                  | 0.952        |
-|    value_loss           | 0.00186      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2064        |
-|    iterations           | 67          |
-|    time_elapsed         | 531         |
-|    total_timesteps      | 1097728     |
-| train/                  |             |
-|    approx_kl            | 0.007817726 |
-|    clip_fraction        | 0.0933      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.74       |
-|    explained_variance   | 0.922       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0206     |
-|    n_updates            | 660         |
-|    policy_gradient_loss | -0.00208    |
-|    std                  | 0.953       |
-|    value_loss           | 0.00234     |
------------------------------------------
-Eval num_timesteps=1100000, episode_reward=-22.82 +/- 33.61
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -22.8       |
-| time/                   |             |
-|    total_timesteps      | 1100000     |
-| train/                  |             |
-|    approx_kl            | 0.006177975 |
-|    clip_fraction        | 0.0806      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.74       |
-|    explained_variance   | 0.951       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.026      |
-|    n_updates            | 670         |
-|    policy_gradient_loss | -5.8e-05    |
-|    std                  | 0.951       |
-|    value_loss           | 0.00184     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 2035    |
-|    iterations      | 68      |
-|    time_elapsed    | 547     |
-|    total_timesteps | 1114112 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2048        |
-|    iterations           | 69          |
-|    time_elapsed         | 551         |
-|    total_timesteps      | 1130496     |
-| train/                  |             |
-|    approx_kl            | 0.009605391 |
-|    clip_fraction        | 0.102       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.74       |
-|    explained_variance   | 0.951       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0344     |
-|    n_updates            | 680         |
-|    policy_gradient_loss | -0.0022     |
-|    std                  | 0.957       |
-|    value_loss           | 0.00221     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2060         |
-|    iterations           | 70           |
-|    time_elapsed         | 556          |
-|    total_timesteps      | 1146880      |
-| train/                  |              |
-|    approx_kl            | 0.0064521013 |
-|    clip_fraction        | 0.0953       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.75        |
-|    explained_variance   | 0.898        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0348      |
-|    n_updates            | 690          |
-|    policy_gradient_loss | -0.00112     |
-|    std                  | 0.96         |
-|    value_loss           | 0.00221      |
-------------------------------------------
-Eval num_timesteps=1150000, episode_reward=-26.36 +/- 35.49
-Episode length: 2000.00 +/- 0.00
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 2e+03      |
-|    mean_reward          | -26.4      |
-| time/                   |            |
-|    total_timesteps      | 1150000    |
-| train/                  |            |
-|    approx_kl            | 0.00777065 |
-|    clip_fraction        | 0.0837     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.76      |
-|    explained_variance   | 0.907      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0198    |
-|    n_updates            | 700        |
-|    policy_gradient_loss | -0.000371  |
-|    std                  | 0.963      |
-|    value_loss           | 0.00182    |
-----------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 2031    |
-|    iterations      | 71      |
-|    time_elapsed    | 572     |
-|    total_timesteps | 1163264 |
---------------------------------
----------------------------------------
-| time/                   |           |
-|    fps                  | 2044      |
-|    iterations           | 72        |
-|    time_elapsed         | 577       |
-|    total_timesteps      | 1179648   |
-| train/                  |           |
-|    approx_kl            | 0.006344  |
-|    clip_fraction        | 0.0719    |
-|    clip_range           | 0.2       |
-|    entropy_loss         | -2.76     |
-|    explained_variance   | 0.908     |
-|    learning_rate        | 0.0003    |
-|    loss                 | -0.0347   |
-|    n_updates            | 710       |
-|    policy_gradient_loss | -0.000455 |
-|    std                  | 0.961     |
-|    value_loss           | 0.00145   |
----------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2054         |
-|    iterations           | 73           |
-|    time_elapsed         | 582          |
-|    total_timesteps      | 1196032      |
-| train/                  |              |
-|    approx_kl            | 0.0060829036 |
-|    clip_fraction        | 0.0854       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.75        |
-|    explained_variance   | 0.896        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0232      |
-|    n_updates            | 720          |
-|    policy_gradient_loss | -0.00108     |
-|    std                  | 0.957        |
-|    value_loss           | 0.00152      |
-------------------------------------------
-Eval num_timesteps=1200000, episode_reward=-14.33 +/- 30.83
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -14.3        |
-| time/                   |              |
-|    total_timesteps      | 1200000      |
-| train/                  |              |
-|    approx_kl            | 0.0073732347 |
-|    clip_fraction        | 0.0783       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.76        |
-|    explained_variance   | 0.948        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0267      |
-|    n_updates            | 730          |
-|    policy_gradient_loss | -0.00212     |
-|    std                  | 0.968        |
-|    value_loss           | 0.00253      |
-------------------------------------------
-
-[Diag @ 1,200,000 | n_sheep=2 | success=0%]
-  COMPACT_CANT_DRIVE         15/20
-  NEVER_COMPACT              5/20
-  action_mag mean=0.273 p10=0.004 p90=1.008 (0=stopped, 1=full speed)
-  min_flock_radius mean=3.94m best=0.97m  (target <5m to compact)
-  min_dog_to_com   mean=1.16m best=0.35m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.54m best=4.20m
-  reward/step (mean): progress=+0.0001  alignment=+0.0121  pen_bonus=+0.0000  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1998    |
-|    iterations      | 74      |
-|    time_elapsed    | 606     |
-|    total_timesteps | 1212416 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2008        |
-|    iterations           | 75          |
-|    time_elapsed         | 611         |
-|    total_timesteps      | 1228800     |
-| train/                  |             |
-|    approx_kl            | 0.006109112 |
-|    clip_fraction        | 0.0814      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.78       |
-|    explained_variance   | 0.86        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0205     |
-|    n_updates            | 740         |
-|    policy_gradient_loss | -0.000541   |
-|    std                  | 0.973       |
-|    value_loss           | 0.00171     |
------------------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 2016       |
-|    iterations           | 76         |
-|    time_elapsed         | 617        |
-|    total_timesteps      | 1245184    |
-| train/                  |            |
-|    approx_kl            | 0.00703271 |
-|    clip_fraction        | 0.0781     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.78      |
-|    explained_variance   | 0.934      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0394    |
-|    n_updates            | 750        |
-|    policy_gradient_loss | -0.00105   |
-|    std                  | 0.975      |
-|    value_loss           | 0.00168    |
-----------------------------------------
-Eval num_timesteps=1250000, episode_reward=-18.12 +/- 39.82
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -18.1        |
-| time/                   |              |
-|    total_timesteps      | 1250000      |
-| train/                  |              |
-|    approx_kl            | 0.0064994176 |
-|    clip_fraction        | 0.0698       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.8         |
-|    explained_variance   | 0.919        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0166      |
-|    n_updates            | 760          |
-|    policy_gradient_loss | -0.000919    |
-|    std                  | 0.985        |
-|    value_loss           | 0.000832     |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1989    |
-|    iterations      | 77      |
-|    time_elapsed    | 634     |
-|    total_timesteps | 1261568 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2001        |
-|    iterations           | 78          |
-|    time_elapsed         | 638         |
-|    total_timesteps      | 1277952     |
-| train/                  |             |
-|    approx_kl            | 0.008321709 |
-|    clip_fraction        | 0.0902      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.81       |
-|    explained_variance   | 0.874       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0295     |
-|    n_updates            | 770         |
-|    policy_gradient_loss | -0.00219    |
-|    std                  | 0.991       |
-|    value_loss           | 0.00127     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2010        |
-|    iterations           | 79          |
-|    time_elapsed         | 643         |
-|    total_timesteps      | 1294336     |
-| train/                  |             |
-|    approx_kl            | 0.009220061 |
-|    clip_fraction        | 0.112       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.82       |
-|    explained_variance   | 0.952       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0379     |
-|    n_updates            | 780         |
-|    policy_gradient_loss | -0.00411    |
-|    std                  | 0.994       |
-|    value_loss           | 0.00295     |
------------------------------------------
-Eval num_timesteps=1300000, episode_reward=-22.41 +/- 35.57
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -22.4        |
-| time/                   |              |
-|    total_timesteps      | 1300000      |
-| train/                  |              |
-|    approx_kl            | 0.0071307076 |
-|    clip_fraction        | 0.0826       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.83        |
-|    explained_variance   | 0.948        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0281      |
-|    n_updates            | 790          |
-|    policy_gradient_loss | -0.00178     |
-|    std                  | 0.995        |
-|    value_loss           | 0.00169      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1986    |
-|    iterations      | 80      |
-|    time_elapsed    | 659     |
-|    total_timesteps | 1310720 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1996        |
-|    iterations           | 81          |
-|    time_elapsed         | 664         |
-|    total_timesteps      | 1327104     |
-| train/                  |             |
-|    approx_kl            | 0.008566003 |
-|    clip_fraction        | 0.0857      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.84       |
-|    explained_variance   | 0.904       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0369     |
-|    n_updates            | 800         |
-|    policy_gradient_loss | -0.00199    |
-|    std                  | 1.01        |
-|    value_loss           | 0.00203     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2006         |
-|    iterations           | 82           |
-|    time_elapsed         | 669          |
-|    total_timesteps      | 1343488      |
-| train/                  |              |
-|    approx_kl            | 0.0082352655 |
-|    clip_fraction        | 0.0989       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.86        |
-|    explained_variance   | 0.918        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0297      |
-|    n_updates            | 810          |
-|    policy_gradient_loss | -0.0023      |
-|    std                  | 1.01         |
-|    value_loss           | 0.00203      |
-------------------------------------------
-Eval num_timesteps=1350000, episode_reward=-14.21 +/- 38.53
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -14.2        |
-| time/                   |              |
-|    total_timesteps      | 1350000      |
-| train/                  |              |
-|    approx_kl            | 0.0066830693 |
-|    clip_fraction        | 0.0831       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.86        |
-|    explained_variance   | 0.923        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0331      |
-|    n_updates            | 820          |
-|    policy_gradient_loss | -0.00226     |
-|    std                  | 1.01         |
-|    value_loss           | 0.00125      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1983    |
-|    iterations      | 83      |
-|    time_elapsed    | 685     |
-|    total_timesteps | 1359872 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1991        |
-|    iterations           | 84          |
-|    time_elapsed         | 691         |
-|    total_timesteps      | 1376256     |
-| train/                  |             |
-|    approx_kl            | 0.008341949 |
-|    clip_fraction        | 0.101       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.85       |
-|    explained_variance   | 0.928       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0156     |
-|    n_updates            | 830         |
-|    policy_gradient_loss | -0.00132    |
-|    std                  | 1.01        |
-|    value_loss           | 0.00407     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1999        |
-|    iterations           | 85          |
-|    time_elapsed         | 696         |
-|    total_timesteps      | 1392640     |
-| train/                  |             |
-|    approx_kl            | 0.010089031 |
-|    clip_fraction        | 0.109       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.84       |
-|    explained_variance   | 0.914       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0249     |
-|    n_updates            | 840         |
-|    policy_gradient_loss | -0.00202    |
-|    std                  | 0.999       |
-|    value_loss           | 0.00555     |
------------------------------------------
-Eval num_timesteps=1400000, episode_reward=-5.74 +/- 37.76
-Episode length: 2000.00 +/- 0.00
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 2e+03      |
-|    mean_reward          | -5.74      |
-| time/                   |            |
-|    total_timesteps      | 1400000    |
-| train/                  |            |
-|    approx_kl            | 0.00840036 |
-|    clip_fraction        | 0.112      |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.84      |
-|    explained_variance   | 0.915      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0267    |
-|    n_updates            | 850        |
-|    policy_gradient_loss | -0.00422   |
-|    std                  | 1          |
-|    value_loss           | 0.0017     |
-----------------------------------------
-
-[Diag @ 1,400,000 | n_sheep=2 | success=0%]
-  COMPACT_CANT_DRIVE         16/20
-  NEVER_COMPACT              4/20
-  action_mag mean=0.258 p10=0.000 p90=1.004 (0=stopped, 1=full speed)
-  min_flock_radius mean=3.30m best=0.61m  (target <5m to compact)
-  min_dog_to_com   mean=0.76m best=0.22m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.16m best=4.08m
-  reward/step (mean): progress=+0.0035  alignment=+0.0165  pen_bonus=+0.0000  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1954    |
-|    iterations      | 86      |
-|    time_elapsed    | 720     |
-|    total_timesteps | 1409024 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1964        |
-|    iterations           | 87          |
-|    time_elapsed         | 725         |
-|    total_timesteps      | 1425408     |
-| train/                  |             |
-|    approx_kl            | 0.007908808 |
-|    clip_fraction        | 0.0839      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.85       |
-|    explained_variance   | 0.755       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.018      |
-|    n_updates            | 860         |
-|    policy_gradient_loss | -0.00223    |
-|    std                  | 1.01        |
-|    value_loss           | 0.00248     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1972        |
-|    iterations           | 88          |
-|    time_elapsed         | 730         |
-|    total_timesteps      | 1441792     |
-| train/                  |             |
-|    approx_kl            | 0.007957449 |
-|    clip_fraction        | 0.0864      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.86       |
-|    explained_variance   | 0.868       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0315     |
-|    n_updates            | 870         |
-|    policy_gradient_loss | -0.00288    |
-|    std                  | 1.01        |
-|    value_loss           | 0.00145     |
------------------------------------------
-Eval num_timesteps=1450000, episode_reward=-13.10 +/- 29.51
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -13.1       |
-| time/                   |             |
-|    total_timesteps      | 1450000     |
-| train/                  |             |
-|    approx_kl            | 0.007803983 |
-|    clip_fraction        | 0.083       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.86       |
-|    explained_variance   | 0.83        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0212     |
-|    n_updates            | 880         |
-|    policy_gradient_loss | -0.00119    |
-|    std                  | 1.01        |
-|    value_loss           | 0.00191     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1952    |
-|    iterations      | 89      |
-|    time_elapsed    | 746     |
-|    total_timesteps | 1458176 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1961        |
-|    iterations           | 90          |
-|    time_elapsed         | 751         |
-|    total_timesteps      | 1474560     |
-| train/                  |             |
-|    approx_kl            | 0.010021031 |
-|    clip_fraction        | 0.097       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.88       |
-|    explained_variance   | 0.902       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0221     |
-|    n_updates            | 890         |
-|    policy_gradient_loss | -0.00294    |
-|    std                  | 1.02        |
-|    value_loss           | 0.00136     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1970         |
-|    iterations           | 91           |
-|    time_elapsed         | 756          |
-|    total_timesteps      | 1490944      |
-| train/                  |              |
-|    approx_kl            | 0.0076614916 |
-|    clip_fraction        | 0.0963       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.89        |
-|    explained_variance   | 0.945        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0273      |
-|    n_updates            | 900          |
-|    policy_gradient_loss | -0.00355     |
-|    std                  | 1.03         |
-|    value_loss           | 0.00181      |
-------------------------------------------
-Eval num_timesteps=1500000, episode_reward=5.01 +/- 34.23
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 5.01        |
-| time/                   |             |
-|    total_timesteps      | 1500000     |
-| train/                  |             |
-|    approx_kl            | 0.005815446 |
-|    clip_fraction        | 0.0675      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.9        |
-|    explained_variance   | 0.934       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0316     |
-|    n_updates            | 910         |
-|    policy_gradient_loss | -0.00215    |
-|    std                  | 1.03        |
-|    value_loss           | 0.00162     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1950    |
-|    iterations      | 92      |
-|    time_elapsed    | 772     |
-|    total_timesteps | 1507328 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1959         |
-|    iterations           | 93           |
-|    time_elapsed         | 777          |
-|    total_timesteps      | 1523712      |
-| train/                  |              |
-|    approx_kl            | 0.0071218535 |
-|    clip_fraction        | 0.0897       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.9         |
-|    explained_variance   | 0.937        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0219      |
-|    n_updates            | 920          |
-|    policy_gradient_loss | -0.00225     |
-|    std                  | 1.03         |
-|    value_loss           | 0.00463      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1967        |
-|    iterations           | 94          |
-|    time_elapsed         | 782         |
-|    total_timesteps      | 1540096     |
-| train/                  |             |
-|    approx_kl            | 0.006857206 |
-|    clip_fraction        | 0.0809      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.89       |
-|    explained_variance   | 0.933       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0252     |
-|    n_updates            | 930         |
-|    policy_gradient_loss | -0.00219    |
-|    std                  | 1.02        |
-|    value_loss           | 0.00436     |
------------------------------------------
-Eval num_timesteps=1550000, episode_reward=-4.04 +/- 33.69
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -4.04       |
-| time/                   |             |
-|    total_timesteps      | 1550000     |
-| train/                  |             |
-|    approx_kl            | 0.006146897 |
-|    clip_fraction        | 0.0821      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.87       |
-|    explained_variance   | 0.913       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0352     |
-|    n_updates            | 940         |
-|    policy_gradient_loss | -0.00258    |
-|    std                  | 1.02        |
-|    value_loss           | 0.00325     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1948    |
-|    iterations      | 95      |
-|    time_elapsed    | 798     |
-|    total_timesteps | 1556480 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1958         |
-|    iterations           | 96           |
-|    time_elapsed         | 803          |
-|    total_timesteps      | 1572864      |
-| train/                  |              |
-|    approx_kl            | 0.0069321445 |
-|    clip_fraction        | 0.0778       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.86        |
-|    explained_variance   | 0.94         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.013       |
-|    n_updates            | 950          |
-|    policy_gradient_loss | -0.00214     |
-|    std                  | 1.01         |
-|    value_loss           | 0.00162      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1965         |
-|    iterations           | 97           |
-|    time_elapsed         | 808          |
-|    total_timesteps      | 1589248      |
-| train/                  |              |
-|    approx_kl            | 0.0066491435 |
-|    clip_fraction        | 0.0714       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.88        |
-|    explained_variance   | 0.941        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0304      |
-|    n_updates            | 960          |
-|    policy_gradient_loss | -0.00212     |
-|    std                  | 1.03         |
-|    value_loss           | 0.0011       |
-------------------------------------------
-Eval num_timesteps=1600000, episode_reward=12.65 +/- 31.73
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 12.6         |
-| time/                   |              |
-|    total_timesteps      | 1600000      |
-| train/                  |              |
-|    approx_kl            | 0.0050257677 |
-|    clip_fraction        | 0.0588       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.9         |
-|    explained_variance   | 0.939        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0359      |
-|    n_updates            | 970          |
-|    policy_gradient_loss | -0.0013      |
-|    std                  | 1.04         |
-|    value_loss           | 0.00201      |
-------------------------------------------
-
-[Diag @ 1,600,000 | n_sheep=2 | success=0%]
-  COMPACT_CANT_DRIVE         13/20
-  NEVER_COMPACT              7/20
-  action_mag mean=0.252 p10=0.004 p90=0.980 (0=stopped, 1=full speed)
-  min_flock_radius mean=4.30m best=0.92m  (target <5m to compact)
-  min_dog_to_com   mean=0.74m best=0.38m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.76m best=5.49m
-  reward/step (mean): progress=-0.0006  alignment=+0.0287  pen_bonus=+0.0000  step_cost=-0.0200  complete=+0.0000
-
-[Curriculum] leaving stage n_sheep=2 after 800,000 steps | training success rate (last 100 eps) = 0%
-[Curriculum] → 3 sheep at step 1,600,000
-
---------------------------------
-| time/              |         |
-|    fps             | 1930    |
-|    iterations      | 98      |
-|    time_elapsed    | 831     |
-|    total_timesteps | 1605632 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1937         |
-|    iterations           | 99           |
-|    time_elapsed         | 837          |
-|    total_timesteps      | 1622016      |
-| train/                  |              |
-|    approx_kl            | 0.0085028205 |
-|    clip_fraction        | 0.0905       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.89        |
-|    explained_variance   | 0.909        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0346      |
-|    n_updates            | 980          |
-|    policy_gradient_loss | -0.00245     |
-|    std                  | 1.02         |
-|    value_loss           | 0.00492      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1945        |
-|    iterations           | 100         |
-|    time_elapsed         | 842         |
-|    total_timesteps      | 1638400     |
-| train/                  |             |
-|    approx_kl            | 0.009084044 |
-|    clip_fraction        | 0.118       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.91       |
-|    explained_variance   | 0.964       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0416     |
-|    n_updates            | 990         |
-|    policy_gradient_loss | 0.0025      |
-|    std                  | 1.04        |
-|    value_loss           | 0.00194     |
------------------------------------------
-Eval num_timesteps=1650000, episode_reward=3.05 +/- 36.42
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 3.05        |
-| time/                   |             |
-|    total_timesteps      | 1650000     |
-| train/                  |             |
-|    approx_kl            | 0.009275759 |
-|    clip_fraction        | 0.108       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.92       |
-|    explained_variance   | 0.965       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0336     |
-|    n_updates            | 1000        |
-|    policy_gradient_loss | 0.000149    |
-|    std                  | 1.04        |
-|    value_loss           | 0.00185     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1926    |
-|    iterations      | 101     |
-|    time_elapsed    | 859     |
-|    total_timesteps | 1654784 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1934        |
-|    iterations           | 102         |
-|    time_elapsed         | 864         |
-|    total_timesteps      | 1671168     |
-| train/                  |             |
-|    approx_kl            | 0.008650862 |
-|    clip_fraction        | 0.117       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.92       |
-|    explained_variance   | 0.938       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0279     |
-|    n_updates            | 1010        |
-|    policy_gradient_loss | -0.000545   |
-|    std                  | 1.04        |
-|    value_loss           | 0.00611     |
------------------------------------------
----------------------------------------
-| time/                   |           |
-|    fps                  | 1939      |
-|    iterations           | 103       |
-|    time_elapsed         | 869       |
-|    total_timesteps      | 1687552   |
-| train/                  |           |
-|    approx_kl            | 0.0080826 |
-|    clip_fraction        | 0.0992    |
-|    clip_range           | 0.2       |
-|    entropy_loss         | -2.93     |
-|    explained_variance   | 0.952     |
-|    learning_rate        | 0.0003    |
-|    loss                 | -0.0415   |
-|    n_updates            | 1020      |
-|    policy_gradient_loss | -0.00201  |
-|    std                  | 1.05      |
-|    value_loss           | 0.00251   |
----------------------------------------
-Eval num_timesteps=1700000, episode_reward=-4.66 +/- 36.05
-Episode length: 2000.00 +/- 0.00
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 2e+03      |
-|    mean_reward          | -4.66      |
-| time/                   |            |
-|    total_timesteps      | 1700000    |
-| train/                  |            |
-|    approx_kl            | 0.00786162 |
-|    clip_fraction        | 0.0921     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.95      |
-|    explained_variance   | 0.893      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0301    |
-|    n_updates            | 1030       |
-|    policy_gradient_loss | -0.000631  |
-|    std                  | 1.06       |
-|    value_loss           | 0.00158    |
-----------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1922    |
-|    iterations      | 104     |
-|    time_elapsed    | 886     |
-|    total_timesteps | 1703936 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1930        |
-|    iterations           | 105         |
-|    time_elapsed         | 891         |
-|    total_timesteps      | 1720320     |
-| train/                  |             |
-|    approx_kl            | 0.008055547 |
-|    clip_fraction        | 0.0842      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.96       |
-|    explained_variance   | 0.918       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.027      |
-|    n_updates            | 1040        |
-|    policy_gradient_loss | -6.56e-05   |
-|    std                  | 1.07        |
-|    value_loss           | 0.00193     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1937        |
-|    iterations           | 106         |
-|    time_elapsed         | 896         |
-|    total_timesteps      | 1736704     |
-| train/                  |             |
-|    approx_kl            | 0.008067045 |
-|    clip_fraction        | 0.087       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.97       |
-|    explained_variance   | 0.878       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0281     |
-|    n_updates            | 1050        |
-|    policy_gradient_loss | -0.00194    |
-|    std                  | 1.07        |
-|    value_loss           | 0.0082      |
------------------------------------------
-Eval num_timesteps=1750000, episode_reward=-0.31 +/- 42.66
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -0.309       |
-| time/                   |              |
-|    total_timesteps      | 1750000      |
-| train/                  |              |
-|    approx_kl            | 0.0066514863 |
-|    clip_fraction        | 0.0808       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.99        |
-|    explained_variance   | 0.888        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0335      |
-|    n_updates            | 1060         |
-|    policy_gradient_loss | -0.00108     |
-|    std                  | 1.08         |
-|    value_loss           | 0.00303      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1921    |
-|    iterations      | 107     |
-|    time_elapsed    | 912     |
-|    total_timesteps | 1753088 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1927        |
-|    iterations           | 108         |
-|    time_elapsed         | 917         |
-|    total_timesteps      | 1769472     |
-| train/                  |             |
-|    approx_kl            | 0.008252729 |
-|    clip_fraction        | 0.093       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3          |
-|    explained_variance   | 0.959       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0413     |
-|    n_updates            | 1070        |
-|    policy_gradient_loss | -0.00241    |
-|    std                  | 1.09        |
-|    value_loss           | 0.00122     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1935         |
-|    iterations           | 109          |
-|    time_elapsed         | 922          |
-|    total_timesteps      | 1785856      |
-| train/                  |              |
-|    approx_kl            | 0.0073527684 |
-|    clip_fraction        | 0.0822       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.01        |
-|    explained_variance   | 0.883        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.018       |
-|    n_updates            | 1080         |
-|    policy_gradient_loss | -0.00172     |
-|    std                  | 1.1          |
-|    value_loss           | 0.00172      |
-------------------------------------------
-Eval num_timesteps=1800000, episode_reward=8.99 +/- 39.35
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 8.99        |
-| time/                   |             |
-|    total_timesteps      | 1800000     |
-| train/                  |             |
-|    approx_kl            | 0.006149094 |
-|    clip_fraction        | 0.0771      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.03       |
-|    explained_variance   | 0.911       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0315     |
-|    n_updates            | 1090        |
-|    policy_gradient_loss | -0.000744   |
-|    std                  | 1.1         |
-|    value_loss           | 0.00456     |
------------------------------------------
-
-[Diag @ 1,800,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              19/20
-  COMPACT_CANT_DRIVE         1/20
-  action_mag mean=0.049 p10=0.007 p90=0.049 (0=stopped, 1=full speed)
-  min_flock_radius mean=7.79m best=4.73m  (target <5m to compact)
-  min_dog_to_com   mean=0.92m best=0.25m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=14.27m best=7.54m
-  reward/step (mean): progress=-0.0043  alignment=+0.0208  pen_bonus=+0.0000  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1899    |
-|    iterations      | 110     |
-|    time_elapsed    | 948     |
-|    total_timesteps | 1802240 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1906        |
-|    iterations           | 111         |
-|    time_elapsed         | 953         |
-|    total_timesteps      | 1818624     |
-| train/                  |             |
-|    approx_kl            | 0.007161974 |
-|    clip_fraction        | 0.0871      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.03       |
-|    explained_variance   | 0.914       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0359     |
-|    n_updates            | 1100        |
-|    policy_gradient_loss | -0.00186    |
-|    std                  | 1.1         |
-|    value_loss           | 0.00214     |
------------------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1914       |
-|    iterations           | 112        |
-|    time_elapsed         | 958        |
-|    total_timesteps      | 1835008    |
-| train/                  |            |
-|    approx_kl            | 0.00886854 |
-|    clip_fraction        | 0.103      |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -3.04      |
-|    explained_variance   | 0.94       |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.04      |
-|    n_updates            | 1110       |
-|    policy_gradient_loss | -0.00333   |
-|    std                  | 1.11       |
-|    value_loss           | 0.00456    |
-----------------------------------------
-Eval num_timesteps=1850000, episode_reward=14.49 +/- 36.35
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 14.5         |
-| time/                   |              |
-|    total_timesteps      | 1850000      |
-| train/                  |              |
-|    approx_kl            | 0.0058414284 |
-|    clip_fraction        | 0.0642       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.05        |
-|    explained_variance   | 0.871        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.033       |
-|    n_updates            | 1120         |
-|    policy_gradient_loss | -0.000891    |
-|    std                  | 1.11         |
-|    value_loss           | 0.00394      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1898    |
-|    iterations      | 113     |
-|    time_elapsed    | 975     |
-|    total_timesteps | 1851392 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1906        |
-|    iterations           | 114         |
-|    time_elapsed         | 979         |
-|    total_timesteps      | 1867776     |
-| train/                  |             |
-|    approx_kl            | 0.008916938 |
-|    clip_fraction        | 0.0916      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.05       |
-|    explained_variance   | 0.937       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0334     |
-|    n_updates            | 1130        |
-|    policy_gradient_loss | -0.00257    |
-|    std                  | 1.12        |
-|    value_loss           | 0.00285     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1913        |
-|    iterations           | 115         |
-|    time_elapsed         | 984         |
-|    total_timesteps      | 1884160     |
-| train/                  |             |
-|    approx_kl            | 0.008523149 |
-|    clip_fraction        | 0.0907      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.06       |
-|    explained_variance   | 0.954       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0339     |
-|    n_updates            | 1140        |
-|    policy_gradient_loss | -0.0034     |
-|    std                  | 1.12        |
-|    value_loss           | 0.00209     |
------------------------------------------
-Eval num_timesteps=1900000, episode_reward=9.85 +/- 42.18
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 9.85         |
-| time/                   |              |
-|    total_timesteps      | 1900000      |
-| train/                  |              |
-|    approx_kl            | 0.0075978916 |
-|    clip_fraction        | 0.0819       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.06        |
-|    explained_variance   | 0.96         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0313      |
-|    n_updates            | 1150         |
-|    policy_gradient_loss | -0.00272     |
-|    std                  | 1.12         |
-|    value_loss           | 0.00332      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1896    |
-|    iterations      | 116     |
-|    time_elapsed    | 1002    |
-|    total_timesteps | 1900544 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1902        |
-|    iterations           | 117         |
-|    time_elapsed         | 1007        |
-|    total_timesteps      | 1916928     |
-| train/                  |             |
-|    approx_kl            | 0.008376695 |
-|    clip_fraction        | 0.0935      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.07       |
-|    explained_variance   | 0.964       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0392     |
-|    n_updates            | 1160        |
-|    policy_gradient_loss | -0.00354    |
-|    std                  | 1.12        |
-|    value_loss           | 0.00203     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1909         |
-|    iterations           | 118          |
-|    time_elapsed         | 1012         |
-|    total_timesteps      | 1933312      |
-| train/                  |              |
-|    approx_kl            | 0.0077100536 |
-|    clip_fraction        | 0.0854       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.07        |
-|    explained_variance   | 0.933        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0467      |
-|    n_updates            | 1170         |
-|    policy_gradient_loss | -0.00421     |
-|    std                  | 1.12         |
-|    value_loss           | 0.00132      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1915        |
-|    iterations           | 119         |
-|    time_elapsed         | 1018        |
-|    total_timesteps      | 1949696     |
-| train/                  |             |
-|    approx_kl            | 0.006848542 |
-|    clip_fraction        | 0.0674      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.07       |
-|    explained_variance   | 0.959       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0335     |
-|    n_updates            | 1180        |
-|    policy_gradient_loss | -0.00229    |
-|    std                  | 1.13        |
-|    value_loss           | 0.00138     |
------------------------------------------
-Eval num_timesteps=1950000, episode_reward=29.72 +/- 38.42
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 29.7        |
-| time/                   |             |
-|    total_timesteps      | 1950000     |
-| train/                  |             |
-|    approx_kl            | 0.007300608 |
-|    clip_fraction        | 0.0824      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.07       |
-|    explained_variance   | 0.977       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0358     |
-|    n_updates            | 1190        |
-|    policy_gradient_loss | -0.00364    |
-|    std                  | 1.12        |
-|    value_loss           | 0.00159     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1899    |
-|    iterations      | 120     |
-|    time_elapsed    | 1034    |
-|    total_timesteps | 1966080 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1906         |
-|    iterations           | 121          |
-|    time_elapsed         | 1040         |
-|    total_timesteps      | 1982464      |
-| train/                  |              |
-|    approx_kl            | 0.0072772675 |
-|    clip_fraction        | 0.0703       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.07        |
-|    explained_variance   | 0.882        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0357      |
-|    n_updates            | 1200         |
-|    policy_gradient_loss | -0.00163     |
-|    std                  | 1.13         |
-|    value_loss           | 0.00471      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1912        |
-|    iterations           | 122         |
-|    time_elapsed         | 1045        |
-|    total_timesteps      | 1998848     |
-| train/                  |             |
-|    approx_kl            | 0.007866079 |
-|    clip_fraction        | 0.0898      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.07       |
-|    explained_variance   | 0.962       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0304     |
-|    n_updates            | 1210        |
-|    policy_gradient_loss | -0.0052     |
-|    std                  | 1.13        |
-|    value_loss           | 0.0014      |
------------------------------------------
-Eval num_timesteps=2000000, episode_reward=14.20 +/- 34.02
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 14.2         |
-| time/                   |              |
-|    total_timesteps      | 2000000      |
-| train/                  |              |
-|    approx_kl            | 0.0073383995 |
-|    clip_fraction        | 0.083        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.07        |
-|    explained_variance   | 0.95         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0369      |
-|    n_updates            | 1220         |
-|    policy_gradient_loss | -0.00296     |
-|    std                  | 1.12         |
-|    value_loss           | 0.00336      |
-------------------------------------------
-
-[Diag @ 2,000,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              12/20
-  COMPACT_CANT_DRIVE         8/20
-  action_mag mean=0.076 p10=0.007 p90=0.097 (0=stopped, 1=full speed)
-  min_flock_radius mean=5.33m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=1.01m best=0.16m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.40m best=6.50m
-  reward/step (mean): progress=+0.0041  alignment=+0.0263  pen_bonus=+0.0013  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1881    |
-|    iterations      | 123     |
-|    time_elapsed    | 1071    |
-|    total_timesteps | 2015232 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1887         |
-|    iterations           | 124          |
-|    time_elapsed         | 1076         |
-|    total_timesteps      | 2031616      |
-| train/                  |              |
-|    approx_kl            | 0.0060287267 |
-|    clip_fraction        | 0.0716       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.07        |
-|    explained_variance   | 0.902        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0402      |
-|    n_updates            | 1230         |
-|    policy_gradient_loss | -0.00308     |
-|    std                  | 1.13         |
-|    value_loss           | 0.00475      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1894         |
-|    iterations           | 125          |
-|    time_elapsed         | 1081         |
-|    total_timesteps      | 2048000      |
-| train/                  |              |
-|    approx_kl            | 0.0073304214 |
-|    clip_fraction        | 0.08         |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.95         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0436      |
-|    n_updates            | 1240         |
-|    policy_gradient_loss | -0.00373     |
-|    std                  | 1.13         |
-|    value_loss           | 0.00138      |
-------------------------------------------
-Eval num_timesteps=2050000, episode_reward=18.68 +/- 36.20
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 18.7         |
-| time/                   |              |
-|    total_timesteps      | 2050000      |
-| train/                  |              |
-|    approx_kl            | 0.0068036346 |
-|    clip_fraction        | 0.0768       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.07        |
-|    explained_variance   | 0.897        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0461      |
-|    n_updates            | 1250         |
-|    policy_gradient_loss | -0.00392     |
-|    std                  | 1.13         |
-|    value_loss           | 0.0013       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1880    |
-|    iterations      | 126     |
-|    time_elapsed    | 1097    |
-|    total_timesteps | 2064384 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1886        |
-|    iterations           | 127         |
-|    time_elapsed         | 1102        |
-|    total_timesteps      | 2080768     |
-| train/                  |             |
-|    approx_kl            | 0.006960577 |
-|    clip_fraction        | 0.0689      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.07       |
-|    explained_variance   | 0.917       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0302     |
-|    n_updates            | 1260        |
-|    policy_gradient_loss | -0.00248    |
-|    std                  | 1.12        |
-|    value_loss           | 0.00841     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1892        |
-|    iterations           | 128         |
-|    time_elapsed         | 1108        |
-|    total_timesteps      | 2097152     |
-| train/                  |             |
-|    approx_kl            | 0.007300884 |
-|    clip_fraction        | 0.0705      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.09       |
-|    explained_variance   | 0.915       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0338     |
-|    n_updates            | 1270        |
-|    policy_gradient_loss | -0.00351    |
-|    std                  | 1.14        |
-|    value_loss           | 0.00336     |
------------------------------------------
-Eval num_timesteps=2100000, episode_reward=37.33 +/- 41.91
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 37.3        |
-| time/                   |             |
-|    total_timesteps      | 2100000     |
-| train/                  |             |
-|    approx_kl            | 0.007571588 |
-|    clip_fraction        | 0.076       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.1        |
-|    explained_variance   | 0.907       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0278     |
-|    n_updates            | 1280        |
-|    policy_gradient_loss | -0.00336    |
-|    std                  | 1.14        |
-|    value_loss           | 0.00228     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1878    |
-|    iterations      | 129     |
-|    time_elapsed    | 1124    |
-|    total_timesteps | 2113536 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1884        |
-|    iterations           | 130         |
-|    time_elapsed         | 1130        |
-|    total_timesteps      | 2129920     |
-| train/                  |             |
-|    approx_kl            | 0.007885255 |
-|    clip_fraction        | 0.088       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.11       |
-|    explained_variance   | 0.939       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0388     |
-|    n_updates            | 1290        |
-|    policy_gradient_loss | -0.00498    |
-|    std                  | 1.15        |
-|    value_loss           | 0.00231     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1890         |
-|    iterations           | 131          |
-|    time_elapsed         | 1135         |
-|    total_timesteps      | 2146304      |
-| train/                  |              |
-|    approx_kl            | 0.0073760273 |
-|    clip_fraction        | 0.0769       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.11        |
-|    explained_variance   | 0.955        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0277      |
-|    n_updates            | 1300         |
-|    policy_gradient_loss | -0.00306     |
-|    std                  | 1.15         |
-|    value_loss           | 0.00294      |
-------------------------------------------
-Eval num_timesteps=2150000, episode_reward=31.84 +/- 38.92
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 31.8        |
-| time/                   |             |
-|    total_timesteps      | 2150000     |
-| train/                  |             |
-|    approx_kl            | 0.006736047 |
-|    clip_fraction        | 0.0685      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.12       |
-|    explained_variance   | 0.913       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0302     |
-|    n_updates            | 1310        |
-|    policy_gradient_loss | -0.0021     |
-|    std                  | 1.16        |
-|    value_loss           | 0.00422     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1872    |
-|    iterations      | 132     |
-|    time_elapsed    | 1155    |
-|    total_timesteps | 2162688 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1878        |
-|    iterations           | 133         |
-|    time_elapsed         | 1160        |
-|    total_timesteps      | 2179072     |
-| train/                  |             |
-|    approx_kl            | 0.006166819 |
-|    clip_fraction        | 0.0668      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.13       |
-|    explained_variance   | 0.956       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0473     |
-|    n_updates            | 1320        |
-|    policy_gradient_loss | -0.00364    |
-|    std                  | 1.16        |
-|    value_loss           | 0.00158     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1884         |
-|    iterations           | 134          |
-|    time_elapsed         | 1165         |
-|    total_timesteps      | 2195456      |
-| train/                  |              |
-|    approx_kl            | 0.0075986157 |
-|    clip_fraction        | 0.0769       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.14        |
-|    explained_variance   | 0.966        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0317      |
-|    n_updates            | 1330         |
-|    policy_gradient_loss | -0.00398     |
-|    std                  | 1.17         |
-|    value_loss           | 0.00307      |
-------------------------------------------
-Eval num_timesteps=2200000, episode_reward=26.98 +/- 37.84
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 27          |
-| time/                   |             |
-|    total_timesteps      | 2200000     |
-| train/                  |             |
-|    approx_kl            | 0.008170303 |
-|    clip_fraction        | 0.0981      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.14       |
-|    explained_variance   | 0.964       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0326     |
-|    n_updates            | 1340        |
-|    policy_gradient_loss | -0.00415    |
-|    std                  | 1.16        |
-|    value_loss           | 0.00349     |
------------------------------------------
-
-[Diag @ 2,200,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              16/20
-  COMPACT_CANT_DRIVE         4/20
-  action_mag mean=0.067 p10=0.003 p90=0.067 (0=stopped, 1=full speed)
-  min_flock_radius mean=7.25m best=1.61m  (target <5m to compact)
-  min_dog_to_com   mean=0.97m best=0.20m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.28m best=5.53m
-  reward/step (mean): progress=+0.0007  alignment=+0.0353  pen_bonus=+0.0008  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1832    |
-|    iterations      | 135     |
-|    time_elapsed    | 1206    |
-|    total_timesteps | 2211840 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1805        |
-|    iterations           | 136         |
-|    time_elapsed         | 1234        |
-|    total_timesteps      | 2228224     |
-| train/                  |             |
-|    approx_kl            | 0.006131858 |
-|    clip_fraction        | 0.067       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.13       |
-|    explained_variance   | 0.927       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0328     |
-|    n_updates            | 1350        |
-|    policy_gradient_loss | -0.0022     |
-|    std                  | 1.16        |
-|    value_loss           | 0.000981    |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1811         |
-|    iterations           | 137          |
-|    time_elapsed         | 1239         |
-|    total_timesteps      | 2244608      |
-| train/                  |              |
-|    approx_kl            | 0.0071705403 |
-|    clip_fraction        | 0.0699       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.12        |
-|    explained_variance   | 0.913        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0391      |
-|    n_updates            | 1360         |
-|    policy_gradient_loss | -0.0032      |
-|    std                  | 1.15         |
-|    value_loss           | 0.00639      |
-------------------------------------------
-Eval num_timesteps=2250000, episode_reward=28.55 +/- 29.67
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 28.5        |
-| time/                   |             |
-|    total_timesteps      | 2250000     |
-| train/                  |             |
-|    approx_kl            | 0.007929602 |
-|    clip_fraction        | 0.0812      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.14       |
-|    explained_variance   | 0.933       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0592     |
-|    n_updates            | 1370        |
-|    policy_gradient_loss | -0.00434    |
-|    std                  | 1.17        |
-|    value_loss           | 0.00337     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1800    |
-|    iterations      | 138     |
-|    time_elapsed    | 1255    |
-|    total_timesteps | 2260992 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1806         |
-|    iterations           | 139          |
-|    time_elapsed         | 1260         |
-|    total_timesteps      | 2277376      |
-| train/                  |              |
-|    approx_kl            | 0.0062256474 |
-|    clip_fraction        | 0.0592       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.15        |
-|    explained_variance   | 0.935        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0368      |
-|    n_updates            | 1380         |
-|    policy_gradient_loss | -0.00242     |
-|    std                  | 1.17         |
-|    value_loss           | 0.00787      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1812         |
-|    iterations           | 140          |
-|    time_elapsed         | 1265         |
-|    total_timesteps      | 2293760      |
-| train/                  |              |
-|    approx_kl            | 0.0075241774 |
-|    clip_fraction        | 0.0885       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.14        |
-|    explained_variance   | 0.948        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0385      |
-|    n_updates            | 1390         |
-|    policy_gradient_loss | -0.00346     |
-|    std                  | 1.16         |
-|    value_loss           | 0.00172      |
-------------------------------------------
-Eval num_timesteps=2300000, episode_reward=43.34 +/- 34.73
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 43.3         |
-| time/                   |              |
-|    total_timesteps      | 2300000      |
-| train/                  |              |
-|    approx_kl            | 0.0073855575 |
-|    clip_fraction        | 0.0753       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.12        |
-|    explained_variance   | 0.911        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0377      |
-|    n_updates            | 1400         |
-|    policy_gradient_loss | -0.0034      |
-|    std                  | 1.15         |
-|    value_loss           | 0.00645      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1801    |
-|    iterations      | 141     |
-|    time_elapsed    | 1282    |
-|    total_timesteps | 2310144 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1806        |
-|    iterations           | 142         |
-|    time_elapsed         | 1287        |
-|    total_timesteps      | 2326528     |
-| train/                  |             |
-|    approx_kl            | 0.007232903 |
-|    clip_fraction        | 0.0845      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.13       |
-|    explained_variance   | 0.956       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0346     |
-|    n_updates            | 1410        |
-|    policy_gradient_loss | -0.003      |
-|    std                  | 1.16        |
-|    value_loss           | 0.00134     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1812        |
-|    iterations           | 143         |
-|    time_elapsed         | 1292        |
-|    total_timesteps      | 2342912     |
-| train/                  |             |
-|    approx_kl            | 0.007283367 |
-|    clip_fraction        | 0.0785      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.14       |
-|    explained_variance   | 0.913       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0306     |
-|    n_updates            | 1420        |
-|    policy_gradient_loss | -0.00368    |
-|    std                  | 1.17        |
-|    value_loss           | 0.00385     |
------------------------------------------
-Eval num_timesteps=2350000, episode_reward=33.49 +/- 34.79
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 33.5        |
-| time/                   |             |
-|    total_timesteps      | 2350000     |
-| train/                  |             |
-|    approx_kl            | 0.006632698 |
-|    clip_fraction        | 0.0647      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.934       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0469     |
-|    n_updates            | 1430        |
-|    policy_gradient_loss | -0.00327    |
-|    std                  | 1.17        |
-|    value_loss           | 0.00793     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1800    |
-|    iterations      | 144     |
-|    time_elapsed    | 1310    |
-|    total_timesteps | 2359296 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1805        |
-|    iterations           | 145         |
-|    time_elapsed         | 1315        |
-|    total_timesteps      | 2375680     |
-| train/                  |             |
-|    approx_kl            | 0.008364577 |
-|    clip_fraction        | 0.089       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.957       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0464     |
-|    n_updates            | 1440        |
-|    policy_gradient_loss | -0.00453    |
-|    std                  | 1.17        |
-|    value_loss           | 0.00507     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1810        |
-|    iterations           | 146         |
-|    time_elapsed         | 1321        |
-|    total_timesteps      | 2392064     |
-| train/                  |             |
-|    approx_kl            | 0.007854694 |
-|    clip_fraction        | 0.0927      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.953       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0436     |
-|    n_updates            | 1450        |
-|    policy_gradient_loss | -0.00519    |
-|    std                  | 1.17        |
-|    value_loss           | 0.00289     |
------------------------------------------
-Eval num_timesteps=2400000, episode_reward=34.64 +/- 37.27
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 34.6         |
-| time/                   |              |
-|    total_timesteps      | 2400000      |
-| train/                  |              |
-|    approx_kl            | 0.0076201856 |
-|    clip_fraction        | 0.0844       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.15        |
-|    explained_variance   | 0.945        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0431      |
-|    n_updates            | 1460         |
-|    policy_gradient_loss | -0.00554     |
-|    std                  | 1.17         |
-|    value_loss           | 0.00196      |
-------------------------------------------
-
-[Diag @ 2,400,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              15/20
-  COMPACT_CANT_DRIVE         5/20
-  action_mag mean=0.058 p10=0.006 p90=0.053 (0=stopped, 1=full speed)
-  min_flock_radius mean=6.68m best=0.96m  (target <5m to compact)
-  min_dog_to_com   mean=0.92m best=0.16m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.18m best=5.62m
-  reward/step (mean): progress=+0.0034  alignment=+0.0352  pen_bonus=+0.0010  step_cost=-0.0200  complete=+0.0000
-
-[Curriculum] leaving stage n_sheep=3 after 800,000 steps | training success rate (last 100 eps) = 0%
-[Curriculum] → 4 sheep at step 2,400,000
-
---------------------------------
-| time/              |         |
-|    fps             | 1788    |
-|    iterations      | 147     |
-|    time_elapsed    | 1346    |
-|    total_timesteps | 2408448 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1794        |
-|    iterations           | 148         |
-|    time_elapsed         | 1351        |
-|    total_timesteps      | 2424832     |
-| train/                  |             |
-|    approx_kl            | 0.006801254 |
-|    clip_fraction        | 0.0797      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.16       |
-|    explained_variance   | 0.922       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0313     |
-|    n_updates            | 1470        |
-|    policy_gradient_loss | -0.00418    |
-|    std                  | 1.18        |
-|    value_loss           | 0.00724     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1798        |
-|    iterations           | 149         |
-|    time_elapsed         | 1357        |
-|    total_timesteps      | 2441216     |
-| train/                  |             |
-|    approx_kl            | 0.007604986 |
-|    clip_fraction        | 0.0758      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.18       |
-|    explained_variance   | 0.937       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0354     |
-|    n_updates            | 1480        |
-|    policy_gradient_loss | -0.00189    |
-|    std                  | 1.19        |
-|    value_loss           | 0.00591     |
------------------------------------------
-Eval num_timesteps=2450000, episode_reward=27.82 +/- 47.76
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 27.8         |
-| time/                   |              |
-|    total_timesteps      | 2450000      |
-| train/                  |              |
-|    approx_kl            | 0.0070674624 |
-|    clip_fraction        | 0.0749       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.2         |
-|    explained_variance   | 0.893        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0327      |
-|    n_updates            | 1490         |
-|    policy_gradient_loss | -0.00322     |
-|    std                  | 1.2          |
-|    value_loss           | 0.0105       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1788    |
-|    iterations      | 150     |
-|    time_elapsed    | 1374    |
-|    total_timesteps | 2457600 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1792        |
-|    iterations           | 151         |
-|    time_elapsed         | 1380        |
-|    total_timesteps      | 2473984     |
-| train/                  |             |
-|    approx_kl            | 0.008372683 |
-|    clip_fraction        | 0.0874      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.21       |
-|    explained_variance   | 0.932       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0381     |
-|    n_updates            | 1500        |
-|    policy_gradient_loss | -0.00471    |
-|    std                  | 1.21        |
-|    value_loss           | 0.00563     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1796        |
-|    iterations           | 152         |
-|    time_elapsed         | 1385        |
-|    total_timesteps      | 2490368     |
-| train/                  |             |
-|    approx_kl            | 0.007761459 |
-|    clip_fraction        | 0.0794      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.23       |
-|    explained_variance   | 0.929       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0345     |
-|    n_updates            | 1510        |
-|    policy_gradient_loss | -0.00402    |
-|    std                  | 1.22        |
-|    value_loss           | 0.00736     |
------------------------------------------
-Eval num_timesteps=2500000, episode_reward=25.79 +/- 28.60
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 25.8         |
-| time/                   |              |
-|    total_timesteps      | 2500000      |
-| train/                  |              |
-|    approx_kl            | 0.0070840344 |
-|    clip_fraction        | 0.0711       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.22        |
-|    explained_variance   | 0.9          |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0322      |
-|    n_updates            | 1520         |
-|    policy_gradient_loss | -0.00397     |
-|    std                  | 1.21         |
-|    value_loss           | 0.00517      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1785    |
-|    iterations      | 153     |
-|    time_elapsed    | 1404    |
-|    total_timesteps | 2506752 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1788         |
-|    iterations           | 154          |
-|    time_elapsed         | 1410         |
-|    total_timesteps      | 2523136      |
-| train/                  |              |
-|    approx_kl            | 0.0062630484 |
-|    clip_fraction        | 0.069        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.22        |
-|    explained_variance   | 0.93         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0363      |
-|    n_updates            | 1530         |
-|    policy_gradient_loss | -0.00382     |
-|    std                  | 1.21         |
-|    value_loss           | 0.00546      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1792        |
-|    iterations           | 155         |
-|    time_elapsed         | 1416        |
-|    total_timesteps      | 2539520     |
-| train/                  |             |
-|    approx_kl            | 0.007609036 |
-|    clip_fraction        | 0.0815      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.23       |
-|    explained_variance   | 0.832       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0404     |
-|    n_updates            | 1540        |
-|    policy_gradient_loss | -0.00347    |
-|    std                  | 1.22        |
-|    value_loss           | 0.00902     |
------------------------------------------
-Eval num_timesteps=2550000, episode_reward=26.76 +/- 38.76
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 26.8         |
-| time/                   |              |
-|    total_timesteps      | 2550000      |
-| train/                  |              |
-|    approx_kl            | 0.0070117847 |
-|    clip_fraction        | 0.0808       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.23        |
-|    explained_variance   | 0.863        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0357      |
-|    n_updates            | 1550         |
-|    policy_gradient_loss | -0.00279     |
-|    std                  | 1.22         |
-|    value_loss           | 0.0114       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1780    |
-|    iterations      | 156     |
-|    time_elapsed    | 1435    |
-|    total_timesteps | 2555904 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1786         |
-|    iterations           | 157          |
-|    time_elapsed         | 1440         |
-|    total_timesteps      | 2572288      |
-| train/                  |              |
-|    approx_kl            | 0.0070258966 |
-|    clip_fraction        | 0.0817       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.23        |
-|    explained_variance   | 0.941        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.039       |
-|    n_updates            | 1560         |
-|    policy_gradient_loss | -0.00488     |
-|    std                  | 1.22         |
-|    value_loss           | 0.00696      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1791        |
-|    iterations           | 158         |
-|    time_elapsed         | 1445        |
-|    total_timesteps      | 2588672     |
-| train/                  |             |
-|    approx_kl            | 0.007600763 |
-|    clip_fraction        | 0.0842      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.23       |
-|    explained_variance   | 0.912       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0363     |
-|    n_updates            | 1570        |
-|    policy_gradient_loss | -0.00544    |
-|    std                  | 1.22        |
-|    value_loss           | 0.00556     |
------------------------------------------
-Eval num_timesteps=2600000, episode_reward=19.53 +/- 46.34
-Episode length: 2000.00 +/- 0.00
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 2e+03      |
-|    mean_reward          | 19.5       |
-| time/                   |            |
-|    total_timesteps      | 2600000    |
-| train/                  |            |
-|    approx_kl            | 0.00714178 |
-|    clip_fraction        | 0.0783     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -3.23      |
-|    explained_variance   | 0.92       |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0352    |
-|    n_updates            | 1580       |
-|    policy_gradient_loss | -0.00468   |
-|    std                  | 1.22       |
-|    value_loss           | 0.00364    |
-----------------------------------------
-
-[Diag @ 2,600,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              20/20
-  action_mag mean=0.061 p10=0.006 p90=0.047 (0=stopped, 1=full speed)
-  min_flock_radius mean=7.84m best=5.75m  (target <5m to compact)
-  min_dog_to_com   mean=0.66m best=0.09m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.60m best=6.52m
-  reward/step (mean): progress=-0.0028  alignment=+0.0337  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1768    |
-|    iterations      | 159     |
-|    time_elapsed    | 1473    |
-|    total_timesteps | 2605056 |
---------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1771       |
-|    iterations           | 160        |
-|    time_elapsed         | 1479       |
-|    total_timesteps      | 2621440    |
-| train/                  |            |
-|    approx_kl            | 0.00681924 |
-|    clip_fraction        | 0.0779     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -3.23      |
-|    explained_variance   | 0.946      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0409    |
-|    n_updates            | 1590       |
-|    policy_gradient_loss | -0.00346   |
-|    std                  | 1.22       |
-|    value_loss           | 0.00377    |
-----------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1775        |
-|    iterations           | 161         |
-|    time_elapsed         | 1485        |
-|    total_timesteps      | 2637824     |
-| train/                  |             |
-|    approx_kl            | 0.008016385 |
-|    clip_fraction        | 0.0888      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.24       |
-|    explained_variance   | 0.931       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0311     |
-|    n_updates            | 1600        |
-|    policy_gradient_loss | -0.00526    |
-|    std                  | 1.22        |
-|    value_loss           | 0.00681     |
------------------------------------------
-Eval num_timesteps=2650000, episode_reward=28.98 +/- 40.07
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 29          |
-| time/                   |             |
-|    total_timesteps      | 2650000     |
-| train/                  |             |
-|    approx_kl            | 0.006836592 |
-|    clip_fraction        | 0.0778      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.24       |
-|    explained_variance   | 0.9         |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0304     |
-|    n_updates            | 1610        |
-|    policy_gradient_loss | -0.00255    |
-|    std                  | 1.23        |
-|    value_loss           | 0.00574     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1765    |
-|    iterations      | 162     |
-|    time_elapsed    | 1503    |
-|    total_timesteps | 2654208 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1770         |
-|    iterations           | 163          |
-|    time_elapsed         | 1508         |
-|    total_timesteps      | 2670592      |
-| train/                  |              |
-|    approx_kl            | 0.0072684484 |
-|    clip_fraction        | 0.0764       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.23        |
-|    explained_variance   | 0.948        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0295      |
-|    n_updates            | 1620         |
-|    policy_gradient_loss | -0.00325     |
-|    std                  | 1.22         |
-|    value_loss           | 0.00254      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1775        |
-|    iterations           | 164         |
-|    time_elapsed         | 1513        |
-|    total_timesteps      | 2686976     |
-| train/                  |             |
-|    approx_kl            | 0.007457966 |
-|    clip_fraction        | 0.0845      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.23       |
-|    explained_variance   | 0.919       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0473     |
-|    n_updates            | 1630        |
-|    policy_gradient_loss | -0.00505    |
-|    std                  | 1.22        |
-|    value_loss           | 0.004       |
------------------------------------------
-Eval num_timesteps=2700000, episode_reward=33.96 +/- 32.11
-Episode length: 2000.00 +/- 0.00
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 2e+03      |
-|    mean_reward          | 34         |
-| time/                   |            |
-|    total_timesteps      | 2700000    |
-| train/                  |            |
-|    approx_kl            | 0.00796853 |
-|    clip_fraction        | 0.0782     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -3.22      |
-|    explained_variance   | 0.959      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0336    |
-|    n_updates            | 1640       |
-|    policy_gradient_loss | -0.00288   |
-|    std                  | 1.21       |
-|    value_loss           | 0.00235    |
-----------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1761    |
-|    iterations      | 165     |
-|    time_elapsed    | 1534    |
-|    total_timesteps | 2703360 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1764         |
-|    iterations           | 166          |
-|    time_elapsed         | 1541         |
-|    total_timesteps      | 2719744      |
-| train/                  |              |
-|    approx_kl            | 0.0073700505 |
-|    clip_fraction        | 0.0857       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.21        |
-|    explained_variance   | 0.875        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0255      |
-|    n_updates            | 1650         |
-|    policy_gradient_loss | -0.00495     |
-|    std                  | 1.21         |
-|    value_loss           | 0.00846      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1768        |
-|    iterations           | 167         |
-|    time_elapsed         | 1546        |
-|    total_timesteps      | 2736128     |
-| train/                  |             |
-|    approx_kl            | 0.007965144 |
-|    clip_fraction        | 0.0858      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.22       |
-|    explained_variance   | 0.898       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0451     |
-|    n_updates            | 1660        |
-|    policy_gradient_loss | -0.00518    |
-|    std                  | 1.22        |
-|    value_loss           | 0.00395     |
------------------------------------------
-Eval num_timesteps=2750000, episode_reward=23.58 +/- 34.37
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 23.6         |
-| time/                   |              |
-|    total_timesteps      | 2750000      |
-| train/                  |              |
-|    approx_kl            | 0.0065765316 |
-|    clip_fraction        | 0.0682       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.23        |
-|    explained_variance   | 0.934        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0429      |
-|    n_updates            | 1670         |
-|    policy_gradient_loss | -0.00379     |
-|    std                  | 1.23         |
-|    value_loss           | 0.00677      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1756    |
-|    iterations      | 168     |
-|    time_elapsed    | 1566    |
-|    total_timesteps | 2752512 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1761         |
-|    iterations           | 169          |
-|    time_elapsed         | 1571         |
-|    total_timesteps      | 2768896      |
-| train/                  |              |
-|    approx_kl            | 0.0066236854 |
-|    clip_fraction        | 0.0619       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.25        |
-|    explained_variance   | 0.935        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0365      |
-|    n_updates            | 1680         |
-|    policy_gradient_loss | -0.00239     |
-|    std                  | 1.23         |
-|    value_loss           | 0.00922      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1766        |
-|    iterations           | 170         |
-|    time_elapsed         | 1576        |
-|    total_timesteps      | 2785280     |
-| train/                  |             |
-|    approx_kl            | 0.007887056 |
-|    clip_fraction        | 0.0836      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.25       |
-|    explained_variance   | 0.899       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0353     |
-|    n_updates            | 1690        |
-|    policy_gradient_loss | -0.0053     |
-|    std                  | 1.24        |
-|    value_loss           | 0.00635     |
------------------------------------------
-Eval num_timesteps=2800000, episode_reward=33.57 +/- 35.56
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 33.6         |
-| time/                   |              |
-|    total_timesteps      | 2800000      |
-| train/                  |              |
-|    approx_kl            | 0.0067548407 |
-|    clip_fraction        | 0.0804       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.25        |
-|    explained_variance   | 0.887        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0408      |
-|    n_updates            | 1700         |
-|    policy_gradient_loss | -0.00444     |
-|    std                  | 1.24         |
-|    value_loss           | 0.0101       |
-------------------------------------------
-
-[Diag @ 2,800,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              19/20
-  COMPACT_CANT_DRIVE         1/20
-  action_mag mean=0.050 p10=0.003 p90=0.039 (0=stopped, 1=full speed)
-  min_flock_radius mean=8.42m best=4.84m  (target <5m to compact)
-  min_dog_to_com   mean=0.73m best=0.12m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=14.29m best=7.66m
-  reward/step (mean): progress=-0.0027  alignment=+0.0365  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1746    |
-|    iterations      | 171     |
-|    time_elapsed    | 1604    |
-|    total_timesteps | 2801664 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1750         |
-|    iterations           | 172          |
-|    time_elapsed         | 1609         |
-|    total_timesteps      | 2818048      |
-| train/                  |              |
-|    approx_kl            | 0.0069283517 |
-|    clip_fraction        | 0.0847       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.24        |
-|    explained_variance   | 0.899        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0476      |
-|    n_updates            | 1710         |
-|    policy_gradient_loss | -0.00499     |
-|    std                  | 1.23         |
-|    value_loss           | 0.00708      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1754        |
-|    iterations           | 173         |
-|    time_elapsed         | 1615        |
-|    total_timesteps      | 2834432     |
-| train/                  |             |
-|    approx_kl            | 0.008303071 |
-|    clip_fraction        | 0.082       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.25       |
-|    explained_variance   | 0.911       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0484     |
-|    n_updates            | 1720        |
-|    policy_gradient_loss | -0.00388    |
-|    std                  | 1.23        |
-|    value_loss           | 0.0061      |
------------------------------------------
-Eval num_timesteps=2850000, episode_reward=34.42 +/- 32.01
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 34.4         |
-| time/                   |              |
-|    total_timesteps      | 2850000      |
-| train/                  |              |
-|    approx_kl            | 0.0063731004 |
-|    clip_fraction        | 0.069        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.26        |
-|    explained_variance   | 0.951        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.029       |
-|    n_updates            | 1730         |
-|    policy_gradient_loss | -0.00384     |
-|    std                  | 1.25         |
-|    value_loss           | 0.00528      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1745    |
-|    iterations      | 174     |
-|    time_elapsed    | 1633    |
-|    total_timesteps | 2850816 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1749        |
-|    iterations           | 175         |
-|    time_elapsed         | 1638        |
-|    total_timesteps      | 2867200     |
-| train/                  |             |
-|    approx_kl            | 0.008163793 |
-|    clip_fraction        | 0.0812      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.28       |
-|    explained_variance   | 0.935       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0374     |
-|    n_updates            | 1740        |
-|    policy_gradient_loss | -0.0032     |
-|    std                  | 1.26        |
-|    value_loss           | 0.00432     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1754         |
-|    iterations           | 176          |
-|    time_elapsed         | 1643         |
-|    total_timesteps      | 2883584      |
-| train/                  |              |
-|    approx_kl            | 0.0063439216 |
-|    clip_fraction        | 0.0743       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.29        |
-|    explained_variance   | 0.89         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0372      |
-|    n_updates            | 1750         |
-|    policy_gradient_loss | -0.00403     |
-|    std                  | 1.26         |
-|    value_loss           | 0.00654      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1759        |
-|    iterations           | 177         |
-|    time_elapsed         | 1648        |
-|    total_timesteps      | 2899968     |
-| train/                  |             |
-|    approx_kl            | 0.006967159 |
-|    clip_fraction        | 0.0761      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.29       |
-|    explained_variance   | 0.929       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0462     |
-|    n_updates            | 1760        |
-|    policy_gradient_loss | -0.00382    |
-|    std                  | 1.26        |
-|    value_loss           | 0.00381     |
------------------------------------------
-Eval num_timesteps=2900000, episode_reward=40.78 +/- 43.99
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 40.8         |
-| time/                   |              |
-|    total_timesteps      | 2900000      |
-| train/                  |              |
-|    approx_kl            | 0.0075211767 |
-|    clip_fraction        | 0.0727       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.29        |
-|    explained_variance   | 0.955        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0178      |
-|    n_updates            | 1770         |
-|    policy_gradient_loss | -0.00285     |
-|    std                  | 1.27         |
-|    value_loss           | 0.00798      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1751    |
-|    iterations      | 178     |
-|    time_elapsed    | 1664    |
-|    total_timesteps | 2916352 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1756        |
-|    iterations           | 179         |
-|    time_elapsed         | 1669        |
-|    total_timesteps      | 2932736     |
-| train/                  |             |
-|    approx_kl            | 0.006763531 |
-|    clip_fraction        | 0.0678      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.3        |
-|    explained_variance   | 0.91        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0349     |
-|    n_updates            | 1780        |
-|    policy_gradient_loss | -0.00361    |
-|    std                  | 1.27        |
-|    value_loss           | 0.00528     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1760         |
-|    iterations           | 180          |
-|    time_elapsed         | 1675         |
-|    total_timesteps      | 2949120      |
-| train/                  |              |
-|    approx_kl            | 0.0067441636 |
-|    clip_fraction        | 0.0732       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.3         |
-|    explained_variance   | 0.888        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0261      |
-|    n_updates            | 1790         |
-|    policy_gradient_loss | -0.00291     |
-|    std                  | 1.27         |
-|    value_loss           | 0.00582      |
-------------------------------------------
-Eval num_timesteps=2950000, episode_reward=48.39 +/- 31.91
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 48.4         |
-| time/                   |              |
-|    total_timesteps      | 2950000      |
-| train/                  |              |
-|    approx_kl            | 0.0076025603 |
-|    clip_fraction        | 0.0858       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.31        |
-|    explained_variance   | 0.92         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0394      |
-|    n_updates            | 1800         |
-|    policy_gradient_loss | -0.00443     |
-|    std                  | 1.27         |
-|    value_loss           | 0.00647      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1751    |
-|    iterations      | 181     |
-|    time_elapsed    | 1693    |
-|    total_timesteps | 2965504 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1754        |
-|    iterations           | 182         |
-|    time_elapsed         | 1699        |
-|    total_timesteps      | 2981888     |
-| train/                  |             |
-|    approx_kl            | 0.008041672 |
-|    clip_fraction        | 0.0795      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.3        |
-|    explained_variance   | 0.939       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0344     |
-|    n_updates            | 1810        |
-|    policy_gradient_loss | -0.00456    |
-|    std                  | 1.27        |
-|    value_loss           | 0.00404     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1758         |
-|    iterations           | 183          |
-|    time_elapsed         | 1704         |
-|    total_timesteps      | 2998272      |
-| train/                  |              |
-|    approx_kl            | 0.0066829836 |
-|    clip_fraction        | 0.0712       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.32        |
-|    explained_variance   | 0.921        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0361      |
-|    n_updates            | 1820         |
-|    policy_gradient_loss | -0.00379     |
-|    std                  | 1.28         |
-|    value_loss           | 0.00818      |
-------------------------------------------
-Eval num_timesteps=3000000, episode_reward=33.06 +/- 47.57
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 33.1        |
-| time/                   |             |
-|    total_timesteps      | 3000000     |
-| train/                  |             |
-|    approx_kl            | 0.006152373 |
-|    clip_fraction        | 0.0633      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.33       |
-|    explained_variance   | 0.912       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0316     |
-|    n_updates            | 1830        |
-|    policy_gradient_loss | -0.00335    |
-|    std                  | 1.29        |
-|    value_loss           | 0.00404     |
------------------------------------------
-
-[Diag @ 3,000,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              20/20
-  action_mag mean=0.049 p10=0.005 p90=0.046 (0=stopped, 1=full speed)
-  min_flock_radius mean=8.21m best=5.29m  (target <5m to compact)
-  min_dog_to_com   mean=0.76m best=0.22m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.62m best=4.77m
-  reward/step (mean): progress=+0.0089  alignment=+0.0386  pen_bonus=+0.0008  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1740    |
-|    iterations      | 184     |
-|    time_elapsed    | 1731    |
-|    total_timesteps | 3014656 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1745        |
-|    iterations           | 185         |
-|    time_elapsed         | 1736        |
-|    total_timesteps      | 3031040     |
-| train/                  |             |
-|    approx_kl            | 0.006385569 |
-|    clip_fraction        | 0.0703      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.34       |
-|    explained_variance   | 0.919       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0313     |
-|    n_updates            | 1840        |
-|    policy_gradient_loss | -0.00274    |
-|    std                  | 1.3         |
-|    value_loss           | 0.00503     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1748        |
-|    iterations           | 186         |
-|    time_elapsed         | 1743        |
-|    total_timesteps      | 3047424     |
-| train/                  |             |
-|    approx_kl            | 0.007695101 |
-|    clip_fraction        | 0.0784      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.36       |
-|    explained_variance   | 0.935       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0244     |
-|    n_updates            | 1850        |
-|    policy_gradient_loss | -0.00342    |
-|    std                  | 1.31        |
-|    value_loss           | 0.0051      |
------------------------------------------
-Eval num_timesteps=3050000, episode_reward=45.25 +/- 31.57
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 45.2         |
-| time/                   |              |
-|    total_timesteps      | 3050000      |
-| train/                  |              |
-|    approx_kl            | 0.0067556566 |
-|    clip_fraction        | 0.082        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.37        |
-|    explained_variance   | 0.868        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0349      |
-|    n_updates            | 1860         |
-|    policy_gradient_loss | -0.00353     |
-|    std                  | 1.31         |
-|    value_loss           | 0.00931      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1738    |
-|    iterations      | 187     |
-|    time_elapsed    | 1762    |
-|    total_timesteps | 3063808 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1741        |
-|    iterations           | 188         |
-|    time_elapsed         | 1768        |
-|    total_timesteps      | 3080192     |
-| train/                  |             |
-|    approx_kl            | 0.008263266 |
-|    clip_fraction        | 0.0792      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.36       |
-|    explained_variance   | 0.924       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0411     |
-|    n_updates            | 1870        |
-|    policy_gradient_loss | -0.00382    |
-|    std                  | 1.31        |
-|    value_loss           | 0.00429     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1746        |
-|    iterations           | 189         |
-|    time_elapsed         | 1773        |
-|    total_timesteps      | 3096576     |
-| train/                  |             |
-|    approx_kl            | 0.008488305 |
-|    clip_fraction        | 0.08        |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.37       |
-|    explained_variance   | 0.925       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0292     |
-|    n_updates            | 1880        |
-|    policy_gradient_loss | -0.00441    |
-|    std                  | 1.31        |
-|    value_loss           | 0.00748     |
------------------------------------------
-Eval num_timesteps=3100000, episode_reward=30.63 +/- 33.70
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 30.6         |
-| time/                   |              |
-|    total_timesteps      | 3100000      |
-| train/                  |              |
-|    approx_kl            | 0.0065515246 |
-|    clip_fraction        | 0.0736       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.35        |
-|    explained_variance   | 0.932        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.00192      |
-|    n_updates            | 1890         |
-|    policy_gradient_loss | -0.00334     |
-|    std                  | 1.3          |
-|    value_loss           | 0.00902      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1737    |
-|    iterations      | 190     |
-|    time_elapsed    | 1791    |
-|    total_timesteps | 3112960 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1741         |
-|    iterations           | 191          |
-|    time_elapsed         | 1796         |
-|    total_timesteps      | 3129344      |
-| train/                  |              |
-|    approx_kl            | 0.0068135276 |
-|    clip_fraction        | 0.0721       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.35        |
-|    explained_variance   | 0.933        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.036       |
-|    n_updates            | 1900         |
-|    policy_gradient_loss | -0.00403     |
-|    std                  | 1.29         |
-|    value_loss           | 0.00616      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1731         |
-|    iterations           | 192          |
-|    time_elapsed         | 1817         |
-|    total_timesteps      | 3145728      |
-| train/                  |              |
-|    approx_kl            | 0.0061126407 |
-|    clip_fraction        | 0.0615       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.35        |
-|    explained_variance   | 0.921        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0355      |
-|    n_updates            | 1910         |
-|    policy_gradient_loss | -0.00318     |
-|    std                  | 1.3          |
-|    value_loss           | 0.0104       |
-------------------------------------------
-Eval num_timesteps=3150000, episode_reward=33.88 +/- 34.31
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 33.9        |
-| time/                   |             |
-|    total_timesteps      | 3150000     |
-| train/                  |             |
-|    approx_kl            | 0.007734685 |
-|    clip_fraction        | 0.0778      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.35       |
-|    explained_variance   | 0.899       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0323     |
-|    n_updates            | 1920        |
-|    policy_gradient_loss | -0.00432    |
-|    std                  | 1.3         |
-|    value_loss           | 0.0091      |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1714    |
-|    iterations      | 193     |
-|    time_elapsed    | 1844    |
-|    total_timesteps | 3162112 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1717        |
-|    iterations           | 194         |
-|    time_elapsed         | 1850        |
-|    total_timesteps      | 3178496     |
-| train/                  |             |
-|    approx_kl            | 0.007997783 |
-|    clip_fraction        | 0.0782      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.35       |
-|    explained_variance   | 0.91        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0525     |
-|    n_updates            | 1930        |
-|    policy_gradient_loss | -0.00523    |
-|    std                  | 1.3         |
-|    value_loss           | 0.00283     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1720        |
-|    iterations           | 195         |
-|    time_elapsed         | 1857        |
-|    total_timesteps      | 3194880     |
-| train/                  |             |
-|    approx_kl            | 0.007701534 |
-|    clip_fraction        | 0.0712      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.34       |
-|    explained_variance   | 0.927       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0367     |
-|    n_updates            | 1940        |
-|    policy_gradient_loss | -0.00288    |
-|    std                  | 1.3         |
-|    value_loss           | 0.0126      |
------------------------------------------
-Eval num_timesteps=3200000, episode_reward=46.55 +/- 34.01
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 46.6        |
-| time/                   |             |
-|    total_timesteps      | 3200000     |
-| train/                  |             |
-|    approx_kl            | 0.006747664 |
-|    clip_fraction        | 0.0766      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.35       |
-|    explained_variance   | 0.93        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0411     |
-|    n_updates            | 1950        |
-|    policy_gradient_loss | -0.00404    |
-|    std                  | 1.3         |
-|    value_loss           | 0.00409     |
------------------------------------------
-
-[Diag @ 3,200,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              20/20
-  action_mag mean=0.078 p10=0.005 p90=0.057 (0=stopped, 1=full speed)
-  min_flock_radius mean=8.76m best=6.32m  (target <5m to compact)
-  min_dog_to_com   mean=0.81m best=0.36m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.75m best=6.91m
-  reward/step (mean): progress=-0.0020  alignment=+0.0384  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0000
-
-[Curriculum] leaving stage n_sheep=4 after 800,000 steps | training success rate (last 100 eps) = 0%
-[Curriculum] → 5 sheep at step 3,200,000
-
---------------------------------
-| time/              |         |
-|    fps             | 1704    |
-|    iterations      | 196     |
-|    time_elapsed    | 1884    |
-|    total_timesteps | 3211264 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1707         |
-|    iterations           | 197          |
-|    time_elapsed         | 1889         |
-|    total_timesteps      | 3227648      |
-| train/                  |              |
-|    approx_kl            | 0.0068222135 |
-|    clip_fraction        | 0.0816       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.36        |
-|    explained_variance   | 0.922        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0386      |
-|    n_updates            | 1960         |
-|    policy_gradient_loss | -0.00374     |
-|    std                  | 1.31         |
-|    value_loss           | 0.0112       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1711        |
-|    iterations           | 198         |
-|    time_elapsed         | 1895        |
-|    total_timesteps      | 3244032     |
-| train/                  |             |
-|    approx_kl            | 0.006939999 |
-|    clip_fraction        | 0.0829      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.36       |
-|    explained_variance   | 0.955       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0439     |
-|    n_updates            | 1970        |
-|    policy_gradient_loss | -0.00433    |
-|    std                  | 1.31        |
-|    value_loss           | 0.00895     |
------------------------------------------
-Eval num_timesteps=3250000, episode_reward=21.19 +/- 37.18
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 21.2        |
-| time/                   |             |
-|    total_timesteps      | 3250000     |
-| train/                  |             |
-|    approx_kl            | 0.007944042 |
-|    clip_fraction        | 0.0812      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.37       |
-|    explained_variance   | 0.925       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0379     |
-|    n_updates            | 1980        |
-|    policy_gradient_loss | -0.00306    |
-|    std                  | 1.31        |
-|    value_loss           | 0.00578     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1702    |
-|    iterations      | 199     |
-|    time_elapsed    | 1914    |
-|    total_timesteps | 3260416 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1706        |
-|    iterations           | 200         |
-|    time_elapsed         | 1920        |
-|    total_timesteps      | 3276800     |
-| train/                  |             |
-|    approx_kl            | 0.007009124 |
-|    clip_fraction        | 0.0786      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.36       |
-|    explained_variance   | 0.945       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0398     |
-|    n_updates            | 1990        |
-|    policy_gradient_loss | -0.00469    |
-|    std                  | 1.31        |
-|    value_loss           | 0.00344     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1709        |
-|    iterations           | 201         |
-|    time_elapsed         | 1926        |
-|    total_timesteps      | 3293184     |
-| train/                  |             |
-|    approx_kl            | 0.007446406 |
-|    clip_fraction        | 0.0736      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.36       |
-|    explained_variance   | 0.957       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0493     |
-|    n_updates            | 2000        |
-|    policy_gradient_loss | -0.00431    |
-|    std                  | 1.31        |
-|    value_loss           | 0.00262     |
------------------------------------------
-Eval num_timesteps=3300000, episode_reward=18.42 +/- 36.17
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 18.4        |
-| time/                   |             |
-|    total_timesteps      | 3300000     |
-| train/                  |             |
-|    approx_kl            | 0.007855328 |
-|    clip_fraction        | 0.0783      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.38       |
-|    explained_variance   | 0.951       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0381     |
-|    n_updates            | 2010        |
-|    policy_gradient_loss | -0.00422    |
-|    std                  | 1.32        |
-|    value_loss           | 0.00379     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1701    |
-|    iterations      | 202     |
-|    time_elapsed    | 1945    |
-|    total_timesteps | 3309568 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1704         |
-|    iterations           | 203          |
-|    time_elapsed         | 1951         |
-|    total_timesteps      | 3325952      |
-| train/                  |              |
-|    approx_kl            | 0.0073990654 |
-|    clip_fraction        | 0.0773       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.38        |
-|    explained_variance   | 0.89         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0319      |
-|    n_updates            | 2020         |
-|    policy_gradient_loss | -0.00507     |
-|    std                  | 1.32         |
-|    value_loss           | 0.0165       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1707         |
-|    iterations           | 204          |
-|    time_elapsed         | 1956         |
-|    total_timesteps      | 3342336      |
-| train/                  |              |
-|    approx_kl            | 0.0076738494 |
-|    clip_fraction        | 0.0913       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.38        |
-|    explained_variance   | 0.914        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0326      |
-|    n_updates            | 2030         |
-|    policy_gradient_loss | -0.00611     |
-|    std                  | 1.32         |
-|    value_loss           | 0.00854      |
-------------------------------------------
-Eval num_timesteps=3350000, episode_reward=39.75 +/- 38.09
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 39.8        |
-| time/                   |             |
-|    total_timesteps      | 3350000     |
-| train/                  |             |
-|    approx_kl            | 0.007704767 |
-|    clip_fraction        | 0.0813      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.39       |
-|    explained_variance   | 0.822       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0351     |
-|    n_updates            | 2040        |
-|    policy_gradient_loss | -0.0056     |
-|    std                  | 1.33        |
-|    value_loss           | 0.0095      |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1700    |
-|    iterations      | 205     |
-|    time_elapsed    | 1974    |
-|    total_timesteps | 3358720 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1703        |
-|    iterations           | 206         |
-|    time_elapsed         | 1980        |
-|    total_timesteps      | 3375104     |
-| train/                  |             |
-|    approx_kl            | 0.006841295 |
-|    clip_fraction        | 0.0682      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.39       |
-|    explained_variance   | 0.973       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.04       |
-|    n_updates            | 2050        |
-|    policy_gradient_loss | -0.00457    |
-|    std                  | 1.33        |
-|    value_loss           | 0.00456     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1707         |
-|    iterations           | 207          |
-|    time_elapsed         | 1986         |
-|    total_timesteps      | 3391488      |
-| train/                  |              |
-|    approx_kl            | 0.0063885115 |
-|    clip_fraction        | 0.0749       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.4         |
-|    explained_variance   | 0.962        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.041       |
-|    n_updates            | 2060         |
-|    policy_gradient_loss | -0.00455     |
-|    std                  | 1.34         |
-|    value_loss           | 0.00373      |
-------------------------------------------
-Eval num_timesteps=3400000, episode_reward=26.62 +/- 43.12
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 26.6        |
-| time/                   |             |
-|    total_timesteps      | 3400000     |
-| train/                  |             |
-|    approx_kl            | 0.006273965 |
-|    clip_fraction        | 0.0709      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.4        |
-|    explained_variance   | 0.956       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0465     |
-|    n_updates            | 2070        |
-|    policy_gradient_loss | -0.00249    |
-|    std                  | 1.33        |
-|    value_loss           | 0.00679     |
------------------------------------------
-
-[Diag @ 3,400,000 | n_sheep=5 | success=0%]
-  NEVER_COMPACT              20/20
-  action_mag mean=0.089 p10=0.005 p90=0.074 (0=stopped, 1=full speed)
-  min_flock_radius mean=9.14m best=5.59m  (target <5m to compact)
-  min_dog_to_com   mean=0.69m best=0.10m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.77m best=5.15m
-  reward/step (mean): progress=-0.0015  alignment=+0.0368  pen_bonus=+0.0020  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1691    |
-|    iterations      | 208     |
-|    time_elapsed    | 2014    |
-|    total_timesteps | 3407872 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1695        |
-|    iterations           | 209         |
-|    time_elapsed         | 2019        |
-|    total_timesteps      | 3424256     |
-| train/                  |             |
-|    approx_kl            | 0.006433293 |
-|    clip_fraction        | 0.0727      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.4        |
-|    explained_variance   | 0.932       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0268     |
-|    n_updates            | 2080        |
-|    policy_gradient_loss | -0.00365    |
-|    std                  | 1.33        |
-|    value_loss           | 0.00657     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1698        |
-|    iterations           | 210         |
-|    time_elapsed         | 2025        |
-|    total_timesteps      | 3440640     |
-| train/                  |             |
-|    approx_kl            | 0.007235542 |
-|    clip_fraction        | 0.0839      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.39       |
-|    explained_variance   | 0.935       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0344     |
-|    n_updates            | 2090        |
-|    policy_gradient_loss | -0.00417    |
-|    std                  | 1.32        |
-|    value_loss           | 0.0137      |
------------------------------------------
-Eval num_timesteps=3450000, episode_reward=35.54 +/- 43.01
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 35.5        |
-| time/                   |             |
-|    total_timesteps      | 3450000     |
-| train/                  |             |
-|    approx_kl            | 0.007782845 |
-|    clip_fraction        | 0.0859      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.4        |
-|    explained_variance   | 0.924       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.044      |
-|    n_updates            | 2100        |
-|    policy_gradient_loss | -0.00561    |
-|    std                  | 1.34        |
-|    value_loss           | 0.0043      |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1690    |
-|    iterations      | 211     |
-|    time_elapsed    | 2044    |
-|    total_timesteps | 3457024 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1693         |
-|    iterations           | 212          |
-|    time_elapsed         | 2050         |
-|    total_timesteps      | 3473408      |
-| train/                  |              |
-|    approx_kl            | 0.0075765867 |
-|    clip_fraction        | 0.0746       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.41        |
-|    explained_variance   | 0.896        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0293      |
-|    n_updates            | 2110         |
-|    policy_gradient_loss | -0.00406     |
-|    std                  | 1.33         |
-|    value_loss           | 0.011        |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1696         |
-|    iterations           | 213          |
-|    time_elapsed         | 2056         |
-|    total_timesteps      | 3489792      |
-| train/                  |              |
-|    approx_kl            | 0.0072322125 |
-|    clip_fraction        | 0.071        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.41        |
-|    explained_variance   | 0.949        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0498      |
-|    n_updates            | 2120         |
-|    policy_gradient_loss | -0.00421     |
-|    std                  | 1.34         |
-|    value_loss           | 0.006        |
-------------------------------------------
-Eval num_timesteps=3500000, episode_reward=54.69 +/- 47.39
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 54.7         |
-| time/                   |              |
-|    total_timesteps      | 3500000      |
-| train/                  |              |
-|    approx_kl            | 0.0073479656 |
-|    clip_fraction        | 0.0778       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.4         |
-|    explained_variance   | 0.824        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0408      |
-|    n_updates            | 2130         |
-|    policy_gradient_loss | -0.00465     |
-|    std                  | 1.32         |
-|    value_loss           | 0.00657      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1688    |
-|    iterations      | 214     |
-|    time_elapsed    | 2076    |
-|    total_timesteps | 3506176 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1692        |
-|    iterations           | 215         |
-|    time_elapsed         | 2081        |
-|    total_timesteps      | 3522560     |
-| train/                  |             |
-|    approx_kl            | 0.007274649 |
-|    clip_fraction        | 0.0798      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.39       |
-|    explained_variance   | 0.951       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0356     |
-|    n_updates            | 2140        |
-|    policy_gradient_loss | -0.00383    |
-|    std                  | 1.33        |
-|    value_loss           | 0.00355     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1695         |
-|    iterations           | 216          |
-|    time_elapsed         | 2087         |
-|    total_timesteps      | 3538944      |
-| train/                  |              |
-|    approx_kl            | 0.0068056686 |
-|    clip_fraction        | 0.0726       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.38        |
-|    explained_variance   | 0.955        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0428      |
-|    n_updates            | 2150         |
-|    policy_gradient_loss | -0.00356     |
-|    std                  | 1.32         |
-|    value_loss           | 0.00378      |
-------------------------------------------
-Eval num_timesteps=3550000, episode_reward=8.69 +/- 39.03
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 8.69        |
-| time/                   |             |
-|    total_timesteps      | 3550000     |
-| train/                  |             |
-|    approx_kl            | 0.008211401 |
-|    clip_fraction        | 0.0801      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.38       |
-|    explained_variance   | 0.972       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0366     |
-|    n_updates            | 2160        |
-|    policy_gradient_loss | -0.00453    |
-|    std                  | 1.32        |
-|    value_loss           | 0.00445     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1687    |
-|    iterations      | 217     |
-|    time_elapsed    | 2106    |
-|    total_timesteps | 3555328 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1690        |
-|    iterations           | 218         |
-|    time_elapsed         | 2112        |
-|    total_timesteps      | 3571712     |
-| train/                  |             |
-|    approx_kl            | 0.008278061 |
-|    clip_fraction        | 0.0871      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.38       |
-|    explained_variance   | 0.931       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0324     |
-|    n_updates            | 2170        |
-|    policy_gradient_loss | -0.00486    |
-|    std                  | 1.32        |
-|    value_loss           | 0.00377     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1693        |
-|    iterations           | 219         |
-|    time_elapsed         | 2119        |
-|    total_timesteps      | 3588096     |
-| train/                  |             |
-|    approx_kl            | 0.007908824 |
-|    clip_fraction        | 0.0777      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.39       |
-|    explained_variance   | 0.951       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0353     |
-|    n_updates            | 2180        |
-|    policy_gradient_loss | -0.00318    |
-|    std                  | 1.32        |
-|    value_loss           | 0.00768     |
------------------------------------------
-Eval num_timesteps=3600000, episode_reward=26.00 +/- 35.20
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 26           |
-| time/                   |              |
-|    total_timesteps      | 3600000      |
-| train/                  |              |
-|    approx_kl            | 0.0068260087 |
-|    clip_fraction        | 0.0761       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.38        |
-|    explained_variance   | 0.946        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0257      |
-|    n_updates            | 2190         |
-|    policy_gradient_loss | -0.00375     |
-|    std                  | 1.32         |
-|    value_loss           | 0.00745      |
-------------------------------------------
-
-[Diag @ 3,600,000 | n_sheep=5 | success=0%]
-  NEVER_COMPACT              20/20
-  action_mag mean=0.114 p10=0.006 p90=0.281 (0=stopped, 1=full speed)
-  min_flock_radius mean=9.62m best=5.04m  (target <5m to compact)
-  min_dog_to_com   mean=0.77m best=0.40m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.31m best=6.37m
-  reward/step (mean): progress=+0.0071  alignment=+0.0385  pen_bonus=+0.0008  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1677    |
-|    iterations      | 220     |
-|    time_elapsed    | 2148    |
-|    total_timesteps | 3604480 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1680         |
-|    iterations           | 221          |
-|    time_elapsed         | 2154         |
-|    total_timesteps      | 3620864      |
-| train/                  |              |
-|    approx_kl            | 0.0084966235 |
-|    clip_fraction        | 0.0849       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.38        |
-|    explained_variance   | 0.936        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0498      |
-|    n_updates            | 2200         |
-|    policy_gradient_loss | -0.00478     |
-|    std                  | 1.32         |
-|    value_loss           | 0.00856      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1683        |
-|    iterations           | 222         |
-|    time_elapsed         | 2160        |
-|    total_timesteps      | 3637248     |
-| train/                  |             |
-|    approx_kl            | 0.007236682 |
-|    clip_fraction        | 0.072       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.37       |
-|    explained_variance   | 0.956       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0436     |
-|    n_updates            | 2210        |
-|    policy_gradient_loss | -0.0054     |
-|    std                  | 1.31        |
-|    value_loss           | 0.00748     |
------------------------------------------
-Eval num_timesteps=3650000, episode_reward=48.26 +/- 45.24
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 48.3         |
-| time/                   |              |
-|    total_timesteps      | 3650000      |
-| train/                  |              |
-|    approx_kl            | 0.0076099336 |
-|    clip_fraction        | 0.0694       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.37        |
-|    explained_variance   | 0.942        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.037       |
-|    n_updates            | 2220         |
-|    policy_gradient_loss | -0.00369     |
-|    std                  | 1.31         |
-|    value_loss           | 0.00888      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1676    |
-|    iterations      | 223     |
-|    time_elapsed    | 2179    |
-|    total_timesteps | 3653632 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1679        |
-|    iterations           | 224         |
-|    time_elapsed         | 2185        |
-|    total_timesteps      | 3670016     |
-| train/                  |             |
-|    approx_kl            | 0.007888832 |
-|    clip_fraction        | 0.0783      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.37       |
-|    explained_variance   | 0.914       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0298     |
-|    n_updates            | 2230        |
-|    policy_gradient_loss | -0.00449    |
-|    std                  | 1.32        |
-|    value_loss           | 0.00867     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1682         |
-|    iterations           | 225          |
-|    time_elapsed         | 2190         |
-|    total_timesteps      | 3686400      |
-| train/                  |              |
-|    approx_kl            | 0.0069514583 |
-|    clip_fraction        | 0.0791       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.38        |
-|    explained_variance   | 0.946        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0283      |
-|    n_updates            | 2240         |
-|    policy_gradient_loss | -0.00427     |
-|    std                  | 1.32         |
-|    value_loss           | 0.00382      |
-------------------------------------------
-Eval num_timesteps=3700000, episode_reward=19.29 +/- 50.45
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 19.3        |
-| time/                   |             |
-|    total_timesteps      | 3700000     |
-| train/                  |             |
-|    approx_kl            | 0.008142319 |
-|    clip_fraction        | 0.0865      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.37       |
-|    explained_variance   | 0.92        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0467     |
-|    n_updates            | 2250        |
-|    policy_gradient_loss | -0.00506    |
-|    std                  | 1.31        |
-|    value_loss           | 0.00547     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1674    |
-|    iterations      | 226     |
-|    time_elapsed    | 2210    |
-|    total_timesteps | 3702784 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1677         |
-|    iterations           | 227          |
-|    time_elapsed         | 2216         |
-|    total_timesteps      | 3719168      |
-| train/                  |              |
-|    approx_kl            | 0.0077144434 |
-|    clip_fraction        | 0.0783       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.36        |
-|    explained_variance   | 0.931        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0331      |
-|    n_updates            | 2260         |
-|    policy_gradient_loss | -0.00529     |
-|    std                  | 1.31         |
-|    value_loss           | 0.00486      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1667        |
-|    iterations           | 228         |
-|    time_elapsed         | 2239        |
-|    total_timesteps      | 3735552     |
-| train/                  |             |
-|    approx_kl            | 0.007820845 |
-|    clip_fraction        | 0.087       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.37       |
-|    explained_variance   | 0.95        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0321     |
-|    n_updates            | 2270        |
-|    policy_gradient_loss | -0.00493    |
-|    std                  | 1.31        |
-|    value_loss           | 0.00531     |
------------------------------------------
-Eval num_timesteps=3750000, episode_reward=35.91 +/- 47.57
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 35.9        |
-| time/                   |             |
-|    total_timesteps      | 3750000     |
-| train/                  |             |
-|    approx_kl            | 0.008380983 |
-|    clip_fraction        | 0.0868      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.37       |
-|    explained_variance   | 0.927       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0318     |
-|    n_updates            | 2280        |
-|    policy_gradient_loss | -0.0046     |
-|    std                  | 1.32        |
-|    value_loss           | 0.00684     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1639    |
-|    iterations      | 229     |
-|    time_elapsed    | 2289    |
-|    total_timesteps | 3751936 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1642        |
-|    iterations           | 230         |
-|    time_elapsed         | 2294        |
-|    total_timesteps      | 3768320     |
-| train/                  |             |
-|    approx_kl            | 0.007415652 |
-|    clip_fraction        | 0.0758      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.37       |
-|    explained_variance   | 0.953       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0354     |
-|    n_updates            | 2290        |
-|    policy_gradient_loss | -0.00557    |
-|    std                  | 1.31        |
-|    value_loss           | 0.0122      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1646         |
-|    iterations           | 231          |
-|    time_elapsed         | 2299         |
-|    total_timesteps      | 3784704      |
-| train/                  |              |
-|    approx_kl            | 0.0071868873 |
-|    clip_fraction        | 0.0736       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.38        |
-|    explained_variance   | 0.954        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0457      |
-|    n_updates            | 2300         |
-|    policy_gradient_loss | -0.00442     |
-|    std                  | 1.33         |
-|    value_loss           | 0.0201       |
-------------------------------------------
-Eval num_timesteps=3800000, episode_reward=31.58 +/- 50.62
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 31.6         |
-| time/                   |              |
-|    total_timesteps      | 3800000      |
-| train/                  |              |
-|    approx_kl            | 0.0074889637 |
-|    clip_fraction        | 0.0805       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.4         |
-|    explained_variance   | 0.95         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0355      |
-|    n_updates            | 2310         |
-|    policy_gradient_loss | -0.00474     |
-|    std                  | 1.33         |
-|    value_loss           | 0.00892      |
-------------------------------------------
-
-[Diag @ 3,800,000 | n_sheep=5 | success=0%]
-  NEVER_COMPACT              19/20
-  COMPACT_CANT_DRIVE         1/20
-  action_mag mean=0.128 p10=0.005 p90=0.475 (0=stopped, 1=full speed)
-  min_flock_radius mean=8.35m best=4.80m  (target <5m to compact)
-  min_dog_to_com   mean=0.71m best=0.23m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.72m best=8.54m
-  reward/step (mean): progress=+0.0063  alignment=+0.0388  pen_bonus=+0.0010  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1633    |
-|    iterations      | 232     |
-|    time_elapsed    | 2326    |
-|    total_timesteps | 3801088 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1636         |
-|    iterations           | 233          |
-|    time_elapsed         | 2332         |
-|    total_timesteps      | 3817472      |
-| train/                  |              |
-|    approx_kl            | 0.0070604184 |
-|    clip_fraction        | 0.0765       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.4         |
-|    explained_variance   | 0.953        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0398      |
-|    n_updates            | 2320         |
-|    policy_gradient_loss | -0.00453     |
-|    std                  | 1.33         |
-|    value_loss           | 0.00675      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1640        |
-|    iterations           | 234         |
-|    time_elapsed         | 2336        |
-|    total_timesteps      | 3833856     |
-| train/                  |             |
-|    approx_kl            | 0.007709453 |
-|    clip_fraction        | 0.0816      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.4        |
-|    explained_variance   | 0.943       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0359     |
-|    n_updates            | 2330        |
-|    policy_gradient_loss | -0.00423    |
-|    std                  | 1.34        |
-|    value_loss           | 0.00754     |
------------------------------------------
-Eval num_timesteps=3850000, episode_reward=42.98 +/- 33.36
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 43          |
-| time/                   |             |
-|    total_timesteps      | 3850000     |
-| train/                  |             |
-|    approx_kl            | 0.007679659 |
-|    clip_fraction        | 0.0858      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.4        |
-|    explained_variance   | 0.961       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.032      |
-|    n_updates            | 2340        |
-|    policy_gradient_loss | -0.00716    |
-|    std                  | 1.33        |
-|    value_loss           | 0.00907     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1635    |
-|    iterations      | 235     |
-|    time_elapsed    | 2354    |
-|    total_timesteps | 3850240 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1638         |
-|    iterations           | 236          |
-|    time_elapsed         | 2360         |
-|    total_timesteps      | 3866624      |
-| train/                  |              |
-|    approx_kl            | 0.0077598644 |
-|    clip_fraction        | 0.0848       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.38        |
-|    explained_variance   | 0.96         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0468      |
-|    n_updates            | 2350         |
-|    policy_gradient_loss | -0.005       |
-|    std                  | 1.33         |
-|    value_loss           | 0.0101       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1641         |
-|    iterations           | 237          |
-|    time_elapsed         | 2366         |
-|    total_timesteps      | 3883008      |
-| train/                  |              |
-|    approx_kl            | 0.0068941545 |
-|    clip_fraction        | 0.0673       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.39        |
-|    explained_variance   | 0.96         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0398      |
-|    n_updates            | 2360         |
-|    policy_gradient_loss | -0.0047      |
-|    std                  | 1.33         |
-|    value_loss           | 0.0113       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1643         |
-|    iterations           | 238          |
-|    time_elapsed         | 2372         |
-|    total_timesteps      | 3899392      |
-| train/                  |              |
-|    approx_kl            | 0.0073663425 |
-|    clip_fraction        | 0.0785       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.41        |
-|    explained_variance   | 0.963        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0319      |
-|    n_updates            | 2370         |
-|    policy_gradient_loss | -0.00458     |
-|    std                  | 1.35         |
-|    value_loss           | 0.0036       |
-------------------------------------------
-Eval num_timesteps=3900000, episode_reward=33.74 +/- 40.96
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 33.7        |
-| time/                   |             |
-|    total_timesteps      | 3900000     |
-| train/                  |             |
-|    approx_kl            | 0.007122398 |
-|    clip_fraction        | 0.0759      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.41       |
-|    explained_variance   | 0.972       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0383     |
-|    n_updates            | 2380        |
-|    policy_gradient_loss | -0.00446    |
-|    std                  | 1.35        |
-|    value_loss           | 0.00445     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1637    |
-|    iterations      | 239     |
-|    time_elapsed    | 2391    |
-|    total_timesteps | 3915776 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1640        |
-|    iterations           | 240         |
-|    time_elapsed         | 2396        |
-|    total_timesteps      | 3932160     |
-| train/                  |             |
-|    approx_kl            | 0.008265208 |
-|    clip_fraction        | 0.0845      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.41       |
-|    explained_variance   | 0.926       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0361     |
-|    n_updates            | 2390        |
-|    policy_gradient_loss | -0.00536    |
-|    std                  | 1.34        |
-|    value_loss           | 0.00846     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1643        |
-|    iterations           | 241         |
-|    time_elapsed         | 2402        |
-|    total_timesteps      | 3948544     |
-| train/                  |             |
-|    approx_kl            | 0.008583728 |
-|    clip_fraction        | 0.0893      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.42       |
-|    explained_variance   | 0.915       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0297     |
-|    n_updates            | 2400        |
-|    policy_gradient_loss | -0.00592    |
-|    std                  | 1.35        |
-|    value_loss           | 0.0068      |
------------------------------------------
-Eval num_timesteps=3950000, episode_reward=46.06 +/- 34.67
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | 46.1         |
-| time/                   |              |
-|    total_timesteps      | 3950000      |
-| train/                  |              |
-|    approx_kl            | 0.0060660206 |
-|    clip_fraction        | 0.0654       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.42        |
-|    explained_variance   | 0.942        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0359      |
-|    n_updates            | 2410         |
-|    policy_gradient_loss | -0.0038      |
-|    std                  | 1.35         |
-|    value_loss           | 0.00296      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1637    |
-|    iterations      | 242     |
-|    time_elapsed    | 2421    |
-|    total_timesteps | 3964928 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1639        |
-|    iterations           | 243         |
-|    time_elapsed         | 2427        |
-|    total_timesteps      | 3981312     |
-| train/                  |             |
-|    approx_kl            | 0.007591601 |
-|    clip_fraction        | 0.0808      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.4        |
-|    explained_variance   | 0.964       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0386     |
-|    n_updates            | 2420        |
-|    policy_gradient_loss | -0.00575    |
-|    std                  | 1.34        |
-|    value_loss           | 0.00714     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1642        |
-|    iterations           | 244         |
-|    time_elapsed         | 2433        |
-|    total_timesteps      | 3997696     |
-| train/                  |             |
-|    approx_kl            | 0.006255053 |
-|    clip_fraction        | 0.0663      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.41       |
-|    explained_variance   | 0.939       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0304     |
-|    n_updates            | 2430        |
-|    policy_gradient_loss | -0.00497    |
-|    std                  | 1.35        |
-|    value_loss           | 0.00585     |
------------------------------------------
-Eval num_timesteps=4000000, episode_reward=19.52 +/- 38.43
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | 19.5        |
-| time/                   |             |
-|    total_timesteps      | 4000000     |
-| train/                  |             |
-|    approx_kl            | 0.008279499 |
-|    clip_fraction        | 0.0814      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.4        |
-|    explained_variance   | 0.958       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0379     |
-|    n_updates            | 2440        |
-|    policy_gradient_loss | -0.00568    |
-|    std                  | 1.34        |
-|    value_loss           | 0.00469     |
------------------------------------------
-
-[Diag @ 4,000,000 | n_sheep=5 | success=0%]
-  NEVER_COMPACT              20/20
-  action_mag mean=0.158 p10=0.006 p90=0.744 (0=stopped, 1=full speed)
-  min_flock_radius mean=8.94m best=6.34m  (target <5m to compact)
-  min_dog_to_com   mean=0.82m best=0.49m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.86m best=7.80m
-  reward/step (mean): progress=+0.0029  alignment=+0.0397  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1630    |
-|    iterations      | 245     |
-|    time_elapsed    | 2461    |
-|    total_timesteps | 4014080 |
---------------------------------
-
-Training complete. Artefacts saved to runs/ppo_debug/
diff --git a/training/runs/ppo_debug/best_model/best_model.zip b/training/runs/ppo_debug/best_model/best_model.zip
deleted file mode 100644
index 2618c2c..0000000
Binary files a/training/runs/ppo_debug/best_model/best_model.zip and /dev/null differ
diff --git a/training/runs/ppo_debug/evaluations.npz b/training/runs/ppo_debug/evaluations.npz
deleted file mode 100644
index 84fd19d..0000000
Binary files a/training/runs/ppo_debug/evaluations.npz and /dev/null differ
diff --git a/training/runs/ppo_debug/final_model.zip b/training/runs/ppo_debug/final_model.zip
deleted file mode 100644
index e3be97e..0000000
Binary files a/training/runs/ppo_debug/final_model.zip and /dev/null differ
diff --git a/training/runs/ppo_debug/vecnorm.pkl b/training/runs/ppo_debug/vecnorm.pkl
deleted file mode 100644
index c17b706..0000000
Binary files a/training/runs/ppo_debug/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/ppo_fix_check.log b/training/runs/ppo_fix_check.log
deleted file mode 100644
index 39ace5a..0000000
--- a/training/runs/ppo_fix_check.log
+++ /dev/null
@@ -1,3388 +0,0 @@
-Using cpu device
-Logging to runs/ppo_fix_check/ppo_1
-------------------------------
-| time/              |       |
-|    fps             | 5021  |
-|    iterations      | 1     |
-|    time_elapsed    | 3     |
-|    total_timesteps | 16384 |
-------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 4241         |
-|    iterations           | 2            |
-|    time_elapsed         | 7            |
-|    total_timesteps      | 32768        |
-| train/                  |              |
-|    approx_kl            | 0.0047510993 |
-|    clip_fraction        | 0.0344       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.85        |
-|    explained_variance   | 0.786        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.00995     |
-|    n_updates            | 10           |
-|    policy_gradient_loss | -0.00156     |
-|    std                  | 1.01         |
-|    value_loss           | 0.0657       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 4026         |
-|    iterations           | 3            |
-|    time_elapsed         | 12           |
-|    total_timesteps      | 49152        |
-| train/                  |              |
-|    approx_kl            | 0.0032065492 |
-|    clip_fraction        | 0.0328       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.88        |
-|    explained_variance   | 0.868        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0327      |
-|    n_updates            | 20           |
-|    policy_gradient_loss | -0.00152     |
-|    std                  | 1.02         |
-|    value_loss           | 0.0172       |
-------------------------------------------
-/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.
-  warnings.warn(
-Eval num_timesteps=50000, episode_reward=-25.33 +/- 56.30
-Episode length: 1859.00 +/- 393.69
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.86e+03     |
-|    mean_reward          | -25.3        |
-| time/                   |              |
-|    total_timesteps      | 50000        |
-| train/                  |              |
-|    approx_kl            | 0.0038272792 |
-|    clip_fraction        | 0.0312       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.89        |
-|    explained_variance   | 0.891        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0224      |
-|    n_updates            | 30           |
-|    policy_gradient_loss | -0.0019      |
-|    std                  | 1.02         |
-|    value_loss           | 0.0227       |
-------------------------------------------
-New best mean reward!
-------------------------------
-| time/              |       |
-|    fps             | 2387  |
-|    iterations      | 4     |
-|    time_elapsed    | 27    |
-|    total_timesteps | 65536 |
-------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2563         |
-|    iterations           | 5            |
-|    time_elapsed         | 31           |
-|    total_timesteps      | 81920        |
-| train/                  |              |
-|    approx_kl            | 0.0040233894 |
-|    clip_fraction        | 0.0323       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.87        |
-|    explained_variance   | 0.878        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0251      |
-|    n_updates            | 40           |
-|    policy_gradient_loss | -0.00247     |
-|    std                  | 1.01         |
-|    value_loss           | 0.0169       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2719        |
-|    iterations           | 6           |
-|    time_elapsed         | 36          |
-|    total_timesteps      | 98304       |
-| train/                  |             |
-|    approx_kl            | 0.003573698 |
-|    clip_fraction        | 0.0316      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.86       |
-|    explained_variance   | 0.865       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0219     |
-|    n_updates            | 50          |
-|    policy_gradient_loss | -0.0019     |
-|    std                  | 1.01        |
-|    value_loss           | 0.022       |
------------------------------------------
-/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.
-  warnings.warn(
-Eval num_timesteps=100000, episode_reward=-29.60 +/- 36.59
-Episode length: 1939.35 +/- 264.37
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.94e+03     |
-|    mean_reward          | -29.6        |
-| time/                   |              |
-|    total_timesteps      | 100000       |
-| train/                  |              |
-|    approx_kl            | 0.0046861977 |
-|    clip_fraction        | 0.039        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.86        |
-|    explained_variance   | 0.815        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0257      |
-|    n_updates            | 60           |
-|    policy_gradient_loss | -0.00203     |
-|    std                  | 1.01         |
-|    value_loss           | 0.0201       |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2191   |
-|    iterations      | 7      |
-|    time_elapsed    | 52     |
-|    total_timesteps | 114688 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2314        |
-|    iterations           | 8           |
-|    time_elapsed         | 56          |
-|    total_timesteps      | 131072      |
-| train/                  |             |
-|    approx_kl            | 0.005258695 |
-|    clip_fraction        | 0.0503      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.86       |
-|    explained_variance   | 0.807       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0211     |
-|    n_updates            | 70          |
-|    policy_gradient_loss | -0.00398    |
-|    std                  | 1.01        |
-|    value_loss           | 0.0164      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2359         |
-|    iterations           | 9            |
-|    time_elapsed         | 62           |
-|    total_timesteps      | 147456       |
-| train/                  |              |
-|    approx_kl            | 0.0043328116 |
-|    clip_fraction        | 0.0332       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.86        |
-|    explained_variance   | 0.811        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0259      |
-|    n_updates            | 80           |
-|    policy_gradient_loss | -0.00173     |
-|    std                  | 1.01         |
-|    value_loss           | 0.0121       |
-------------------------------------------
-Eval num_timesteps=150000, episode_reward=-33.97 +/- 37.15
-Episode length: 1954.85 +/- 196.80
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.95e+03    |
-|    mean_reward          | -34         |
-| time/                   |             |
-|    total_timesteps      | 150000      |
-| train/                  |             |
-|    approx_kl            | 0.005169191 |
-|    clip_fraction        | 0.0506      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.85       |
-|    explained_variance   | 0.649       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0287     |
-|    n_updates            | 90          |
-|    policy_gradient_loss | -0.00384    |
-|    std                  | 1           |
-|    value_loss           | 0.0162      |
------------------------------------------
-
-[Diag @ 150,000 | n_sheep=1 | success=15%]
-  COMPACT_CANT_DRIVE         16/20
-  SUCCESS                    3/20
-  DROVE_NO_SHEEP             1/20
-  action_mag mean=0.239 p10=0.071 p90=0.433 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=4.80m best=1.70m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=10.22m best=1.50m
-  reward/step (mean): progress=+0.0013  alignment=+0.0000  pen_bonus=+0.0008  step_cost=-0.0200  complete=+0.0078
--------------------------------
-| time/              |        |
-|    fps             | 1935   |
-|    iterations      | 10     |
-|    time_elapsed    | 84     |
-|    total_timesteps | 163840 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2014         |
-|    iterations           | 11           |
-|    time_elapsed         | 89           |
-|    total_timesteps      | 180224       |
-| train/                  |              |
-|    approx_kl            | 0.0039950563 |
-|    clip_fraction        | 0.0276       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.83        |
-|    explained_variance   | 0.623        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0128      |
-|    n_updates            | 100          |
-|    policy_gradient_loss | -0.00208     |
-|    std                  | 0.995        |
-|    value_loss           | 0.0959       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2093         |
-|    iterations           | 12           |
-|    time_elapsed         | 93           |
-|    total_timesteps      | 196608       |
-| train/                  |              |
-|    approx_kl            | 0.0036244316 |
-|    clip_fraction        | 0.0299       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.83        |
-|    explained_variance   | 0.916        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0251      |
-|    n_updates            | 110          |
-|    policy_gradient_loss | -0.00229     |
-|    std                  | 0.991        |
-|    value_loss           | 0.0118       |
-------------------------------------------
-Eval num_timesteps=200000, episode_reward=-36.37 +/- 39.41
-Episode length: 1950.95 +/- 213.80
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.95e+03    |
-|    mean_reward          | -36.4       |
-| time/                   |             |
-|    total_timesteps      | 200000      |
-| train/                  |             |
-|    approx_kl            | 0.003325508 |
-|    clip_fraction        | 0.0223      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.83       |
-|    explained_variance   | 0.858       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0279     |
-|    n_updates            | 120         |
-|    policy_gradient_loss | -0.0007     |
-|    std                  | 0.999       |
-|    value_loss           | 0.0493      |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1964   |
-|    iterations      | 13     |
-|    time_elapsed    | 108    |
-|    total_timesteps | 212992 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2034        |
-|    iterations           | 14          |
-|    time_elapsed         | 112         |
-|    total_timesteps      | 229376      |
-| train/                  |             |
-|    approx_kl            | 0.004660043 |
-|    clip_fraction        | 0.0403      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.85       |
-|    explained_variance   | 0.719       |
-|    learning_rate        | 0.0003      |
-|    loss                 | 0.128       |
-|    n_updates            | 130         |
-|    policy_gradient_loss | -0.00265    |
-|    std                  | 1.01        |
-|    value_loss           | 0.073       |
------------------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 2103       |
-|    iterations           | 15         |
-|    time_elapsed         | 116        |
-|    total_timesteps      | 245760     |
-| train/                  |            |
-|    approx_kl            | 0.00501227 |
-|    clip_fraction        | 0.0499     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.88      |
-|    explained_variance   | 0.847      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0237    |
-|    n_updates            | 140        |
-|    policy_gradient_loss | -0.00264   |
-|    std                  | 1.02       |
-|    value_loss           | 0.0415     |
-----------------------------------------
-Eval num_timesteps=250000, episode_reward=-44.92 +/- 15.63
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -44.9        |
-| time/                   |              |
-|    total_timesteps      | 250000       |
-| train/                  |              |
-|    approx_kl            | 0.0055294414 |
-|    clip_fraction        | 0.06         |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.89        |
-|    explained_variance   | 0.951        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0274      |
-|    n_updates            | 150          |
-|    policy_gradient_loss | -0.00491     |
-|    std                  | 1.03         |
-|    value_loss           | 0.014        |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1999   |
-|    iterations      | 16     |
-|    time_elapsed    | 131    |
-|    total_timesteps | 262144 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2051         |
-|    iterations           | 17           |
-|    time_elapsed         | 135          |
-|    total_timesteps      | 278528       |
-| train/                  |              |
-|    approx_kl            | 0.0051201656 |
-|    clip_fraction        | 0.0301       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.88        |
-|    explained_variance   | 0.941        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.148        |
-|    n_updates            | 160          |
-|    policy_gradient_loss | -0.00199     |
-|    std                  | 1.02         |
-|    value_loss           | 0.099        |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2096        |
-|    iterations           | 18          |
-|    time_elapsed         | 140         |
-|    total_timesteps      | 294912      |
-| train/                  |             |
-|    approx_kl            | 0.004261789 |
-|    clip_fraction        | 0.0328      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.88       |
-|    explained_variance   | 0.942       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0314     |
-|    n_updates            | 170         |
-|    policy_gradient_loss | -0.00243    |
-|    std                  | 1.02        |
-|    value_loss           | 0.0117      |
------------------------------------------
-Eval num_timesteps=300000, episode_reward=-44.79 +/- 17.68
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -44.8       |
-| time/                   |             |
-|    total_timesteps      | 300000      |
-| train/                  |             |
-|    approx_kl            | 0.004783842 |
-|    clip_fraction        | 0.0296      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.87       |
-|    explained_variance   | 0.892       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0219     |
-|    n_updates            | 180         |
-|    policy_gradient_loss | -0.00159    |
-|    std                  | 1.01        |
-|    value_loss           | 0.0497      |
------------------------------------------
-
-[Diag @ 300,000 | n_sheep=1 | success=0%]
-  COMPACT_CANT_DRIVE         17/20
-  DROVE_NO_SHEEP             3/20
-  action_mag mean=0.241 p10=0.109 p90=0.389 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=4.77m best=2.12m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=9.31m best=1.50m
-  reward/step (mean): progress=+0.0016  alignment=+0.0000  pen_bonus=+0.0000  step_cost=-0.0200  complete=+0.0000
--------------------------------
-| time/              |        |
-|    fps             | 1905   |
-|    iterations      | 19     |
-|    time_elapsed    | 163    |
-|    total_timesteps | 311296 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1949         |
-|    iterations           | 20           |
-|    time_elapsed         | 168          |
-|    total_timesteps      | 327680       |
-| train/                  |              |
-|    approx_kl            | 0.0033368056 |
-|    clip_fraction        | 0.0258       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.87        |
-|    explained_variance   | 0.794        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0211      |
-|    n_updates            | 190          |
-|    policy_gradient_loss | -0.00105     |
-|    std                  | 1.02         |
-|    value_loss           | 0.0769       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1992         |
-|    iterations           | 21           |
-|    time_elapsed         | 172          |
-|    total_timesteps      | 344064       |
-| train/                  |              |
-|    approx_kl            | 0.0046488494 |
-|    clip_fraction        | 0.0352       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.87        |
-|    explained_variance   | 0.927        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0274      |
-|    n_updates            | 200          |
-|    policy_gradient_loss | -0.00331     |
-|    std                  | 1.02         |
-|    value_loss           | 0.0165       |
-------------------------------------------
-Eval num_timesteps=350000, episode_reward=-24.90 +/- 50.25
-Episode length: 1976.75 +/- 82.03
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.98e+03     |
-|    mean_reward          | -24.9        |
-| time/                   |              |
-|    total_timesteps      | 350000       |
-| train/                  |              |
-|    approx_kl            | 0.0041725934 |
-|    clip_fraction        | 0.0299       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.88        |
-|    explained_variance   | 0.944        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.026       |
-|    n_updates            | 210          |
-|    policy_gradient_loss | -0.0026      |
-|    std                  | 1.02         |
-|    value_loss           | 0.00665      |
-------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 1921   |
-|    iterations      | 22     |
-|    time_elapsed    | 187    |
-|    total_timesteps | 360448 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1963        |
-|    iterations           | 23          |
-|    time_elapsed         | 191         |
-|    total_timesteps      | 376832      |
-| train/                  |             |
-|    approx_kl            | 0.005180447 |
-|    clip_fraction        | 0.0532      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.87       |
-|    explained_variance   | 0.956       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0255     |
-|    n_updates            | 220         |
-|    policy_gradient_loss | -0.00352    |
-|    std                  | 1.02        |
-|    value_loss           | 0.0142      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1990        |
-|    iterations           | 24          |
-|    time_elapsed         | 197         |
-|    total_timesteps      | 393216      |
-| train/                  |             |
-|    approx_kl            | 0.004661506 |
-|    clip_fraction        | 0.0443      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.87       |
-|    explained_variance   | 0.967       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0331     |
-|    n_updates            | 230         |
-|    policy_gradient_loss | -0.00441    |
-|    std                  | 1.02        |
-|    value_loss           | 0.0112      |
------------------------------------------
-Eval num_timesteps=400000, episode_reward=-26.04 +/- 47.69
-Episode length: 1890.85 +/- 367.20
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.89e+03    |
-|    mean_reward          | -26         |
-| time/                   |             |
-|    total_timesteps      | 400000      |
-| train/                  |             |
-|    approx_kl            | 0.005491742 |
-|    clip_fraction        | 0.0538      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.89       |
-|    explained_variance   | 0.941       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.042      |
-|    n_updates            | 240         |
-|    policy_gradient_loss | -0.00297    |
-|    std                  | 1.03        |
-|    value_loss           | 0.00877     |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1927   |
-|    iterations      | 25     |
-|    time_elapsed    | 212    |
-|    total_timesteps | 409600 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1966         |
-|    iterations           | 26           |
-|    time_elapsed         | 216          |
-|    total_timesteps      | 425984       |
-| train/                  |              |
-|    approx_kl            | 0.0045445506 |
-|    clip_fraction        | 0.0385       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.91        |
-|    explained_variance   | 0.941        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0343      |
-|    n_updates            | 250          |
-|    policy_gradient_loss | -0.00307     |
-|    std                  | 1.04         |
-|    value_loss           | 0.00818      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2004         |
-|    iterations           | 27           |
-|    time_elapsed         | 220          |
-|    total_timesteps      | 442368       |
-| train/                  |              |
-|    approx_kl            | 0.0045271795 |
-|    clip_fraction        | 0.0373       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.94        |
-|    explained_variance   | 0.97         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0361      |
-|    n_updates            | 260          |
-|    policy_gradient_loss | -0.00236     |
-|    std                  | 1.05         |
-|    value_loss           | 0.0091       |
-------------------------------------------
-Eval num_timesteps=450000, episode_reward=-24.58 +/- 48.73
-Episode length: 1907.85 +/- 276.46
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.91e+03     |
-|    mean_reward          | -24.6        |
-| time/                   |              |
-|    total_timesteps      | 450000       |
-| train/                  |              |
-|    approx_kl            | 0.0052676853 |
-|    clip_fraction        | 0.0498       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.948        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0261      |
-|    n_updates            | 270          |
-|    policy_gradient_loss | -0.00236     |
-|    std                  | 1.07         |
-|    value_loss           | 0.0286       |
-------------------------------------------
-New best mean reward!
-
-[Diag @ 450,000 | n_sheep=1 | success=5%]
-  COMPACT_CANT_DRIVE         18/20
-  DROVE_NO_SHEEP             1/20
-  SUCCESS                    1/20
-  action_mag mean=0.272 p10=0.139 p90=0.407 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=4.81m best=1.54m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.36m best=1.96m
-  reward/step (mean): progress=+0.0012  alignment=+0.0000  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0025
--------------------------------
-| time/              |        |
-|    fps             | 1893   |
-|    iterations      | 28     |
-|    time_elapsed    | 242    |
-|    total_timesteps | 458752 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1928        |
-|    iterations           | 29          |
-|    time_elapsed         | 246         |
-|    total_timesteps      | 475136      |
-| train/                  |             |
-|    approx_kl            | 0.004465497 |
-|    clip_fraction        | 0.0376      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.97       |
-|    explained_variance   | 0.948       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0307     |
-|    n_updates            | 280         |
-|    policy_gradient_loss | -0.00259    |
-|    std                  | 1.07        |
-|    value_loss           | 0.0213      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1961         |
-|    iterations           | 30           |
-|    time_elapsed         | 250          |
-|    total_timesteps      | 491520       |
-| train/                  |              |
-|    approx_kl            | 0.0054338034 |
-|    clip_fraction        | 0.0512       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.97        |
-|    explained_variance   | 0.967        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.021       |
-|    n_updates            | 290          |
-|    policy_gradient_loss | -0.00296     |
-|    std                  | 1.07         |
-|    value_loss           | 0.0138       |
-------------------------------------------
-Eval num_timesteps=500000, episode_reward=-44.13 +/- 20.75
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -44.1       |
-| time/                   |             |
-|    total_timesteps      | 500000      |
-| train/                  |             |
-|    approx_kl            | 0.006292434 |
-|    clip_fraction        | 0.0572      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.97       |
-|    explained_variance   | 0.937       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0398     |
-|    n_updates            | 300         |
-|    policy_gradient_loss | -0.00516    |
-|    std                  | 1.07        |
-|    value_loss           | 0.00832     |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1913   |
-|    iterations      | 31     |
-|    time_elapsed    | 265    |
-|    total_timesteps | 507904 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1940         |
-|    iterations           | 32           |
-|    time_elapsed         | 270          |
-|    total_timesteps      | 524288       |
-| train/                  |              |
-|    approx_kl            | 0.0063960385 |
-|    clip_fraction        | 0.0702       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.942        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0341      |
-|    n_updates            | 310          |
-|    policy_gradient_loss | -0.00436     |
-|    std                  | 1.06         |
-|    value_loss           | 0.0189       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1968         |
-|    iterations           | 33           |
-|    time_elapsed         | 274          |
-|    total_timesteps      | 540672       |
-| train/                  |              |
-|    approx_kl            | 0.0070166546 |
-|    clip_fraction        | 0.0888       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.955        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0376      |
-|    n_updates            | 320          |
-|    policy_gradient_loss | -0.00631     |
-|    std                  | 1.06         |
-|    value_loss           | 0.00861      |
-------------------------------------------
-Eval num_timesteps=550000, episode_reward=-38.60 +/- 14.53
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -38.6        |
-| time/                   |              |
-|    total_timesteps      | 550000       |
-| train/                  |              |
-|    approx_kl            | 0.0068266992 |
-|    clip_fraction        | 0.075        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.959        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0252      |
-|    n_updates            | 330          |
-|    policy_gradient_loss | -0.00593     |
-|    std                  | 1.07         |
-|    value_loss           | 0.0131       |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1922   |
-|    iterations      | 34     |
-|    time_elapsed    | 289    |
-|    total_timesteps | 557056 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1950        |
-|    iterations           | 35          |
-|    time_elapsed         | 294         |
-|    total_timesteps      | 573440      |
-| train/                  |             |
-|    approx_kl            | 0.006152669 |
-|    clip_fraction        | 0.0626      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.97       |
-|    explained_variance   | 0.954       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0376     |
-|    n_updates            | 340         |
-|    policy_gradient_loss | -0.00514    |
-|    std                  | 1.07        |
-|    value_loss           | 0.0187      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1977        |
-|    iterations           | 36          |
-|    time_elapsed         | 298         |
-|    total_timesteps      | 589824      |
-| train/                  |             |
-|    approx_kl            | 0.006685758 |
-|    clip_fraction        | 0.0729      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.97       |
-|    explained_variance   | 0.958       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0387     |
-|    n_updates            | 350         |
-|    policy_gradient_loss | -0.00632    |
-|    std                  | 1.07        |
-|    value_loss           | 0.0118      |
------------------------------------------
-Eval num_timesteps=600000, episode_reward=-31.39 +/- 8.94
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -31.4       |
-| time/                   |             |
-|    total_timesteps      | 600000      |
-| train/                  |             |
-|    approx_kl            | 0.008094068 |
-|    clip_fraction        | 0.0985      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.97       |
-|    explained_variance   | 0.937       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0439     |
-|    n_updates            | 360         |
-|    policy_gradient_loss | -0.00782    |
-|    std                  | 1.07        |
-|    value_loss           | 0.0116      |
------------------------------------------
-
-[Diag @ 600,000 | n_sheep=1 | success=5%]
-  COMPACT_CANT_DRIVE         16/20
-  DROVE_NO_SHEEP             3/20
-  SUCCESS                    1/20
-  action_mag mean=0.150 p10=0.000 p90=0.392 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=3.64m best=0.68m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=10.60m best=1.50m
-  reward/step (mean): progress=+0.0025  alignment=+0.0000  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0026
-
-[Curriculum] leaving stage n_sheep=1 after 600,000 steps | training success rate (last 100 eps) = 9%
-[Curriculum] → 2 sheep at step 600,000
-
--------------------------------
-| time/              |        |
-|    fps             | 1894   |
-|    iterations      | 37     |
-|    time_elapsed    | 319    |
-|    total_timesteps | 606208 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1917         |
-|    iterations           | 38           |
-|    time_elapsed         | 324          |
-|    total_timesteps      | 622592       |
-| train/                  |              |
-|    approx_kl            | 0.0067913756 |
-|    clip_fraction        | 0.0689       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.97        |
-|    explained_variance   | 0.861        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.0772       |
-|    n_updates            | 370          |
-|    policy_gradient_loss | -0.00184     |
-|    std                  | 1.07         |
-|    value_loss           | 0.101        |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1938         |
-|    iterations           | 39           |
-|    time_elapsed         | 329          |
-|    total_timesteps      | 638976       |
-| train/                  |              |
-|    approx_kl            | 0.0061344057 |
-|    clip_fraction        | 0.0666       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.98        |
-|    explained_variance   | 0.928        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0147      |
-|    n_updates            | 380          |
-|    policy_gradient_loss | -0.00148     |
-|    std                  | 1.08         |
-|    value_loss           | 0.0386       |
-------------------------------------------
-Eval num_timesteps=650000, episode_reward=-42.39 +/- 31.99
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -42.4        |
-| time/                   |              |
-|    total_timesteps      | 650000       |
-| train/                  |              |
-|    approx_kl            | 0.0061708866 |
-|    clip_fraction        | 0.06         |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.98        |
-|    explained_variance   | 0.918        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0203      |
-|    n_updates            | 390          |
-|    policy_gradient_loss | -0.00313     |
-|    std                  | 1.07         |
-|    value_loss           | 0.0242       |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1896   |
-|    iterations      | 40     |
-|    time_elapsed    | 345    |
-|    total_timesteps | 655360 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1918        |
-|    iterations           | 41          |
-|    time_elapsed         | 350         |
-|    total_timesteps      | 671744      |
-| train/                  |             |
-|    approx_kl            | 0.007122565 |
-|    clip_fraction        | 0.0765      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.98       |
-|    explained_variance   | 0.855       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.00749    |
-|    n_updates            | 400         |
-|    policy_gradient_loss | -0.00529    |
-|    std                  | 1.07        |
-|    value_loss           | 0.0596      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1941         |
-|    iterations           | 42           |
-|    time_elapsed         | 354          |
-|    total_timesteps      | 688128       |
-| train/                  |              |
-|    approx_kl            | 0.0078532845 |
-|    clip_fraction        | 0.0975       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.98        |
-|    explained_variance   | 0.89         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0188      |
-|    n_updates            | 410          |
-|    policy_gradient_loss | -0.00699     |
-|    std                  | 1.07         |
-|    value_loss           | 0.0207       |
-------------------------------------------
-Eval num_timesteps=700000, episode_reward=-39.79 +/- 29.60
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -39.8        |
-| time/                   |              |
-|    total_timesteps      | 700000       |
-| train/                  |              |
-|    approx_kl            | 0.0073551387 |
-|    clip_fraction        | 0.084        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.97        |
-|    explained_variance   | 0.824        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.0126       |
-|    n_updates            | 420          |
-|    policy_gradient_loss | -0.0064      |
-|    std                  | 1.06         |
-|    value_loss           | 0.0438       |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1904   |
-|    iterations      | 43     |
-|    time_elapsed    | 370    |
-|    total_timesteps | 704512 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1922        |
-|    iterations           | 44          |
-|    time_elapsed         | 375         |
-|    total_timesteps      | 720896      |
-| train/                  |             |
-|    approx_kl            | 0.006614036 |
-|    clip_fraction        | 0.0611      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.95       |
-|    explained_variance   | 0.881       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0207     |
-|    n_updates            | 430         |
-|    policy_gradient_loss | -0.00371    |
-|    std                  | 1.06        |
-|    value_loss           | 0.0244      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1940         |
-|    iterations           | 45           |
-|    time_elapsed         | 380          |
-|    total_timesteps      | 737280       |
-| train/                  |              |
-|    approx_kl            | 0.0060790265 |
-|    clip_fraction        | 0.0591       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.95        |
-|    explained_variance   | 0.885        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0284      |
-|    n_updates            | 440          |
-|    policy_gradient_loss | -0.00447     |
-|    std                  | 1.06         |
-|    value_loss           | 0.0206       |
-------------------------------------------
-Eval num_timesteps=750000, episode_reward=-40.21 +/- 27.55
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -40.2        |
-| time/                   |              |
-|    total_timesteps      | 750000       |
-| train/                  |              |
-|    approx_kl            | 0.0066163363 |
-|    clip_fraction        | 0.0691       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.924        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.032       |
-|    n_updates            | 450          |
-|    policy_gradient_loss | -0.0043      |
-|    std                  | 1.06         |
-|    value_loss           | 0.0127       |
-------------------------------------------
-
-[Diag @ 750,000 | n_sheep=2 | success=0%]
-  COMPACT_CANT_DRIVE         14/20
-  NEVER_COMPACT              5/20
-  DROVE_NO_SHEEP             1/20
-  action_mag mean=0.313 p10=0.081 p90=0.638 (0=stopped, 1=full speed)
-  min_flock_radius mean=2.72m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=3.96m best=0.02m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.68m best=2.17m
-  reward/step (mean): progress=-0.0005  alignment=+0.0000  pen_bonus=+0.0008  step_cost=-0.0200  complete=+0.0000
--------------------------------
-| time/              |        |
-|    fps             | 1866   |
-|    iterations      | 46     |
-|    time_elapsed    | 403    |
-|    total_timesteps | 753664 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1887        |
-|    iterations           | 47          |
-|    time_elapsed         | 407         |
-|    total_timesteps      | 770048      |
-| train/                  |             |
-|    approx_kl            | 0.005094421 |
-|    clip_fraction        | 0.0496      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.96       |
-|    explained_variance   | 0.917       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0237     |
-|    n_updates            | 460         |
-|    policy_gradient_loss | -0.00332    |
-|    std                  | 1.06        |
-|    value_loss           | 0.0275      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1906        |
-|    iterations           | 48          |
-|    time_elapsed         | 412         |
-|    total_timesteps      | 786432      |
-| train/                  |             |
-|    approx_kl            | 0.006302662 |
-|    clip_fraction        | 0.0571      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.94       |
-|    explained_variance   | 0.944       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0353     |
-|    n_updates            | 470         |
-|    policy_gradient_loss | -0.00424    |
-|    std                  | 1.05        |
-|    value_loss           | 0.0201      |
------------------------------------------
-Eval num_timesteps=800000, episode_reward=-31.43 +/- 45.97
-Episode length: 1953.35 +/- 203.34
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.95e+03     |
-|    mean_reward          | -31.4        |
-| time/                   |              |
-|    total_timesteps      | 800000       |
-| train/                  |              |
-|    approx_kl            | 0.0055750986 |
-|    clip_fraction        | 0.0494       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.95        |
-|    explained_variance   | 0.959        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0262      |
-|    n_updates            | 480          |
-|    policy_gradient_loss | -0.00386     |
-|    std                  | 1.06         |
-|    value_loss           | 0.0218       |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1878   |
-|    iterations      | 49     |
-|    time_elapsed    | 427    |
-|    total_timesteps | 802816 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1897         |
-|    iterations           | 50           |
-|    time_elapsed         | 431          |
-|    total_timesteps      | 819200       |
-| train/                  |              |
-|    approx_kl            | 0.0057711033 |
-|    clip_fraction        | 0.0568       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.95        |
-|    explained_variance   | 0.838        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0362      |
-|    n_updates            | 490          |
-|    policy_gradient_loss | -0.00438     |
-|    std                  | 1.06         |
-|    value_loss           | 0.00952      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1914         |
-|    iterations           | 51           |
-|    time_elapsed         | 436          |
-|    total_timesteps      | 835584       |
-| train/                  |              |
-|    approx_kl            | 0.0073408587 |
-|    clip_fraction        | 0.077        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.931        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0283      |
-|    n_updates            | 500          |
-|    policy_gradient_loss | -0.00553     |
-|    std                  | 1.07         |
-|    value_loss           | 0.0142       |
-------------------------------------------
-Eval num_timesteps=850000, episode_reward=-37.98 +/- 27.04
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -38          |
-| time/                   |              |
-|    total_timesteps      | 850000       |
-| train/                  |              |
-|    approx_kl            | 0.0055803536 |
-|    clip_fraction        | 0.0536       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.931        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0338      |
-|    n_updates            | 510          |
-|    policy_gradient_loss | -0.00469     |
-|    std                  | 1.06         |
-|    value_loss           | 0.0156       |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1884   |
-|    iterations      | 52     |
-|    time_elapsed    | 452    |
-|    total_timesteps | 851968 |
--------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1899       |
-|    iterations           | 53         |
-|    time_elapsed         | 457        |
-|    total_timesteps      | 868352     |
-| train/                  |            |
-|    approx_kl            | 0.00585186 |
-|    clip_fraction        | 0.0638     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.97      |
-|    explained_variance   | 0.83       |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0333    |
-|    n_updates            | 520        |
-|    policy_gradient_loss | -0.00395   |
-|    std                  | 1.07       |
-|    value_loss           | 0.0322     |
-----------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1915         |
-|    iterations           | 54           |
-|    time_elapsed         | 461          |
-|    total_timesteps      | 884736       |
-| train/                  |              |
-|    approx_kl            | 0.0055105407 |
-|    clip_fraction        | 0.045        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.845        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0283      |
-|    n_updates            | 530          |
-|    policy_gradient_loss | -0.00367     |
-|    std                  | 1.06         |
-|    value_loss           | 0.0109       |
-------------------------------------------
-Eval num_timesteps=900000, episode_reward=-41.53 +/- 35.40
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -41.5        |
-| time/                   |              |
-|    total_timesteps      | 900000       |
-| train/                  |              |
-|    approx_kl            | 0.0064837057 |
-|    clip_fraction        | 0.0625       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.909        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0394      |
-|    n_updates            | 540          |
-|    policy_gradient_loss | -0.00409     |
-|    std                  | 1.06         |
-|    value_loss           | 0.0147       |
-------------------------------------------
-
-[Diag @ 900,000 | n_sheep=2 | success=0%]
-  COMPACT_CANT_DRIVE         12/20
-  NEVER_COMPACT              8/20
-  action_mag mean=0.276 p10=0.038 p90=0.580 (0=stopped, 1=full speed)
-  min_flock_radius mean=4.30m best=0.98m  (target <5m to compact)
-  min_dog_to_com   mean=3.24m best=0.24m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.15m best=5.60m
-  reward/step (mean): progress=-0.0048  alignment=+0.0000  pen_bonus=+0.0000  step_cost=-0.0200  complete=+0.0000
--------------------------------
-| time/              |        |
-|    fps             | 1857   |
-|    iterations      | 55     |
-|    time_elapsed    | 485    |
-|    total_timesteps | 901120 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1874        |
-|    iterations           | 56          |
-|    time_elapsed         | 489         |
-|    total_timesteps      | 917504      |
-| train/                  |             |
-|    approx_kl            | 0.006582682 |
-|    clip_fraction        | 0.0662      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.97       |
-|    explained_variance   | 0.961       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.039      |
-|    n_updates            | 550         |
-|    policy_gradient_loss | -0.00462    |
-|    std                  | 1.07        |
-|    value_loss           | 0.0103      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1888         |
-|    iterations           | 57           |
-|    time_elapsed         | 494          |
-|    total_timesteps      | 933888       |
-| train/                  |              |
-|    approx_kl            | 0.0059698187 |
-|    clip_fraction        | 0.0573       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.97        |
-|    explained_variance   | 0.907        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0291      |
-|    n_updates            | 560          |
-|    policy_gradient_loss | -0.00446     |
-|    std                  | 1.07         |
-|    value_loss           | 0.0113       |
-------------------------------------------
-Eval num_timesteps=950000, episode_reward=-26.73 +/- 22.82
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -26.7       |
-| time/                   |             |
-|    total_timesteps      | 950000      |
-| train/                  |             |
-|    approx_kl            | 0.006601461 |
-|    clip_fraction        | 0.0594      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.96       |
-|    explained_variance   | 0.872       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.034      |
-|    n_updates            | 570         |
-|    policy_gradient_loss | -0.00455    |
-|    std                  | 1.06        |
-|    value_loss           | 0.00901     |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1856   |
-|    iterations      | 58     |
-|    time_elapsed    | 511    |
-|    total_timesteps | 950272 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1869        |
-|    iterations           | 59          |
-|    time_elapsed         | 517         |
-|    total_timesteps      | 966656      |
-| train/                  |             |
-|    approx_kl            | 0.005824944 |
-|    clip_fraction        | 0.0624      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.96       |
-|    explained_variance   | 0.789       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0214     |
-|    n_updates            | 580         |
-|    policy_gradient_loss | -0.00363    |
-|    std                  | 1.07        |
-|    value_loss           | 0.0359      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1882        |
-|    iterations           | 60          |
-|    time_elapsed         | 522         |
-|    total_timesteps      | 983040      |
-| train/                  |             |
-|    approx_kl            | 0.005888001 |
-|    clip_fraction        | 0.0573      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.98       |
-|    explained_variance   | 0.887       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0391     |
-|    n_updates            | 590         |
-|    policy_gradient_loss | -0.00371    |
-|    std                  | 1.07        |
-|    value_loss           | 0.00935     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1895        |
-|    iterations           | 61          |
-|    time_elapsed         | 527         |
-|    total_timesteps      | 999424      |
-| train/                  |             |
-|    approx_kl            | 0.005874036 |
-|    clip_fraction        | 0.0611      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.98       |
-|    explained_variance   | 0.871       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0246     |
-|    n_updates            | 600         |
-|    policy_gradient_loss | -0.00492    |
-|    std                  | 1.07        |
-|    value_loss           | 0.00877     |
------------------------------------------
-Eval num_timesteps=1000000, episode_reward=-22.72 +/- 33.15
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -22.7        |
-| time/                   |              |
-|    total_timesteps      | 1000000      |
-| train/                  |              |
-|    approx_kl            | 0.0060388125 |
-|    clip_fraction        | 0.0637       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.97        |
-|    explained_variance   | 0.737        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0511      |
-|    n_updates            | 610          |
-|    policy_gradient_loss | -0.00387     |
-|    std                  | 1.07         |
-|    value_loss           | 0.0538       |
-------------------------------------------
-New best mean reward!
---------------------------------
-| time/              |         |
-|    fps             | 1869    |
-|    iterations      | 62      |
-|    time_elapsed    | 543     |
-|    total_timesteps | 1015808 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1882        |
-|    iterations           | 63          |
-|    time_elapsed         | 548         |
-|    total_timesteps      | 1032192     |
-| train/                  |             |
-|    approx_kl            | 0.007320485 |
-|    clip_fraction        | 0.0723      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.99       |
-|    explained_variance   | 0.946       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0342     |
-|    n_updates            | 620         |
-|    policy_gradient_loss | -0.0052     |
-|    std                  | 1.08        |
-|    value_loss           | 0.0174      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1894         |
-|    iterations           | 64           |
-|    time_elapsed         | 553          |
-|    total_timesteps      | 1048576      |
-| train/                  |              |
-|    approx_kl            | 0.0066477214 |
-|    clip_fraction        | 0.0621       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3           |
-|    explained_variance   | 0.919        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0301      |
-|    n_updates            | 630          |
-|    policy_gradient_loss | -0.00449     |
-|    std                  | 1.08         |
-|    value_loss           | 0.0109       |
-------------------------------------------
-Eval num_timesteps=1050000, episode_reward=-39.86 +/- 28.77
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -39.9        |
-| time/                   |              |
-|    total_timesteps      | 1050000      |
-| train/                  |              |
-|    approx_kl            | 0.0066243596 |
-|    clip_fraction        | 0.0772       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.99        |
-|    explained_variance   | 0.861        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0313      |
-|    n_updates            | 640          |
-|    policy_gradient_loss | -0.00462     |
-|    std                  | 1.07         |
-|    value_loss           | 0.0324       |
-------------------------------------------
-
-[Diag @ 1,050,000 | n_sheep=2 | success=0%]
-  COMPACT_CANT_DRIVE         18/20
-  NEVER_COMPACT              2/20
-  action_mag mean=0.200 p10=0.022 p90=0.478 (0=stopped, 1=full speed)
-  min_flock_radius mean=2.29m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=3.23m best=0.05m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.84m best=3.77m
-  reward/step (mean): progress=+0.0016  alignment=+0.0000  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1843    |
-|    iterations      | 65      |
-|    time_elapsed    | 577     |
-|    total_timesteps | 1064960 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1855         |
-|    iterations           | 66           |
-|    time_elapsed         | 582          |
-|    total_timesteps      | 1081344      |
-| train/                  |              |
-|    approx_kl            | 0.0066154073 |
-|    clip_fraction        | 0.0657       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.99        |
-|    explained_variance   | 0.836        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.029       |
-|    n_updates            | 650          |
-|    policy_gradient_loss | -0.0049      |
-|    std                  | 1.08         |
-|    value_loss           | 0.0135       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1865         |
-|    iterations           | 67           |
-|    time_elapsed         | 588          |
-|    total_timesteps      | 1097728      |
-| train/                  |              |
-|    approx_kl            | 0.0059733046 |
-|    clip_fraction        | 0.0634       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.01        |
-|    explained_variance   | 0.852        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0254      |
-|    n_updates            | 660          |
-|    policy_gradient_loss | -0.00452     |
-|    std                  | 1.09         |
-|    value_loss           | 0.0395       |
-------------------------------------------
-Eval num_timesteps=1100000, episode_reward=-33.30 +/- 26.65
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -33.3        |
-| time/                   |              |
-|    total_timesteps      | 1100000      |
-| train/                  |              |
-|    approx_kl            | 0.0054050894 |
-|    clip_fraction        | 0.048        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.02        |
-|    explained_variance   | 0.851        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0348      |
-|    n_updates            | 670          |
-|    policy_gradient_loss | -0.00385     |
-|    std                  | 1.1          |
-|    value_loss           | 0.0247       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1843    |
-|    iterations      | 68      |
-|    time_elapsed    | 604     |
-|    total_timesteps | 1114112 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1856         |
-|    iterations           | 69           |
-|    time_elapsed         | 608          |
-|    total_timesteps      | 1130496      |
-| train/                  |              |
-|    approx_kl            | 0.0073612374 |
-|    clip_fraction        | 0.076        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.01        |
-|    explained_variance   | 0.885        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0424      |
-|    n_updates            | 680          |
-|    policy_gradient_loss | -0.00512     |
-|    std                  | 1.09         |
-|    value_loss           | 0.0278       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1869         |
-|    iterations           | 70           |
-|    time_elapsed         | 613          |
-|    total_timesteps      | 1146880      |
-| train/                  |              |
-|    approx_kl            | 0.0063554104 |
-|    clip_fraction        | 0.067        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.01        |
-|    explained_variance   | 0.915        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0302      |
-|    n_updates            | 690          |
-|    policy_gradient_loss | -0.00577     |
-|    std                  | 1.09         |
-|    value_loss           | 0.0116       |
-------------------------------------------
-Eval num_timesteps=1150000, episode_reward=-26.91 +/- 26.08
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -26.9       |
-| time/                   |             |
-|    total_timesteps      | 1150000     |
-| train/                  |             |
-|    approx_kl            | 0.006060633 |
-|    clip_fraction        | 0.0603      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.02       |
-|    explained_variance   | 0.905       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0374     |
-|    n_updates            | 700         |
-|    policy_gradient_loss | -0.00442    |
-|    std                  | 1.1         |
-|    value_loss           | 0.0101      |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1847    |
-|    iterations      | 71      |
-|    time_elapsed    | 629     |
-|    total_timesteps | 1163264 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1859         |
-|    iterations           | 72           |
-|    time_elapsed         | 634          |
-|    total_timesteps      | 1179648      |
-| train/                  |              |
-|    approx_kl            | 0.0070389216 |
-|    clip_fraction        | 0.0728       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.03        |
-|    explained_variance   | 0.854        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0409      |
-|    n_updates            | 710          |
-|    policy_gradient_loss | -0.00505     |
-|    std                  | 1.1          |
-|    value_loss           | 0.0196       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1871         |
-|    iterations           | 73           |
-|    time_elapsed         | 638          |
-|    total_timesteps      | 1196032      |
-| train/                  |              |
-|    approx_kl            | 0.0055403598 |
-|    clip_fraction        | 0.0567       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.03        |
-|    explained_variance   | 0.906        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0324      |
-|    n_updates            | 720          |
-|    policy_gradient_loss | -0.00494     |
-|    std                  | 1.1          |
-|    value_loss           | 0.0109       |
-------------------------------------------
-Eval num_timesteps=1200000, episode_reward=-23.57 +/- 26.30
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -23.6        |
-| time/                   |              |
-|    total_timesteps      | 1200000      |
-| train/                  |              |
-|    approx_kl            | 0.0055604624 |
-|    clip_fraction        | 0.0522       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.02        |
-|    explained_variance   | 0.819        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.00379     |
-|    n_updates            | 730          |
-|    policy_gradient_loss | -0.00374     |
-|    std                  | 1.1          |
-|    value_loss           | 0.0453       |
-------------------------------------------
-
-[Diag @ 1,200,000 | n_sheep=2 | success=0%]
-  COMPACT_CANT_DRIVE         15/20
-  NEVER_COMPACT              4/20
-  DROVE_NO_SHEEP             1/20
-  action_mag mean=0.399 p10=0.067 p90=0.794 (0=stopped, 1=full speed)
-  min_flock_radius mean=2.96m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=2.17m best=0.14m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=11.07m best=2.66m
-  reward/step (mean): progress=+0.0064  alignment=+0.0000  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0000
-
-[Curriculum] leaving stage n_sheep=2 after 600,000 steps | training success rate (last 100 eps) = 0%
-[Curriculum] → 3 sheep at step 1,200,000
-
---------------------------------
-| time/              |         |
-|    fps             | 1828    |
-|    iterations      | 74      |
-|    time_elapsed    | 663     |
-|    total_timesteps | 1212416 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1839        |
-|    iterations           | 75          |
-|    time_elapsed         | 668         |
-|    total_timesteps      | 1228800     |
-| train/                  |             |
-|    approx_kl            | 0.007044647 |
-|    clip_fraction        | 0.0819      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.02       |
-|    explained_variance   | 0.902       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.00823    |
-|    n_updates            | 740         |
-|    policy_gradient_loss | -0.00327    |
-|    std                  | 1.1         |
-|    value_loss           | 0.042       |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1849         |
-|    iterations           | 76           |
-|    time_elapsed         | 673          |
-|    total_timesteps      | 1245184      |
-| train/                  |              |
-|    approx_kl            | 0.0064169513 |
-|    clip_fraction        | 0.0699       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.03        |
-|    explained_variance   | 0.928        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0323      |
-|    n_updates            | 750          |
-|    policy_gradient_loss | -0.00459     |
-|    std                  | 1.1          |
-|    value_loss           | 0.0102       |
-------------------------------------------
-Eval num_timesteps=1250000, episode_reward=-27.97 +/- 37.55
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -28         |
-| time/                   |             |
-|    total_timesteps      | 1250000     |
-| train/                  |             |
-|    approx_kl            | 0.006859841 |
-|    clip_fraction        | 0.0783      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.04       |
-|    explained_variance   | 0.94        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0368     |
-|    n_updates            | 760         |
-|    policy_gradient_loss | -0.00472    |
-|    std                  | 1.11        |
-|    value_loss           | 0.00931     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1825    |
-|    iterations      | 77      |
-|    time_elapsed    | 691     |
-|    total_timesteps | 1261568 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1836         |
-|    iterations           | 78           |
-|    time_elapsed         | 696          |
-|    total_timesteps      | 1277952      |
-| train/                  |              |
-|    approx_kl            | 0.0066901552 |
-|    clip_fraction        | 0.0704       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.04        |
-|    explained_variance   | 0.942        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0329      |
-|    n_updates            | 770          |
-|    policy_gradient_loss | -0.00458     |
-|    std                  | 1.11         |
-|    value_loss           | 0.00938      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1845        |
-|    iterations           | 79          |
-|    time_elapsed         | 701         |
-|    total_timesteps      | 1294336     |
-| train/                  |             |
-|    approx_kl            | 0.007008245 |
-|    clip_fraction        | 0.082       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.03       |
-|    explained_variance   | 0.899       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0194     |
-|    n_updates            | 780         |
-|    policy_gradient_loss | -0.00426    |
-|    std                  | 1.1         |
-|    value_loss           | 0.052       |
------------------------------------------
-Eval num_timesteps=1300000, episode_reward=-41.12 +/- 37.68
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -41.1        |
-| time/                   |              |
-|    total_timesteps      | 1300000      |
-| train/                  |              |
-|    approx_kl            | 0.0070775724 |
-|    clip_fraction        | 0.0742       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.03        |
-|    explained_variance   | 0.942        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0238      |
-|    n_updates            | 790          |
-|    policy_gradient_loss | -0.0052      |
-|    std                  | 1.11         |
-|    value_loss           | 0.00657      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1823    |
-|    iterations      | 80      |
-|    time_elapsed    | 718     |
-|    total_timesteps | 1310720 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1832        |
-|    iterations           | 81          |
-|    time_elapsed         | 724         |
-|    total_timesteps      | 1327104     |
-| train/                  |             |
-|    approx_kl            | 0.008046751 |
-|    clip_fraction        | 0.0851      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.04       |
-|    explained_variance   | 0.897       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0384     |
-|    n_updates            | 800         |
-|    policy_gradient_loss | -0.0057     |
-|    std                  | 1.11        |
-|    value_loss           | 0.009       |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1840        |
-|    iterations           | 82          |
-|    time_elapsed         | 730         |
-|    total_timesteps      | 1343488     |
-| train/                  |             |
-|    approx_kl            | 0.006007643 |
-|    clip_fraction        | 0.0548      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.06       |
-|    explained_variance   | 0.871       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0251     |
-|    n_updates            | 810         |
-|    policy_gradient_loss | -0.00416    |
-|    std                  | 1.12        |
-|    value_loss           | 0.0179      |
------------------------------------------
-Eval num_timesteps=1350000, episode_reward=-24.46 +/- 41.24
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -24.5        |
-| time/                   |              |
-|    total_timesteps      | 1350000      |
-| train/                  |              |
-|    approx_kl            | 0.0065572546 |
-|    clip_fraction        | 0.0698       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.877        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0219      |
-|    n_updates            | 820          |
-|    policy_gradient_loss | -0.00456     |
-|    std                  | 1.13         |
-|    value_loss           | 0.0242       |
-------------------------------------------
-
-[Diag @ 1,350,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              14/20
-  COMPACT_CANT_DRIVE         6/20
-  action_mag mean=0.195 p10=0.018 p90=0.576 (0=stopped, 1=full speed)
-  min_flock_radius mean=6.32m best=1.36m  (target <5m to compact)
-  min_dog_to_com   mean=4.15m best=0.61m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=11.37m best=4.88m
-  reward/step (mean): progress=+0.0029  alignment=+0.0000  pen_bonus=+0.0000  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1798    |
-|    iterations      | 83      |
-|    time_elapsed    | 756     |
-|    total_timesteps | 1359872 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1809         |
-|    iterations           | 84           |
-|    time_elapsed         | 760          |
-|    total_timesteps      | 1376256      |
-| train/                  |              |
-|    approx_kl            | 0.0072198315 |
-|    clip_fraction        | 0.0764       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.909        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0208      |
-|    n_updates            | 830          |
-|    policy_gradient_loss | -0.00626     |
-|    std                  | 1.13         |
-|    value_loss           | 0.0106       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1817         |
-|    iterations           | 85           |
-|    time_elapsed         | 766          |
-|    total_timesteps      | 1392640      |
-| train/                  |              |
-|    approx_kl            | 0.0070813587 |
-|    clip_fraction        | 0.0733       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.907        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0324      |
-|    n_updates            | 840          |
-|    policy_gradient_loss | -0.00505     |
-|    std                  | 1.13         |
-|    value_loss           | 0.0166       |
-------------------------------------------
-Eval num_timesteps=1400000, episode_reward=-36.32 +/- 33.15
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -36.3        |
-| time/                   |              |
-|    total_timesteps      | 1400000      |
-| train/                  |              |
-|    approx_kl            | 0.0067584305 |
-|    clip_fraction        | 0.08         |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.906        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0308      |
-|    n_updates            | 850          |
-|    policy_gradient_loss | -0.0054      |
-|    std                  | 1.13         |
-|    value_loss           | 0.0112       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1798    |
-|    iterations      | 86      |
-|    time_elapsed    | 783     |
-|    total_timesteps | 1409024 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1807        |
-|    iterations           | 87          |
-|    time_elapsed         | 788         |
-|    total_timesteps      | 1425408     |
-| train/                  |             |
-|    approx_kl            | 0.007411341 |
-|    clip_fraction        | 0.0716      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.09       |
-|    explained_variance   | 0.904       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0322     |
-|    n_updates            | 860         |
-|    policy_gradient_loss | -0.00641    |
-|    std                  | 1.14        |
-|    value_loss           | 0.0191      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1815         |
-|    iterations           | 88           |
-|    time_elapsed         | 794          |
-|    total_timesteps      | 1441792      |
-| train/                  |              |
-|    approx_kl            | 0.0077011855 |
-|    clip_fraction        | 0.0774       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.09        |
-|    explained_variance   | 0.914        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0316      |
-|    n_updates            | 870          |
-|    policy_gradient_loss | -0.00545     |
-|    std                  | 1.13         |
-|    value_loss           | 0.0148       |
-------------------------------------------
-Eval num_timesteps=1450000, episode_reward=-40.58 +/- 38.17
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -40.6       |
-| time/                   |             |
-|    total_timesteps      | 1450000     |
-| train/                  |             |
-|    approx_kl            | 0.007694071 |
-|    clip_fraction        | 0.0816      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.07       |
-|    explained_variance   | 0.937       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.036      |
-|    n_updates            | 880         |
-|    policy_gradient_loss | -0.0054     |
-|    std                  | 1.12        |
-|    value_loss           | 0.0111      |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1796    |
-|    iterations      | 89      |
-|    time_elapsed    | 811     |
-|    total_timesteps | 1458176 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1805        |
-|    iterations           | 90          |
-|    time_elapsed         | 816         |
-|    total_timesteps      | 1474560     |
-| train/                  |             |
-|    approx_kl            | 0.007034345 |
-|    clip_fraction        | 0.0693      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.07       |
-|    explained_variance   | 0.924       |
-|    learning_rate        | 0.0003      |
-|    loss                 | 0.0472      |
-|    n_updates            | 890         |
-|    policy_gradient_loss | -0.00472    |
-|    std                  | 1.13        |
-|    value_loss           | 0.0352      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1815         |
-|    iterations           | 91           |
-|    time_elapsed         | 821          |
-|    total_timesteps      | 1490944      |
-| train/                  |              |
-|    approx_kl            | 0.0078114523 |
-|    clip_fraction        | 0.0917       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.942        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0461      |
-|    n_updates            | 900          |
-|    policy_gradient_loss | -0.00668     |
-|    std                  | 1.13         |
-|    value_loss           | 0.00844      |
-------------------------------------------
-Eval num_timesteps=1500000, episode_reward=-19.66 +/- 25.98
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -19.7        |
-| time/                   |              |
-|    total_timesteps      | 1500000      |
-| train/                  |              |
-|    approx_kl            | 0.0067999987 |
-|    clip_fraction        | 0.0606       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.893        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0283      |
-|    n_updates            | 910          |
-|    policy_gradient_loss | -0.00385     |
-|    std                  | 1.12         |
-|    value_loss           | 0.0409       |
-------------------------------------------
-New best mean reward!
-
-[Diag @ 1,500,000 | n_sheep=3 | success=0%]
-  COMPACT_CANT_DRIVE         11/20
-  NEVER_COMPACT              7/20
-  DROVE_NO_SHEEP             2/20
-  action_mag mean=0.185 p10=0.015 p90=0.426 (0=stopped, 1=full speed)
-  min_flock_radius mean=4.43m best=1.38m  (target <5m to compact)
-  min_dog_to_com   mean=2.89m best=0.07m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=11.88m best=2.23m
-  reward/step (mean): progress=+0.0008  alignment=+0.0000  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1781    |
-|    iterations      | 92      |
-|    time_elapsed    | 846     |
-|    total_timesteps | 1507328 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1789         |
-|    iterations           | 93           |
-|    time_elapsed         | 851          |
-|    total_timesteps      | 1523712      |
-| train/                  |              |
-|    approx_kl            | 0.0069550863 |
-|    clip_fraction        | 0.0787       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.897        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0204      |
-|    n_updates            | 920          |
-|    policy_gradient_loss | -0.00394     |
-|    std                  | 1.13         |
-|    value_loss           | 0.0324       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1798        |
-|    iterations           | 94          |
-|    time_elapsed         | 856         |
-|    total_timesteps      | 1540096     |
-| train/                  |             |
-|    approx_kl            | 0.006749108 |
-|    clip_fraction        | 0.0787      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.08       |
-|    explained_variance   | 0.929       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0338     |
-|    n_updates            | 930         |
-|    policy_gradient_loss | -0.00534    |
-|    std                  | 1.13        |
-|    value_loss           | 0.00967     |
------------------------------------------
-Eval num_timesteps=1550000, episode_reward=-26.47 +/- 25.94
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -26.5        |
-| time/                   |              |
-|    total_timesteps      | 1550000      |
-| train/                  |              |
-|    approx_kl            | 0.0073381998 |
-|    clip_fraction        | 0.0679       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.919        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0259      |
-|    n_updates            | 940          |
-|    policy_gradient_loss | -0.00554     |
-|    std                  | 1.13         |
-|    value_loss           | 0.00999      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1782    |
-|    iterations      | 95      |
-|    time_elapsed    | 873     |
-|    total_timesteps | 1556480 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1790         |
-|    iterations           | 96           |
-|    time_elapsed         | 878          |
-|    total_timesteps      | 1572864      |
-| train/                  |              |
-|    approx_kl            | 0.0071112993 |
-|    clip_fraction        | 0.0781       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.929        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0324      |
-|    n_updates            | 950          |
-|    policy_gradient_loss | -0.00428     |
-|    std                  | 1.13         |
-|    value_loss           | 0.0246       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1798         |
-|    iterations           | 97           |
-|    time_elapsed         | 883          |
-|    total_timesteps      | 1589248      |
-| train/                  |              |
-|    approx_kl            | 0.0077134473 |
-|    clip_fraction        | 0.0784       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.08        |
-|    explained_variance   | 0.917        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0365      |
-|    n_updates            | 960          |
-|    policy_gradient_loss | -0.00445     |
-|    std                  | 1.13         |
-|    value_loss           | 0.0122       |
-------------------------------------------
-Eval num_timesteps=1600000, episode_reward=-35.13 +/- 31.01
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -35.1        |
-| time/                   |              |
-|    total_timesteps      | 1600000      |
-| train/                  |              |
-|    approx_kl            | 0.0070123896 |
-|    clip_fraction        | 0.0712       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.07        |
-|    explained_variance   | 0.919        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.026       |
-|    n_updates            | 970          |
-|    policy_gradient_loss | -0.00519     |
-|    std                  | 1.13         |
-|    value_loss           | 0.0171       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1781    |
-|    iterations      | 98      |
-|    time_elapsed    | 901     |
-|    total_timesteps | 1605632 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1789        |
-|    iterations           | 99          |
-|    time_elapsed         | 906         |
-|    total_timesteps      | 1622016     |
-| train/                  |             |
-|    approx_kl            | 0.007990176 |
-|    clip_fraction        | 0.0845      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.07       |
-|    explained_variance   | 0.873       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.04       |
-|    n_updates            | 980         |
-|    policy_gradient_loss | -0.0045     |
-|    std                  | 1.13        |
-|    value_loss           | 0.0153      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1798        |
-|    iterations           | 100         |
-|    time_elapsed         | 911         |
-|    total_timesteps      | 1638400     |
-| train/                  |             |
-|    approx_kl            | 0.006477687 |
-|    clip_fraction        | 0.0593      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.07       |
-|    explained_variance   | 0.946       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0396     |
-|    n_updates            | 990         |
-|    policy_gradient_loss | -0.00442    |
-|    std                  | 1.13        |
-|    value_loss           | 0.0107      |
------------------------------------------
-Eval num_timesteps=1650000, episode_reward=-31.86 +/- 47.05
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -31.9       |
-| time/                   |             |
-|    total_timesteps      | 1650000     |
-| train/                  |             |
-|    approx_kl            | 0.006796476 |
-|    clip_fraction        | 0.0672      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.08       |
-|    explained_variance   | 0.929       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0264     |
-|    n_updates            | 1000        |
-|    policy_gradient_loss | -0.00375    |
-|    std                  | 1.13        |
-|    value_loss           | 0.0385      |
------------------------------------------
-
-[Diag @ 1,650,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              11/20
-  COMPACT_CANT_DRIVE         9/20
-  action_mag mean=0.154 p10=0.005 p90=0.398 (0=stopped, 1=full speed)
-  min_flock_radius mean=5.81m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=3.22m best=0.52m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.42m best=7.08m
-  reward/step (mean): progress=+0.0061  alignment=+0.0000  pen_bonus=+0.0010  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1768    |
-|    iterations      | 101     |
-|    time_elapsed    | 935     |
-|    total_timesteps | 1654784 |
---------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1774       |
-|    iterations           | 102        |
-|    time_elapsed         | 941        |
-|    total_timesteps      | 1671168    |
-| train/                  |            |
-|    approx_kl            | 0.00682881 |
-|    clip_fraction        | 0.0694     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -3.08      |
-|    explained_variance   | 0.939      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0233    |
-|    n_updates            | 1010       |
-|    policy_gradient_loss | -0.00461   |
-|    std                  | 1.13       |
-|    value_loss           | 0.0183     |
-----------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1779         |
-|    iterations           | 103          |
-|    time_elapsed         | 948          |
-|    total_timesteps      | 1687552      |
-| train/                  |              |
-|    approx_kl            | 0.0071003223 |
-|    clip_fraction        | 0.0782       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.1         |
-|    explained_variance   | 0.923        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0398      |
-|    n_updates            | 1020         |
-|    policy_gradient_loss | -0.00491     |
-|    std                  | 1.15         |
-|    value_loss           | 0.0101       |
-------------------------------------------
-Eval num_timesteps=1700000, episode_reward=-32.11 +/- 36.59
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -32.1        |
-| time/                   |              |
-|    total_timesteps      | 1700000      |
-| train/                  |              |
-|    approx_kl            | 0.0064870613 |
-|    clip_fraction        | 0.0624       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.13        |
-|    explained_variance   | 0.909        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0365      |
-|    n_updates            | 1030         |
-|    policy_gradient_loss | -0.00404     |
-|    std                  | 1.17         |
-|    value_loss           | 0.00855      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1762    |
-|    iterations      | 104     |
-|    time_elapsed    | 966     |
-|    total_timesteps | 1703936 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1769        |
-|    iterations           | 105         |
-|    time_elapsed         | 972         |
-|    total_timesteps      | 1720320     |
-| train/                  |             |
-|    approx_kl            | 0.007349294 |
-|    clip_fraction        | 0.0833      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.926       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0358     |
-|    n_updates            | 1040        |
-|    policy_gradient_loss | -0.00514    |
-|    std                  | 1.17        |
-|    value_loss           | 0.00848     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1777         |
-|    iterations           | 106          |
-|    time_elapsed         | 976          |
-|    total_timesteps      | 1736704      |
-| train/                  |              |
-|    approx_kl            | 0.0070306472 |
-|    clip_fraction        | 0.0814       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.15        |
-|    explained_variance   | 0.887        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0359      |
-|    n_updates            | 1050         |
-|    policy_gradient_loss | -0.00489     |
-|    std                  | 1.17         |
-|    value_loss           | 0.0134       |
-------------------------------------------
-Eval num_timesteps=1750000, episode_reward=-34.24 +/- 43.23
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -34.2       |
-| time/                   |             |
-|    total_timesteps      | 1750000     |
-| train/                  |             |
-|    approx_kl            | 0.008487761 |
-|    clip_fraction        | 0.102       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.962       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0369     |
-|    n_updates            | 1060        |
-|    policy_gradient_loss | -0.0077     |
-|    std                  | 1.17        |
-|    value_loss           | 0.00786     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1762    |
-|    iterations      | 107     |
-|    time_elapsed    | 994     |
-|    total_timesteps | 1753088 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1766         |
-|    iterations           | 108          |
-|    time_elapsed         | 1001         |
-|    total_timesteps      | 1769472      |
-| train/                  |              |
-|    approx_kl            | 0.0074267983 |
-|    clip_fraction        | 0.0742       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.15        |
-|    explained_variance   | 0.939        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0404      |
-|    n_updates            | 1070         |
-|    policy_gradient_loss | -0.00575     |
-|    std                  | 1.18         |
-|    value_loss           | 0.0158       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1772         |
-|    iterations           | 109          |
-|    time_elapsed         | 1007         |
-|    total_timesteps      | 1785856      |
-| train/                  |              |
-|    approx_kl            | 0.0075380025 |
-|    clip_fraction        | 0.074        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.15        |
-|    explained_variance   | 0.961        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.034       |
-|    n_updates            | 1080         |
-|    policy_gradient_loss | -0.00553     |
-|    std                  | 1.17         |
-|    value_loss           | 0.00651      |
-------------------------------------------
-Eval num_timesteps=1800000, episode_reward=-31.16 +/- 37.32
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -31.2       |
-| time/                   |             |
-|    total_timesteps      | 1800000     |
-| train/                  |             |
-|    approx_kl            | 0.007386248 |
-|    clip_fraction        | 0.0843      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.922       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0419     |
-|    n_updates            | 1090        |
-|    policy_gradient_loss | -0.00596    |
-|    std                  | 1.17        |
-|    value_loss           | 0.00858     |
------------------------------------------
-
-[Diag @ 1,800,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              17/20
-  COMPACT_CANT_DRIVE         3/20
-  action_mag mean=0.164 p10=0.007 p90=0.418 (0=stopped, 1=full speed)
-  min_flock_radius mean=7.52m best=2.00m  (target <5m to compact)
-  min_dog_to_com   mean=2.24m best=0.21m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.87m best=3.90m
-  reward/step (mean): progress=-0.0007  alignment=+0.0000  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
-
-[Curriculum] leaving stage n_sheep=3 after 600,000 steps | training success rate (last 100 eps) = 0%
-[Curriculum] → 4 sheep at step 1,800,000
-
---------------------------------
-| time/              |         |
-|    fps             | 1743    |
-|    iterations      | 110     |
-|    time_elapsed    | 1033    |
-|    total_timesteps | 1802240 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1749        |
-|    iterations           | 111         |
-|    time_elapsed         | 1039        |
-|    total_timesteps      | 1818624     |
-| train/                  |             |
-|    approx_kl            | 0.009158293 |
-|    clip_fraction        | 0.0991      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.893       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0414     |
-|    n_updates            | 1100        |
-|    policy_gradient_loss | -0.00701    |
-|    std                  | 1.17        |
-|    value_loss           | 0.0237      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1755        |
-|    iterations           | 112         |
-|    time_elapsed         | 1045        |
-|    total_timesteps      | 1835008     |
-| train/                  |             |
-|    approx_kl            | 0.007241189 |
-|    clip_fraction        | 0.0831      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.874       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0241     |
-|    n_updates            | 1110        |
-|    policy_gradient_loss | -0.00634    |
-|    std                  | 1.17        |
-|    value_loss           | 0.0226      |
------------------------------------------
-Eval num_timesteps=1850000, episode_reward=-29.45 +/- 31.10
-Episode length: 2000.00 +/- 0.00
----------------------------------------
-| eval/                   |           |
-|    mean_ep_length       | 2e+03     |
-|    mean_reward          | -29.5     |
-| time/                   |           |
-|    total_timesteps      | 1850000   |
-| train/                  |           |
-|    approx_kl            | 0.0078688 |
-|    clip_fraction        | 0.0777    |
-|    clip_range           | 0.2       |
-|    entropy_loss         | -3.15     |
-|    explained_variance   | 0.895     |
-|    learning_rate        | 0.0003    |
-|    loss                 | -0.036    |
-|    n_updates            | 1120      |
-|    policy_gradient_loss | -0.00602  |
-|    std                  | 1.17      |
-|    value_loss           | 0.0128    |
----------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1742    |
-|    iterations      | 113     |
-|    time_elapsed    | 1062    |
-|    total_timesteps | 1851392 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1749        |
-|    iterations           | 114         |
-|    time_elapsed         | 1067        |
-|    total_timesteps      | 1867776     |
-| train/                  |             |
-|    approx_kl            | 0.008158936 |
-|    clip_fraction        | 0.0963      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.14       |
-|    explained_variance   | 0.897       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0324     |
-|    n_updates            | 1130        |
-|    policy_gradient_loss | -0.00854    |
-|    std                  | 1.17        |
-|    value_loss           | 0.0144      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1754         |
-|    iterations           | 115          |
-|    time_elapsed         | 1073         |
-|    total_timesteps      | 1884160      |
-| train/                  |              |
-|    approx_kl            | 0.0074978825 |
-|    clip_fraction        | 0.0844       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.14        |
-|    explained_variance   | 0.92         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0246      |
-|    n_updates            | 1140         |
-|    policy_gradient_loss | -0.00578     |
-|    std                  | 1.16         |
-|    value_loss           | 0.0134       |
-------------------------------------------
-Eval num_timesteps=1900000, episode_reward=-38.21 +/- 31.08
-Episode length: 2000.00 +/- 0.00
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 2e+03      |
-|    mean_reward          | -38.2      |
-| time/                   |            |
-|    total_timesteps      | 1900000    |
-| train/                  |            |
-|    approx_kl            | 0.00678163 |
-|    clip_fraction        | 0.0711     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -3.15      |
-|    explained_variance   | 0.892      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0345    |
-|    n_updates            | 1150       |
-|    policy_gradient_loss | -0.00409   |
-|    std                  | 1.18       |
-|    value_loss           | 0.0221     |
-----------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1740    |
-|    iterations      | 116     |
-|    time_elapsed    | 1091    |
-|    total_timesteps | 1900544 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1746        |
-|    iterations           | 117         |
-|    time_elapsed         | 1097        |
-|    total_timesteps      | 1916928     |
-| train/                  |             |
-|    approx_kl            | 0.006992462 |
-|    clip_fraction        | 0.0731      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.16       |
-|    explained_variance   | 0.895       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0243     |
-|    n_updates            | 1160        |
-|    policy_gradient_loss | -0.00588    |
-|    std                  | 1.18        |
-|    value_loss           | 0.0145      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1750         |
-|    iterations           | 118          |
-|    time_elapsed         | 1104         |
-|    total_timesteps      | 1933312      |
-| train/                  |              |
-|    approx_kl            | 0.0069225584 |
-|    clip_fraction        | 0.068        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.15        |
-|    explained_variance   | 0.905        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0297      |
-|    n_updates            | 1170         |
-|    policy_gradient_loss | -0.00516     |
-|    std                  | 1.17         |
-|    value_loss           | 0.0153       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1756        |
-|    iterations           | 119         |
-|    time_elapsed         | 1109        |
-|    total_timesteps      | 1949696     |
-| train/                  |             |
-|    approx_kl            | 0.005966103 |
-|    clip_fraction        | 0.059       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.896       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0337     |
-|    n_updates            | 1180        |
-|    policy_gradient_loss | -0.00413    |
-|    std                  | 1.17        |
-|    value_loss           | 0.0091      |
------------------------------------------
-Eval num_timesteps=1950000, episode_reward=-59.72 +/- 38.15
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -59.7        |
-| time/                   |              |
-|    total_timesteps      | 1950000      |
-| train/                  |              |
-|    approx_kl            | 0.0067311125 |
-|    clip_fraction        | 0.0733       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.16        |
-|    explained_variance   | 0.861        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0147      |
-|    n_updates            | 1190         |
-|    policy_gradient_loss | -0.00459     |
-|    std                  | 1.18         |
-|    value_loss           | 0.0083       |
-------------------------------------------
-
-[Diag @ 1,950,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              14/20
-  COMPACT_CANT_DRIVE         6/20
-  action_mag mean=0.325 p10=0.025 p90=0.778 (0=stopped, 1=full speed)
-  min_flock_radius mean=7.27m best=2.17m  (target <5m to compact)
-  min_dog_to_com   mean=3.74m best=0.07m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.01m best=6.24m
-  reward/step (mean): progress=+0.0026  alignment=+0.0000  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1728    |
-|    iterations      | 120     |
-|    time_elapsed    | 1137    |
-|    total_timesteps | 1966080 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1734         |
-|    iterations           | 121          |
-|    time_elapsed         | 1143         |
-|    total_timesteps      | 1982464      |
-| train/                  |              |
-|    approx_kl            | 0.0061555626 |
-|    clip_fraction        | 0.0631       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.17        |
-|    explained_variance   | 0.932        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0328      |
-|    n_updates            | 1200         |
-|    policy_gradient_loss | -0.00446     |
-|    std                  | 1.19         |
-|    value_loss           | 0.0133       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1739         |
-|    iterations           | 122          |
-|    time_elapsed         | 1149         |
-|    total_timesteps      | 1998848      |
-| train/                  |              |
-|    approx_kl            | 0.0060347347 |
-|    clip_fraction        | 0.057        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.18        |
-|    explained_variance   | 0.841        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0352      |
-|    n_updates            | 1210         |
-|    policy_gradient_loss | -0.00322     |
-|    std                  | 1.19         |
-|    value_loss           | 0.0104       |
-------------------------------------------
-Eval num_timesteps=2000000, episode_reward=-37.97 +/- 46.26
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -38          |
-| time/                   |              |
-|    total_timesteps      | 2000000      |
-| train/                  |              |
-|    approx_kl            | 0.0063244104 |
-|    clip_fraction        | 0.0675       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.18        |
-|    explained_variance   | 0.865        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0217      |
-|    n_updates            | 1220         |
-|    policy_gradient_loss | -0.00489     |
-|    std                  | 1.2          |
-|    value_loss           | 0.0219       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1725    |
-|    iterations      | 123     |
-|    time_elapsed    | 1167    |
-|    total_timesteps | 2015232 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1730        |
-|    iterations           | 124         |
-|    time_elapsed         | 1173        |
-|    total_timesteps      | 2031616     |
-| train/                  |             |
-|    approx_kl            | 0.007022621 |
-|    clip_fraction        | 0.0816      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.19       |
-|    explained_variance   | 0.949       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0248     |
-|    n_updates            | 1230        |
-|    policy_gradient_loss | -0.0053     |
-|    std                  | 1.19        |
-|    value_loss           | 0.00677     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1735        |
-|    iterations           | 125         |
-|    time_elapsed         | 1179        |
-|    total_timesteps      | 2048000     |
-| train/                  |             |
-|    approx_kl            | 0.006686856 |
-|    clip_fraction        | 0.0653      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.18       |
-|    explained_variance   | 0.928       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0333     |
-|    n_updates            | 1240        |
-|    policy_gradient_loss | -0.00445    |
-|    std                  | 1.19        |
-|    value_loss           | 0.00651     |
------------------------------------------
-Eval num_timesteps=2050000, episode_reward=-27.67 +/- 36.42
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -27.7       |
-| time/                   |             |
-|    total_timesteps      | 2050000     |
-| train/                  |             |
-|    approx_kl            | 0.006721792 |
-|    clip_fraction        | 0.0675      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.2        |
-|    explained_variance   | 0.921       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0278     |
-|    n_updates            | 1250        |
-|    policy_gradient_loss | -0.00408    |
-|    std                  | 1.21        |
-|    value_loss           | 0.00793     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1721    |
-|    iterations      | 126     |
-|    time_elapsed    | 1198    |
-|    total_timesteps | 2064384 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1726        |
-|    iterations           | 127         |
-|    time_elapsed         | 1205        |
-|    total_timesteps      | 2080768     |
-| train/                  |             |
-|    approx_kl            | 0.006730888 |
-|    clip_fraction        | 0.0617      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.23       |
-|    explained_variance   | 0.911       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0276     |
-|    n_updates            | 1260        |
-|    policy_gradient_loss | -0.00378    |
-|    std                  | 1.22        |
-|    value_loss           | 0.00964     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1732        |
-|    iterations           | 128         |
-|    time_elapsed         | 1210        |
-|    total_timesteps      | 2097152     |
-| train/                  |             |
-|    approx_kl            | 0.007725292 |
-|    clip_fraction        | 0.0775      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.23       |
-|    explained_variance   | 0.913       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0371     |
-|    n_updates            | 1270        |
-|    policy_gradient_loss | -0.006      |
-|    std                  | 1.22        |
-|    value_loss           | 0.0109      |
------------------------------------------
-Eval num_timesteps=2100000, episode_reward=-40.56 +/- 44.37
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -40.6        |
-| time/                   |              |
-|    total_timesteps      | 2100000      |
-| train/                  |              |
-|    approx_kl            | 0.0067186276 |
-|    clip_fraction        | 0.0644       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.24        |
-|    explained_variance   | 0.845        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0357      |
-|    n_updates            | 1280         |
-|    policy_gradient_loss | -0.00433     |
-|    std                  | 1.23         |
-|    value_loss           | 0.0263       |
-------------------------------------------
-
-[Diag @ 2,100,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              12/20
-  COMPACT_CANT_DRIVE         8/20
-  action_mag mean=0.384 p10=0.018 p90=0.884 (0=stopped, 1=full speed)
-  min_flock_radius mean=6.36m best=2.11m  (target <5m to compact)
-  min_dog_to_com   mean=2.94m best=0.40m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.34m best=5.56m
-  reward/step (mean): progress=-0.0084  alignment=+0.0000  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1706    |
-|    iterations      | 129     |
-|    time_elapsed    | 1238    |
-|    total_timesteps | 2113536 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1712        |
-|    iterations           | 130         |
-|    time_elapsed         | 1243        |
-|    total_timesteps      | 2129920     |
-| train/                  |             |
-|    approx_kl            | 0.006317258 |
-|    clip_fraction        | 0.0623      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.26       |
-|    explained_variance   | 0.912       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0419     |
-|    n_updates            | 1290        |
-|    policy_gradient_loss | -0.00427    |
-|    std                  | 1.24        |
-|    value_loss           | 0.00859     |
------------------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1716       |
-|    iterations           | 131        |
-|    time_elapsed         | 1250       |
-|    total_timesteps      | 2146304    |
-| train/                  |            |
-|    approx_kl            | 0.00636432 |
-|    clip_fraction        | 0.0698     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -3.28      |
-|    explained_variance   | 0.851      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0266    |
-|    n_updates            | 1300       |
-|    policy_gradient_loss | -0.00374   |
-|    std                  | 1.25       |
-|    value_loss           | 0.0299     |
-----------------------------------------
-Eval num_timesteps=2150000, episode_reward=-63.32 +/- 33.74
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -63.3        |
-| time/                   |              |
-|    total_timesteps      | 2150000      |
-| train/                  |              |
-|    approx_kl            | 0.0060345423 |
-|    clip_fraction        | 0.0563       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.27        |
-|    explained_variance   | 0.898        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0404      |
-|    n_updates            | 1310         |
-|    policy_gradient_loss | -0.00356     |
-|    std                  | 1.24         |
-|    value_loss           | 0.0205       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1704    |
-|    iterations      | 132     |
-|    time_elapsed    | 1268    |
-|    total_timesteps | 2162688 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1709        |
-|    iterations           | 133         |
-|    time_elapsed         | 1274        |
-|    total_timesteps      | 2179072     |
-| train/                  |             |
-|    approx_kl            | 0.007027424 |
-|    clip_fraction        | 0.0693      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.25       |
-|    explained_variance   | 0.9         |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0315     |
-|    n_updates            | 1320        |
-|    policy_gradient_loss | -0.00521    |
-|    std                  | 1.23        |
-|    value_loss           | 0.0194      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1715        |
-|    iterations           | 134         |
-|    time_elapsed         | 1279        |
-|    total_timesteps      | 2195456     |
-| train/                  |             |
-|    approx_kl            | 0.006112649 |
-|    clip_fraction        | 0.0635      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.24       |
-|    explained_variance   | 0.957       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0339     |
-|    n_updates            | 1330        |
-|    policy_gradient_loss | -0.00383    |
-|    std                  | 1.23        |
-|    value_loss           | 0.00861     |
------------------------------------------
-Eval num_timesteps=2200000, episode_reward=-31.28 +/- 44.80
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -31.3        |
-| time/                   |              |
-|    total_timesteps      | 2200000      |
-| train/                  |              |
-|    approx_kl            | 0.0070182728 |
-|    clip_fraction        | 0.076        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.26        |
-|    explained_variance   | 0.883        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0412      |
-|    n_updates            | 1340         |
-|    policy_gradient_loss | -0.00534     |
-|    std                  | 1.25         |
-|    value_loss           | 0.013        |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1704    |
-|    iterations      | 135     |
-|    time_elapsed    | 1297    |
-|    total_timesteps | 2211840 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1708         |
-|    iterations           | 136          |
-|    time_elapsed         | 1304         |
-|    total_timesteps      | 2228224      |
-| train/                  |              |
-|    approx_kl            | 0.0062820893 |
-|    clip_fraction        | 0.062        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.26        |
-|    explained_variance   | 0.924        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0377      |
-|    n_updates            | 1350         |
-|    policy_gradient_loss | -0.00497     |
-|    std                  | 1.24         |
-|    value_loss           | 0.00797      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1713         |
-|    iterations           | 137          |
-|    time_elapsed         | 1310         |
-|    total_timesteps      | 2244608      |
-| train/                  |              |
-|    approx_kl            | 0.0072454046 |
-|    clip_fraction        | 0.0747       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.25        |
-|    explained_variance   | 0.94         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0366      |
-|    n_updates            | 1360         |
-|    policy_gradient_loss | -0.00572     |
-|    std                  | 1.23         |
-|    value_loss           | 0.00852      |
-------------------------------------------
-Eval num_timesteps=2250000, episode_reward=-36.00 +/- 38.67
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -36         |
-| time/                   |             |
-|    total_timesteps      | 2250000     |
-| train/                  |             |
-|    approx_kl            | 0.005690419 |
-|    clip_fraction        | 0.0546      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.25       |
-|    explained_variance   | 0.957       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0376     |
-|    n_updates            | 1370        |
-|    policy_gradient_loss | -0.00425    |
-|    std                  | 1.23        |
-|    value_loss           | 0.00524     |
------------------------------------------
-
-[Diag @ 2,250,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              13/20
-  COMPACT_CANT_DRIVE         7/20
-  action_mag mean=0.416 p10=0.038 p90=0.887 (0=stopped, 1=full speed)
-  min_flock_radius mean=6.62m best=2.03m  (target <5m to compact)
-  min_dog_to_com   mean=3.54m best=0.40m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=14.24m best=9.65m
-  reward/step (mean): progress=-0.0070  alignment=+0.0000  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1690    |
-|    iterations      | 138     |
-|    time_elapsed    | 1337    |
-|    total_timesteps | 2260992 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1696         |
-|    iterations           | 139          |
-|    time_elapsed         | 1342         |
-|    total_timesteps      | 2277376      |
-| train/                  |              |
-|    approx_kl            | 0.0072061084 |
-|    clip_fraction        | 0.0728       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.25        |
-|    explained_variance   | 0.954        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0312      |
-|    n_updates            | 1380         |
-|    policy_gradient_loss | -0.00512     |
-|    std                  | 1.23         |
-|    value_loss           | 0.006        |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1702         |
-|    iterations           | 140          |
-|    time_elapsed         | 1347         |
-|    total_timesteps      | 2293760      |
-| train/                  |              |
-|    approx_kl            | 0.0066916933 |
-|    clip_fraction        | 0.0626       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.24        |
-|    explained_variance   | 0.939        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0408      |
-|    n_updates            | 1390         |
-|    policy_gradient_loss | -0.00463     |
-|    std                  | 1.23         |
-|    value_loss           | 0.00827      |
-------------------------------------------
-Eval num_timesteps=2300000, episode_reward=-43.65 +/- 42.86
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -43.7        |
-| time/                   |              |
-|    total_timesteps      | 2300000      |
-| train/                  |              |
-|    approx_kl            | 0.0062987795 |
-|    clip_fraction        | 0.0609       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.26        |
-|    explained_variance   | 0.898        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0316      |
-|    n_updates            | 1400         |
-|    policy_gradient_loss | -0.00442     |
-|    std                  | 1.25         |
-|    value_loss           | 0.00955      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1691    |
-|    iterations      | 141     |
-|    time_elapsed    | 1365    |
-|    total_timesteps | 2310144 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1696        |
-|    iterations           | 142         |
-|    time_elapsed         | 1371        |
-|    total_timesteps      | 2326528     |
-| train/                  |             |
-|    approx_kl            | 0.005443076 |
-|    clip_fraction        | 0.054       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.27       |
-|    explained_variance   | 0.877       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0296     |
-|    n_updates            | 1410        |
-|    policy_gradient_loss | -0.00375    |
-|    std                  | 1.24        |
-|    value_loss           | 0.00928     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1701        |
-|    iterations           | 143         |
-|    time_elapsed         | 1376        |
-|    total_timesteps      | 2342912     |
-| train/                  |             |
-|    approx_kl            | 0.004740049 |
-|    clip_fraction        | 0.0456      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.26       |
-|    explained_variance   | 0.922       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0318     |
-|    n_updates            | 1420        |
-|    policy_gradient_loss | -0.00351    |
-|    std                  | 1.24        |
-|    value_loss           | 0.0156      |
------------------------------------------
-Eval num_timesteps=2350000, episode_reward=-37.57 +/- 37.78
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -37.6        |
-| time/                   |              |
-|    total_timesteps      | 2350000      |
-| train/                  |              |
-|    approx_kl            | 0.0056120222 |
-|    clip_fraction        | 0.0542       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.27        |
-|    explained_variance   | 0.911        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0272      |
-|    n_updates            | 1430         |
-|    policy_gradient_loss | -0.0035      |
-|    std                  | 1.25         |
-|    value_loss           | 0.00811      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1690    |
-|    iterations      | 144     |
-|    time_elapsed    | 1395    |
-|    total_timesteps | 2359296 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1695         |
-|    iterations           | 145          |
-|    time_elapsed         | 1401         |
-|    total_timesteps      | 2375680      |
-| train/                  |              |
-|    approx_kl            | 0.0064737825 |
-|    clip_fraction        | 0.0697       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.28        |
-|    explained_variance   | 0.93         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.036       |
-|    n_updates            | 1440         |
-|    policy_gradient_loss | -0.00403     |
-|    std                  | 1.25         |
-|    value_loss           | 0.00488      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1699         |
-|    iterations           | 146          |
-|    time_elapsed         | 1407         |
-|    total_timesteps      | 2392064      |
-| train/                  |              |
-|    approx_kl            | 0.0050720195 |
-|    clip_fraction        | 0.0466       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.29        |
-|    explained_variance   | 0.902        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0374      |
-|    n_updates            | 1450         |
-|    policy_gradient_loss | -0.00283     |
-|    std                  | 1.26         |
-|    value_loss           | 0.00958      |
-------------------------------------------
-Eval num_timesteps=2400000, episode_reward=-42.55 +/- 37.89
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -42.6       |
-| time/                   |             |
-|    total_timesteps      | 2400000     |
-| train/                  |             |
-|    approx_kl            | 0.005990128 |
-|    clip_fraction        | 0.0565      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.31       |
-|    explained_variance   | 0.869       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0448     |
-|    n_updates            | 1460        |
-|    policy_gradient_loss | -0.0051     |
-|    std                  | 1.27        |
-|    value_loss           | 0.00854     |
------------------------------------------
-
-[Diag @ 2,400,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              15/20
-  COMPACT_CANT_DRIVE         5/20
-  action_mag mean=0.424 p10=0.025 p90=0.948 (0=stopped, 1=full speed)
-  min_flock_radius mean=7.66m best=1.63m  (target <5m to compact)
-  min_dog_to_com   mean=4.77m best=0.32m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=14.47m best=8.96m
-  reward/step (mean): progress=-0.0008  alignment=+0.0000  pen_bonus=+0.0003  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1677    |
-|    iterations      | 147     |
-|    time_elapsed    | 1435    |
-|    total_timesteps | 2408448 |
---------------------------------
-
-Training complete. Artefacts saved to runs/ppo_fix_check/
diff --git a/training/runs/ppo_fix_check/best_model/best_model.zip b/training/runs/ppo_fix_check/best_model/best_model.zip
deleted file mode 100644
index 8533c33..0000000
Binary files a/training/runs/ppo_fix_check/best_model/best_model.zip and /dev/null differ
diff --git a/training/runs/ppo_fix_check/evaluations.npz b/training/runs/ppo_fix_check/evaluations.npz
deleted file mode 100644
index 9ae65e5..0000000
Binary files a/training/runs/ppo_fix_check/evaluations.npz and /dev/null differ
diff --git a/training/runs/ppo_fix_check/final_model.zip b/training/runs/ppo_fix_check/final_model.zip
deleted file mode 100644
index 7e1248e..0000000
Binary files a/training/runs/ppo_fix_check/final_model.zip and /dev/null differ
diff --git a/training/runs/ppo_fix_check/vecnorm.pkl b/training/runs/ppo_fix_check/vecnorm.pkl
deleted file mode 100644
index f51753c..0000000
Binary files a/training/runs/ppo_fix_check/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/ppo_fix_check2.log b/training/runs/ppo_fix_check2.log
deleted file mode 100644
index a345ff5..0000000
--- a/training/runs/ppo_fix_check2.log
+++ /dev/null
@@ -1,3391 +0,0 @@
-Using cpu device
-Logging to runs/ppo_fix_check2/ppo_1
-------------------------------
-| time/              |       |
-|    fps             | 4605  |
-|    iterations      | 1     |
-|    time_elapsed    | 3     |
-|    total_timesteps | 16384 |
-------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 4011         |
-|    iterations           | 2            |
-|    time_elapsed         | 8            |
-|    total_timesteps      | 32768        |
-| train/                  |              |
-|    approx_kl            | 0.0033352287 |
-|    clip_fraction        | 0.0253       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.83        |
-|    explained_variance   | 0.271        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.00687     |
-|    n_updates            | 10           |
-|    policy_gradient_loss | -0.00103     |
-|    std                  | 0.996        |
-|    value_loss           | 0.0684       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 3789        |
-|    iterations           | 3           |
-|    time_elapsed         | 12          |
-|    total_timesteps      | 49152       |
-| train/                  |             |
-|    approx_kl            | 0.005950423 |
-|    clip_fraction        | 0.0552      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.83       |
-|    explained_variance   | 0.527       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0153     |
-|    n_updates            | 20          |
-|    policy_gradient_loss | -0.0029     |
-|    std                  | 0.997       |
-|    value_loss           | 0.0663      |
------------------------------------------
-/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.
-  warnings.warn(
-Eval num_timesteps=50000, episode_reward=-25.68 +/- 59.67
-Episode length: 1815.95 +/- 456.88
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.82e+03     |
-|    mean_reward          | -25.7        |
-| time/                   |              |
-|    total_timesteps      | 50000        |
-| train/                  |              |
-|    approx_kl            | 0.0040030424 |
-|    clip_fraction        | 0.0356       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.85        |
-|    explained_variance   | 0.421        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.149        |
-|    n_updates            | 30           |
-|    policy_gradient_loss | -0.00198     |
-|    std                  | 1.01         |
-|    value_loss           | 0.114        |
-------------------------------------------
-New best mean reward!
-------------------------------
-| time/              |       |
-|    fps             | 2351  |
-|    iterations      | 4     |
-|    time_elapsed    | 27    |
-|    total_timesteps | 65536 |
-------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2446        |
-|    iterations           | 5           |
-|    time_elapsed         | 33          |
-|    total_timesteps      | 81920       |
-| train/                  |             |
-|    approx_kl            | 0.005522004 |
-|    clip_fraction        | 0.0604      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.86       |
-|    explained_variance   | 0.737       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0301     |
-|    n_updates            | 40          |
-|    policy_gradient_loss | -0.00434    |
-|    std                  | 1.01        |
-|    value_loss           | 0.0164      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2617         |
-|    iterations           | 6            |
-|    time_elapsed         | 37           |
-|    total_timesteps      | 98304        |
-| train/                  |              |
-|    approx_kl            | 0.0052388343 |
-|    clip_fraction        | 0.0463       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.86        |
-|    explained_variance   | 0.626        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0294      |
-|    n_updates            | 50           |
-|    policy_gradient_loss | -0.00297     |
-|    std                  | 1.01         |
-|    value_loss           | 0.0597       |
-------------------------------------------
-/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.
-  warnings.warn(
-Eval num_timesteps=100000, episode_reward=-22.76 +/- 46.60
-Episode length: 1900.95 +/- 430.60
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.9e+03     |
-|    mean_reward          | -22.8       |
-| time/                   |             |
-|    total_timesteps      | 100000      |
-| train/                  |             |
-|    approx_kl            | 0.005612197 |
-|    clip_fraction        | 0.0475      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.86       |
-|    explained_variance   | 0.747       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0261     |
-|    n_updates            | 60          |
-|    policy_gradient_loss | -0.00393    |
-|    std                  | 1.01        |
-|    value_loss           | 0.0517      |
------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 2178   |
-|    iterations      | 7      |
-|    time_elapsed    | 52     |
-|    total_timesteps | 114688 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2294         |
-|    iterations           | 8            |
-|    time_elapsed         | 57           |
-|    total_timesteps      | 131072       |
-| train/                  |              |
-|    approx_kl            | 0.0057119504 |
-|    clip_fraction        | 0.0541       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.85        |
-|    explained_variance   | 0.896        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0144      |
-|    n_updates            | 70           |
-|    policy_gradient_loss | -0.00364     |
-|    std                  | 1            |
-|    value_loss           | 0.0738       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2393        |
-|    iterations           | 9           |
-|    time_elapsed         | 61          |
-|    total_timesteps      | 147456      |
-| train/                  |             |
-|    approx_kl            | 0.005940904 |
-|    clip_fraction        | 0.0565      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.85       |
-|    explained_variance   | 0.89        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0283     |
-|    n_updates            | 80          |
-|    policy_gradient_loss | -0.00245    |
-|    std                  | 1.01        |
-|    value_loss           | 0.0761      |
------------------------------------------
-Eval num_timesteps=150000, episode_reward=-29.37 +/- 28.32
-Episode length: 1997.50 +/- 10.90
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -29.4       |
-| time/                   |             |
-|    total_timesteps      | 150000      |
-| train/                  |             |
-|    approx_kl            | 0.004531667 |
-|    clip_fraction        | 0.0392      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.85       |
-|    explained_variance   | 0.958       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0343     |
-|    n_updates            | 90          |
-|    policy_gradient_loss | -0.00379    |
-|    std                  | 1.01        |
-|    value_loss           | 0.00995     |
------------------------------------------
-
-[Diag @ 150,000 | n_sheep=1 | success=0%]
-  COMPACT_CANT_DRIVE         17/20
-  DROVE_NO_SHEEP             3/20
-  action_mag mean=0.089 p10=0.003 p90=0.274 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=4.40m best=2.07m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=11.66m best=1.50m
-  reward/step (mean): progress=+0.0004  alignment=+0.0000  pen_bonus=+0.0000  step_cost=-0.0200  complete=+0.0000
--------------------------------
-| time/              |        |
-|    fps             | 1950   |
-|    iterations      | 10     |
-|    time_elapsed    | 84     |
-|    total_timesteps | 163840 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2020         |
-|    iterations           | 11           |
-|    time_elapsed         | 89           |
-|    total_timesteps      | 180224       |
-| train/                  |              |
-|    approx_kl            | 0.0061831754 |
-|    clip_fraction        | 0.068        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.86        |
-|    explained_variance   | 0.975        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0349      |
-|    n_updates            | 100          |
-|    policy_gradient_loss | -0.00607     |
-|    std                  | 1.02         |
-|    value_loss           | 0.0156       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2084        |
-|    iterations           | 12          |
-|    time_elapsed         | 94          |
-|    total_timesteps      | 196608      |
-| train/                  |             |
-|    approx_kl            | 0.009407628 |
-|    clip_fraction        | 0.123       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.87       |
-|    explained_variance   | 0.899       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0305     |
-|    n_updates            | 110         |
-|    policy_gradient_loss | -0.00932    |
-|    std                  | 1.02        |
-|    value_loss           | 0.0223      |
------------------------------------------
-Eval num_timesteps=200000, episode_reward=-12.36 +/- 51.37
-Episode length: 1880.20 +/- 355.04
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.88e+03    |
-|    mean_reward          | -12.4       |
-| time/                   |             |
-|    total_timesteps      | 200000      |
-| train/                  |             |
-|    approx_kl            | 0.008270489 |
-|    clip_fraction        | 0.0945      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.85       |
-|    explained_variance   | 0.945       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0339     |
-|    n_updates            | 120         |
-|    policy_gradient_loss | -0.00809    |
-|    std                  | 1           |
-|    value_loss           | 0.0162      |
------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 1936   |
-|    iterations      | 13     |
-|    time_elapsed    | 109    |
-|    total_timesteps | 212992 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1989        |
-|    iterations           | 14          |
-|    time_elapsed         | 115         |
-|    total_timesteps      | 229376      |
-| train/                  |             |
-|    approx_kl            | 0.008541125 |
-|    clip_fraction        | 0.112       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.83       |
-|    explained_variance   | 0.944       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0184     |
-|    n_updates            | 130         |
-|    policy_gradient_loss | -0.00846    |
-|    std                  | 0.994       |
-|    value_loss           | 0.0284      |
------------------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 2037       |
-|    iterations           | 15         |
-|    time_elapsed         | 120        |
-|    total_timesteps      | 245760     |
-| train/                  |            |
-|    approx_kl            | 0.00763176 |
-|    clip_fraction        | 0.0894     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.81      |
-|    explained_variance   | 0.9        |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0128    |
-|    n_updates            | 140        |
-|    policy_gradient_loss | -0.00655   |
-|    std                  | 0.987      |
-|    value_loss           | 0.071      |
-----------------------------------------
-Eval num_timesteps=250000, episode_reward=45.82 +/- 68.33
-Episode length: 1391.70 +/- 757.58
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.39e+03    |
-|    mean_reward          | 45.8        |
-| time/                   |             |
-|    total_timesteps      | 250000      |
-| train/                  |             |
-|    approx_kl            | 0.009210973 |
-|    clip_fraction        | 0.11        |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.81       |
-|    explained_variance   | 0.95        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0401     |
-|    n_updates            | 150         |
-|    policy_gradient_loss | -0.0082     |
-|    std                  | 0.986       |
-|    value_loss           | 0.0202      |
------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 1958   |
-|    iterations      | 16     |
-|    time_elapsed    | 133    |
-|    total_timesteps | 262144 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2005        |
-|    iterations           | 17          |
-|    time_elapsed         | 138         |
-|    total_timesteps      | 278528      |
-| train/                  |             |
-|    approx_kl            | 0.008197077 |
-|    clip_fraction        | 0.096       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.79       |
-|    explained_variance   | 0.949       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0375     |
-|    n_updates            | 160         |
-|    policy_gradient_loss | -0.00834    |
-|    std                  | 0.976       |
-|    value_loss           | 0.0207      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2061        |
-|    iterations           | 18          |
-|    time_elapsed         | 143         |
-|    total_timesteps      | 294912      |
-| train/                  |             |
-|    approx_kl            | 0.006078005 |
-|    clip_fraction        | 0.0598      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.78       |
-|    explained_variance   | 0.965       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0188     |
-|    n_updates            | 170         |
-|    policy_gradient_loss | -0.00464    |
-|    std                  | 0.969       |
-|    value_loss           | 0.0178      |
------------------------------------------
-Eval num_timesteps=300000, episode_reward=56.19 +/- 63.26
-Episode length: 1246.75 +/- 843.82
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.25e+03     |
-|    mean_reward          | 56.2         |
-| time/                   |              |
-|    total_timesteps      | 300000       |
-| train/                  |              |
-|    approx_kl            | 0.0056289425 |
-|    clip_fraction        | 0.0523       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.76        |
-|    explained_variance   | 0.969        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0246      |
-|    n_updates            | 180          |
-|    policy_gradient_loss | -0.00378     |
-|    std                  | 0.961        |
-|    value_loss           | 0.0174       |
-------------------------------------------
-New best mean reward!
-
-[Diag @ 300,000 | n_sheep=1 | success=40%]
-  DROVE_NO_SHEEP             11/20
-  SUCCESS                    8/20
-  COMPACT_CANT_DRIVE         1/20
-  action_mag mean=0.076 p10=0.000 p90=0.193 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=2.83m best=0.24m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=2.99m best=1.50m
-  reward/step (mean): progress=+0.0236  alignment=+0.0012  pen_bonus=+0.0029  step_cost=-0.0200  complete=+0.0291
--------------------------------
-| time/              |        |
-|    fps             | 1939   |
-|    iterations      | 19     |
-|    time_elapsed    | 160    |
-|    total_timesteps | 311296 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1983        |
-|    iterations           | 20          |
-|    time_elapsed         | 165         |
-|    total_timesteps      | 327680      |
-| train/                  |             |
-|    approx_kl            | 0.005042998 |
-|    clip_fraction        | 0.05        |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.73       |
-|    explained_variance   | 0.941       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0242     |
-|    n_updates            | 190         |
-|    policy_gradient_loss | -0.00399    |
-|    std                  | 0.947       |
-|    value_loss           | 0.00505     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2018         |
-|    iterations           | 21           |
-|    time_elapsed         | 170          |
-|    total_timesteps      | 344064       |
-| train/                  |              |
-|    approx_kl            | 0.0054986854 |
-|    clip_fraction        | 0.0569       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.72        |
-|    explained_variance   | 0.942        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0248      |
-|    n_updates            | 200          |
-|    policy_gradient_loss | -0.00415     |
-|    std                  | 0.941        |
-|    value_loss           | 0.00784      |
-------------------------------------------
-Eval num_timesteps=350000, episode_reward=25.08 +/- 61.55
-Episode length: 1562.00 +/- 761.23
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.56e+03     |
-|    mean_reward          | 25.1         |
-| time/                   |              |
-|    total_timesteps      | 350000       |
-| train/                  |              |
-|    approx_kl            | 0.0046333643 |
-|    clip_fraction        | 0.0476       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.71        |
-|    explained_variance   | 0.934        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0244      |
-|    n_updates            | 210          |
-|    policy_gradient_loss | -0.00237     |
-|    std                  | 0.934        |
-|    value_loss           | 0.00827      |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1950   |
-|    iterations      | 22     |
-|    time_elapsed    | 184    |
-|    total_timesteps | 360448 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1990        |
-|    iterations           | 23          |
-|    time_elapsed         | 189         |
-|    total_timesteps      | 376832      |
-| train/                  |             |
-|    approx_kl            | 0.006686668 |
-|    clip_fraction        | 0.0757      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.7        |
-|    explained_variance   | 0.963       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0423     |
-|    n_updates            | 220         |
-|    policy_gradient_loss | -0.00244    |
-|    std                  | 0.936       |
-|    value_loss           | 0.00575     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2027        |
-|    iterations           | 24          |
-|    time_elapsed         | 193         |
-|    total_timesteps      | 393216      |
-| train/                  |             |
-|    approx_kl            | 0.009116547 |
-|    clip_fraction        | 0.103       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.71       |
-|    explained_variance   | 0.97        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0353     |
-|    n_updates            | 230         |
-|    policy_gradient_loss | -0.0042     |
-|    std                  | 0.941       |
-|    value_loss           | 0.006       |
------------------------------------------
-Eval num_timesteps=400000, episode_reward=56.91 +/- 71.91
-Episode length: 1225.25 +/- 861.21
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.23e+03     |
-|    mean_reward          | 56.9         |
-| time/                   |              |
-|    total_timesteps      | 400000       |
-| train/                  |              |
-|    approx_kl            | 0.0061917743 |
-|    clip_fraction        | 0.0658       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.72        |
-|    explained_variance   | 0.975        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0378      |
-|    n_updates            | 240          |
-|    policy_gradient_loss | -0.00282     |
-|    std                  | 0.943        |
-|    value_loss           | 0.00633      |
-------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 1981   |
-|    iterations      | 25     |
-|    time_elapsed    | 206    |
-|    total_timesteps | 409600 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2011        |
-|    iterations           | 26          |
-|    time_elapsed         | 211         |
-|    total_timesteps      | 425984      |
-| train/                  |             |
-|    approx_kl            | 0.007945089 |
-|    clip_fraction        | 0.1         |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.73       |
-|    explained_variance   | 0.978       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0343     |
-|    n_updates            | 250         |
-|    policy_gradient_loss | -0.00475    |
-|    std                  | 0.95        |
-|    value_loss           | 0.00708     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2044        |
-|    iterations           | 27          |
-|    time_elapsed         | 216         |
-|    total_timesteps      | 442368      |
-| train/                  |             |
-|    approx_kl            | 0.013059773 |
-|    clip_fraction        | 0.152       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.76       |
-|    explained_variance   | 0.984       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0421     |
-|    n_updates            | 260         |
-|    policy_gradient_loss | -0.00542    |
-|    std                  | 0.967       |
-|    value_loss           | 0.00331     |
------------------------------------------
-Eval num_timesteps=450000, episode_reward=58.80 +/- 74.46
-Episode length: 1123.15 +/- 881.85
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.12e+03     |
-|    mean_reward          | 58.8         |
-| time/                   |              |
-|    total_timesteps      | 450000       |
-| train/                  |              |
-|    approx_kl            | 0.0085322345 |
-|    clip_fraction        | 0.0967       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.77        |
-|    explained_variance   | 0.98         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0264      |
-|    n_updates            | 270          |
-|    policy_gradient_loss | -0.00612     |
-|    std                  | 0.963        |
-|    value_loss           | 0.00919      |
-------------------------------------------
-New best mean reward!
-
-[Diag @ 450,000 | n_sheep=1 | success=65%]
-  SUCCESS                    13/20
-  DROVE_NO_SHEEP             4/20
-  COMPACT_CANT_DRIVE         3/20
-  action_mag mean=0.105 p10=0.000 p90=0.272 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=1.67m best=0.43m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=3.26m best=2.29m
-  reward/step (mean): progress=+0.0326  alignment=+0.0024  pen_bonus=+0.0076  step_cost=-0.0200  complete=+0.0762
--------------------------------
-| time/              |        |
-|    fps             | 1974   |
-|    iterations      | 28     |
-|    time_elapsed    | 232    |
-|    total_timesteps | 458752 |
--------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 2005       |
-|    iterations           | 29         |
-|    time_elapsed         | 236        |
-|    total_timesteps      | 475136     |
-| train/                  |            |
-|    approx_kl            | 0.01203198 |
-|    clip_fraction        | 0.146      |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.79      |
-|    explained_variance   | 0.963      |
-|    learning_rate        | 0.0003     |
-|    loss                 | 0.00738    |
-|    n_updates            | 280        |
-|    policy_gradient_loss | -0.0128    |
-|    std                  | 0.982      |
-|    value_loss           | 0.0749     |
-----------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2037         |
-|    iterations           | 30           |
-|    time_elapsed         | 241          |
-|    total_timesteps      | 491520       |
-| train/                  |              |
-|    approx_kl            | 0.0078244675 |
-|    clip_fraction        | 0.0856       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.8         |
-|    explained_variance   | 0.937        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.0631       |
-|    n_updates            | 290          |
-|    policy_gradient_loss | -0.00651     |
-|    std                  | 0.977        |
-|    value_loss           | 0.131        |
-------------------------------------------
-Eval num_timesteps=500000, episode_reward=135.29 +/- 9.81
-Episode length: 287.30 +/- 88.71
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 287        |
-|    mean_reward          | 135        |
-| time/                   |            |
-|    total_timesteps      | 500000     |
-| train/                  |            |
-|    approx_kl            | 0.00837522 |
-|    clip_fraction        | 0.0866     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.77      |
-|    explained_variance   | 0.948      |
-|    learning_rate        | 0.0003     |
-|    loss                 | 0.041      |
-|    n_updates            | 300        |
-|    policy_gradient_loss | -0.00532   |
-|    std                  | 0.962      |
-|    value_loss           | 0.0898     |
-----------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 2048   |
-|    iterations      | 31     |
-|    time_elapsed    | 247    |
-|    total_timesteps | 507904 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2070         |
-|    iterations           | 32           |
-|    time_elapsed         | 253          |
-|    total_timesteps      | 524288       |
-| train/                  |              |
-|    approx_kl            | 0.0067581255 |
-|    clip_fraction        | 0.0543       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.75        |
-|    explained_variance   | 0.932        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.0518       |
-|    n_updates            | 310          |
-|    policy_gradient_loss | -0.00297     |
-|    std                  | 0.954        |
-|    value_loss           | 0.111        |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2090         |
-|    iterations           | 33           |
-|    time_elapsed         | 258          |
-|    total_timesteps      | 540672       |
-| train/                  |              |
-|    approx_kl            | 0.0066835573 |
-|    clip_fraction        | 0.0597       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.74        |
-|    explained_variance   | 0.934        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.00545      |
-|    n_updates            | 320          |
-|    policy_gradient_loss | -0.00508     |
-|    std                  | 0.949        |
-|    value_loss           | 0.101        |
-------------------------------------------
-Eval num_timesteps=550000, episode_reward=136.08 +/- 11.93
-Episode length: 285.80 +/- 123.59
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 286          |
-|    mean_reward          | 136          |
-| time/                   |              |
-|    total_timesteps      | 550000       |
-| train/                  |              |
-|    approx_kl            | 0.0062076193 |
-|    clip_fraction        | 0.0672       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.71        |
-|    explained_variance   | 0.942        |
-|    learning_rate        | 0.0003       |
-|    loss                 | 0.0229       |
-|    n_updates            | 330          |
-|    policy_gradient_loss | -0.00616     |
-|    std                  | 0.933        |
-|    value_loss           | 0.0813       |
-------------------------------------------
-New best mean reward!
--------------------------------
-| time/              |        |
-|    fps             | 2104   |
-|    iterations      | 34     |
-|    time_elapsed    | 264    |
-|    total_timesteps | 557056 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2130         |
-|    iterations           | 35           |
-|    time_elapsed         | 269          |
-|    total_timesteps      | 573440       |
-| train/                  |              |
-|    approx_kl            | 0.0064913128 |
-|    clip_fraction        | 0.0631       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.67        |
-|    explained_variance   | 0.971        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0199      |
-|    n_updates            | 340          |
-|    policy_gradient_loss | -0.00631     |
-|    std                  | 0.917        |
-|    value_loss           | 0.0185       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2155         |
-|    iterations           | 36           |
-|    time_elapsed         | 273          |
-|    total_timesteps      | 589824       |
-| train/                  |              |
-|    approx_kl            | 0.0067110434 |
-|    clip_fraction        | 0.0719       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.63        |
-|    explained_variance   | 0.98         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0343      |
-|    n_updates            | 350          |
-|    policy_gradient_loss | -0.0069      |
-|    std                  | 0.897        |
-|    value_loss           | 0.0113       |
-------------------------------------------
-Eval num_timesteps=600000, episode_reward=135.45 +/- 12.96
-Episode length: 273.05 +/- 118.26
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 273          |
-|    mean_reward          | 135          |
-| time/                   |              |
-|    total_timesteps      | 600000       |
-| train/                  |              |
-|    approx_kl            | 0.0054842415 |
-|    clip_fraction        | 0.0564       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.59        |
-|    explained_variance   | 0.983        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.033       |
-|    n_updates            | 360          |
-|    policy_gradient_loss | -0.0042      |
-|    std                  | 0.883        |
-|    value_loss           | 0.00479      |
-------------------------------------------
-
-[Diag @ 600,000 | n_sheep=1 | success=100%]
-  SUCCESS                    20/20
-  action_mag mean=0.343 p10=0.232 p90=0.548 (0=stopped, 1=full speed)
-  min_flock_radius mean=0.00m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=1.53m best=0.76m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=3.49m best=2.84m
-  reward/step (mean): progress=+0.1066  alignment=+0.0088  pen_bonus=+0.0357  step_cost=-0.0200  complete=+0.3567
-
-[Curriculum] leaving stage n_sheep=1 after 600,000 steps | training success rate (last 100 eps) = 100%
-[Curriculum] → 2 sheep at step 600,000
-
--------------------------------
-| time/              |        |
-|    fps             | 2156   |
-|    iterations      | 37     |
-|    time_elapsed    | 281    |
-|    total_timesteps | 606208 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2173        |
-|    iterations           | 38          |
-|    time_elapsed         | 286         |
-|    total_timesteps      | 622592      |
-| train/                  |             |
-|    approx_kl            | 0.011170821 |
-|    clip_fraction        | 0.117       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.59       |
-|    explained_variance   | 0.924       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0137     |
-|    n_updates            | 370         |
-|    policy_gradient_loss | 0.00714     |
-|    std                  | 0.886       |
-|    value_loss           | 0.0417      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2192        |
-|    iterations           | 39          |
-|    time_elapsed         | 291         |
-|    total_timesteps      | 638976      |
-| train/                  |             |
-|    approx_kl            | 0.012632904 |
-|    clip_fraction        | 0.156       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.6        |
-|    explained_variance   | 0.858       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.00445    |
-|    n_updates            | 380         |
-|    policy_gradient_loss | 0.00112     |
-|    std                  | 0.892       |
-|    value_loss           | 0.0156      |
------------------------------------------
-Eval num_timesteps=650000, episode_reward=-38.36 +/- 29.94
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -38.4       |
-| time/                   |             |
-|    total_timesteps      | 650000      |
-| train/                  |             |
-|    approx_kl            | 0.012015635 |
-|    clip_fraction        | 0.133       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.62       |
-|    explained_variance   | 0.946       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0168     |
-|    n_updates            | 390         |
-|    policy_gradient_loss | -0.000726   |
-|    std                  | 0.904       |
-|    value_loss           | 0.0126      |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2131   |
-|    iterations      | 40     |
-|    time_elapsed    | 307    |
-|    total_timesteps | 655360 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2145        |
-|    iterations           | 41          |
-|    time_elapsed         | 313         |
-|    total_timesteps      | 671744      |
-| train/                  |             |
-|    approx_kl            | 0.009391339 |
-|    clip_fraction        | 0.121       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.63       |
-|    explained_variance   | 0.955       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0164     |
-|    n_updates            | 400         |
-|    policy_gradient_loss | -0.00177    |
-|    std                  | 0.905       |
-|    value_loss           | 0.00536     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2156         |
-|    iterations           | 42           |
-|    time_elapsed         | 319          |
-|    total_timesteps      | 688128       |
-| train/                  |              |
-|    approx_kl            | 0.0077482145 |
-|    clip_fraction        | 0.0977       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.64        |
-|    explained_variance   | 0.895        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.023       |
-|    n_updates            | 410          |
-|    policy_gradient_loss | -0.00158     |
-|    std                  | 0.908        |
-|    value_loss           | 0.0068       |
-------------------------------------------
-Eval num_timesteps=700000, episode_reward=-16.26 +/- 48.54
-Episode length: 1934.20 +/- 286.82
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.93e+03    |
-|    mean_reward          | -16.3       |
-| time/                   |             |
-|    total_timesteps      | 700000      |
-| train/                  |             |
-|    approx_kl            | 0.007948186 |
-|    clip_fraction        | 0.0933      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.64       |
-|    explained_variance   | 0.934       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0205     |
-|    n_updates            | 420         |
-|    policy_gradient_loss | -0.00233    |
-|    std                  | 0.904       |
-|    value_loss           | 0.00556     |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2093   |
-|    iterations      | 43     |
-|    time_elapsed    | 336    |
-|    total_timesteps | 704512 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2109         |
-|    iterations           | 44           |
-|    time_elapsed         | 341          |
-|    total_timesteps      | 720896       |
-| train/                  |              |
-|    approx_kl            | 0.0077707805 |
-|    clip_fraction        | 0.101        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.64        |
-|    explained_variance   | 0.929        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.00469     |
-|    n_updates            | 430          |
-|    policy_gradient_loss | -0.00226     |
-|    std                  | 0.909        |
-|    value_loss           | 0.0031       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2129         |
-|    iterations           | 45           |
-|    time_elapsed         | 346          |
-|    total_timesteps      | 737280       |
-| train/                  |              |
-|    approx_kl            | 0.0063995067 |
-|    clip_fraction        | 0.0823       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.66        |
-|    explained_variance   | 0.951        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0249      |
-|    n_updates            | 440          |
-|    policy_gradient_loss | -0.00261     |
-|    std                  | 0.922        |
-|    value_loss           | 0.00343      |
-------------------------------------------
-Eval num_timesteps=750000, episode_reward=-12.10 +/- 56.78
-Episode length: 1850.50 +/- 449.09
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.85e+03     |
-|    mean_reward          | -12.1        |
-| time/                   |              |
-|    total_timesteps      | 750000       |
-| train/                  |              |
-|    approx_kl            | 0.0069549307 |
-|    clip_fraction        | 0.0847       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.68        |
-|    explained_variance   | 0.862        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0192      |
-|    n_updates            | 450          |
-|    policy_gradient_loss | -0.00165     |
-|    std                  | 0.929        |
-|    value_loss           | 0.0032       |
-------------------------------------------
-
-[Diag @ 750,000 | n_sheep=2 | success=5%]
-  COMPACT_CANT_DRIVE         9/20
-  NEVER_COMPACT              9/20
-  PARTIAL_1of2               1/20
-  SUCCESS                    1/20
-  action_mag mean=0.261 p10=0.002 p90=0.983 (0=stopped, 1=full speed)
-  min_flock_radius mean=3.93m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=0.79m best=0.07m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.43m best=1.62m
-  reward/step (mean): progress=-0.0058  alignment=+0.0087  pen_bonus=+0.0008  step_cost=-0.0200  complete=+0.0025
--------------------------------
-| time/              |        |
-|    fps             | 2043   |
-|    iterations      | 46     |
-|    time_elapsed    | 368    |
-|    total_timesteps | 753664 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2062        |
-|    iterations           | 47          |
-|    time_elapsed         | 373         |
-|    total_timesteps      | 770048      |
-| train/                  |             |
-|    approx_kl            | 0.008165602 |
-|    clip_fraction        | 0.0997      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.69       |
-|    explained_variance   | 0.931       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0461     |
-|    n_updates            | 460         |
-|    policy_gradient_loss | -0.00412    |
-|    std                  | 0.932       |
-|    value_loss           | 0.00308     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2074        |
-|    iterations           | 48          |
-|    time_elapsed         | 379         |
-|    total_timesteps      | 786432      |
-| train/                  |             |
-|    approx_kl            | 0.006088208 |
-|    clip_fraction        | 0.0805      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.71       |
-|    explained_variance   | 0.917       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.034      |
-|    n_updates            | 470         |
-|    policy_gradient_loss | -0.000257   |
-|    std                  | 0.943       |
-|    value_loss           | 0.00533     |
------------------------------------------
-Eval num_timesteps=800000, episode_reward=-32.78 +/- 23.33
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -32.8        |
-| time/                   |              |
-|    total_timesteps      | 800000       |
-| train/                  |              |
-|    approx_kl            | 0.0069386996 |
-|    clip_fraction        | 0.0883       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.73        |
-|    explained_variance   | 0.954        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0361      |
-|    n_updates            | 480          |
-|    policy_gradient_loss | -0.00228     |
-|    std                  | 0.948        |
-|    value_loss           | 0.00495      |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2028   |
-|    iterations      | 49     |
-|    time_elapsed    | 395    |
-|    total_timesteps | 802816 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2045         |
-|    iterations           | 50           |
-|    time_elapsed         | 400          |
-|    total_timesteps      | 819200       |
-| train/                  |              |
-|    approx_kl            | 0.0070893797 |
-|    clip_fraction        | 0.0687       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.74        |
-|    explained_variance   | 0.955        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.035       |
-|    n_updates            | 490          |
-|    policy_gradient_loss | -0.00221     |
-|    std                  | 0.954        |
-|    value_loss           | 0.00229      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 2060         |
-|    iterations           | 51           |
-|    time_elapsed         | 405          |
-|    total_timesteps      | 835584       |
-| train/                  |              |
-|    approx_kl            | 0.0068652867 |
-|    clip_fraction        | 0.0787       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.75        |
-|    explained_variance   | 0.863        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0337      |
-|    n_updates            | 500          |
-|    policy_gradient_loss | -0.00277     |
-|    std                  | 0.959        |
-|    value_loss           | 0.00229      |
-------------------------------------------
-Eval num_timesteps=850000, episode_reward=-14.34 +/- 48.77
-Episode length: 1998.40 +/- 6.97
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -14.3       |
-| time/                   |             |
-|    total_timesteps      | 850000      |
-| train/                  |             |
-|    approx_kl            | 0.007872021 |
-|    clip_fraction        | 0.0815      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.76       |
-|    explained_variance   | 0.852       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0358     |
-|    n_updates            | 510         |
-|    policy_gradient_loss | -0.00365    |
-|    std                  | 0.966       |
-|    value_loss           | 0.00272     |
------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 2018   |
-|    iterations      | 52     |
-|    time_elapsed    | 422    |
-|    total_timesteps | 851968 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2032        |
-|    iterations           | 53          |
-|    time_elapsed         | 427         |
-|    total_timesteps      | 868352      |
-| train/                  |             |
-|    approx_kl            | 0.007002457 |
-|    clip_fraction        | 0.0752      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.78       |
-|    explained_variance   | 0.879       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0414     |
-|    n_updates            | 520         |
-|    policy_gradient_loss | -0.00242    |
-|    std                  | 0.977       |
-|    value_loss           | 0.00166     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2047        |
-|    iterations           | 54          |
-|    time_elapsed         | 432         |
-|    total_timesteps      | 884736      |
-| train/                  |             |
-|    approx_kl            | 0.007822147 |
-|    clip_fraction        | 0.0813      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.8        |
-|    explained_variance   | 0.871       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0376     |
-|    n_updates            | 530         |
-|    policy_gradient_loss | -0.00362    |
-|    std                  | 0.984       |
-|    value_loss           | 0.00212     |
------------------------------------------
-Eval num_timesteps=900000, episode_reward=-20.41 +/- 60.01
-Episode length: 1929.40 +/- 284.99
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 1.93e+03   |
-|    mean_reward          | -20.4      |
-| time/                   |            |
-|    total_timesteps      | 900000     |
-| train/                  |            |
-|    approx_kl            | 0.00738756 |
-|    clip_fraction        | 0.0793     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.81      |
-|    explained_variance   | 0.808      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0355    |
-|    n_updates            | 540        |
-|    policy_gradient_loss | -0.00195   |
-|    std                  | 0.988      |
-|    value_loss           | 0.00721    |
-----------------------------------------
-
-[Diag @ 900,000 | n_sheep=2 | success=5%]
-  COMPACT_CANT_DRIVE         11/20
-  NEVER_COMPACT              8/20
-  SUCCESS                    1/20
-  action_mag mean=0.203 p10=0.007 p90=0.704 (0=stopped, 1=full speed)
-  min_flock_radius mean=3.40m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=0.60m best=0.11m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=14.01m best=3.61m
-  reward/step (mean): progress=-0.0040  alignment=+0.0071  pen_bonus=+0.0008  step_cost=-0.0200  complete=+0.0026
--------------------------------
-| time/              |        |
-|    fps             | 1977   |
-|    iterations      | 55     |
-|    time_elapsed    | 455    |
-|    total_timesteps | 901120 |
--------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1990        |
-|    iterations           | 56          |
-|    time_elapsed         | 460         |
-|    total_timesteps      | 917504      |
-| train/                  |             |
-|    approx_kl            | 0.007000256 |
-|    clip_fraction        | 0.0831      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.8        |
-|    explained_variance   | 0.889       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0285     |
-|    n_updates            | 550         |
-|    policy_gradient_loss | -0.00402    |
-|    std                  | 0.984       |
-|    value_loss           | 0.00171     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2005        |
-|    iterations           | 57          |
-|    time_elapsed         | 465         |
-|    total_timesteps      | 933888      |
-| train/                  |             |
-|    approx_kl            | 0.007749311 |
-|    clip_fraction        | 0.0755      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.83       |
-|    explained_variance   | 0.599       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.032      |
-|    n_updates            | 560         |
-|    policy_gradient_loss | -0.00239    |
-|    std                  | 1.01        |
-|    value_loss           | 0.00351     |
------------------------------------------
-Eval num_timesteps=950000, episode_reward=-13.16 +/- 44.70
-Episode length: 1949.30 +/- 221.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.95e+03     |
-|    mean_reward          | -13.2        |
-| time/                   |              |
-|    total_timesteps      | 950000       |
-| train/                  |              |
-|    approx_kl            | 0.0075328955 |
-|    clip_fraction        | 0.0829       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.85        |
-|    explained_variance   | 0.783        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0306      |
-|    n_updates            | 570          |
-|    policy_gradient_loss | -0.00352     |
-|    std                  | 1.01         |
-|    value_loss           | 0.00319      |
-------------------------------------------
--------------------------------
-| time/              |        |
-|    fps             | 1971   |
-|    iterations      | 58     |
-|    time_elapsed    | 482    |
-|    total_timesteps | 950272 |
--------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1981         |
-|    iterations           | 59           |
-|    time_elapsed         | 487          |
-|    total_timesteps      | 966656       |
-| train/                  |              |
-|    approx_kl            | 0.0072506005 |
-|    clip_fraction        | 0.0835       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.86        |
-|    explained_variance   | 0.929        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0291      |
-|    n_updates            | 580          |
-|    policy_gradient_loss | -0.00173     |
-|    std                  | 1.01         |
-|    value_loss           | 0.00491      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1991         |
-|    iterations           | 60           |
-|    time_elapsed         | 493          |
-|    total_timesteps      | 983040       |
-| train/                  |              |
-|    approx_kl            | 0.0068104668 |
-|    clip_fraction        | 0.0799       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.87        |
-|    explained_variance   | 0.813        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0282      |
-|    n_updates            | 590          |
-|    policy_gradient_loss | -0.00162     |
-|    std                  | 1.02         |
-|    value_loss           | 0.00477      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 2005        |
-|    iterations           | 61          |
-|    time_elapsed         | 498         |
-|    total_timesteps      | 999424      |
-| train/                  |             |
-|    approx_kl            | 0.007103944 |
-|    clip_fraction        | 0.0774      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.88       |
-|    explained_variance   | 0.942       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0322     |
-|    n_updates            | 600         |
-|    policy_gradient_loss | -0.00143    |
-|    std                  | 1.03        |
-|    value_loss           | 0.0033      |
------------------------------------------
-Eval num_timesteps=1000000, episode_reward=-25.58 +/- 49.00
-Episode length: 1999.50 +/- 2.18
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -25.6        |
-| time/                   |              |
-|    total_timesteps      | 1000000      |
-| train/                  |              |
-|    approx_kl            | 0.0075788023 |
-|    clip_fraction        | 0.088        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.9         |
-|    explained_variance   | 0.864        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0352      |
-|    n_updates            | 610          |
-|    policy_gradient_loss | -0.003       |
-|    std                  | 1.04         |
-|    value_loss           | 0.00192      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1971    |
-|    iterations      | 62      |
-|    time_elapsed    | 515     |
-|    total_timesteps | 1015808 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1983        |
-|    iterations           | 63          |
-|    time_elapsed         | 520         |
-|    total_timesteps      | 1032192     |
-| train/                  |             |
-|    approx_kl            | 0.009131588 |
-|    clip_fraction        | 0.0902      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.89       |
-|    explained_variance   | 0.941       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0476     |
-|    n_updates            | 620         |
-|    policy_gradient_loss | -0.00341    |
-|    std                  | 1.03        |
-|    value_loss           | 0.00705     |
------------------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1995       |
-|    iterations           | 64         |
-|    time_elapsed         | 525        |
-|    total_timesteps      | 1048576    |
-| train/                  |            |
-|    approx_kl            | 0.00746674 |
-|    clip_fraction        | 0.0838     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.89      |
-|    explained_variance   | 0.958      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.022     |
-|    n_updates            | 630        |
-|    policy_gradient_loss | -0.00392   |
-|    std                  | 1.03       |
-|    value_loss           | 0.00592    |
-----------------------------------------
-Eval num_timesteps=1050000, episode_reward=-12.04 +/- 64.56
-Episode length: 1889.90 +/- 333.38
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 1.89e+03     |
-|    mean_reward          | -12          |
-| time/                   |              |
-|    total_timesteps      | 1050000      |
-| train/                  |              |
-|    approx_kl            | 0.0058071706 |
-|    clip_fraction        | 0.0721       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.9         |
-|    explained_variance   | 0.932        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0188      |
-|    n_updates            | 640          |
-|    policy_gradient_loss | -0.00235     |
-|    std                  | 1.03         |
-|    value_loss           | 0.00513      |
-------------------------------------------
-
-[Diag @ 1,050,000 | n_sheep=2 | success=5%]
-  COMPACT_CANT_DRIVE         10/20
-  NEVER_COMPACT              9/20
-  SUCCESS                    1/20
-  action_mag mean=0.190 p10=0.001 p90=0.686 (0=stopped, 1=full speed)
-  min_flock_radius mean=4.60m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=0.54m best=0.21m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.05m best=3.62m
-  reward/step (mean): progress=-0.0023  alignment=+0.0072  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0025
---------------------------------
-| time/              |         |
-|    fps             | 1931    |
-|    iterations      | 65      |
-|    time_elapsed    | 551     |
-|    total_timesteps | 1064960 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1944        |
-|    iterations           | 66          |
-|    time_elapsed         | 556         |
-|    total_timesteps      | 1081344     |
-| train/                  |             |
-|    approx_kl            | 0.006802067 |
-|    clip_fraction        | 0.0701      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.92       |
-|    explained_variance   | 0.937       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0304     |
-|    n_updates            | 650         |
-|    policy_gradient_loss | -0.0019     |
-|    std                  | 1.04        |
-|    value_loss           | 0.00206     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1956        |
-|    iterations           | 67          |
-|    time_elapsed         | 561         |
-|    total_timesteps      | 1097728     |
-| train/                  |             |
-|    approx_kl            | 0.007102525 |
-|    clip_fraction        | 0.074       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.92       |
-|    explained_variance   | 0.953       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.00869    |
-|    n_updates            | 660         |
-|    policy_gradient_loss | -0.00208    |
-|    std                  | 1.04        |
-|    value_loss           | 0.00579     |
------------------------------------------
-Eval num_timesteps=1100000, episode_reward=-29.51 +/- 23.80
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -29.5       |
-| time/                   |             |
-|    total_timesteps      | 1100000     |
-| train/                  |             |
-|    approx_kl            | 0.006372301 |
-|    clip_fraction        | 0.0669      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.94       |
-|    explained_variance   | 0.829       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0349     |
-|    n_updates            | 670         |
-|    policy_gradient_loss | -0.00135    |
-|    std                  | 1.06        |
-|    value_loss           | 0.00208     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1932    |
-|    iterations      | 68      |
-|    time_elapsed    | 576     |
-|    total_timesteps | 1114112 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1942        |
-|    iterations           | 69          |
-|    time_elapsed         | 581         |
-|    total_timesteps      | 1130496     |
-| train/                  |             |
-|    approx_kl            | 0.007083354 |
-|    clip_fraction        | 0.0839      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.95       |
-|    explained_variance   | 0.845       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0464     |
-|    n_updates            | 680         |
-|    policy_gradient_loss | -0.00298    |
-|    std                  | 1.06        |
-|    value_loss           | 0.00747     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1954        |
-|    iterations           | 70          |
-|    time_elapsed         | 586         |
-|    total_timesteps      | 1146880     |
-| train/                  |             |
-|    approx_kl            | 0.007034454 |
-|    clip_fraction        | 0.0875      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.96       |
-|    explained_variance   | 0.892       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0382     |
-|    n_updates            | 690         |
-|    policy_gradient_loss | -0.00359    |
-|    std                  | 1.06        |
-|    value_loss           | 0.00208     |
------------------------------------------
-Eval num_timesteps=1150000, episode_reward=-20.98 +/- 49.18
-Episode length: 1959.70 +/- 175.66
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 1.96e+03    |
-|    mean_reward          | -21         |
-| time/                   |             |
-|    total_timesteps      | 1150000     |
-| train/                  |             |
-|    approx_kl            | 0.006192833 |
-|    clip_fraction        | 0.0626      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.94       |
-|    explained_variance   | 0.951       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0224     |
-|    n_updates            | 700         |
-|    policy_gradient_loss | -0.00299    |
-|    std                  | 1.05        |
-|    value_loss           | 0.00883     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1926    |
-|    iterations      | 71      |
-|    time_elapsed    | 603     |
-|    total_timesteps | 1163264 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1937        |
-|    iterations           | 72          |
-|    time_elapsed         | 608         |
-|    total_timesteps      | 1179648     |
-| train/                  |             |
-|    approx_kl            | 0.008185772 |
-|    clip_fraction        | 0.0969      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.96       |
-|    explained_variance   | 0.944       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0278     |
-|    n_updates            | 710         |
-|    policy_gradient_loss | -0.00316    |
-|    std                  | 1.07        |
-|    value_loss           | 0.00421     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1947         |
-|    iterations           | 73           |
-|    time_elapsed         | 614          |
-|    total_timesteps      | 1196032      |
-| train/                  |              |
-|    approx_kl            | 0.0063469247 |
-|    clip_fraction        | 0.065        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -2.96        |
-|    explained_variance   | 0.912        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0239      |
-|    n_updates            | 720          |
-|    policy_gradient_loss | -0.00224     |
-|    std                  | 1.06         |
-|    value_loss           | 0.0054       |
-------------------------------------------
-Eval num_timesteps=1200000, episode_reward=-29.34 +/- 18.71
-Episode length: 2000.00 +/- 0.00
-----------------------------------------
-| eval/                   |            |
-|    mean_ep_length       | 2e+03      |
-|    mean_reward          | -29.3      |
-| time/                   |            |
-|    total_timesteps      | 1200000    |
-| train/                  |            |
-|    approx_kl            | 0.00778389 |
-|    clip_fraction        | 0.0734     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.95      |
-|    explained_variance   | 0.961      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0435    |
-|    n_updates            | 730        |
-|    policy_gradient_loss | -0.00184   |
-|    std                  | 1.06       |
-|    value_loss           | 0.0048     |
-----------------------------------------
-
-[Diag @ 1,200,000 | n_sheep=2 | success=10%]
-  NEVER_COMPACT              9/20
-  COMPACT_CANT_DRIVE         9/20
-  SUCCESS                    2/20
-  action_mag mean=0.198 p10=0.002 p90=0.744 (0=stopped, 1=full speed)
-  min_flock_radius mean=3.94m best=0.00m  (target <5m to compact)
-  min_dog_to_com   mean=0.50m best=0.14m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=11.36m best=3.58m
-  reward/step (mean): progress=-0.0002  alignment=+0.0073  pen_bonus=+0.0013  step_cost=-0.0200  complete=+0.0053
-
-[Curriculum] leaving stage n_sheep=2 after 600,000 steps | training success rate (last 100 eps) = 5%
-[Curriculum] → 3 sheep at step 1,200,000
-
---------------------------------
-| time/              |         |
-|    fps             | 1898    |
-|    iterations      | 74      |
-|    time_elapsed    | 638     |
-|    total_timesteps | 1212416 |
---------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1909       |
-|    iterations           | 75         |
-|    time_elapsed         | 643        |
-|    total_timesteps      | 1228800    |
-| train/                  |            |
-|    approx_kl            | 0.00918101 |
-|    clip_fraction        | 0.106      |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -2.95      |
-|    explained_variance   | 0.919      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0112    |
-|    n_updates            | 740        |
-|    policy_gradient_loss | -0.00123   |
-|    std                  | 1.06       |
-|    value_loss           | 0.0427     |
-----------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1917        |
-|    iterations           | 76          |
-|    time_elapsed         | 649         |
-|    total_timesteps      | 1245184     |
-| train/                  |             |
-|    approx_kl            | 0.010076641 |
-|    clip_fraction        | 0.137       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.94       |
-|    explained_variance   | 0.919       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0229     |
-|    n_updates            | 750         |
-|    policy_gradient_loss | -0.000617   |
-|    std                  | 1.05        |
-|    value_loss           | 0.0222      |
------------------------------------------
-Eval num_timesteps=1250000, episode_reward=-38.73 +/- 33.85
-Episode length: 2000.00 +/- 0.00
----------------------------------------
-| eval/                   |           |
-|    mean_ep_length       | 2e+03     |
-|    mean_reward          | -38.7     |
-| time/                   |           |
-|    total_timesteps      | 1250000   |
-| train/                  |           |
-|    approx_kl            | 0.0084493 |
-|    clip_fraction        | 0.109     |
-|    clip_range           | 0.2       |
-|    entropy_loss         | -2.96     |
-|    explained_variance   | 0.96      |
-|    learning_rate        | 0.0003    |
-|    loss                 | -0.0259   |
-|    n_updates            | 760       |
-|    policy_gradient_loss | -0.00168  |
-|    std                  | 1.06      |
-|    value_loss           | 0.0024    |
----------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1890    |
-|    iterations      | 77      |
-|    time_elapsed    | 667     |
-|    total_timesteps | 1261568 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1899        |
-|    iterations           | 78          |
-|    time_elapsed         | 672         |
-|    total_timesteps      | 1277952     |
-| train/                  |             |
-|    approx_kl            | 0.008724872 |
-|    clip_fraction        | 0.109       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.98       |
-|    explained_variance   | 0.931       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0293     |
-|    n_updates            | 770         |
-|    policy_gradient_loss | -0.00204    |
-|    std                  | 1.08        |
-|    value_loss           | 0.0067      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1906        |
-|    iterations           | 79          |
-|    time_elapsed         | 678         |
-|    total_timesteps      | 1294336     |
-| train/                  |             |
-|    approx_kl            | 0.008191848 |
-|    clip_fraction        | 0.096       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -2.99       |
-|    explained_variance   | 0.963       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0247     |
-|    n_updates            | 780         |
-|    policy_gradient_loss | -0.002      |
-|    std                  | 1.08        |
-|    value_loss           | 0.00632     |
------------------------------------------
-Eval num_timesteps=1300000, episode_reward=-26.68 +/- 27.12
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -26.7       |
-| time/                   |             |
-|    total_timesteps      | 1300000     |
-| train/                  |             |
-|    approx_kl            | 0.006018152 |
-|    clip_fraction        | 0.0869      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3          |
-|    explained_variance   | 0.96        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0311     |
-|    n_updates            | 790         |
-|    policy_gradient_loss | -0.00129    |
-|    std                  | 1.09        |
-|    value_loss           | 0.00189     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1881    |
-|    iterations      | 80      |
-|    time_elapsed    | 696     |
-|    total_timesteps | 1310720 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1892         |
-|    iterations           | 81           |
-|    time_elapsed         | 701          |
-|    total_timesteps      | 1327104      |
-| train/                  |              |
-|    approx_kl            | 0.0077671953 |
-|    clip_fraction        | 0.082        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.01        |
-|    explained_variance   | 0.972        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0308      |
-|    n_updates            | 800          |
-|    policy_gradient_loss | -0.00219     |
-|    std                  | 1.09         |
-|    value_loss           | 0.00177      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1902        |
-|    iterations           | 82          |
-|    time_elapsed         | 706         |
-|    total_timesteps      | 1343488     |
-| train/                  |             |
-|    approx_kl            | 0.008806022 |
-|    clip_fraction        | 0.0947      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.02       |
-|    explained_variance   | 0.962       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0426     |
-|    n_updates            | 810         |
-|    policy_gradient_loss | -0.00231    |
-|    std                  | 1.1         |
-|    value_loss           | 0.00235     |
------------------------------------------
-Eval num_timesteps=1350000, episode_reward=-24.30 +/- 32.03
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -24.3       |
-| time/                   |             |
-|    total_timesteps      | 1350000     |
-| train/                  |             |
-|    approx_kl            | 0.007263833 |
-|    clip_fraction        | 0.0797      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.03       |
-|    explained_variance   | 0.957       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0338     |
-|    n_updates            | 820         |
-|    policy_gradient_loss | -0.00251    |
-|    std                  | 1.11        |
-|    value_loss           | 0.00397     |
------------------------------------------
-
-[Diag @ 1,350,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              16/20
-  COMPACT_CANT_DRIVE         4/20
-  action_mag mean=0.058 p10=0.004 p90=0.054 (0=stopped, 1=full speed)
-  min_flock_radius mean=6.77m best=1.04m  (target <5m to compact)
-  min_dog_to_com   mean=0.58m best=0.28m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.71m best=4.27m
-  reward/step (mean): progress=-0.0038  alignment=+0.0015  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1859    |
-|    iterations      | 83      |
-|    time_elapsed    | 731     |
-|    total_timesteps | 1359872 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1870        |
-|    iterations           | 84          |
-|    time_elapsed         | 735         |
-|    total_timesteps      | 1376256     |
-| train/                  |             |
-|    approx_kl            | 0.007816839 |
-|    clip_fraction        | 0.0812      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.05       |
-|    explained_variance   | 0.946       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0285     |
-|    n_updates            | 830         |
-|    policy_gradient_loss | -0.00277    |
-|    std                  | 1.11        |
-|    value_loss           | 0.0018      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1880         |
-|    iterations           | 85           |
-|    time_elapsed         | 740          |
-|    total_timesteps      | 1392640      |
-| train/                  |              |
-|    approx_kl            | 0.0064534983 |
-|    clip_fraction        | 0.0774       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.06        |
-|    explained_variance   | 0.958        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0305      |
-|    n_updates            | 840          |
-|    policy_gradient_loss | -0.00158     |
-|    std                  | 1.12         |
-|    value_loss           | 0.00988      |
-------------------------------------------
-Eval num_timesteps=1400000, episode_reward=-39.10 +/- 41.08
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -39.1        |
-| time/                   |              |
-|    total_timesteps      | 1400000      |
-| train/                  |              |
-|    approx_kl            | 0.0069560152 |
-|    clip_fraction        | 0.0835       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.07        |
-|    explained_variance   | 0.96         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0302      |
-|    n_updates            | 850          |
-|    policy_gradient_loss | -0.00283     |
-|    std                  | 1.12         |
-|    value_loss           | 0.00307      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1857    |
-|    iterations      | 86      |
-|    time_elapsed    | 758     |
-|    total_timesteps | 1409024 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1864        |
-|    iterations           | 87          |
-|    time_elapsed         | 764         |
-|    total_timesteps      | 1425408     |
-| train/                  |             |
-|    approx_kl            | 0.007682803 |
-|    clip_fraction        | 0.0931      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.09       |
-|    explained_variance   | 0.902       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0322     |
-|    n_updates            | 860         |
-|    policy_gradient_loss | -0.00224    |
-|    std                  | 1.14        |
-|    value_loss           | 0.013       |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1869         |
-|    iterations           | 88           |
-|    time_elapsed         | 771          |
-|    total_timesteps      | 1441792      |
-| train/                  |              |
-|    approx_kl            | 0.0063949013 |
-|    clip_fraction        | 0.0786       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.1         |
-|    explained_variance   | 0.953        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0401      |
-|    n_updates            | 870          |
-|    policy_gradient_loss | -0.00134     |
-|    std                  | 1.14         |
-|    value_loss           | 0.00193      |
-------------------------------------------
-Eval num_timesteps=1450000, episode_reward=-28.59 +/- 25.61
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -28.6       |
-| time/                   |             |
-|    total_timesteps      | 1450000     |
-| train/                  |             |
-|    approx_kl            | 0.007503539 |
-|    clip_fraction        | 0.0774      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.13       |
-|    explained_variance   | 0.951       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0378     |
-|    n_updates            | 880         |
-|    policy_gradient_loss | -0.00309    |
-|    std                  | 1.16        |
-|    value_loss           | 0.00551     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1845    |
-|    iterations      | 89      |
-|    time_elapsed    | 789     |
-|    total_timesteps | 1458176 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1852         |
-|    iterations           | 90           |
-|    time_elapsed         | 796          |
-|    total_timesteps      | 1474560      |
-| train/                  |              |
-|    approx_kl            | 0.0075057503 |
-|    clip_fraction        | 0.0793       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.15        |
-|    explained_variance   | 0.955        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0439      |
-|    n_updates            | 890          |
-|    policy_gradient_loss | -0.00264     |
-|    std                  | 1.17         |
-|    value_loss           | 0.00265      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1857         |
-|    iterations           | 91           |
-|    time_elapsed         | 802          |
-|    total_timesteps      | 1490944      |
-| train/                  |              |
-|    approx_kl            | 0.0068523246 |
-|    clip_fraction        | 0.0755       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.15        |
-|    explained_variance   | 0.935        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0282      |
-|    n_updates            | 900          |
-|    policy_gradient_loss | -0.00292     |
-|    std                  | 1.17         |
-|    value_loss           | 0.00268      |
-------------------------------------------
-Eval num_timesteps=1500000, episode_reward=-40.66 +/- 25.29
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -40.7       |
-| time/                   |             |
-|    total_timesteps      | 1500000     |
-| train/                  |             |
-|    approx_kl            | 0.007249858 |
-|    clip_fraction        | 0.0857      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.15       |
-|    explained_variance   | 0.952       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0366     |
-|    n_updates            | 910         |
-|    policy_gradient_loss | -0.00319    |
-|    std                  | 1.17        |
-|    value_loss           | 0.00564     |
------------------------------------------
-
-[Diag @ 1,500,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              14/20
-  COMPACT_CANT_DRIVE         6/20
-  action_mag mean=0.050 p10=0.005 p90=0.049 (0=stopped, 1=full speed)
-  min_flock_radius mean=6.53m best=0.98m  (target <5m to compact)
-  min_dog_to_com   mean=0.46m best=0.06m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.38m best=5.44m
-  reward/step (mean): progress=+0.0039  alignment=+0.0011  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1819    |
-|    iterations      | 92      |
-|    time_elapsed    | 828     |
-|    total_timesteps | 1507328 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1828        |
-|    iterations           | 93          |
-|    time_elapsed         | 833         |
-|    total_timesteps      | 1523712     |
-| train/                  |             |
-|    approx_kl            | 0.007471386 |
-|    clip_fraction        | 0.0834      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.16       |
-|    explained_variance   | 0.929       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0275     |
-|    n_updates            | 920         |
-|    policy_gradient_loss | -0.00192    |
-|    std                  | 1.17        |
-|    value_loss           | 0.00791     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1835        |
-|    iterations           | 94          |
-|    time_elapsed         | 838         |
-|    total_timesteps      | 1540096     |
-| train/                  |             |
-|    approx_kl            | 0.007296456 |
-|    clip_fraction        | 0.0765      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.17       |
-|    explained_variance   | 0.95        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0484     |
-|    n_updates            | 930         |
-|    policy_gradient_loss | -0.00366    |
-|    std                  | 1.18        |
-|    value_loss           | 0.00788     |
------------------------------------------
-Eval num_timesteps=1550000, episode_reward=-34.66 +/- 25.47
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -34.7       |
-| time/                   |             |
-|    total_timesteps      | 1550000     |
-| train/                  |             |
-|    approx_kl            | 0.007654687 |
-|    clip_fraction        | 0.095       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.18       |
-|    explained_variance   | 0.92        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0386     |
-|    n_updates            | 940         |
-|    policy_gradient_loss | -0.00316    |
-|    std                  | 1.19        |
-|    value_loss           | 0.00363     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1817    |
-|    iterations      | 95      |
-|    time_elapsed    | 856     |
-|    total_timesteps | 1556480 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1823        |
-|    iterations           | 96          |
-|    time_elapsed         | 862         |
-|    total_timesteps      | 1572864     |
-| train/                  |             |
-|    approx_kl            | 0.007030643 |
-|    clip_fraction        | 0.0881      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.18       |
-|    explained_variance   | 0.944       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0346     |
-|    n_updates            | 950         |
-|    policy_gradient_loss | -0.00321    |
-|    std                  | 1.19        |
-|    value_loss           | 0.00208     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1828         |
-|    iterations           | 97           |
-|    time_elapsed         | 869          |
-|    total_timesteps      | 1589248      |
-| train/                  |              |
-|    approx_kl            | 0.0071562277 |
-|    clip_fraction        | 0.0834       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.19        |
-|    explained_variance   | 0.955        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0196      |
-|    n_updates            | 960          |
-|    policy_gradient_loss | -0.00259     |
-|    std                  | 1.2          |
-|    value_loss           | 0.00773      |
-------------------------------------------
-Eval num_timesteps=1600000, episode_reward=-33.49 +/- 36.88
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -33.5        |
-| time/                   |              |
-|    total_timesteps      | 1600000      |
-| train/                  |              |
-|    approx_kl            | 0.0069667175 |
-|    clip_fraction        | 0.0741       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.2         |
-|    explained_variance   | 0.94         |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0313      |
-|    n_updates            | 970          |
-|    policy_gradient_loss | -0.00399     |
-|    std                  | 1.2          |
-|    value_loss           | 0.00419      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1810    |
-|    iterations      | 98      |
-|    time_elapsed    | 886     |
-|    total_timesteps | 1605632 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1819         |
-|    iterations           | 99           |
-|    time_elapsed         | 891          |
-|    total_timesteps      | 1622016      |
-| train/                  |              |
-|    approx_kl            | 0.0061995042 |
-|    clip_fraction        | 0.0767       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.21        |
-|    explained_variance   | 0.968        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.036       |
-|    n_updates            | 980          |
-|    policy_gradient_loss | -0.00289     |
-|    std                  | 1.2          |
-|    value_loss           | 0.00241      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1826        |
-|    iterations           | 100         |
-|    time_elapsed         | 896         |
-|    total_timesteps      | 1638400     |
-| train/                  |             |
-|    approx_kl            | 0.006502889 |
-|    clip_fraction        | 0.0714      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.22       |
-|    explained_variance   | 0.976       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0445     |
-|    n_updates            | 990         |
-|    policy_gradient_loss | -0.00314    |
-|    std                  | 1.21        |
-|    value_loss           | 0.00218     |
------------------------------------------
-Eval num_timesteps=1650000, episode_reward=-38.00 +/- 30.02
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -38         |
-| time/                   |             |
-|    total_timesteps      | 1650000     |
-| train/                  |             |
-|    approx_kl            | 0.006163503 |
-|    clip_fraction        | 0.0739      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.22       |
-|    explained_variance   | 0.955       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0391     |
-|    n_updates            | 1000        |
-|    policy_gradient_loss | -0.00257    |
-|    std                  | 1.22        |
-|    value_loss           | 0.0027      |
------------------------------------------
-
-[Diag @ 1,650,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              16/20
-  COMPACT_CANT_DRIVE         4/20
-  action_mag mean=0.054 p10=0.002 p90=0.051 (0=stopped, 1=full speed)
-  min_flock_radius mean=6.63m best=3.72m  (target <5m to compact)
-  min_dog_to_com   mean=0.60m best=0.09m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.17m best=5.44m
-  reward/step (mean): progress=+0.0032  alignment=+0.0015  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1793    |
-|    iterations      | 101     |
-|    time_elapsed    | 922     |
-|    total_timesteps | 1654784 |
---------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1800       |
-|    iterations           | 102        |
-|    time_elapsed         | 927        |
-|    total_timesteps      | 1671168    |
-| train/                  |            |
-|    approx_kl            | 0.00634938 |
-|    clip_fraction        | 0.073      |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -3.23      |
-|    explained_variance   | 0.97       |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0462    |
-|    n_updates            | 1010       |
-|    policy_gradient_loss | -0.00394   |
-|    std                  | 1.22       |
-|    value_loss           | 0.00334    |
-----------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1807         |
-|    iterations           | 103          |
-|    time_elapsed         | 933          |
-|    total_timesteps      | 1687552      |
-| train/                  |              |
-|    approx_kl            | 0.0072235917 |
-|    clip_fraction        | 0.0774       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.23        |
-|    explained_variance   | 0.957        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0284      |
-|    n_updates            | 1020         |
-|    policy_gradient_loss | -0.00292     |
-|    std                  | 1.22         |
-|    value_loss           | 0.00807      |
-------------------------------------------
-Eval num_timesteps=1700000, episode_reward=-32.26 +/- 31.96
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -32.3        |
-| time/                   |              |
-|    total_timesteps      | 1700000      |
-| train/                  |              |
-|    approx_kl            | 0.0060304543 |
-|    clip_fraction        | 0.0721       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.23        |
-|    explained_variance   | 0.929        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0427      |
-|    n_updates            | 1030         |
-|    policy_gradient_loss | -0.00306     |
-|    std                  | 1.21         |
-|    value_loss           | 0.00208      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1789    |
-|    iterations      | 104     |
-|    time_elapsed    | 952     |
-|    total_timesteps | 1703936 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1795        |
-|    iterations           | 105         |
-|    time_elapsed         | 958         |
-|    total_timesteps      | 1720320     |
-| train/                  |             |
-|    approx_kl            | 0.006440907 |
-|    clip_fraction        | 0.0642      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.22       |
-|    explained_variance   | 0.947       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0317     |
-|    n_updates            | 1040        |
-|    policy_gradient_loss | -0.00158    |
-|    std                  | 1.21        |
-|    value_loss           | 0.00165     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1801        |
-|    iterations           | 106         |
-|    time_elapsed         | 963         |
-|    total_timesteps      | 1736704     |
-| train/                  |             |
-|    approx_kl            | 0.006897255 |
-|    clip_fraction        | 0.0738      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.2        |
-|    explained_variance   | 0.939       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0408     |
-|    n_updates            | 1050        |
-|    policy_gradient_loss | -0.00349    |
-|    std                  | 1.19        |
-|    value_loss           | 0.00814     |
------------------------------------------
-Eval num_timesteps=1750000, episode_reward=-40.58 +/- 28.91
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -40.6        |
-| time/                   |              |
-|    total_timesteps      | 1750000      |
-| train/                  |              |
-|    approx_kl            | 0.0070952754 |
-|    clip_fraction        | 0.0742       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.19        |
-|    explained_variance   | 0.957        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0308      |
-|    n_updates            | 1060         |
-|    policy_gradient_loss | -0.0037      |
-|    std                  | 1.19         |
-|    value_loss           | 0.0191       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1784    |
-|    iterations      | 107     |
-|    time_elapsed    | 982     |
-|    total_timesteps | 1753088 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1791        |
-|    iterations           | 108         |
-|    time_elapsed         | 987         |
-|    total_timesteps      | 1769472     |
-| train/                  |             |
-|    approx_kl            | 0.006444447 |
-|    clip_fraction        | 0.0736      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.2        |
-|    explained_variance   | 0.968       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0362     |
-|    n_updates            | 1070        |
-|    policy_gradient_loss | -0.00409    |
-|    std                  | 1.2         |
-|    value_loss           | 0.00395     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1797        |
-|    iterations           | 109         |
-|    time_elapsed         | 993         |
-|    total_timesteps      | 1785856     |
-| train/                  |             |
-|    approx_kl            | 0.007391736 |
-|    clip_fraction        | 0.0758      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.22       |
-|    explained_variance   | 0.96        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0341     |
-|    n_updates            | 1080        |
-|    policy_gradient_loss | -0.00272    |
-|    std                  | 1.21        |
-|    value_loss           | 0.00221     |
------------------------------------------
-Eval num_timesteps=1800000, episode_reward=-29.06 +/- 30.98
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -29.1       |
-| time/                   |             |
-|    total_timesteps      | 1800000     |
-| train/                  |             |
-|    approx_kl            | 0.006899439 |
-|    clip_fraction        | 0.0695      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.25       |
-|    explained_variance   | 0.965       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0317     |
-|    n_updates            | 1090        |
-|    policy_gradient_loss | -0.00226    |
-|    std                  | 1.23        |
-|    value_loss           | 0.00615     |
------------------------------------------
-
-[Diag @ 1,800,000 | n_sheep=3 | success=0%]
-  NEVER_COMPACT              11/20
-  COMPACT_CANT_DRIVE         9/20
-  action_mag mean=0.054 p10=0.003 p90=0.057 (0=stopped, 1=full speed)
-  min_flock_radius mean=6.01m best=1.13m  (target <5m to compact)
-  min_dog_to_com   mean=0.51m best=0.11m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=12.52m best=3.21m
-  reward/step (mean): progress=+0.0050  alignment=+0.0017  pen_bonus=+0.0008  step_cost=-0.0200  complete=+0.0000
-
-[Curriculum] leaving stage n_sheep=3 after 600,000 steps | training success rate (last 100 eps) = 0%
-[Curriculum] → 4 sheep at step 1,800,000
-
---------------------------------
-| time/              |         |
-|    fps             | 1769    |
-|    iterations      | 110     |
-|    time_elapsed    | 1018    |
-|    total_timesteps | 1802240 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1776        |
-|    iterations           | 111         |
-|    time_elapsed         | 1023        |
-|    total_timesteps      | 1818624     |
-| train/                  |             |
-|    approx_kl            | 0.006710761 |
-|    clip_fraction        | 0.0761      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.25       |
-|    explained_variance   | 0.867       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.031      |
-|    n_updates            | 1100        |
-|    policy_gradient_loss | -0.00311    |
-|    std                  | 1.23        |
-|    value_loss           | 0.0186      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1783        |
-|    iterations           | 112         |
-|    time_elapsed         | 1028        |
-|    total_timesteps      | 1835008     |
-| train/                  |             |
-|    approx_kl            | 0.006202608 |
-|    clip_fraction        | 0.0682      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.25       |
-|    explained_variance   | 0.954       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0245     |
-|    n_updates            | 1110        |
-|    policy_gradient_loss | -0.00429    |
-|    std                  | 1.23        |
-|    value_loss           | 0.00641     |
------------------------------------------
-Eval num_timesteps=1850000, episode_reward=-35.87 +/- 42.36
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -35.9       |
-| time/                   |             |
-|    total_timesteps      | 1850000     |
-| train/                  |             |
-|    approx_kl            | 0.008398036 |
-|    clip_fraction        | 0.086       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.28       |
-|    explained_variance   | 0.938       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0514     |
-|    n_updates            | 1120        |
-|    policy_gradient_loss | -0.00497    |
-|    std                  | 1.25        |
-|    value_loss           | 0.00614     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1768    |
-|    iterations      | 113     |
-|    time_elapsed    | 1046    |
-|    total_timesteps | 1851392 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1775        |
-|    iterations           | 114         |
-|    time_elapsed         | 1052        |
-|    total_timesteps      | 1867776     |
-| train/                  |             |
-|    approx_kl            | 0.007641702 |
-|    clip_fraction        | 0.0742      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.31       |
-|    explained_variance   | 0.935       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.046      |
-|    n_updates            | 1130        |
-|    policy_gradient_loss | -0.00349    |
-|    std                  | 1.28        |
-|    value_loss           | 0.0228      |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1781         |
-|    iterations           | 115          |
-|    time_elapsed         | 1057         |
-|    total_timesteps      | 1884160      |
-| train/                  |              |
-|    approx_kl            | 0.0073437546 |
-|    clip_fraction        | 0.0747       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.34        |
-|    explained_variance   | 0.928        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0498      |
-|    n_updates            | 1140         |
-|    policy_gradient_loss | -0.00496     |
-|    std                  | 1.29         |
-|    value_loss           | 0.00764      |
-------------------------------------------
-Eval num_timesteps=1900000, episode_reward=-41.88 +/- 27.01
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -41.9       |
-| time/                   |             |
-|    total_timesteps      | 1900000     |
-| train/                  |             |
-|    approx_kl            | 0.006885264 |
-|    clip_fraction        | 0.0728      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.36       |
-|    explained_variance   | 0.934       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0503     |
-|    n_updates            | 1150        |
-|    policy_gradient_loss | -0.00384    |
-|    std                  | 1.3         |
-|    value_loss           | 0.00423     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1767    |
-|    iterations      | 116     |
-|    time_elapsed    | 1075    |
-|    total_timesteps | 1900544 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1773         |
-|    iterations           | 117          |
-|    time_elapsed         | 1080         |
-|    total_timesteps      | 1916928      |
-| train/                  |              |
-|    approx_kl            | 0.0077611385 |
-|    clip_fraction        | 0.0792       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.38        |
-|    explained_variance   | 0.931        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0374      |
-|    n_updates            | 1160         |
-|    policy_gradient_loss | -0.00399     |
-|    std                  | 1.31         |
-|    value_loss           | 0.00292      |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1780        |
-|    iterations           | 118         |
-|    time_elapsed         | 1085        |
-|    total_timesteps      | 1933312     |
-| train/                  |             |
-|    approx_kl            | 0.006831214 |
-|    clip_fraction        | 0.0758      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.4        |
-|    explained_variance   | 0.963       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0175     |
-|    n_updates            | 1170        |
-|    policy_gradient_loss | -0.00471    |
-|    std                  | 1.33        |
-|    value_loss           | 0.00235     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1786        |
-|    iterations           | 119         |
-|    time_elapsed         | 1091        |
-|    total_timesteps      | 1949696     |
-| train/                  |             |
-|    approx_kl            | 0.006474304 |
-|    clip_fraction        | 0.0666      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.43       |
-|    explained_variance   | 0.931       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0318     |
-|    n_updates            | 1180        |
-|    policy_gradient_loss | -0.00285    |
-|    std                  | 1.35        |
-|    value_loss           | 0.00699     |
------------------------------------------
-Eval num_timesteps=1950000, episode_reward=-35.80 +/- 28.95
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -35.8       |
-| time/                   |             |
-|    total_timesteps      | 1950000     |
-| train/                  |             |
-|    approx_kl            | 0.008532442 |
-|    clip_fraction        | 0.0746      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.43       |
-|    explained_variance   | 0.958       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.00337    |
-|    n_updates            | 1190        |
-|    policy_gradient_loss | -0.00376    |
-|    std                  | 1.34        |
-|    value_loss           | 0.0156      |
------------------------------------------
-
-[Diag @ 1,950,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              19/20
-  COMPACT_CANT_DRIVE         1/20
-  action_mag mean=0.049 p10=0.007 p90=0.044 (0=stopped, 1=full speed)
-  min_flock_radius mean=8.95m best=4.96m  (target <5m to compact)
-  min_dog_to_com   mean=0.39m best=0.07m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=14.18m best=9.30m
-  reward/step (mean): progress=-0.0121  alignment=+0.0010  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1759    |
-|    iterations      | 120     |
-|    time_elapsed    | 1117    |
-|    total_timesteps | 1966080 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1766        |
-|    iterations           | 121         |
-|    time_elapsed         | 1122        |
-|    total_timesteps      | 1982464     |
-| train/                  |             |
-|    approx_kl            | 0.006549825 |
-|    clip_fraction        | 0.0665      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.43       |
-|    explained_variance   | 0.966       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0345     |
-|    n_updates            | 1200        |
-|    policy_gradient_loss | -0.00349    |
-|    std                  | 1.34        |
-|    value_loss           | 0.00315     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1773         |
-|    iterations           | 122          |
-|    time_elapsed         | 1127         |
-|    total_timesteps      | 1998848      |
-| train/                  |              |
-|    approx_kl            | 0.0062008686 |
-|    clip_fraction        | 0.0699       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.44        |
-|    explained_variance   | 0.959        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0512      |
-|    n_updates            | 1210         |
-|    policy_gradient_loss | -0.00291     |
-|    std                  | 1.35         |
-|    value_loss           | 0.00544      |
-------------------------------------------
-Eval num_timesteps=2000000, episode_reward=-45.28 +/- 26.78
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -45.3       |
-| time/                   |             |
-|    total_timesteps      | 2000000     |
-| train/                  |             |
-|    approx_kl            | 0.006553275 |
-|    clip_fraction        | 0.0739      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.45       |
-|    explained_variance   | 0.924       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0416     |
-|    n_updates            | 1220        |
-|    policy_gradient_loss | -0.00427    |
-|    std                  | 1.36        |
-|    value_loss           | 0.0127      |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1761    |
-|    iterations      | 123     |
-|    time_elapsed    | 1144    |
-|    total_timesteps | 2015232 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1767         |
-|    iterations           | 124          |
-|    time_elapsed         | 1149         |
-|    total_timesteps      | 2031616      |
-| train/                  |              |
-|    approx_kl            | 0.0059226304 |
-|    clip_fraction        | 0.0653       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.46        |
-|    explained_variance   | 0.947        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.025       |
-|    n_updates            | 1230         |
-|    policy_gradient_loss | -0.00273     |
-|    std                  | 1.36         |
-|    value_loss           | 0.00879      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1775         |
-|    iterations           | 125          |
-|    time_elapsed         | 1153         |
-|    total_timesteps      | 2048000      |
-| train/                  |              |
-|    approx_kl            | 0.0076779695 |
-|    clip_fraction        | 0.0729       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.47        |
-|    explained_variance   | 0.931        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0382      |
-|    n_updates            | 1240         |
-|    policy_gradient_loss | -0.00385     |
-|    std                  | 1.37         |
-|    value_loss           | 0.00692      |
-------------------------------------------
-Eval num_timesteps=2050000, episode_reward=-44.22 +/- 28.52
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -44.2        |
-| time/                   |              |
-|    total_timesteps      | 2050000      |
-| train/                  |              |
-|    approx_kl            | 0.0073502595 |
-|    clip_fraction        | 0.0822       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.49        |
-|    explained_variance   | 0.946        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0342      |
-|    n_updates            | 1250         |
-|    policy_gradient_loss | -0.00592     |
-|    std                  | 1.39         |
-|    value_loss           | 0.00555      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1764    |
-|    iterations      | 126     |
-|    time_elapsed    | 1170    |
-|    total_timesteps | 2064384 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1770        |
-|    iterations           | 127         |
-|    time_elapsed         | 1175        |
-|    total_timesteps      | 2080768     |
-| train/                  |             |
-|    approx_kl            | 0.006628736 |
-|    clip_fraction        | 0.0767      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.51       |
-|    explained_variance   | 0.95        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.035      |
-|    n_updates            | 1260        |
-|    policy_gradient_loss | -0.00457    |
-|    std                  | 1.4         |
-|    value_loss           | 0.00416     |
------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1776         |
-|    iterations           | 128          |
-|    time_elapsed         | 1180         |
-|    total_timesteps      | 2097152      |
-| train/                  |              |
-|    approx_kl            | 0.0068027405 |
-|    clip_fraction        | 0.0719       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.53        |
-|    explained_variance   | 0.891        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0391      |
-|    n_updates            | 1270         |
-|    policy_gradient_loss | -0.00312     |
-|    std                  | 1.42         |
-|    value_loss           | 0.00492      |
-------------------------------------------
-Eval num_timesteps=2100000, episode_reward=-39.37 +/- 34.76
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -39.4       |
-| time/                   |             |
-|    total_timesteps      | 2100000     |
-| train/                  |             |
-|    approx_kl            | 0.005523986 |
-|    clip_fraction        | 0.0604      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.54       |
-|    explained_variance   | 0.938       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0364     |
-|    n_updates            | 1280        |
-|    policy_gradient_loss | -0.00281    |
-|    std                  | 1.42        |
-|    value_loss           | 0.015       |
------------------------------------------
-
-[Diag @ 2,100,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              20/20
-  action_mag mean=0.047 p10=0.002 p90=0.041 (0=stopped, 1=full speed)
-  min_flock_radius mean=8.62m best=5.89m  (target <5m to compact)
-  min_dog_to_com   mean=0.46m best=0.04m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=14.19m best=7.53m
-  reward/step (mean): progress=-0.0012  alignment=+0.0012  pen_bonus=+0.0010  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1751    |
-|    iterations      | 129     |
-|    time_elapsed    | 1206    |
-|    total_timesteps | 2113536 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1756        |
-|    iterations           | 130         |
-|    time_elapsed         | 1212        |
-|    total_timesteps      | 2129920     |
-| train/                  |             |
-|    approx_kl            | 0.007766474 |
-|    clip_fraction        | 0.0823      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.53       |
-|    explained_variance   | 0.96        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0396     |
-|    n_updates            | 1290        |
-|    policy_gradient_loss | -0.00492    |
-|    std                  | 1.41        |
-|    value_loss           | 0.00554     |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1762        |
-|    iterations           | 131         |
-|    time_elapsed         | 1217        |
-|    total_timesteps      | 2146304     |
-| train/                  |             |
-|    approx_kl            | 0.006704482 |
-|    clip_fraction        | 0.0748      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.53       |
-|    explained_variance   | 0.97        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0466     |
-|    n_updates            | 1300        |
-|    policy_gradient_loss | -0.00339    |
-|    std                  | 1.42        |
-|    value_loss           | 0.00432     |
------------------------------------------
-Eval num_timesteps=2150000, episode_reward=-43.17 +/- 26.95
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -43.2        |
-| time/                   |              |
-|    total_timesteps      | 2150000      |
-| train/                  |              |
-|    approx_kl            | 0.0065447316 |
-|    clip_fraction        | 0.0751       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.53        |
-|    explained_variance   | 0.888        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0369      |
-|    n_updates            | 1310         |
-|    policy_gradient_loss | -0.00369     |
-|    std                  | 1.41         |
-|    value_loss           | 0.0165       |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1750    |
-|    iterations      | 132     |
-|    time_elapsed    | 1235    |
-|    total_timesteps | 2162688 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1755         |
-|    iterations           | 133          |
-|    time_elapsed         | 1241         |
-|    total_timesteps      | 2179072      |
-| train/                  |              |
-|    approx_kl            | 0.0070872563 |
-|    clip_fraction        | 0.075        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.54        |
-|    explained_variance   | 0.954        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0427      |
-|    n_updates            | 1320         |
-|    policy_gradient_loss | -0.00406     |
-|    std                  | 1.42         |
-|    value_loss           | 0.00977      |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1762         |
-|    iterations           | 134          |
-|    time_elapsed         | 1245         |
-|    total_timesteps      | 2195456      |
-| train/                  |              |
-|    approx_kl            | 0.0073371828 |
-|    clip_fraction        | 0.077        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.55        |
-|    explained_variance   | 0.939        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0303      |
-|    n_updates            | 1330         |
-|    policy_gradient_loss | -0.00371     |
-|    std                  | 1.43         |
-|    value_loss           | 0.00862      |
-------------------------------------------
-Eval num_timesteps=2200000, episode_reward=-40.81 +/- 44.39
-Episode length: 2000.00 +/- 0.00
-------------------------------------------
-| eval/                   |              |
-|    mean_ep_length       | 2e+03        |
-|    mean_reward          | -40.8        |
-| time/                   |              |
-|    total_timesteps      | 2200000      |
-| train/                  |              |
-|    approx_kl            | 0.0072064474 |
-|    clip_fraction        | 0.0714       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.58        |
-|    explained_variance   | 0.951        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0517      |
-|    n_updates            | 1340         |
-|    policy_gradient_loss | -0.00405     |
-|    std                  | 1.45         |
-|    value_loss           | 0.00351      |
-------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1751    |
-|    iterations      | 135     |
-|    time_elapsed    | 1262    |
-|    total_timesteps | 2211840 |
---------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1758        |
-|    iterations           | 136         |
-|    time_elapsed         | 1267        |
-|    total_timesteps      | 2228224     |
-| train/                  |             |
-|    approx_kl            | 0.008551812 |
-|    clip_fraction        | 0.0911      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.58       |
-|    explained_variance   | 0.929       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0258     |
-|    n_updates            | 1350        |
-|    policy_gradient_loss | -0.00599    |
-|    std                  | 1.45        |
-|    value_loss           | 0.0034      |
------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1764        |
-|    iterations           | 137         |
-|    time_elapsed         | 1271        |
-|    total_timesteps      | 2244608     |
-| train/                  |             |
-|    approx_kl            | 0.006960677 |
-|    clip_fraction        | 0.0702      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.59       |
-|    explained_variance   | 0.9         |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0396     |
-|    n_updates            | 1360        |
-|    policy_gradient_loss | -0.00412    |
-|    std                  | 1.46        |
-|    value_loss           | 0.00429     |
------------------------------------------
-Eval num_timesteps=2250000, episode_reward=-37.92 +/- 31.68
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -37.9       |
-| time/                   |             |
-|    total_timesteps      | 2250000     |
-| train/                  |             |
-|    approx_kl            | 0.005949891 |
-|    clip_fraction        | 0.0683      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.59       |
-|    explained_variance   | 0.948       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0381     |
-|    n_updates            | 1370        |
-|    policy_gradient_loss | -0.00328    |
-|    std                  | 1.46        |
-|    value_loss           | 0.0113      |
------------------------------------------
-
-[Diag @ 2,250,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              19/20
-  COMPACT_CANT_DRIVE         1/20
-  action_mag mean=0.068 p10=0.004 p90=0.045 (0=stopped, 1=full speed)
-  min_flock_radius mean=7.87m best=3.57m  (target <5m to compact)
-  min_dog_to_com   mean=0.45m best=0.15m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=14.06m best=6.95m
-  reward/step (mean): progress=-0.0035  alignment=+0.0020  pen_bonus=+0.0008  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1743    |
-|    iterations      | 138     |
-|    time_elapsed    | 1297    |
-|    total_timesteps | 2260992 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1749         |
-|    iterations           | 139          |
-|    time_elapsed         | 1301         |
-|    total_timesteps      | 2277376      |
-| train/                  |              |
-|    approx_kl            | 0.0071727796 |
-|    clip_fraction        | 0.0784       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.6         |
-|    explained_variance   | 0.943        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0387      |
-|    n_updates            | 1380         |
-|    policy_gradient_loss | -0.0042      |
-|    std                  | 1.46         |
-|    value_loss           | 0.0113       |
-------------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1755        |
-|    iterations           | 140         |
-|    time_elapsed         | 1306        |
-|    total_timesteps      | 2293760     |
-| train/                  |             |
-|    approx_kl            | 0.006800391 |
-|    clip_fraction        | 0.0662      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.59       |
-|    explained_variance   | 0.931       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0283     |
-|    n_updates            | 1390        |
-|    policy_gradient_loss | -0.00421    |
-|    std                  | 1.46        |
-|    value_loss           | 0.00659     |
------------------------------------------
-Eval num_timesteps=2300000, episode_reward=-47.47 +/- 37.24
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -47.5       |
-| time/                   |             |
-|    total_timesteps      | 2300000     |
-| train/                  |             |
-|    approx_kl            | 0.008103053 |
-|    clip_fraction        | 0.081       |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.59       |
-|    explained_variance   | 0.945       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0433     |
-|    n_updates            | 1400        |
-|    policy_gradient_loss | -0.00404    |
-|    std                  | 1.46        |
-|    value_loss           | 0.00796     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1745    |
-|    iterations      | 141     |
-|    time_elapsed    | 1323    |
-|    total_timesteps | 2310144 |
---------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1751         |
-|    iterations           | 142          |
-|    time_elapsed         | 1328         |
-|    total_timesteps      | 2326528      |
-| train/                  |              |
-|    approx_kl            | 0.0061590094 |
-|    clip_fraction        | 0.066        |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.61        |
-|    explained_variance   | 0.957        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0436      |
-|    n_updates            | 1410         |
-|    policy_gradient_loss | -0.00287     |
-|    std                  | 1.47         |
-|    value_loss           | 0.0102       |
-------------------------------------------
-------------------------------------------
-| time/                   |              |
-|    fps                  | 1757         |
-|    iterations           | 143          |
-|    time_elapsed         | 1332         |
-|    total_timesteps      | 2342912      |
-| train/                  |              |
-|    approx_kl            | 0.0070403973 |
-|    clip_fraction        | 0.0733       |
-|    clip_range           | 0.2          |
-|    entropy_loss         | -3.62        |
-|    explained_variance   | 0.863        |
-|    learning_rate        | 0.0003       |
-|    loss                 | -0.0356      |
-|    n_updates            | 1420         |
-|    policy_gradient_loss | -0.00525     |
-|    std                  | 1.48         |
-|    value_loss           | 0.0103       |
-------------------------------------------
-Eval num_timesteps=2350000, episode_reward=-47.95 +/- 27.60
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -48         |
-| time/                   |             |
-|    total_timesteps      | 2350000     |
-| train/                  |             |
-|    approx_kl            | 0.007505033 |
-|    clip_fraction        | 0.0729      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.64       |
-|    explained_variance   | 0.94        |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0473     |
-|    n_updates            | 1430        |
-|    policy_gradient_loss | -0.00385    |
-|    std                  | 1.5         |
-|    value_loss           | 0.00449     |
------------------------------------------
---------------------------------
-| time/              |         |
-|    fps             | 1747    |
-|    iterations      | 144     |
-|    time_elapsed    | 1350    |
-|    total_timesteps | 2359296 |
---------------------------------
-----------------------------------------
-| time/                   |            |
-|    fps                  | 1752       |
-|    iterations           | 145        |
-|    time_elapsed         | 1355       |
-|    total_timesteps      | 2375680    |
-| train/                  |            |
-|    approx_kl            | 0.00724002 |
-|    clip_fraction        | 0.0739     |
-|    clip_range           | 0.2        |
-|    entropy_loss         | -3.65      |
-|    explained_variance   | 0.948      |
-|    learning_rate        | 0.0003     |
-|    loss                 | -0.0419    |
-|    n_updates            | 1440       |
-|    policy_gradient_loss | -0.00426   |
-|    std                  | 1.5        |
-|    value_loss           | 0.00886    |
-----------------------------------------
------------------------------------------
-| time/                   |             |
-|    fps                  | 1758        |
-|    iterations           | 146         |
-|    time_elapsed         | 1360        |
-|    total_timesteps      | 2392064     |
-| train/                  |             |
-|    approx_kl            | 0.007578165 |
-|    clip_fraction        | 0.0713      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.64       |
-|    explained_variance   | 0.859       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0427     |
-|    n_updates            | 1450        |
-|    policy_gradient_loss | -0.0049     |
-|    std                  | 1.49        |
-|    value_loss           | 0.00429     |
------------------------------------------
-Eval num_timesteps=2400000, episode_reward=-47.88 +/- 34.39
-Episode length: 2000.00 +/- 0.00
------------------------------------------
-| eval/                   |             |
-|    mean_ep_length       | 2e+03       |
-|    mean_reward          | -47.9       |
-| time/                   |             |
-|    total_timesteps      | 2400000     |
-| train/                  |             |
-|    approx_kl            | 0.006707498 |
-|    clip_fraction        | 0.0692      |
-|    clip_range           | 0.2         |
-|    entropy_loss         | -3.65       |
-|    explained_variance   | 0.861       |
-|    learning_rate        | 0.0003      |
-|    loss                 | -0.0426     |
-|    n_updates            | 1460        |
-|    policy_gradient_loss | -0.00411    |
-|    std                  | 1.5         |
-|    value_loss           | 0.00639     |
------------------------------------------
-
-[Diag @ 2,400,000 | n_sheep=4 | success=0%]
-  NEVER_COMPACT              19/20
-  COMPACT_CANT_DRIVE         1/20
-  action_mag mean=0.052 p10=0.005 p90=0.045 (0=stopped, 1=full speed)
-  min_flock_radius mean=8.79m best=3.32m  (target <5m to compact)
-  min_dog_to_com   mean=0.45m best=0.20m  (FLEE_DIST=7m)
-  min_com_to_pen   mean=13.96m best=9.02m
-  reward/step (mean): progress=-0.0047  alignment=+0.0013  pen_bonus=+0.0005  step_cost=-0.0200  complete=+0.0000
---------------------------------
-| time/              |         |
-|    fps             | 1737    |
-|    iterations      | 147     |
-|    time_elapsed    | 1386    |
-|    total_timesteps | 2408448 |
---------------------------------
-
-Training complete. Artefacts saved to runs/ppo_fix_check2/
diff --git a/training/runs/ppo_fix_check2/best_model/best_model.zip b/training/runs/ppo_fix_check2/best_model/best_model.zip
deleted file mode 100644
index b07d85b..0000000
Binary files a/training/runs/ppo_fix_check2/best_model/best_model.zip and /dev/null differ
diff --git a/training/runs/ppo_fix_check2/evaluations.npz b/training/runs/ppo_fix_check2/evaluations.npz
deleted file mode 100644
index cc6f67e..0000000
Binary files a/training/runs/ppo_fix_check2/evaluations.npz and /dev/null differ
diff --git a/training/runs/ppo_fix_check2/final_model.zip b/training/runs/ppo_fix_check2/final_model.zip
deleted file mode 100644
index ac482b3..0000000
Binary files a/training/runs/ppo_fix_check2/final_model.zip and /dev/null differ
diff --git a/training/runs/ppo_fix_check2/vecnorm.pkl b/training/runs/ppo_fix_check2/vecnorm.pkl
deleted file mode 100644
index 20a640e..0000000
Binary files a/training/runs/ppo_fix_check2/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/ppo_v2/best_model/best_model.zip b/training/runs/ppo_v2/best_model/best_model.zip
deleted file mode 100644
index 9d6f244..0000000
Binary files a/training/runs/ppo_v2/best_model/best_model.zip and /dev/null differ
diff --git a/training/runs/ppo_v2/evaluations.npz b/training/runs/ppo_v2/evaluations.npz
deleted file mode 100644
index 5f2a578..0000000
Binary files a/training/runs/ppo_v2/evaluations.npz and /dev/null differ
diff --git a/training/runs/ppo_v2/final_model.zip b/training/runs/ppo_v2/final_model.zip
deleted file mode 100644
index 49e3adf..0000000
Binary files a/training/runs/ppo_v2/final_model.zip and /dev/null differ
diff --git a/training/runs/ppo_v2/vecnorm.pkl b/training/runs/ppo_v2/vecnorm.pkl
deleted file mode 100644
index 76f9df6..0000000
Binary files a/training/runs/ppo_v2/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/ppo_v2_cont/best_model/best_model.zip b/training/runs/ppo_v2_cont/best_model/best_model.zip
deleted file mode 100644
index fa1a968..0000000
Binary files a/training/runs/ppo_v2_cont/best_model/best_model.zip and /dev/null differ
diff --git a/training/runs/ppo_v2_cont/evaluations.npz b/training/runs/ppo_v2_cont/evaluations.npz
deleted file mode 100644
index 634b804..0000000
Binary files a/training/runs/ppo_v2_cont/evaluations.npz and /dev/null differ
diff --git a/training/runs/ppo_v2_cont/final_model.zip b/training/runs/ppo_v2_cont/final_model.zip
deleted file mode 100644
index 49a0296..0000000
Binary files a/training/runs/ppo_v2_cont/final_model.zip and /dev/null differ
diff --git a/training/runs/ppo_v2_cont/vecnorm.pkl b/training/runs/ppo_v2_cont/vecnorm.pkl
deleted file mode 100644
index d7b6d54..0000000
Binary files a/training/runs/ppo_v2_cont/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/ppo_v3/best_model/best_model.zip b/training/runs/ppo_v3/best_model/best_model.zip
deleted file mode 100644
index 82d0259..0000000
Binary files a/training/runs/ppo_v3/best_model/best_model.zip and /dev/null differ
diff --git a/training/runs/ppo_v3/evaluations.npz b/training/runs/ppo_v3/evaluations.npz
deleted file mode 100644
index 1d5ee82..0000000
Binary files a/training/runs/ppo_v3/evaluations.npz and /dev/null differ
diff --git a/training/runs/ppo_v3/final_model.zip b/training/runs/ppo_v3/final_model.zip
deleted file mode 100644
index ce84843..0000000
Binary files a/training/runs/ppo_v3/final_model.zip and /dev/null differ
diff --git a/training/runs/ppo_v3/vecnorm.pkl b/training/runs/ppo_v3/vecnorm.pkl
deleted file mode 100644
index 4729c11..0000000
Binary files a/training/runs/ppo_v3/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/replay_20260425_152857/config.json b/training/runs/replay_20260425_152857/config.json
deleted file mode 100644
index b2d15fe..0000000
--- a/training/runs/replay_20260425_152857/config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "W_PER_SHEEP": 1.0,
-  "W_ALIGN": 0.0,
-  "W_PEN_BONUS": 5.0,
-  "W_STEP_COST": 0.02,
-  "W_COMPLETE": 200.0,
-  "W_COMPACT": 1.5,
-  "ALIGN_SHAPE": "standoff",
-  "ALIGN_GATED": false,
-  "ent_coef": 0.02
-}
\ No newline at end of file
diff --git a/training/runs/replay_20260425_152857/final_model.zip b/training/runs/replay_20260425_152857/final_model.zip
deleted file mode 100644
index b326e4c..0000000
Binary files a/training/runs/replay_20260425_152857/final_model.zip and /dev/null differ
diff --git a/training/runs/replay_20260425_152857/stage_results.json b/training/runs/replay_20260425_152857/stage_results.json
deleted file mode 100644
index c4e1ec0..0000000
--- a/training/runs/replay_20260425_152857/stage_results.json
+++ /dev/null
@@ -1,23 +0,0 @@
-[
-  {
-    "n_sheep": 1,
-    "sr": 1.0,
-    "mean_len": 267.6333333333333,
-    "mean_min_pen": 3.7235233147939044,
-    "mean_act": 0.3746675180125346
-  },
-  {
-    "n_sheep": 2,
-    "sr": 0.06666666666666667,
-    "mean_len": 1458.6666666666667,
-    "mean_min_pen": 14.14484707514445,
-    "mean_act": 0.284232099657656
-  },
-  {
-    "n_sheep": 3,
-    "sr": 0.0,
-    "mean_len": 1500.0,
-    "mean_min_pen": 12.514182837804158,
-    "mean_act": 1.2590703022670828
-  }
-]
\ No newline at end of file
diff --git a/training/runs/replay_20260425_152857/vecnorm.pkl b/training/runs/replay_20260425_152857/vecnorm.pkl
deleted file mode 100644
index 0a57434..0000000
Binary files a/training/runs/replay_20260425_152857/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/replay_best.log b/training/runs/replay_best.log
deleted file mode 100644
index 5fd9f21..0000000
--- a/training/runs/replay_best.log
+++ /dev/null
@@ -1,72 +0,0 @@
-Config: {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-Run dir: runs/replay_20260425_152857
-Curriculum: 1 → 3 sheep, 1,500,000 steps/stage
-
-[Stage n_sheep=1] training 1,500,000 steps
-           ... [trial 1 | 1 sheep | 100,000 steps | ret(last 50)=-20.83  sr=6%]
-           ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-21.40  sr=4%]
-           ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=-22.31  sr=0%]
-           ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=-19.13  sr=4%]
-           ... [trial 1 | 1 sheep | 500,000 steps | ret(last 50)=-18.79  sr=8%]
-           ... [trial 1 | 1 sheep | 600,000 steps | ret(last 50)=-10.15  sr=8%]
-           ... [trial 1 | 1 sheep | 700,000 steps | ret(last 50)=+10.14  sr=82%]
-           ... [trial 1 | 1 sheep | 800,000 steps | ret(last 50)=+11.90  sr=100%]
-           ... [trial 1 | 1 sheep | 900,000 steps | ret(last 50)=+11.32  sr=100%]
-           ... [trial 1 | 1 sheep | 1,000,000 steps | ret(last 50)=+11.36  sr=100%]
-           ... [trial 1 | 1 sheep | 1,100,000 steps | ret(last 50)=+11.18  sr=100%]
-           ... [trial 1 | 1 sheep | 1,200,000 steps | ret(last 50)=+11.08  sr=100%]
-           ... [trial 1 | 1 sheep | 1,300,000 steps | ret(last 50)=+11.14  sr=100%]
-           ... [trial 1 | 1 sheep | 1,400,000 steps | ret(last 50)=+11.10  sr=100%]
-           ... [trial 1 | 1 sheep | 1,500,000 steps | ret(last 50)=+10.99  sr=100%]
-[Stage n_sheep=1] evaluating 30 eps
-[Stage n_sheep=1] sr=100%  mean_len=268  mean_min_pen=3.7m  mean_act=0.37
-
-[Stage n_sheep=2] training 1,500,000 steps
-           ... [trial 1 | 2 sheep | 1,507,336 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 1 | 2 sheep | 1,607,336 steps | ret(last 50)=-3.10  sr=2%]
-           ... [trial 1 | 2 sheep | 1,707,336 steps | ret(last 50)=-3.41  sr=2%]
-           ... [trial 1 | 2 sheep | 1,807,336 steps | ret(last 50)=-3.11  sr=6%]
-           ... [trial 1 | 2 sheep | 1,907,336 steps | ret(last 50)=-2.65  sr=8%]
-           ... [trial 1 | 2 sheep | 2,007,336 steps | ret(last 50)=-4.11  sr=2%]
-           ... [trial 1 | 2 sheep | 2,107,336 steps | ret(last 50)=-3.19  sr=6%]
-           ... [trial 1 | 2 sheep | 2,207,336 steps | ret(last 50)=-3.45  sr=4%]
-           ... [trial 1 | 2 sheep | 2,307,336 steps | ret(last 50)=-4.13  sr=0%]
-           ... [trial 1 | 2 sheep | 2,407,336 steps | ret(last 50)=-3.47  sr=8%]
-           ... [trial 1 | 2 sheep | 2,507,336 steps | ret(last 50)=-3.83  sr=4%]
-           ... [trial 1 | 2 sheep | 2,607,336 steps | ret(last 50)=-4.58  sr=0%]
-           ... [trial 1 | 2 sheep | 2,707,336 steps | ret(last 50)=-3.94  sr=2%]
-           ... [trial 1 | 2 sheep | 2,807,336 steps | ret(last 50)=-4.15  sr=2%]
-           ... [trial 1 | 2 sheep | 2,907,336 steps | ret(last 50)=-3.95  sr=4%]
-           ... [trial 1 | 2 sheep | 3,007,336 steps | ret(last 50)=-4.44  sr=0%]
-[Stage n_sheep=2] evaluating 30 eps
-[Stage n_sheep=2] sr=7%  mean_len=1459  mean_min_pen=14.1m  mean_act=0.28
-
-[Stage n_sheep=3] training 1,500,000 steps
-           ... [trial 1 | 3 sheep | 3,014,664 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 1 | 3 sheep | 3,114,664 steps | ret(last 50)=-4.16  sr=0%]
-           ... [trial 1 | 3 sheep | 3,214,664 steps | ret(last 50)=-4.94  sr=0%]
-           ... [trial 1 | 3 sheep | 3,314,664 steps | ret(last 50)=-4.42  sr=0%]
-           ... [trial 1 | 3 sheep | 3,414,664 steps | ret(last 50)=-4.69  sr=0%]
-           ... [trial 1 | 3 sheep | 3,514,664 steps | ret(last 50)=-3.72  sr=0%]
-           ... [trial 1 | 3 sheep | 3,614,664 steps | ret(last 50)=-5.04  sr=0%]
-           ... [trial 1 | 3 sheep | 3,714,664 steps | ret(last 50)=-4.26  sr=0%]
-           ... [trial 1 | 3 sheep | 3,814,664 steps | ret(last 50)=-4.70  sr=0%]
-           ... [trial 1 | 3 sheep | 3,914,664 steps | ret(last 50)=-4.61  sr=0%]
-           ... [trial 1 | 3 sheep | 4,014,664 steps | ret(last 50)=-4.19  sr=0%]
-           ... [trial 1 | 3 sheep | 4,114,664 steps | ret(last 50)=-4.35  sr=0%]
-           ... [trial 1 | 3 sheep | 4,214,664 steps | ret(last 50)=-4.41  sr=0%]
-           ... [trial 1 | 3 sheep | 4,314,664 steps | ret(last 50)=-4.42  sr=0%]
-           ... [trial 1 | 3 sheep | 4,414,664 steps | ret(last 50)=-4.77  sr=0%]
-           ... [trial 1 | 3 sheep | 4,514,664 steps | ret(last 50)=-4.49  sr=0%]
-[Stage n_sheep=3] evaluating 30 eps
-[Stage n_sheep=3] sr=0%  mean_len=1500  mean_min_pen=12.5m  mean_act=1.26
-
-============================================================
-  REPLAY SUMMARY
-============================================================
-  n_sheep=1  sr=100%  len=  268  min_pen=  3.7m  act=0.37
-  n_sheep=2  sr=  7%  len= 1459  min_pen= 14.1m  act=0.28
-  n_sheep=3  sr=  0%  len= 1500  min_pen= 12.5m  act=1.26
-
-  Total time: 26.9 min
-  Artefacts:  runs/replay_20260425_152857/
diff --git a/training/runs/smoke_stage1/model.zip b/training/runs/smoke_stage1/model.zip
deleted file mode 100644
index 232a47b..0000000
Binary files a/training/runs/smoke_stage1/model.zip and /dev/null differ
diff --git a/training/runs/smoke_stage1/timeseries.png b/training/runs/smoke_stage1/timeseries.png
deleted file mode 100644
index 9d6bf7c..0000000
Binary files a/training/runs/smoke_stage1/timeseries.png and /dev/null differ
diff --git a/training/runs/smoke_stage1/trajectory.png b/training/runs/smoke_stage1/trajectory.png
deleted file mode 100644
index 965f743..0000000
Binary files a/training/runs/smoke_stage1/trajectory.png and /dev/null differ
diff --git a/training/runs/smoke_stage1/vecnorm.pkl b/training/runs/smoke_stage1/vecnorm.pkl
deleted file mode 100644
index 731c388..0000000
Binary files a/training/runs/smoke_stage1/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/smoke_stage2/model.zip b/training/runs/smoke_stage2/model.zip
deleted file mode 100644
index 7c746b0..0000000
Binary files a/training/runs/smoke_stage2/model.zip and /dev/null differ
diff --git a/training/runs/smoke_stage2/timeseries.png b/training/runs/smoke_stage2/timeseries.png
deleted file mode 100644
index 2165716..0000000
Binary files a/training/runs/smoke_stage2/timeseries.png and /dev/null differ
diff --git a/training/runs/smoke_stage2/trajectory.png b/training/runs/smoke_stage2/trajectory.png
deleted file mode 100644
index 52340b9..0000000
Binary files a/training/runs/smoke_stage2/trajectory.png and /dev/null differ
diff --git a/training/runs/smoke_stage2/vecnorm.pkl b/training/runs/smoke_stage2/vecnorm.pkl
deleted file mode 100644
index 8870baa..0000000
Binary files a/training/runs/smoke_stage2/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/smoke_stage3/model.zip b/training/runs/smoke_stage3/model.zip
deleted file mode 100644
index 4640505..0000000
Binary files a/training/runs/smoke_stage3/model.zip and /dev/null differ
diff --git a/training/runs/smoke_stage3/timeseries.png b/training/runs/smoke_stage3/timeseries.png
deleted file mode 100644
index c548598..0000000
Binary files a/training/runs/smoke_stage3/timeseries.png and /dev/null differ
diff --git a/training/runs/smoke_stage3/trajectory.png b/training/runs/smoke_stage3/trajectory.png
deleted file mode 100644
index 1b70804..0000000
Binary files a/training/runs/smoke_stage3/trajectory.png and /dev/null differ
diff --git a/training/runs/smoke_stage3/vecnorm.pkl b/training/runs/smoke_stage3/vecnorm.pkl
deleted file mode 100644
index 6cd290a..0000000
Binary files a/training/runs/smoke_stage3/vecnorm.pkl and /dev/null differ
diff --git a/training/runs/sweep_20260425_124021/best.json b/training/runs/sweep_20260425_124021/best.json
deleted file mode 100644
index ee2c2ff..0000000
--- a/training/runs/sweep_20260425_124021/best.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-  "trial": 0,
-  "config": {
-    "W_PER_SHEEP": 1.0,
-    "W_ALIGN": 0.1,
-    "W_PEN_BONUS": 10.0,
-    "W_STEP_COST": 0.02,
-    "W_COMPLETE": 100.0,
-    "W_COMPACT": 3.0,
-    "ALIGN_SHAPE": "standoff",
-    "ALIGN_GATED": false,
-    "ent_coef": 0.005
-  },
-  "score": 0.06,
-  "sr": {
-    "1": 0.3,
-    "2": 0.0,
-    "3": 0.0
-  },
-  "details": {
-    "1": {
-      "sr": 0.3,
-      "mean_len": 1252.2,
-      "mean_min_pen": 2.1085331559181215,
-      "mean_act": 0.07743233270979732
-    },
-    "2": {
-      "sr": 0.0,
-      "mean_len": 1500.0,
-      "mean_min_pen": 12.107558453083039,
-      "mean_act": 0.15608626089841424
-    },
-    "3": {
-      "sr": 0.0,
-      "mean_len": 1500.0,
-      "mean_min_pen": 13.675278377532958,
-      "mean_act": 0.10535904271739319
-    }
-  },
-  "elapsed_s": 307.773992061615
-}
\ No newline at end of file
diff --git a/training/runs/sweep_20260425_124021/results.jsonl b/training/runs/sweep_20260425_124021/results.jsonl
deleted file mode 100644
index 191ddee..0000000
--- a/training/runs/sweep_20260425_124021/results.jsonl
+++ /dev/null
@@ -1 +0,0 @@
-{"trial": 0, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1252.2, "mean_min_pen": 2.1085331559181215, "mean_act": 0.07743233270979732}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.107558453083039, "mean_act": 0.15608626089841424}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.675278377532958, "mean_act": 0.10535904271739319}}, "elapsed_s": 307.773992061615}
diff --git a/training/runs/sweep_20260425_124630/best.json b/training/runs/sweep_20260425_124630/best.json
deleted file mode 100644
index 4f41880..0000000
--- a/training/runs/sweep_20260425_124630/best.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-  "trial": 13,
-  "config": {
-    "W_PER_SHEEP": 1.0,
-    "W_ALIGN": 0.0,
-    "W_PEN_BONUS": 5.0,
-    "W_STEP_COST": 0.02,
-    "W_COMPLETE": 200.0,
-    "W_COMPACT": 1.5,
-    "ALIGN_SHAPE": "standoff",
-    "ALIGN_GATED": false,
-    "ent_coef": 0.02
-  },
-  "score": 0.35,
-  "sr": {
-    "1": 1.0,
-    "2": 0.3,
-    "3": 0.0
-  },
-  "details": {
-    "1": {
-      "sr": 1.0,
-      "mean_len": 428.9,
-      "mean_min_pen": 3.731236696243286,
-      "mean_act": 0.33429858573849425
-    },
-    "2": {
-      "sr": 0.3,
-      "mean_len": 1242.7,
-      "mean_min_pen": 8.937442195415496,
-      "mean_act": 0.3998076917437125
-    },
-    "3": {
-      "sr": 0.0,
-      "mean_len": 1500.0,
-      "mean_min_pen": 14.061083602905274,
-      "mean_act": 0.5966902794524755
-    }
-  },
-  "elapsed_s": 313.8281009197235
-}
\ No newline at end of file
diff --git a/training/runs/sweep_20260425_124630/results.jsonl b/training/runs/sweep_20260425_124630/results.jsonl
deleted file mode 100644
index cbecd6f..0000000
--- a/training/runs/sweep_20260425_124630/results.jsonl
+++ /dev/null
@@ -1,25 +0,0 @@
-{"trial": 0, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.15000000000000002, "sr": {"1": 0.5, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.5, "mean_len": 1051.6, "mean_min_pen": 3.0551586985588073, "mean_act": 0.0887192903536989}, "2": {"sr": 0.1, "mean_len": 1438.1, "mean_min_pen": 10.993862140178681, "mean_act": 0.1723056222816755}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 11.92835488319397, "mean_act": 0.15403316749989074}}, "elapsed_s": 316.9084241390228}
-{"trial": 1, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.05, "W_COMPLETE": 200.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1153.8, "mean_min_pen": 3.8145030617713926, "mean_act": 0.15146865127462797}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.058024168014526, "mean_act": 0.10904584494279744}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.5988187789917, "mean_act": 0.09578829008591905}}, "elapsed_s": 310.8732409477234}
-{"trial": 2, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 50.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.01}, "score": 0.27, "sr": {"1": 0.7, "2": 0.2, "3": 0.1}, "details": {"1": {"sr": 0.7, "mean_len": 772.1, "mean_min_pen": 2.92204372882843, "mean_act": 0.1583604314471399}, "2": {"sr": 0.2, "mean_len": 1390.6, "mean_min_pen": 12.992859578132629, "mean_act": 0.16090679360424953}, "3": {"sr": 0.1, "mean_len": 1403.7, "mean_min_pen": 13.045468378067017, "mean_act": 0.07991531561051667}}, "elapsed_s": 303.7708294391632}
-{"trial": 3, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 50.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1137.5, "mean_min_pen": 2.1229824781417848, "mean_act": 0.08172097406143335}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 11.521494126319885, "mean_act": 0.16864279503144788}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.317158126831055, "mean_act": 0.05537428615499472}}, "elapsed_s": 301.6172459125519}
-{"trial": 4, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.02, "W_COMPLETE": 50.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": true, "ent_coef": 0.005}, "score": 0.2, "sr": {"1": 1.0, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 1.0, "mean_len": 567.0, "mean_min_pen": 3.2795117855072022, "mean_act": 0.1855437107780058}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 9.976170372962951, "mean_act": 0.2074074002778701}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.89306182861328, "mean_act": 0.21666522849385267}}, "elapsed_s": 313.525591135025}
-{"trial": 5, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.05, "W_COMPLETE": 200.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.01}, "score": 0.16000000000000003, "sr": {"1": 0.8, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.8, "mean_len": 675.5, "mean_min_pen": 3.1338732481002807, "mean_act": 0.11691584614814514}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 9.693846690654755, "mean_act": 0.19984676872865814}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.684805488586425, "mean_act": 0.06430307933471292}}, "elapsed_s": 312.4476580619812}
-{"trial": 6, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.005, "W_COMPLETE": 200.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.01}, "score": 0.08000000000000002, "sr": {"1": 0.4, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.4, "mean_len": 1343.9, "mean_min_pen": 4.092962062358856, "mean_act": 0.07675616785431166}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.157618689537049, "mean_act": 0.13906600509098352}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.079688358306885, "mean_act": 0.07073271389845953}}, "elapsed_s": 337.7615342140198}
-{"trial": 7, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.11, "sr": {"1": 0.3, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1177.5, "mean_min_pen": 2.261639392375946, "mean_act": 0.11013885321646562}, "2": {"sr": 0.1, "mean_len": 1437.5, "mean_min_pen": 5.9263048529624935, "mean_act": 0.16420815230170227}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.130784749984741, "mean_act": 0.20303070502222206}}, "elapsed_s": 451.2424490451813}
-{"trial": 8, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.02, "W_COMPLETE": 50.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.05}, "score": 0.19, "sr": {"1": 0.7, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.7, "mean_len": 874.2, "mean_min_pen": 4.152815592288971, "mean_act": 0.1303976929043709}, "2": {"sr": 0.1, "mean_len": 1381.4, "mean_min_pen": 12.115124177932739, "mean_act": 0.3749806733317197}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.596546864509582, "mean_act": 0.10082290474528718}}, "elapsed_s": 349.3926422595978}
-{"trial": 9, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.02, "W_COMPLETE": 200.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.05}, "score": 0.0, "sr": {"1": 0.0, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 8.404254817962647, "mean_act": 0.6749623541596586}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 11.970247220993041, "mean_act": 0.45562502020561796}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.029277420043945, "mean_act": 0.1599790089856222}}, "elapsed_s": 319.38924622535706}
-{"trial": 10, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.02, "W_COMPLETE": 200.0, "W_COMPACT": 0.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.16000000000000003, "sr": {"1": 0.8, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.8, "mean_len": 690.7, "mean_min_pen": 3.1264367938041686, "mean_act": 0.13493279961414406}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.040377330780029, "mean_act": 0.20203861368317985}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.379706478118896, "mean_act": 0.05979441475490263}}, "elapsed_s": 310.1806254386902}
-{"trial": 11, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.05, "W_COMPLETE": 50.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.24, "sr": {"1": 0.7, "2": 0.2, "3": 0.0}, "details": {"1": {"sr": 0.7, "mean_len": 727.5, "mean_min_pen": 2.933144009113312, "mean_act": 0.11888058594495643}, "2": {"sr": 0.2, "mean_len": 1317.8, "mean_min_pen": 10.2599928855896, "mean_act": 0.14370172662258304}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.231103086471558, "mean_act": 0.0614644922383149}}, "elapsed_s": 330.0620620250702}
-{"trial": 12, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 0.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.005}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1244.8, "mean_min_pen": 2.1193889737129212, "mean_act": 0.08216679023110932}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.745809042453766, "mean_act": 0.16497857472260813}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.016976690292358, "mean_act": 0.09897869050660908}}, "elapsed_s": 323.27931213378906}
-{"trial": 13, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.02, "W_COMPLETE": 200.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.35, "sr": {"1": 1.0, "2": 0.3, "3": 0.0}, "details": {"1": {"sr": 1.0, "mean_len": 428.9, "mean_min_pen": 3.731236696243286, "mean_act": 0.33429858573849425}, "2": {"sr": 0.3, "mean_len": 1242.7, "mean_min_pen": 8.937442195415496, "mean_act": 0.3998076917437125}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.061083602905274, "mean_act": 0.5966902794524755}}, "elapsed_s": 313.8281009197235}
-{"trial": 14, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.05}, "score": 0.13999999999999999, "sr": {"1": 0.7, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.7, "mean_len": 912.4, "mean_min_pen": 2.940706562995911, "mean_act": 1.3471978399000248}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 9.901372599601746, "mean_act": 0.9463685217667609}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.291404342651367, "mean_act": 0.08601266834173493}}, "elapsed_s": 322.57220220565796}
-{"trial": 15, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.01}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1210.5, "mean_min_pen": 2.107759189605713, "mean_act": 0.08131515106917063}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.824185514450074, "mean_act": 0.20362997558291535}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.042323064804076, "mean_act": 0.17125511734669563}}, "elapsed_s": 312.3465087413788}
-{"trial": 16, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 5.0, "W_STEP_COST": 0.005, "W_COMPLETE": 200.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "near", "ALIGN_GATED": true, "ent_coef": 0.05}, "score": 0.24, "sr": {"1": 0.7, "2": 0.2, "3": 0.0}, "details": {"1": {"sr": 0.7, "mean_len": 650.1, "mean_min_pen": 2.981771671772003, "mean_act": 0.1621352170537764}, "2": {"sr": 0.2, "mean_len": 1435.5, "mean_min_pen": 8.686615812778474, "mean_act": 0.3279171284351484}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.374159717559815, "mean_act": 0.04937917392927017}}, "elapsed_s": 303.71519470214844}
-{"trial": 17, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.005, "W_COMPLETE": 100.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.16, "sr": {"1": 0.3, "2": 0.2, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1088.1, "mean_min_pen": 3.4793057322502134, "mean_act": 0.09515179877670824}, "2": {"sr": 0.2, "mean_len": 1428.5, "mean_min_pen": 10.024536824226379, "mean_act": 0.4135459636897354}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.302330660820008, "mean_act": 0.34973196326509737}}, "elapsed_s": 315.76633620262146}
-{"trial": 18, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.05, "W_COMPLETE": 50.0, "W_COMPACT": 0.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.005}, "score": 0.16000000000000003, "sr": {"1": 0.8, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.8, "mean_len": 645.4, "mean_min_pen": 3.1326077818870544, "mean_act": 0.15081361126264722}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.723365247249603, "mean_act": 0.10806036127302399}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.303192138671875, "mean_act": 0.08246586098832388}}, "elapsed_s": 318.483638048172}
-{"trial": 19, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.02}, "score": 0.13, "sr": {"1": 0.4, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.4, "mean_len": 1231.4, "mean_min_pen": 2.6246669054031373, "mean_act": 0.07338090033141094}, "2": {"sr": 0.1, "mean_len": 1420.2, "mean_min_pen": 8.371916389465332, "mean_act": 0.16944798908643302}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 14.287557554244994, "mean_act": 0.09957915147298428}}, "elapsed_s": 315.07627868652344}
-{"trial": 20, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 0.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.005}, "score": 0.05, "sr": {"1": 0.0, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 1.5734932541847229, "mean_act": 0.08394606926547861}, "2": {"sr": 0.1, "mean_len": 1498.9, "mean_min_pen": 6.444609999656677, "mean_act": 0.2938110977638972}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 11.258054113388061, "mean_act": 0.16288984295733971}}, "elapsed_s": 309.5854580402374}
-{"trial": 21, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.05, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.02, "W_COMPLETE": 100.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.005}, "score": 0.11, "sr": {"1": 0.3, "2": 0.1, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1324.6, "mean_min_pen": 3.3425565361976624, "mean_act": 0.1115106962044226}, "2": {"sr": 0.1, "mean_len": 1443.0, "mean_min_pen": 11.069470012187958, "mean_act": 0.17271345215252376}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.845431709289551, "mean_act": 0.13337391122176}}, "elapsed_s": 315.54923272132874}
-{"trial": 22, "config": {"W_PER_SHEEP": 2.0, "W_ALIGN": 0.1, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.05, "W_COMPLETE": 100.0, "W_COMPACT": 1.5, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": true, "ent_coef": 0.05}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1220.2, "mean_min_pen": 2.1276236534118653, "mean_act": 0.4312911105166665}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 8.770305395126343, "mean_act": 0.6047595652043354}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.12634140253067, "mean_act": 0.14348885283676113}}, "elapsed_s": 471.740927696228}
-{"trial": 23, "config": {"W_PER_SHEEP": 6.0, "W_ALIGN": 0.025, "W_PEN_BONUS": 20.0, "W_STEP_COST": 0.005, "W_COMPLETE": 200.0, "W_COMPACT": 3.0, "ALIGN_SHAPE": "standoff", "ALIGN_GATED": false, "ent_coef": 0.01}, "score": 0.06, "sr": {"1": 0.3, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.3, "mean_len": 1209.4, "mean_min_pen": 3.811609184741974, "mean_act": 0.08888363576016632}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.143073177337646, "mean_act": 0.27062979487000655}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 15.135865116119385, "mean_act": 0.3670903712440903}}, "elapsed_s": 335.26912212371826}
-{"trial": 24, "config": {"W_PER_SHEEP": 1.0, "W_ALIGN": 0.0, "W_PEN_BONUS": 10.0, "W_STEP_COST": 0.05, "W_COMPLETE": 50.0, "W_COMPACT": 0.5, "ALIGN_SHAPE": "near", "ALIGN_GATED": true, "ent_coef": 0.02}, "score": 0.0, "sr": {"1": 0.0, "2": 0.0, "3": 0.0}, "details": {"1": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 10.014724779129029, "mean_act": 1.024556803444028}, "2": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 12.734652400016785, "mean_act": 1.0186923123559604}, "3": {"sr": 0.0, "mean_len": 1500.0, "mean_min_pen": 13.690151166915893, "mean_act": 1.000638129701217}}, "elapsed_s": 306.1110165119171}
diff --git a/training/runs/sweep_full.log b/training/runs/sweep_full.log
deleted file mode 100644
index a60c7e4..0000000
--- a/training/runs/sweep_full.log
+++ /dev/null
@@ -1,681 +0,0 @@
-Sweep dir: runs/sweep_20260425_124630
-Search space: ['W_PER_SHEEP', 'W_ALIGN', 'W_PEN_BONUS', 'W_STEP_COST', 'W_COMPLETE', 'W_COMPACT', 'ALIGN_SHAPE', 'ALIGN_GATED', 'ent_coef']
-Per-trial: 1,000,000 steps train + 30 eval eps
-Time budget: 7.5h
-
-[Trial   1] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.005}
-           ... [trial 1 | 1 sheep |  50,000 steps | ret(last 33)=-7.72  sr=6%]
-           ... [trial 1 | 1 sheep | 100,000 steps | ret(last 50)=-10.07  sr=2%]
-           ... [trial 1 | 1 sheep | 150,000 steps | ret(last 50)=-9.89  sr=2%]
-           ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=-7.94  sr=4%]
-           ... [trial 1 | 1 sheep | 250,000 steps | ret(last 50)=+2.69  sr=2%]
-           ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=+18.25  sr=24%]
-           ... [trial 1 | 1 sheep | 350,000 steps | ret(last 50)=+24.63  sr=20%]
-           ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=+24.83  sr=26%]
-           ... [trial 1 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 1 | 2 sheep | 459,608 steps | ret(last 32)=+10.08  sr=0%]
-           ... [trial 1 | 2 sheep | 509,608 steps | ret(last 50)=+11.51  sr=0%]
-           ... [trial 1 | 2 sheep | 559,608 steps | ret(last 50)=+12.82  sr=0%]
-           ... [trial 1 | 2 sheep | 609,608 steps | ret(last 50)=+14.39  sr=0%]
-           ... [trial 1 | 2 sheep | 659,608 steps | ret(last 50)=+14.14  sr=0%]
-           ... [trial 1 | 2 sheep | 709,608 steps | ret(last 50)=+12.36  sr=2%]
-           ... [trial 1 | 2 sheep | 759,608 steps | ret(last 50)=+13.08  sr=0%]
-           ... [trial 1 | 2 sheep | 809,608 steps | ret(last 50)=+13.24  sr=0%]
-           ... [trial 1 | 2 sheep | 859,608 steps | ret(last 50)=+13.23  sr=0%]
-           ... [trial 1 | 2 sheep | 909,608 steps | ret(last 50)=+14.23  sr=2%]
-           ... [trial 1 | 2 sheep | 959,608 steps | ret(last 50)=+14.69  sr=0%]
-           ... [trial 1 | 2 sheep | 1,009,608 steps | ret(last 50)=+20.23  sr=0%]
-           ... [trial 1 | eval n=1]
-           ... [trial 1 | eval n=2]
-           ... [trial 1 | eval n=3]
-           → score=0.150  sr1=0.50 sr2=0.10 sr3=0.00  [317s]
-[Trial   2] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.005}
-           ... [trial 2 | 1 sheep |  50,000 steps | ret(last 34)=-24.61  sr=9%]
-           ... [trial 2 | 1 sheep | 100,000 steps | ret(last 50)=-28.20  sr=10%]
-           ... [trial 2 | 1 sheep | 150,000 steps | ret(last 50)=-28.14  sr=8%]
-           ... [trial 2 | 1 sheep | 200,000 steps | ret(last 50)=-31.36  sr=2%]
-           ... [trial 2 | 1 sheep | 250,000 steps | ret(last 50)=-31.38  sr=6%]
-           ... [trial 2 | 1 sheep | 300,000 steps | ret(last 50)=-32.89  sr=4%]
-           ... [trial 2 | 1 sheep | 350,000 steps | ret(last 50)=-29.11  sr=8%]
-           ... [trial 2 | 1 sheep | 400,000 steps | ret(last 50)=-19.16  sr=30%]
-           ... [trial 2 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 2 | 2 sheep | 459,608 steps | ret(last 34)=-17.61  sr=9%]
-           ... [trial 2 | 2 sheep | 509,608 steps | ret(last 50)=-18.59  sr=2%]
-           ... [trial 2 | 2 sheep | 559,608 steps | ret(last 50)=-16.92  sr=0%]
-           ... [trial 2 | 2 sheep | 609,608 steps | ret(last 50)=-17.40  sr=0%]
-           ... [trial 2 | 2 sheep | 659,608 steps | ret(last 50)=-18.13  sr=0%]
-           ... [trial 2 | 2 sheep | 709,608 steps | ret(last 50)=-17.45  sr=0%]
-           ... [trial 2 | 2 sheep | 759,608 steps | ret(last 50)=-16.06  sr=0%]
-           ... [trial 2 | 2 sheep | 809,608 steps | ret(last 50)=-15.35  sr=0%]
-           ... [trial 2 | 2 sheep | 859,608 steps | ret(last 50)=-12.63  sr=0%]
-           ... [trial 2 | 2 sheep | 909,608 steps | ret(last 50)=-12.41  sr=0%]
-           ... [trial 2 | 2 sheep | 959,608 steps | ret(last 50)=-12.91  sr=0%]
-           ... [trial 2 | 2 sheep | 1,009,608 steps | ret(last 50)=-10.94  sr=0%]
-           ... [trial 2 | eval n=1]
-           ... [trial 2 | eval n=2]
-           ... [trial 2 | eval n=3]
-           → score=0.060  sr1=0.30 sr2=0.00 sr3=0.00  [311s]
-[Trial   3] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 50.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.01}
-           ... [trial 3 | 1 sheep |  50,000 steps | ret(last 32)=-1.75  sr=0%]
-           ... [trial 3 | 1 sheep | 100,000 steps | ret(last 50)=-3.70  sr=0%]
-           ... [trial 3 | 1 sheep | 150,000 steps | ret(last 50)=-6.09  sr=2%]
-           ... [trial 3 | 1 sheep | 200,000 steps | ret(last 50)=-3.44  sr=4%]
-           ... [trial 3 | 1 sheep | 250,000 steps | ret(last 50)=+6.68  sr=8%]
-           ... [trial 3 | 1 sheep | 300,000 steps | ret(last 50)=+14.58  sr=22%]
-           ... [trial 3 | 1 sheep | 350,000 steps | ret(last 50)=+15.28  sr=64%]
-           ... [trial 3 | 1 sheep | 400,000 steps | ret(last 50)=+14.70  sr=74%]
-           ... [trial 3 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 3 | 2 sheep | 459,608 steps | ret(last 35)=+0.82  sr=9%]
-           ... [trial 3 | 2 sheep | 509,608 steps | ret(last 50)=-0.66  sr=2%]
-           ... [trial 3 | 2 sheep | 559,608 steps | ret(last 50)=-0.02  sr=0%]
-           ... [trial 3 | 2 sheep | 609,608 steps | ret(last 50)=-0.02  sr=0%]
-           ... [trial 3 | 2 sheep | 659,608 steps | ret(last 50)=+1.37  sr=4%]
-           ... [trial 3 | 2 sheep | 709,608 steps | ret(last 50)=+2.75  sr=8%]
-           ... [trial 3 | 2 sheep | 759,608 steps | ret(last 50)=+1.25  sr=6%]
-           ... [trial 3 | 2 sheep | 809,608 steps | ret(last 50)=+4.20  sr=10%]
-           ... [trial 3 | 2 sheep | 859,608 steps | ret(last 50)=+2.14  sr=4%]
-           ... [trial 3 | 2 sheep | 909,608 steps | ret(last 50)=+3.13  sr=8%]
-           ... [trial 3 | 2 sheep | 959,608 steps | ret(last 50)=+5.16  sr=6%]
-           ... [trial 3 | 2 sheep | 1,009,608 steps | ret(last 50)=+5.95  sr=8%]
-           ... [trial 3 | eval n=1]
-           ... [trial 3 | eval n=2]
-           ... [trial 3 | eval n=3]
-           → score=0.270  sr1=0.70 sr2=0.20 sr3=0.10  [304s]
-[Trial   4] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 50.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.005}
-           ... [trial 4 | 1 sheep |  50,000 steps | ret(last 33)=-2.86  sr=9%]
-           ... [trial 4 | 1 sheep | 100,000 steps | ret(last 50)=-3.54  sr=6%]
-           ... [trial 4 | 1 sheep | 150,000 steps | ret(last 50)=-2.76  sr=8%]
-           ... [trial 4 | 1 sheep | 200,000 steps | ret(last 50)=-1.56  sr=8%]
-           ... [trial 4 | 1 sheep | 250,000 steps | ret(last 50)=+9.18  sr=24%]
-           ... [trial 4 | 1 sheep | 300,000 steps | ret(last 50)=+18.46  sr=46%]
-           ... [trial 4 | 1 sheep | 350,000 steps | ret(last 50)=+15.01  sr=34%]
-           ... [trial 4 | 1 sheep | 400,000 steps | ret(last 50)=+14.44  sr=42%]
-           ... [trial 4 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 4 | 2 sheep | 459,608 steps | ret(last 35)=+6.77  sr=9%]
-           ... [trial 4 | 2 sheep | 509,608 steps | ret(last 50)=+5.50  sr=6%]
-           ... [trial 4 | 2 sheep | 559,608 steps | ret(last 50)=+4.39  sr=0%]
-           ... [trial 4 | 2 sheep | 609,608 steps | ret(last 50)=+4.54  sr=0%]
-           ... [trial 4 | 2 sheep | 659,608 steps | ret(last 50)=+6.97  sr=0%]
-           ... [trial 4 | 2 sheep | 709,608 steps | ret(last 50)=+4.28  sr=4%]
-           ... [trial 4 | 2 sheep | 759,608 steps | ret(last 50)=+4.30  sr=2%]
-           ... [trial 4 | 2 sheep | 809,608 steps | ret(last 50)=+6.34  sr=4%]
-           ... [trial 4 | 2 sheep | 859,608 steps | ret(last 50)=+7.27  sr=2%]
-           ... [trial 4 | 2 sheep | 909,608 steps | ret(last 50)=+8.22  sr=4%]
-           ... [trial 4 | 2 sheep | 959,608 steps | ret(last 50)=+7.23  sr=6%]
-           ... [trial 4 | 2 sheep | 1,009,608 steps | ret(last 50)=+7.24  sr=2%]
-           ... [trial 4 | eval n=1]
-           ... [trial 4 | eval n=2]
-           ... [trial 4 | eval n=3]
-           → score=0.060  sr1=0.30 sr2=0.00 sr3=0.00  [302s]
-[Trial   5] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 50.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': True, 'ent_coef': 0.005}
-           ... [trial 5 | 1 sheep |  50,000 steps | ret(last 33)=+3.70  sr=6%]
-           ... [trial 5 | 1 sheep | 100,000 steps | ret(last 50)=-2.32  sr=0%]
-           ... [trial 5 | 1 sheep | 150,000 steps | ret(last 50)=-4.36  sr=4%]
-           ... [trial 5 | 1 sheep | 200,000 steps | ret(last 50)=-4.30  sr=6%]
-           ... [trial 5 | 1 sheep | 250,000 steps | ret(last 50)=-0.15  sr=14%]
-           ... [trial 5 | 1 sheep | 300,000 steps | ret(last 50)=+1.39  sr=8%]
-           ... [trial 5 | 1 sheep | 350,000 steps | ret(last 50)=+11.40  sr=36%]
-           ... [trial 5 | 1 sheep | 400,000 steps | ret(last 50)=+11.08  sr=24%]
-           ... [trial 5 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 5 | 2 sheep | 459,608 steps | ret(last 34)=+6.85  sr=6%]
-           ... [trial 5 | 2 sheep | 509,608 steps | ret(last 50)=+7.35  sr=8%]
-           ... [trial 5 | 2 sheep | 559,608 steps | ret(last 50)=+7.57  sr=4%]
-           ... [trial 5 | 2 sheep | 609,608 steps | ret(last 50)=+6.64  sr=2%]
-           ... [trial 5 | 2 sheep | 659,608 steps | ret(last 50)=+9.15  sr=10%]
-           ... [trial 5 | 2 sheep | 709,608 steps | ret(last 50)=+14.27  sr=10%]
-           ... [trial 5 | 2 sheep | 759,608 steps | ret(last 50)=+10.93  sr=6%]
-           ... [trial 5 | 2 sheep | 809,608 steps | ret(last 50)=+10.17  sr=12%]
-           ... [trial 5 | 2 sheep | 859,608 steps | ret(last 50)=+8.20  sr=8%]
-           ... [trial 5 | 2 sheep | 909,608 steps | ret(last 50)=+9.61  sr=14%]
-           ... [trial 5 | 2 sheep | 959,608 steps | ret(last 50)=+11.14  sr=10%]
-           ... [trial 5 | 2 sheep | 1,009,608 steps | ret(last 50)=+10.75  sr=12%]
-           ... [trial 5 | eval n=1]
-           ... [trial 5 | eval n=2]
-           ... [trial 5 | eval n=3]
-           → score=0.200  sr1=1.00 sr2=0.00 sr3=0.00  [314s]
-[Trial   6] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 200.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.01}
-           ... [trial 6 | 1 sheep |  50,000 steps | ret(last 32)=-13.18  sr=9%]
-           ... [trial 6 | 1 sheep | 100,000 steps | ret(last 50)=-10.28  sr=16%]
-           ... [trial 6 | 1 sheep | 150,000 steps | ret(last 50)=+5.28  sr=44%]
-           ... [trial 6 | 1 sheep | 200,000 steps | ret(last 50)=+9.40  sr=38%]
-           ... [trial 6 | 1 sheep | 250,000 steps | ret(last 50)=+8.62  sr=32%]
-           ... [trial 6 | 1 sheep | 300,000 steps | ret(last 50)=+9.14  sr=34%]
-           ... [trial 6 | 1 sheep | 350,000 steps | ret(last 50)=+12.59  sr=60%]
-           ... [trial 6 | 1 sheep | 400,000 steps | ret(last 50)=+14.10  sr=72%]
-           ... [trial 6 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 6 | 2 sheep | 459,608 steps | ret(last 34)=+0.12  sr=9%]
-           ... [trial 6 | 2 sheep | 509,608 steps | ret(last 50)=-2.84  sr=4%]
-           ... [trial 6 | 2 sheep | 559,608 steps | ret(last 50)=-2.11  sr=10%]
-           ... [trial 6 | 2 sheep | 609,608 steps | ret(last 50)=-1.91  sr=14%]
-           ... [trial 6 | 2 sheep | 659,608 steps | ret(last 50)=-2.14  sr=14%]
-           ... [trial 6 | 2 sheep | 709,608 steps | ret(last 50)=-4.30  sr=6%]
-           ... [trial 6 | 2 sheep | 759,608 steps | ret(last 50)=-1.89  sr=10%]
-           ... [trial 6 | 2 sheep | 809,608 steps | ret(last 50)=-3.47  sr=8%]
-           ... [trial 6 | 2 sheep | 859,608 steps | ret(last 50)=-1.45  sr=8%]
-           ... [trial 6 | 2 sheep | 909,608 steps | ret(last 50)=-3.55  sr=2%]
-           ... [trial 6 | 2 sheep | 959,608 steps | ret(last 50)=-2.93  sr=4%]
-           ... [trial 6 | 2 sheep | 1,009,608 steps | ret(last 50)=-1.45  sr=10%]
-           ... [trial 6 | eval n=1]
-           ... [trial 6 | eval n=2]
-           ... [trial 6 | eval n=3]
-           → score=0.160  sr1=0.80 sr2=0.00 sr3=0.00  [312s]
-[Trial   7] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.005, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.01}
-           ... [trial 7 | 1 sheep |  50,000 steps | ret(last 32)=-8.47  sr=0%]
-           ... [trial 7 | 1 sheep | 100,000 steps | ret(last 50)=-5.40  sr=4%]
-           ... [trial 7 | 1 sheep | 150,000 steps | ret(last 50)=-2.72  sr=10%]
-           ... [trial 7 | 1 sheep | 200,000 steps | ret(last 50)=-1.59  sr=10%]
-           ... [trial 7 | 1 sheep | 250,000 steps | ret(last 50)=-1.58  sr=6%]
-           ... [trial 7 | 1 sheep | 300,000 steps | ret(last 50)=-3.68  sr=2%]
-           ... [trial 7 | 1 sheep | 350,000 steps | ret(last 50)=+4.82  sr=10%]
-           ... [trial 7 | 1 sheep | 400,000 steps | ret(last 50)=+15.81  sr=54%]
-           ... [trial 7 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 7 | 2 sheep | 459,608 steps | ret(last 32)=-2.50  sr=6%]
-           ... [trial 7 | 2 sheep | 509,608 steps | ret(last 50)=-2.32  sr=2%]
-           ... [trial 7 | 2 sheep | 559,608 steps | ret(last 50)=+0.76  sr=4%]
-           ... [trial 7 | 2 sheep | 609,608 steps | ret(last 50)=+0.45  sr=0%]
-           ... [trial 7 | 2 sheep | 659,608 steps | ret(last 50)=+1.03  sr=8%]
-           ... [trial 7 | 2 sheep | 709,608 steps | ret(last 50)=+0.62  sr=6%]
-           ... [trial 7 | 2 sheep | 759,608 steps | ret(last 50)=+0.36  sr=8%]
-           ... [trial 7 | 2 sheep | 809,608 steps | ret(last 50)=+2.27  sr=10%]
-           ... [trial 7 | 2 sheep | 859,608 steps | ret(last 50)=+2.31  sr=6%]
-           ... [trial 7 | 2 sheep | 909,608 steps | ret(last 50)=+3.78  sr=4%]
-           ... [trial 7 | 2 sheep | 959,608 steps | ret(last 50)=+2.21  sr=10%]
-           ... [trial 7 | 2 sheep | 1,009,608 steps | ret(last 50)=+2.66  sr=4%]
-           ... [trial 7 | eval n=1]
-           ... [trial 7 | eval n=2]
-           ... [trial 7 | eval n=3]
-           → score=0.080  sr1=0.40 sr2=0.00 sr3=0.00  [338s]
-[Trial   8] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.005}
-           ... [trial 8 | 1 sheep |  50,000 steps | ret(last 32)=-7.73  sr=6%]
-           ... [trial 8 | 1 sheep | 100,000 steps | ret(last 50)=-9.58  sr=8%]
-           ... [trial 8 | 1 sheep | 150,000 steps | ret(last 50)=-10.87  sr=8%]
-           ... [trial 8 | 1 sheep | 200,000 steps | ret(last 50)=-9.79  sr=6%]
-           ... [trial 8 | 1 sheep | 250,000 steps | ret(last 50)=-7.19  sr=8%]
-           ... [trial 8 | 1 sheep | 300,000 steps | ret(last 50)=-3.84  sr=18%]
-           ... [trial 8 | 1 sheep | 350,000 steps | ret(last 50)=-0.03  sr=26%]
-           ... [trial 8 | 1 sheep | 400,000 steps | ret(last 50)=+6.80  sr=44%]
-           ... [trial 8 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 8 | 2 sheep | 459,608 steps | ret(last 35)=-3.00  sr=9%]
-           ... [trial 8 | 2 sheep | 509,608 steps | ret(last 50)=-4.26  sr=4%]
-           ... [trial 8 | 2 sheep | 559,608 steps | ret(last 50)=+1.91  sr=14%]
-           ... [trial 8 | 2 sheep | 609,608 steps | ret(last 50)=-0.57  sr=16%]
-           ... [trial 8 | 2 sheep | 659,608 steps | ret(last 50)=+1.65  sr=14%]
-           ... [trial 8 | 2 sheep | 709,608 steps | ret(last 50)=+2.90  sr=8%]
-           ... [trial 8 | 2 sheep | 759,608 steps | ret(last 50)=+0.98  sr=2%]
-           ... [trial 8 | 2 sheep | 809,608 steps | ret(last 50)=-2.52  sr=4%]
-           ... [trial 8 | 2 sheep | 859,608 steps | ret(last 50)=-1.11  sr=2%]
-           ... [trial 8 | 2 sheep | 909,608 steps | ret(last 50)=+2.74  sr=2%]
-           ... [trial 8 | 2 sheep | 959,608 steps | ret(last 50)=+2.94  sr=0%]
-           ... [trial 8 | 2 sheep | 1,009,608 steps | ret(last 50)=+5.13  sr=0%]
-           ... [trial 8 | eval n=1]
-           ... [trial 8 | eval n=2]
-           ... [trial 8 | eval n=3]
-           → score=0.110  sr1=0.30 sr2=0.10 sr3=0.00  [451s]
-[Trial   9] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 50.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.05}
-           ... [trial 9 | 1 sheep |  50,000 steps | ret(last 34)=-11.25  sr=15%]
-           ... [trial 9 | 1 sheep | 100,000 steps | ret(last 50)=-11.98  sr=8%]
-           ... [trial 9 | 1 sheep | 150,000 steps | ret(last 50)=-10.46  sr=14%]
-           ... [trial 9 | 1 sheep | 200,000 steps | ret(last 50)=-2.86  sr=14%]
-           ... [trial 9 | 1 sheep | 250,000 steps | ret(last 50)=+8.65  sr=60%]
-           ... [trial 9 | 1 sheep | 300,000 steps | ret(last 50)=+10.48  sr=58%]
-           ... [trial 9 | 1 sheep | 350,000 steps | ret(last 50)=+8.65  sr=56%]
-           ... [trial 9 | 1 sheep | 400,000 steps | ret(last 50)=+10.25  sr=68%]
-           ... [trial 9 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 9 | 2 sheep | 459,608 steps | ret(last 35)=-0.75  sr=20%]
-           ... [trial 9 | 2 sheep | 509,608 steps | ret(last 50)=-6.64  sr=2%]
-           ... [trial 9 | 2 sheep | 559,608 steps | ret(last 50)=-7.43  sr=4%]
-           ... [trial 9 | 2 sheep | 609,608 steps | ret(last 50)=-4.32  sr=6%]
-           ... [trial 9 | 2 sheep | 659,608 steps | ret(last 50)=-3.64  sr=6%]
-           ... [trial 9 | 2 sheep | 709,608 steps | ret(last 50)=-7.09  sr=0%]
-           ... [trial 9 | 2 sheep | 759,608 steps | ret(last 50)=-5.60  sr=4%]
-           ... [trial 9 | 2 sheep | 809,608 steps | ret(last 50)=-5.70  sr=6%]
-           ... [trial 9 | 2 sheep | 859,608 steps | ret(last 50)=-4.99  sr=4%]
-           ... [trial 9 | 2 sheep | 909,608 steps | ret(last 50)=-4.60  sr=6%]
-           ... [trial 9 | 2 sheep | 959,608 steps | ret(last 50)=-6.53  sr=4%]
-           ... [trial 9 | 2 sheep | 1,009,608 steps | ret(last 50)=-7.46  sr=2%]
-           ... [trial 9 | eval n=1]
-           ... [trial 9 | eval n=2]
-           ... [trial 9 | eval n=3]
-           → score=0.190  sr1=0.70 sr2=0.10 sr3=0.00  [349s]
-[Trial  10] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.05}
-           ... [trial 10 | 1 sheep |  50,000 steps | ret(last 32)=-13.35  sr=3%]
-           ... [trial 10 | 1 sheep | 100,000 steps | ret(last 50)=-12.49  sr=4%]
-           ... [trial 10 | 1 sheep | 150,000 steps | ret(last 50)=-13.24  sr=8%]
-           ... [trial 10 | 1 sheep | 200,000 steps | ret(last 50)=-12.73  sr=10%]
-           ... [trial 10 | 1 sheep | 250,000 steps | ret(last 50)=-15.27  sr=4%]
-           ... [trial 10 | 1 sheep | 300,000 steps | ret(last 50)=-9.43  sr=8%]
-           ... [trial 10 | 1 sheep | 350,000 steps | ret(last 50)=-2.65  sr=22%]
-           ... [trial 10 | 1 sheep | 400,000 steps | ret(last 50)=+5.12  sr=46%]
-           ... [trial 10 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 10 | 2 sheep | 459,608 steps | ret(last 34)=-4.93  sr=6%]
-           ... [trial 10 | 2 sheep | 509,608 steps | ret(last 50)=-6.25  sr=2%]
-           ... [trial 10 | 2 sheep | 559,608 steps | ret(last 50)=-5.57  sr=4%]
-           ... [trial 10 | 2 sheep | 609,608 steps | ret(last 50)=-6.24  sr=4%]
-           ... [trial 10 | 2 sheep | 659,608 steps | ret(last 50)=-9.34  sr=0%]
-           ... [trial 10 | 2 sheep | 709,608 steps | ret(last 50)=-8.23  sr=0%]
-           ... [trial 10 | 2 sheep | 759,608 steps | ret(last 50)=-8.34  sr=0%]
-           ... [trial 10 | 2 sheep | 809,608 steps | ret(last 50)=-5.27  sr=0%]
-           ... [trial 10 | 2 sheep | 859,608 steps | ret(last 50)=-8.24  sr=0%]
-           ... [trial 10 | 2 sheep | 909,608 steps | ret(last 50)=-8.75  sr=0%]
-           ... [trial 10 | 2 sheep | 959,608 steps | ret(last 50)=-9.15  sr=0%]
-           ... [trial 10 | 2 sheep | 1,009,608 steps | ret(last 50)=-9.75  sr=0%]
-           ... [trial 10 | eval n=1]
-           ... [trial 10 | eval n=2]
-           ... [trial 10 | eval n=3]
-           → score=0.000  sr1=0.00 sr2=0.00 sr3=0.00  [319s]
-[Trial  11] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 0.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-           ... [trial 11 | 1 sheep |  50,000 steps | ret(last 32)=-3.50  sr=12%]
-           ... [trial 11 | 1 sheep | 100,000 steps | ret(last 50)=-5.79  sr=6%]
-           ... [trial 11 | 1 sheep | 150,000 steps | ret(last 50)=-2.10  sr=18%]
-           ... [trial 11 | 1 sheep | 200,000 steps | ret(last 50)=+2.60  sr=8%]
-           ... [trial 11 | 1 sheep | 250,000 steps | ret(last 50)=+11.49  sr=8%]
-           ... [trial 11 | 1 sheep | 300,000 steps | ret(last 50)=+21.73  sr=26%]
-           ... [trial 11 | 1 sheep | 350,000 steps | ret(last 50)=+20.73  sr=36%]
-           ... [trial 11 | 1 sheep | 400,000 steps | ret(last 50)=+19.77  sr=62%]
-           ... [trial 11 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 11 | 2 sheep | 459,608 steps | ret(last 36)=+10.19  sr=11%]
-           ... [trial 11 | 2 sheep | 509,608 steps | ret(last 50)=+11.56  sr=6%]
-           ... [trial 11 | 2 sheep | 559,608 steps | ret(last 50)=+13.61  sr=2%]
-           ... [trial 11 | 2 sheep | 609,608 steps | ret(last 50)=+15.44  sr=4%]
-           ... [trial 11 | 2 sheep | 659,608 steps | ret(last 50)=+15.61  sr=10%]
-           ... [trial 11 | 2 sheep | 709,608 steps | ret(last 50)=+16.30  sr=6%]
-           ... [trial 11 | 2 sheep | 759,608 steps | ret(last 50)=+17.33  sr=4%]
-           ... [trial 11 | 2 sheep | 809,608 steps | ret(last 50)=+18.36  sr=2%]
-           ... [trial 11 | 2 sheep | 859,608 steps | ret(last 50)=+19.78  sr=8%]
-           ... [trial 11 | 2 sheep | 909,608 steps | ret(last 50)=+20.12  sr=14%]
-           ... [trial 11 | 2 sheep | 959,608 steps | ret(last 50)=+18.93  sr=8%]
-           ... [trial 11 | 2 sheep | 1,009,608 steps | ret(last 50)=+18.16  sr=2%]
-           ... [trial 11 | eval n=1]
-           ... [trial 11 | eval n=2]
-           ... [trial 11 | eval n=3]
-           → score=0.160  sr1=0.80 sr2=0.00 sr3=0.00  [310s]
-[Trial  12] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 50.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-           ... [trial 12 | 1 sheep |  50,000 steps | ret(last 32)=-42.77  sr=0%]
-           ... [trial 12 | 1 sheep | 100,000 steps | ret(last 50)=-39.16  sr=2%]
-           ... [trial 12 | 1 sheep | 150,000 steps | ret(last 50)=-35.02  sr=6%]
-           ... [trial 12 | 1 sheep | 200,000 steps | ret(last 50)=-31.49  sr=4%]
-           ... [trial 12 | 1 sheep | 250,000 steps | ret(last 50)=-8.31  sr=16%]
-           ... [trial 12 | 1 sheep | 300,000 steps | ret(last 50)=+7.97  sr=36%]
-           ... [trial 12 | 1 sheep | 350,000 steps | ret(last 50)=+11.77  sr=68%]
-           ... [trial 12 | 1 sheep | 400,000 steps | ret(last 50)=+12.47  sr=74%]
-           ... [trial 12 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 12 | 2 sheep | 459,608 steps | ret(last 34)=-9.76  sr=0%]
-           ... [trial 12 | 2 sheep | 509,608 steps | ret(last 50)=-4.85  sr=0%]
-           ... [trial 12 | 2 sheep | 559,608 steps | ret(last 50)=-2.81  sr=8%]
-           ... [trial 12 | 2 sheep | 609,608 steps | ret(last 50)=+2.27  sr=10%]
-           ... [trial 12 | 2 sheep | 659,608 steps | ret(last 50)=+1.66  sr=6%]
-           ... [trial 12 | 2 sheep | 709,608 steps | ret(last 50)=+3.42  sr=4%]
-           ... [trial 12 | 2 sheep | 759,608 steps | ret(last 50)=+4.08  sr=2%]
-           ... [trial 12 | 2 sheep | 809,608 steps | ret(last 50)=+5.49  sr=2%]
-           ... [trial 12 | 2 sheep | 859,608 steps | ret(last 50)=+7.12  sr=10%]
-           ... [trial 12 | 2 sheep | 909,608 steps | ret(last 50)=+7.91  sr=6%]
-           ... [trial 12 | 2 sheep | 959,608 steps | ret(last 50)=+6.87  sr=2%]
-           ... [trial 12 | 2 sheep | 1,009,608 steps | ret(last 50)=+5.83  sr=2%]
-           ... [trial 12 | eval n=1]
-           ... [trial 12 | eval n=2]
-           ... [trial 12 | eval n=3]
-           → score=0.240  sr1=0.70 sr2=0.20 sr3=0.00  [330s]
-[Trial  13] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 0.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.005}
-           ... [trial 13 | 1 sheep |  50,000 steps | ret(last 34)=-31.15  sr=9%]
-           ... [trial 13 | 1 sheep | 100,000 steps | ret(last 50)=-32.34  sr=4%]
-           ... [trial 13 | 1 sheep | 150,000 steps | ret(last 50)=-33.16  sr=0%]
-           ... [trial 13 | 1 sheep | 200,000 steps | ret(last 50)=-29.98  sr=6%]
-           ... [trial 13 | 1 sheep | 250,000 steps | ret(last 50)=-28.64  sr=4%]
-           ... [trial 13 | 1 sheep | 300,000 steps | ret(last 50)=-17.91  sr=14%]
-           ... [trial 13 | 1 sheep | 350,000 steps | ret(last 50)=-15.27  sr=22%]
-           ... [trial 13 | 1 sheep | 400,000 steps | ret(last 50)=-11.36  sr=16%]
-           ... [trial 13 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 13 | 2 sheep | 459,608 steps | ret(last 34)=-16.78  sr=0%]
-           ... [trial 13 | 2 sheep | 509,608 steps | ret(last 50)=-16.84  sr=2%]
-           ... [trial 13 | 2 sheep | 559,608 steps | ret(last 50)=-14.28  sr=0%]
-           ... [trial 13 | 2 sheep | 609,608 steps | ret(last 50)=-12.35  sr=6%]
-           ... [trial 13 | 2 sheep | 659,608 steps | ret(last 50)=-14.50  sr=2%]
-           ... [trial 13 | 2 sheep | 709,608 steps | ret(last 50)=-12.96  sr=2%]
-           ... [trial 13 | 2 sheep | 759,608 steps | ret(last 50)=-9.86  sr=4%]
-           ... [trial 13 | 2 sheep | 809,608 steps | ret(last 50)=-13.88  sr=2%]
-           ... [trial 13 | 2 sheep | 859,608 steps | ret(last 50)=-14.76  sr=0%]
-           ... [trial 13 | 2 sheep | 909,608 steps | ret(last 50)=-12.79  sr=0%]
-           ... [trial 13 | 2 sheep | 959,608 steps | ret(last 50)=-12.54  sr=0%]
-           ... [trial 13 | 2 sheep | 1,009,608 steps | ret(last 50)=-12.11  sr=8%]
-           ... [trial 13 | eval n=1]
-           ... [trial 13 | eval n=2]
-           ... [trial 13 | eval n=3]
-           → score=0.060  sr1=0.30 sr2=0.00 sr3=0.00  [323s]
-[Trial  14] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 200.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-           ... [trial 14 | 1 sheep |  50,000 steps | ret(last 32)=-20.15  sr=9%]
-           ... [trial 14 | 1 sheep | 100,000 steps | ret(last 50)=-15.28  sr=8%]
-           ... [trial 14 | 1 sheep | 150,000 steps | ret(last 50)=-8.87  sr=26%]
-           ... [trial 14 | 1 sheep | 200,000 steps | ret(last 50)=-9.94  sr=8%]
-           ... [trial 14 | 1 sheep | 250,000 steps | ret(last 50)=-9.04  sr=8%]
-           ... [trial 14 | 1 sheep | 300,000 steps | ret(last 50)=-7.40  sr=14%]
-           ... [trial 14 | 1 sheep | 350,000 steps | ret(last 50)=+2.22  sr=50%]
-           ... [trial 14 | 1 sheep | 400,000 steps | ret(last 50)=+4.06  sr=58%]
-           ... [trial 14 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 14 | 2 sheep | 459,608 steps | ret(last 33)=-5.93  sr=3%]
-           ... [trial 14 | 2 sheep | 509,608 steps | ret(last 50)=-6.85  sr=4%]
-           ... [trial 14 | 2 sheep | 559,608 steps | ret(last 50)=-6.81  sr=6%]
-           ... [trial 14 | 2 sheep | 609,608 steps | ret(last 50)=-4.80  sr=4%]
-           ... [trial 14 | 2 sheep | 659,608 steps | ret(last 50)=-6.55  sr=4%]
-           ... [trial 14 | 2 sheep | 709,608 steps | ret(last 50)=-4.81  sr=12%]
-           ... [trial 14 | 2 sheep | 759,608 steps | ret(last 50)=-5.41  sr=10%]
-           ... [trial 14 | 2 sheep | 809,608 steps | ret(last 50)=-0.00  sr=30%]
-           ... [trial 14 | 2 sheep | 859,608 steps | ret(last 50)=+1.17  sr=26%]
-           ... [trial 14 | 2 sheep | 909,608 steps | ret(last 50)=+0.17  sr=20%]
-           ... [trial 14 | 2 sheep | 959,608 steps | ret(last 50)=-0.96  sr=18%]
-           ... [trial 14 | 2 sheep | 1,009,608 steps | ret(last 50)=-1.33  sr=20%]
-           ... [trial 14 | eval n=1]
-           ... [trial 14 | eval n=2]
-           ... [trial 14 | eval n=3]
-           → score=0.350  sr1=1.00 sr2=0.30 sr3=0.00  [314s]
-[Trial  15] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.05}
-           ... [trial 15 | 1 sheep |  50,000 steps | ret(last 32)=-6.83  sr=3%]
-           ... [trial 15 | 1 sheep | 100,000 steps | ret(last 50)=-7.59  sr=4%]
-           ... [trial 15 | 1 sheep | 150,000 steps | ret(last 50)=-5.74  sr=6%]
-           ... [trial 15 | 1 sheep | 200,000 steps | ret(last 50)=-5.92  sr=6%]
-           ... [trial 15 | 1 sheep | 250,000 steps | ret(last 50)=+8.14  sr=22%]
-           ... [trial 15 | 1 sheep | 300,000 steps | ret(last 50)=+15.51  sr=22%]
-           ... [trial 15 | 1 sheep | 350,000 steps | ret(last 50)=+21.46  sr=20%]
-           ... [trial 15 | 1 sheep | 400,000 steps | ret(last 50)=+22.52  sr=16%]
-           ... [trial 15 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 15 | 2 sheep | 459,608 steps | ret(last 35)=+6.28  sr=0%]
-           ... [trial 15 | 2 sheep | 509,608 steps | ret(last 50)=+13.19  sr=2%]
-           ... [trial 15 | 2 sheep | 559,608 steps | ret(last 50)=+15.58  sr=4%]
-           ... [trial 15 | 2 sheep | 609,608 steps | ret(last 50)=+18.78  sr=10%]
-           ... [trial 15 | 2 sheep | 659,608 steps | ret(last 50)=+22.71  sr=10%]
-           ... [trial 15 | 2 sheep | 709,608 steps | ret(last 50)=+23.95  sr=6%]
-           ... [trial 15 | 2 sheep | 759,608 steps | ret(last 50)=+24.84  sr=14%]
-           ... [trial 15 | 2 sheep | 809,608 steps | ret(last 50)=+24.00  sr=8%]
-           ... [trial 15 | 2 sheep | 859,608 steps | ret(last 50)=+23.91  sr=2%]
-           ... [trial 15 | 2 sheep | 909,608 steps | ret(last 50)=+23.73  sr=4%]
-           ... [trial 15 | 2 sheep | 959,608 steps | ret(last 50)=+24.23  sr=4%]
-           ... [trial 15 | 2 sheep | 1,009,608 steps | ret(last 50)=+24.77  sr=4%]
-           ... [trial 15 | eval n=1]
-           ... [trial 15 | eval n=2]
-           ... [trial 15 | eval n=3]
-           → score=0.140  sr1=0.70 sr2=0.00 sr3=0.00  [323s]
-[Trial  16] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.01}
-           ... [trial 16 | 1 sheep |  50,000 steps | ret(last 32)=-7.14  sr=9%]
-           ... [trial 16 | 1 sheep | 100,000 steps | ret(last 50)=-5.58  sr=12%]
-           ... [trial 16 | 1 sheep | 150,000 steps | ret(last 50)=+5.93  sr=26%]
-           ... [trial 16 | 1 sheep | 200,000 steps | ret(last 50)=+15.53  sr=68%]
-           ... [trial 16 | 1 sheep | 250,000 steps | ret(last 50)=+14.88  sr=56%]
-           ... [trial 16 | 1 sheep | 300,000 steps | ret(last 50)=+13.86  sr=36%]
-           ... [trial 16 | 1 sheep | 350,000 steps | ret(last 50)=+14.84  sr=54%]
-           ... [trial 16 | 1 sheep | 400,000 steps | ret(last 50)=+15.15  sr=70%]
-           ... [trial 16 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 16 | 2 sheep | 459,608 steps | ret(last 34)=-1.47  sr=6%]
-           ... [trial 16 | 2 sheep | 509,608 steps | ret(last 50)=-1.63  sr=2%]
-           ... [trial 16 | 2 sheep | 559,608 steps | ret(last 50)=-3.78  sr=2%]
-           ... [trial 16 | 2 sheep | 609,608 steps | ret(last 50)=-2.17  sr=4%]
-           ... [trial 16 | 2 sheep | 659,608 steps | ret(last 50)=+1.25  sr=6%]
-           ... [trial 16 | 2 sheep | 709,608 steps | ret(last 50)=+0.28  sr=4%]
-           ... [trial 16 | 2 sheep | 759,608 steps | ret(last 50)=+2.74  sr=4%]
-           ... [trial 16 | 2 sheep | 809,608 steps | ret(last 50)=+7.19  sr=6%]
-           ... [trial 16 | 2 sheep | 859,608 steps | ret(last 50)=+7.68  sr=4%]
-           ... [trial 16 | 2 sheep | 909,608 steps | ret(last 50)=+2.38  sr=0%]
-           ... [trial 16 | 2 sheep | 959,608 steps | ret(last 50)=+3.43  sr=0%]
-           ... [trial 16 | 2 sheep | 1,009,608 steps | ret(last 50)=+11.11  sr=0%]
-           ... [trial 16 | eval n=1]
-           ... [trial 16 | eval n=2]
-           ... [trial 16 | eval n=3]
-           → score=0.060  sr1=0.30 sr2=0.00 sr3=0.00  [312s]
-[Trial  17] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 5.0, 'W_STEP_COST': 0.005, 'W_COMPLETE': 200.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': True, 'ent_coef': 0.05}
-           ... [trial 17 | 1 sheep |  50,000 steps | ret(last 32)=+2.15  sr=6%]
-           ... [trial 17 | 1 sheep | 100,000 steps | ret(last 50)=-0.51  sr=2%]
-           ... [trial 17 | 1 sheep | 150,000 steps | ret(last 50)=+0.84  sr=6%]
-           ... [trial 17 | 1 sheep | 200,000 steps | ret(last 50)=+2.96  sr=6%]
-           ... [trial 17 | 1 sheep | 250,000 steps | ret(last 50)=+3.04  sr=4%]
-           ... [trial 17 | 1 sheep | 300,000 steps | ret(last 50)=+10.58  sr=10%]
-           ... [trial 17 | 1 sheep | 350,000 steps | ret(last 50)=+21.95  sr=36%]
-           ... [trial 17 | 1 sheep | 400,000 steps | ret(last 50)=+19.20  sr=16%]
-           ... [trial 17 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 17 | 2 sheep | 459,608 steps | ret(last 32)=+10.27  sr=16%]
-           ... [trial 17 | 2 sheep | 509,608 steps | ret(last 50)=+12.25  sr=6%]
-           ... [trial 17 | 2 sheep | 559,608 steps | ret(last 50)=+12.94  sr=6%]
-           ... [trial 17 | 2 sheep | 609,608 steps | ret(last 50)=+11.82  sr=4%]
-           ... [trial 17 | 2 sheep | 659,608 steps | ret(last 50)=+13.45  sr=4%]
-           ... [trial 17 | 2 sheep | 709,608 steps | ret(last 50)=+13.03  sr=4%]
-           ... [trial 17 | 2 sheep | 759,608 steps | ret(last 50)=+10.69  sr=6%]
-           ... [trial 17 | 2 sheep | 809,608 steps | ret(last 50)=+7.79  sr=6%]
-           ... [trial 17 | 2 sheep | 859,608 steps | ret(last 50)=+12.16  sr=16%]
-           ... [trial 17 | 2 sheep | 909,608 steps | ret(last 50)=+11.75  sr=12%]
-           ... [trial 17 | 2 sheep | 959,608 steps | ret(last 50)=+13.65  sr=16%]
-           ... [trial 17 | 2 sheep | 1,009,608 steps | ret(last 50)=+12.43  sr=10%]
-           ... [trial 17 | eval n=1]
-           ... [trial 17 | eval n=2]
-           ... [trial 17 | eval n=3]
-           → score=0.240  sr1=0.70 sr2=0.20 sr3=0.00  [304s]
-[Trial  18] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.005, 'W_COMPLETE': 100.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-           ... [trial 18 | 1 sheep |  50,000 steps | ret(last 32)=-3.63  sr=3%]
-           ... [trial 18 | 1 sheep | 100,000 steps | ret(last 50)=-2.28  sr=12%]
-           ... [trial 18 | 1 sheep | 150,000 steps | ret(last 50)=-3.15  sr=10%]
-           ... [trial 18 | 1 sheep | 200,000 steps | ret(last 50)=-3.31  sr=6%]
-           ... [trial 18 | 1 sheep | 250,000 steps | ret(last 50)=-3.23  sr=2%]
-           ... [trial 18 | 1 sheep | 300,000 steps | ret(last 50)=+3.55  sr=22%]
-           ... [trial 18 | 1 sheep | 350,000 steps | ret(last 50)=+8.15  sr=28%]
-           ... [trial 18 | 1 sheep | 400,000 steps | ret(last 50)=+10.56  sr=18%]
-           ... [trial 18 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 18 | 2 sheep | 459,608 steps | ret(last 34)=+3.80  sr=0%]
-           ... [trial 18 | 2 sheep | 509,608 steps | ret(last 50)=+7.30  sr=4%]
-           ... [trial 18 | 2 sheep | 559,608 steps | ret(last 50)=+9.61  sr=10%]
-           ... [trial 18 | 2 sheep | 609,608 steps | ret(last 50)=+7.70  sr=8%]
-           ... [trial 18 | 2 sheep | 659,608 steps | ret(last 50)=+6.01  sr=2%]
-           ... [trial 18 | 2 sheep | 709,608 steps | ret(last 50)=+8.28  sr=6%]
-           ... [trial 18 | 2 sheep | 759,608 steps | ret(last 50)=+6.74  sr=0%]
-           ... [trial 18 | 2 sheep | 809,608 steps | ret(last 50)=+10.61  sr=0%]
-           ... [trial 18 | 2 sheep | 859,608 steps | ret(last 50)=+12.20  sr=0%]
-           ... [trial 18 | 2 sheep | 909,608 steps | ret(last 50)=+11.25  sr=2%]
-           ... [trial 18 | 2 sheep | 959,608 steps | ret(last 50)=+13.58  sr=4%]
-           ... [trial 18 | 2 sheep | 1,009,608 steps | ret(last 50)=+16.61  sr=20%]
-           ... [trial 18 | eval n=1]
-           ... [trial 18 | eval n=2]
-           ... [trial 18 | eval n=3]
-           → score=0.160  sr1=0.30 sr2=0.20 sr3=0.00  [316s]
-[Trial  19] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 50.0, 'W_COMPACT': 0.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.005}
-           ... [trial 19 | 1 sheep |  50,000 steps | ret(last 32)=-36.89  sr=3%]
-           ... [trial 19 | 1 sheep | 100,000 steps | ret(last 50)=-30.93  sr=4%]
-           ... [trial 19 | 1 sheep | 150,000 steps | ret(last 50)=-28.35  sr=12%]
-           ... [trial 19 | 1 sheep | 200,000 steps | ret(last 50)=-30.73  sr=8%]
-           ... [trial 19 | 1 sheep | 250,000 steps | ret(last 50)=-29.54  sr=4%]
-           ... [trial 19 | 1 sheep | 300,000 steps | ret(last 50)=-20.15  sr=20%]
-           ... [trial 19 | 1 sheep | 350,000 steps | ret(last 50)=-0.07  sr=68%]
-           ... [trial 19 | 1 sheep | 400,000 steps | ret(last 50)=+1.66  sr=52%]
-           ... [trial 19 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 19 | 2 sheep | 459,608 steps | ret(last 36)=-12.82  sr=19%]
-           ... [trial 19 | 2 sheep | 509,608 steps | ret(last 50)=-20.66  sr=0%]
-           ... [trial 19 | 2 sheep | 559,608 steps | ret(last 50)=-16.54  sr=4%]
-           ... [trial 19 | 2 sheep | 609,608 steps | ret(last 50)=-17.11  sr=4%]
-           ... [trial 19 | 2 sheep | 659,608 steps | ret(last 50)=-19.32  sr=0%]
-           ... [trial 19 | 2 sheep | 709,608 steps | ret(last 50)=-16.20  sr=0%]
-           ... [trial 19 | 2 sheep | 759,608 steps | ret(last 50)=-13.12  sr=2%]
-           ... [trial 19 | 2 sheep | 809,608 steps | ret(last 50)=-17.18  sr=4%]
-           ... [trial 19 | 2 sheep | 859,608 steps | ret(last 50)=-18.16  sr=2%]
-           ... [trial 19 | 2 sheep | 909,608 steps | ret(last 50)=-18.12  sr=4%]
-           ... [trial 19 | 2 sheep | 959,608 steps | ret(last 50)=-17.79  sr=2%]
-           ... [trial 19 | 2 sheep | 1,009,608 steps | ret(last 50)=-17.58  sr=0%]
-           ... [trial 19 | eval n=1]
-           ... [trial 19 | eval n=2]
-           ... [trial 19 | eval n=3]
-           → score=0.160  sr1=0.80 sr2=0.00 sr3=0.00  [318s]
-[Trial  20] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.02}
-           ... [trial 20 | 1 sheep |  50,000 steps | ret(last 33)=-15.83  sr=9%]
-           ... [trial 20 | 1 sheep | 100,000 steps | ret(last 50)=-18.74  sr=10%]
-           ... [trial 20 | 1 sheep | 150,000 steps | ret(last 50)=-22.88  sr=6%]
-           ... [trial 20 | 1 sheep | 200,000 steps | ret(last 50)=-23.86  sr=4%]
-           ... [trial 20 | 1 sheep | 250,000 steps | ret(last 50)=-21.10  sr=6%]
-           ... [trial 20 | 1 sheep | 300,000 steps | ret(last 50)=-18.42  sr=6%]
-           ... [trial 20 | 1 sheep | 350,000 steps | ret(last 50)=+1.74  sr=14%]
-           ... [trial 20 | 1 sheep | 400,000 steps | ret(last 50)=+7.62  sr=34%]
-           ... [trial 20 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 20 | 2 sheep | 459,608 steps | ret(last 34)=-2.63  sr=3%]
-           ... [trial 20 | 2 sheep | 509,608 steps | ret(last 50)=+1.10  sr=2%]
-           ... [trial 20 | 2 sheep | 559,608 steps | ret(last 50)=+5.57  sr=4%]
-           ... [trial 20 | 2 sheep | 609,608 steps | ret(last 50)=+8.54  sr=8%]
-           ... [trial 20 | 2 sheep | 659,608 steps | ret(last 50)=+12.02  sr=8%]
-           ... [trial 20 | 2 sheep | 709,608 steps | ret(last 50)=+11.28  sr=4%]
-           ... [trial 20 | 2 sheep | 759,608 steps | ret(last 50)=+11.45  sr=2%]
-           ... [trial 20 | 2 sheep | 809,608 steps | ret(last 50)=+9.52  sr=0%]
-           ... [trial 20 | 2 sheep | 859,608 steps | ret(last 50)=+9.07  sr=2%]
-           ... [trial 20 | 2 sheep | 909,608 steps | ret(last 50)=+12.06  sr=8%]
-           ... [trial 20 | 2 sheep | 959,608 steps | ret(last 50)=+12.77  sr=8%]
-           ... [trial 20 | 2 sheep | 1,009,608 steps | ret(last 50)=+11.55  sr=2%]
-           ... [trial 20 | eval n=1]
-           ... [trial 20 | eval n=2]
-           ... [trial 20 | eval n=3]
-           → score=0.130  sr1=0.40 sr2=0.10 sr3=0.00  [315s]
-[Trial  21] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 0.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.005}
-           ... [trial 21 | 1 sheep |  50,000 steps | ret(last 32)=-14.94  sr=6%]
-           ... [trial 21 | 1 sheep | 100,000 steps | ret(last 50)=-12.47  sr=4%]
-           ... [trial 21 | 1 sheep | 150,000 steps | ret(last 50)=-12.65  sr=6%]
-           ... [trial 21 | 1 sheep | 200,000 steps | ret(last 50)=-12.44  sr=2%]
-           ... [trial 21 | 1 sheep | 250,000 steps | ret(last 50)=-12.95  sr=6%]
-           ... [trial 21 | 1 sheep | 300,000 steps | ret(last 50)=-13.04  sr=6%]
-           ... [trial 21 | 1 sheep | 350,000 steps | ret(last 50)=-5.14  sr=8%]
-           ... [trial 21 | 1 sheep | 400,000 steps | ret(last 50)=-0.46  sr=8%]
-           ... [trial 21 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 21 | 2 sheep | 459,608 steps | ret(last 33)=-7.10  sr=0%]
-           ... [trial 21 | 2 sheep | 509,608 steps | ret(last 50)=-8.26  sr=0%]
-           ... [trial 21 | 2 sheep | 559,608 steps | ret(last 50)=-6.17  sr=4%]
-           ... [trial 21 | 2 sheep | 609,608 steps | ret(last 50)=-4.23  sr=4%]
-           ... [trial 21 | 2 sheep | 659,608 steps | ret(last 50)=-5.62  sr=0%]
-           ... [trial 21 | 2 sheep | 709,608 steps | ret(last 50)=-3.72  sr=0%]
-           ... [trial 21 | 2 sheep | 759,608 steps | ret(last 50)=-2.06  sr=0%]
-           ... [trial 21 | 2 sheep | 809,608 steps | ret(last 50)=-1.23  sr=0%]
-           ... [trial 21 | 2 sheep | 859,608 steps | ret(last 50)=-0.14  sr=0%]
-           ... [trial 21 | 2 sheep | 909,608 steps | ret(last 50)=+1.30  sr=2%]
-           ... [trial 21 | 2 sheep | 959,608 steps | ret(last 50)=+0.64  sr=2%]
-           ... [trial 21 | 2 sheep | 1,009,608 steps | ret(last 50)=+2.62  sr=6%]
-           ... [trial 21 | eval n=1]
-           ... [trial 21 | eval n=2]
-           ... [trial 21 | eval n=3]
-           → score=0.050  sr1=0.00 sr2=0.10 sr3=0.00  [310s]
-[Trial  22] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.05, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.005}
-           ... [trial 22 | 1 sheep |  50,000 steps | ret(last 32)=-11.10  sr=6%]
-           ... [trial 22 | 1 sheep | 100,000 steps | ret(last 50)=-10.61  sr=8%]
-           ... [trial 22 | 1 sheep | 150,000 steps | ret(last 50)=-11.16  sr=4%]
-           ... [trial 22 | 1 sheep | 200,000 steps | ret(last 50)=-11.15  sr=4%]
-           ... [trial 22 | 1 sheep | 250,000 steps | ret(last 50)=-10.56  sr=6%]
-           ... [trial 22 | 1 sheep | 300,000 steps | ret(last 50)=-14.90  sr=0%]
-           ... [trial 22 | 1 sheep | 350,000 steps | ret(last 50)=-5.11  sr=14%]
-           ... [trial 22 | 1 sheep | 400,000 steps | ret(last 50)=+2.22  sr=24%]
-           ... [trial 22 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 22 | 2 sheep | 459,608 steps | ret(last 35)=-4.69  sr=6%]
-           ... [trial 22 | 2 sheep | 509,608 steps | ret(last 50)=-3.17  sr=0%]
-           ... [trial 22 | 2 sheep | 559,608 steps | ret(last 50)=+2.18  sr=2%]
-           ... [trial 22 | 2 sheep | 609,608 steps | ret(last 50)=+4.53  sr=8%]
-           ... [trial 22 | 2 sheep | 659,608 steps | ret(last 50)=+4.97  sr=10%]
-           ... [trial 22 | 2 sheep | 709,608 steps | ret(last 50)=+5.06  sr=8%]
-           ... [trial 22 | 2 sheep | 759,608 steps | ret(last 50)=+6.04  sr=4%]
-           ... [trial 22 | 2 sheep | 809,608 steps | ret(last 50)=+5.95  sr=4%]
-           ... [trial 22 | 2 sheep | 859,608 steps | ret(last 50)=+3.34  sr=2%]
-           ... [trial 22 | 2 sheep | 909,608 steps | ret(last 50)=+6.80  sr=8%]
-           ... [trial 22 | 2 sheep | 959,608 steps | ret(last 50)=+4.13  sr=8%]
-           ... [trial 22 | 2 sheep | 1,009,608 steps | ret(last 50)=+4.17  sr=2%]
-           ... [trial 22 | eval n=1]
-           ... [trial 22 | eval n=2]
-           ... [trial 22 | eval n=3]
-           → score=0.110  sr1=0.30 sr2=0.10 sr3=0.00  [316s]
-[Trial  23] {'W_PER_SHEEP': 2.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 100.0, 'W_COMPACT': 1.5, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': True, 'ent_coef': 0.05}
-           ... [trial 23 | 1 sheep |  50,000 steps | ret(last 32)=-22.59  sr=9%]
-           ... [trial 23 | 1 sheep | 100,000 steps | ret(last 50)=-21.14  sr=6%]
-           ... [trial 23 | 1 sheep | 150,000 steps | ret(last 50)=-20.75  sr=6%]
-           ... [trial 23 | 1 sheep | 200,000 steps | ret(last 50)=-20.37  sr=8%]
-           ... [trial 23 | 1 sheep | 250,000 steps | ret(last 50)=-5.04  sr=18%]
-           ... [trial 23 | 1 sheep | 300,000 steps | ret(last 50)=+7.25  sr=12%]
-           ... [trial 23 | 1 sheep | 350,000 steps | ret(last 50)=+11.34  sr=32%]
-           ... [trial 23 | 1 sheep | 400,000 steps | ret(last 50)=+13.02  sr=24%]
-           ... [trial 23 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 23 | 2 sheep | 459,608 steps | ret(last 32)=+0.29  sr=3%]
-           ... [trial 23 | 2 sheep | 509,608 steps | ret(last 50)=-0.39  sr=4%]
-           ... [trial 23 | 2 sheep | 559,608 steps | ret(last 50)=+6.56  sr=2%]
-           ... [trial 23 | 2 sheep | 609,608 steps | ret(last 50)=+10.45  sr=2%]
-           ... [trial 23 | 2 sheep | 659,608 steps | ret(last 50)=+9.75  sr=2%]
-           ... [trial 23 | 2 sheep | 709,608 steps | ret(last 50)=+7.98  sr=6%]
-           ... [trial 23 | 2 sheep | 759,608 steps | ret(last 50)=+9.20  sr=4%]
-           ... [trial 23 | 2 sheep | 809,608 steps | ret(last 50)=+11.03  sr=6%]
-           ... [trial 23 | 2 sheep | 859,608 steps | ret(last 50)=+12.53  sr=6%]
-           ... [trial 23 | 2 sheep | 909,608 steps | ret(last 50)=+10.86  sr=6%]
-           ... [trial 23 | 2 sheep | 959,608 steps | ret(last 50)=+13.16  sr=14%]
-           ... [trial 23 | 2 sheep | 1,009,608 steps | ret(last 50)=+12.36  sr=12%]
-           ... [trial 23 | eval n=1]
-           ... [trial 23 | eval n=2]
-           ... [trial 23 | eval n=3]
-           → score=0.060  sr1=0.30 sr2=0.00 sr3=0.00  [472s]
-[Trial  24] {'W_PER_SHEEP': 6.0, 'W_ALIGN': 0.025, 'W_PEN_BONUS': 20.0, 'W_STEP_COST': 0.005, 'W_COMPLETE': 200.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.01}
-           ... [trial 24 | 1 sheep |  50,000 steps | ret(last 32)=-1.97  sr=0%]
-           ... [trial 24 | 1 sheep | 100,000 steps | ret(last 50)=-1.86  sr=2%]
-           ... [trial 24 | 1 sheep | 150,000 steps | ret(last 50)=-2.97  sr=4%]
-           ... [trial 24 | 1 sheep | 200,000 steps | ret(last 50)=-0.45  sr=8%]
-           ... [trial 24 | 1 sheep | 250,000 steps | ret(last 50)=-1.73  sr=4%]
-           ... [trial 24 | 1 sheep | 300,000 steps | ret(last 50)=+0.64  sr=4%]
-           ... [trial 24 | 1 sheep | 350,000 steps | ret(last 50)=+1.35  sr=2%]
-           ... [trial 24 | 1 sheep | 400,000 steps | ret(last 50)=+0.95  sr=4%]
-           ... [trial 24 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 24 | 2 sheep | 459,608 steps | ret(last 33)=+1.34  sr=0%]
-           ... [trial 24 | 2 sheep | 509,608 steps | ret(last 50)=+1.48  sr=0%]
-           ... [trial 24 | 2 sheep | 559,608 steps | ret(last 50)=+6.05  sr=0%]
-           ... [trial 24 | 2 sheep | 609,608 steps | ret(last 50)=+3.58  sr=0%]
-           ... [trial 24 | 2 sheep | 659,608 steps | ret(last 50)=+2.33  sr=0%]
-           ... [trial 24 | 2 sheep | 709,608 steps | ret(last 50)=+4.05  sr=2%]
-           ... [trial 24 | 2 sheep | 759,608 steps | ret(last 50)=+0.93  sr=0%]
-           ... [trial 24 | 2 sheep | 809,608 steps | ret(last 50)=-0.39  sr=0%]
-           ... [trial 24 | 2 sheep | 859,608 steps | ret(last 50)=-2.68  sr=0%]
-           ... [trial 24 | 2 sheep | 909,608 steps | ret(last 50)=+0.90  sr=0%]
-           ... [trial 24 | 2 sheep | 959,608 steps | ret(last 50)=+2.63  sr=0%]
-           ... [trial 24 | 2 sheep | 1,009,608 steps | ret(last 50)=+2.88  sr=0%]
-           ... [trial 24 | eval n=1]
-           ... [trial 24 | eval n=2]
-           ... [trial 24 | eval n=3]
-           → score=0.060  sr1=0.30 sr2=0.00 sr3=0.00  [335s]
-[Trial  25] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.0, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.05, 'W_COMPLETE': 50.0, 'W_COMPACT': 0.5, 'ALIGN_SHAPE': 'near', 'ALIGN_GATED': True, 'ent_coef': 0.02}
-           ... [trial 25 | 1 sheep |  50,000 steps | ret(last 32)=-56.03  sr=3%]
-           ... [trial 25 | 1 sheep | 100,000 steps | ret(last 50)=-53.61  sr=4%]
-           ... [trial 25 | 1 sheep | 150,000 steps | ret(last 50)=-54.50  sr=4%]
-           ... [trial 25 | 1 sheep | 200,000 steps | ret(last 50)=-57.55  sr=4%]
-           ... [trial 25 | 1 sheep | 250,000 steps | ret(last 50)=-54.77  sr=8%]
-           ... [trial 25 | 1 sheep | 300,000 steps | ret(last 50)=-55.53  sr=4%]
-           ... [trial 25 | 1 sheep | 350,000 steps | ret(last 50)=-55.26  sr=4%]
-           ... [trial 25 | 1 sheep | 400,000 steps | ret(last 50)=-56.11  sr=4%]
-           ... [trial 25 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 25 | 2 sheep | 459,608 steps | ret(last 32)=-48.36  sr=0%]
-           ... [trial 25 | 2 sheep | 509,608 steps | ret(last 50)=-54.87  sr=0%]
-           ... [trial 25 | 2 sheep | 559,608 steps | ret(last 50)=-56.08  sr=0%]
-           ... [trial 25 | 2 sheep | 609,608 steps | ret(last 50)=-54.86  sr=0%]
-           ... [trial 25 | 2 sheep | 659,608 steps | ret(last 50)=-50.62  sr=0%]
-           ... [trial 25 | 2 sheep | 709,608 steps | ret(last 50)=-49.92  sr=0%]
-           ... [trial 25 | 2 sheep | 759,608 steps | ret(last 50)=-50.11  sr=0%]
-           ... [trial 25 | 2 sheep | 809,608 steps | ret(last 50)=-51.41  sr=0%]
-           ... [trial 25 | 2 sheep | 859,608 steps | ret(last 50)=-51.02  sr=0%]
-           ... [trial 25 | 2 sheep | 909,608 steps | ret(last 50)=-50.80  sr=0%]
-           ... [trial 25 | 2 sheep | 959,608 steps | ret(last 50)=-50.01  sr=0%]
-           ... [trial 25 | 2 sheep | 1,009,608 steps | ret(last 50)=-49.71  sr=0%]
-           ... [trial 25 | eval n=1]
-           ... [trial 25 | eval n=2]
-           ... [trial 25 | eval n=3]
-           → score=0.000  sr1=0.00 sr2=0.00 sr3=0.00  [306s]
-
-============================================================================================
-  LEADERBOARD
-============================================================================================
-  rank  score   sr1   sr2   sr3  config
-  ----------------------------------------------------------------------------------------
-     1  0.350  1.00  0.30  0.00  W_PER_SHEEP=1.0 W_ALIGN=0.0 W_PEN_BONUS=5.0 W_STEP_COST=0.02 W_COMPLETE=200.0 W_COMPACT=1.5 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.02
-     2  0.270  0.70  0.20  0.10  W_PER_SHEEP=6.0 W_ALIGN=0.025 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=50.0 W_COMPACT=3.0 ALIGN_SHAPE=near ALIGN_GATED=False ent_coef=0.01
-     3  0.240  0.70  0.20  0.00  W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=5.0 W_STEP_COST=0.05 W_COMPLETE=50.0 W_COMPACT=3.0 ALIGN_SHAPE=near ALIGN_GATED=False ent_coef=0.02
-     4  0.240  0.70  0.20  0.00  W_PER_SHEEP=6.0 W_ALIGN=0.1 W_PEN_BONUS=5.0 W_STEP_COST=0.005 W_COMPLETE=200.0 W_COMPACT=0.0 ALIGN_SHAPE=near ALIGN_GATED=True ent_coef=0.05
-     5  0.200  1.00  0.00  0.00  W_PER_SHEEP=6.0 W_ALIGN=0.1 W_PEN_BONUS=5.0 W_STEP_COST=0.02 W_COMPLETE=50.0 W_COMPACT=3.0 ALIGN_SHAPE=near ALIGN_GATED=True ent_coef=0.005
-     6  0.190  0.70  0.10  0.00  W_PER_SHEEP=2.0 W_ALIGN=0.0 W_PEN_BONUS=20.0 W_STEP_COST=0.02 W_COMPLETE=50.0 W_COMPACT=0.0 ALIGN_SHAPE=near ALIGN_GATED=False ent_coef=0.05
-     7  0.160  0.80  0.00  0.00  W_PER_SHEEP=6.0 W_ALIGN=0.025 W_PEN_BONUS=20.0 W_STEP_COST=0.05 W_COMPLETE=200.0 W_COMPACT=3.0 ALIGN_SHAPE=standoff ALIGN_GATED=True ent_coef=0.01
-     8  0.160  0.80  0.00  0.00  W_PER_SHEEP=2.0 W_ALIGN=0.1 W_PEN_BONUS=20.0 W_STEP_COST=0.02 W_COMPLETE=200.0 W_COMPACT=0.5 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.02
-     9  0.160  0.80  0.00  0.00  W_PER_SHEEP=2.0 W_ALIGN=0.025 W_PEN_BONUS=10.0 W_STEP_COST=0.05 W_COMPLETE=50.0 W_COMPACT=0.0 ALIGN_SHAPE=standoff ALIGN_GATED=True ent_coef=0.005
-    10  0.160  0.30  0.20  0.00  W_PER_SHEEP=2.0 W_ALIGN=0.025 W_PEN_BONUS=10.0 W_STEP_COST=0.005 W_COMPLETE=100.0 W_COMPACT=1.5 ALIGN_SHAPE=near ALIGN_GATED=False ent_coef=0.02
-    11  0.150  0.50  0.10  0.00  W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=100.0 W_COMPACT=3.0 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.005
-    12  0.140  0.70  0.00  0.00  W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=100.0 W_COMPACT=1.5 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.05
-    13  0.130  0.40  0.10  0.00  W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=20.0 W_STEP_COST=0.05 W_COMPLETE=100.0 W_COMPACT=1.5 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.02
-    14  0.110  0.30  0.10  0.00  W_PER_SHEEP=6.0 W_ALIGN=0.025 W_PEN_BONUS=5.0 W_STEP_COST=0.05 W_COMPLETE=100.0 W_COMPACT=0.0 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.005
-    15  0.110  0.30  0.10  0.00  W_PER_SHEEP=2.0 W_ALIGN=0.05 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=100.0 W_COMPACT=3.0 ALIGN_SHAPE=standoff ALIGN_GATED=True ent_coef=0.005
-
-  Best config saved to runs/sweep_20260425_124630/best.json
-  Total trials: 25 (25 successful, 0 failed)
-  Total time:   2.28h
-
diff --git a/training/runs/sweep_smoke.log b/training/runs/sweep_smoke.log
deleted file mode 100644
index ae47b2a..0000000
--- a/training/runs/sweep_smoke.log
+++ /dev/null
@@ -1,43 +0,0 @@
-Sweep dir: runs/sweep_20260425_124021
-Search space: ['W_PER_SHEEP', 'W_ALIGN', 'W_PEN_BONUS', 'W_STEP_COST', 'W_COMPLETE', 'W_COMPACT', 'ALIGN_SHAPE', 'ALIGN_GATED', 'ent_coef']
-Per-trial: 1,000,000 steps train + 30 eval eps
-Time budget: 0.5h
-
-[Trial   1] {'W_PER_SHEEP': 1.0, 'W_ALIGN': 0.1, 'W_PEN_BONUS': 10.0, 'W_STEP_COST': 0.02, 'W_COMPLETE': 100.0, 'W_COMPACT': 3.0, 'ALIGN_SHAPE': 'standoff', 'ALIGN_GATED': False, 'ent_coef': 0.005}
-           ... [trial 1 | 1 sheep |  50,000 steps | ret(last 32)=-8.33  sr=6%]
-           ... [trial 1 | 1 sheep | 100,000 steps | ret(last 50)=-2.95  sr=6%]
-           ... [trial 1 | 1 sheep | 150,000 steps | ret(last 50)=+12.68  sr=10%]
-           ... [trial 1 | 1 sheep | 200,000 steps | ret(last 50)=+22.15  sr=22%]
-           ... [trial 1 | 1 sheep | 250,000 steps | ret(last 50)=+22.47  sr=18%]
-           ... [trial 1 | 1 sheep | 300,000 steps | ret(last 50)=+23.58  sr=24%]
-           ... [trial 1 | 1 sheep | 350,000 steps | ret(last 50)=+23.42  sr=18%]
-           ... [trial 1 | 1 sheep | 400,000 steps | ret(last 50)=+24.39  sr=32%]
-           ... [trial 1 | 2 sheep | 409,608 steps | ret(last 0)=+nan  sr=nan%]
-           ... [trial 1 | 2 sheep | 459,608 steps | ret(last 35)=+15.39  sr=3%]
-           ... [trial 1 | 2 sheep | 509,608 steps | ret(last 50)=+20.25  sr=0%]
-           ... [trial 1 | 2 sheep | 559,608 steps | ret(last 50)=+23.24  sr=4%]
-           ... [trial 1 | 2 sheep | 609,608 steps | ret(last 50)=+23.36  sr=4%]
-           ... [trial 1 | 2 sheep | 659,608 steps | ret(last 50)=+25.32  sr=2%]
-           ... [trial 1 | 2 sheep | 709,608 steps | ret(last 50)=+24.02  sr=4%]
-           ... [trial 1 | 2 sheep | 759,608 steps | ret(last 50)=+24.66  sr=4%]
-           ... [trial 1 | 2 sheep | 809,608 steps | ret(last 50)=+25.41  sr=4%]
-           ... [trial 1 | 2 sheep | 859,608 steps | ret(last 50)=+24.27  sr=4%]
-           ... [trial 1 | 2 sheep | 909,608 steps | ret(last 50)=+25.13  sr=8%]
-           ... [trial 1 | 2 sheep | 959,608 steps | ret(last 50)=+25.10  sr=2%]
-           ... [trial 1 | 2 sheep | 1,009,608 steps | ret(last 50)=+26.02  sr=2%]
-           ... [trial 1 | eval n=1]
-           ... [trial 1 | eval n=2]
-           ... [trial 1 | eval n=3]
-           → score=0.060  sr1=0.30 sr2=0.00 sr3=0.00  [308s]
-
-============================================================================================
-  LEADERBOARD
-============================================================================================
-  rank  score   sr1   sr2   sr3  config
-  ----------------------------------------------------------------------------------------
-     1  0.060  0.30  0.00  0.00  W_PER_SHEEP=1.0 W_ALIGN=0.1 W_PEN_BONUS=10.0 W_STEP_COST=0.02 W_COMPLETE=100.0 W_COMPACT=3.0 ALIGN_SHAPE=standoff ALIGN_GATED=False ent_coef=0.005
-
-  Best config saved to runs/sweep_20260425_124021/best.json
-  Total trials: 1 (1 successful, 0 failed)
-  Total time:   0.09h
-
diff --git a/training/smoke_test.py b/training/smoke_test.py
deleted file mode 100644
index 7892aca..0000000
--- a/training/smoke_test.py
+++ /dev/null
@@ -1,369 +0,0 @@
-"""
-Quick sanity check before committing to a full 15M-step training run.
-
-Trains 1 sheep for 500k steps (~5 min), then 3 sheep for 500k steps.
-If both pass, the obs/reward setup is sound and full training is worth running.
-If either fails, abort and fix before wasting 15M steps.
-
-Usage:
-    python smoke_test.py              # fresh run
-    python smoke_test.py --render     # watch episodes after each stage
-"""
-
-import argparse
-import os
-import sys
-import numpy as np
-from copy import deepcopy
-
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
-from matplotlib.collections import LineCollection
-
-from stable_baselines3 import PPO
-from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize
-
-from herding_env import HerdingEnv
-
-
-COMPACT_RADIUS = 5.0
-PASS_THRESHOLD = 0.60   # success rate required to pass each stage
-
-
-def make_env(n_sheep, seed, max_steps=2000):
-    def _init():
-        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps)
-        env.reset(seed=seed)
-        return env
-    return _init
-
-
-def classify_failure(ep_radius, ep_com_dist, n_penned, n_sheep, success):
-    if success:
-        return "SUCCESS"
-    if min(ep_radius) > COMPACT_RADIUS:
-        return "NEVER_COMPACT"
-    first_compact = next(i for i, r in enumerate(ep_radius) if r <= COMPACT_RADIUS)
-    if min(ep_com_dist[first_compact:]) > 3.0:
-        return "COMPACT_CANT_DRIVE"
-    if n_penned == 0:
-        return "DROVE_NO_SHEEP"
-    return f"PARTIAL_{n_penned}of{n_sheep}"
-
-
-def run_episodes(model, eval_env, n_episodes=30, max_steps=2000, render=False):
-    """
-    Run N deterministic episodes.
-    Returns (success_rate, failure_counts, diagnostics_dict).
-    diagnostics_dict contains per-episode and aggregate stats useful for
-    understanding WHY the policy is failing without assuming the cause.
-    """
-    failure_counts = {}
-    successes = 0
-
-    all_action_mags   = []   # action magnitude every step across all episodes
-    all_pen_progress  = []   # per-episode: total pen-dist reduction (positive = good)
-    ep_steps_list     = []
-    ep_min_pen_list   = []   # min pen dist reached in each episode
-
-    for ep in range(n_episodes):
-        obs = eval_env.reset()
-        done = False
-        ep_radius, ep_com_dist = [], []
-        ep_action_mags = []
-        n_penned = 0
-        n_sheep  = 1
-        prev_pen_dist = None
-
-        while not done:
-            action, _ = model.predict(obs, deterministic=True)
-            obs, _, dones, infos = eval_env.step(action)
-            done = dones[0]
-
-            inner    = eval_env.envs[0]
-            com, radius, _ = inner._flock_stats()
-            com_dist = float(np.linalg.norm(com - inner.PEN_CENTER))
-            ep_radius.append(radius)
-            ep_com_dist.append(com_dist)
-
-            act_mag = float(np.linalg.norm(action[0]))
-            ep_action_mags.append(act_mag)
-
-            active = ~inner.penned[:inner.n_sheep]
-            if active.any():
-                pen_dist = float(np.linalg.norm(
-                    inner.sheep_pos[:inner.n_sheep][active] - inner.PEN_CENTER, axis=1
-                ).sum())
-            else:
-                pen_dist = 0.0
-            if prev_pen_dist is None:
-                prev_pen_dist = pen_dist
-            prev_pen_dist = pen_dist
-
-            if render and ep == 0:
-                inner.render()
-
-        info     = infos[0]
-        n_penned = info.get("n_penned", 0)
-        n_sheep  = info.get("n_sheep",  1)
-        success  = n_penned == n_sheep
-        successes += int(success)
-        mode = classify_failure(ep_radius, ep_com_dist, n_penned, n_sheep, success)
-        failure_counts[mode] = failure_counts.get(mode, 0) + 1
-
-        all_action_mags.extend(ep_action_mags)
-        ep_steps_list.append(len(ep_action_mags))
-        ep_min_pen_list.append(min(ep_com_dist))
-
-        # Per-episode one-liner for real-time feedback
-        mean_act = float(np.mean(ep_action_mags))
-        min_pen  = min(ep_com_dist)
-        print(f"  ep {ep+1:>3}  steps={len(ep_action_mags):>5}  "
-              f"penned={n_penned}/{n_sheep}  "
-              f"act={mean_act:.2f}  "
-              f"min_pen={min_pen:.1f}m  [{mode}]")
-
-    success_rate = successes / n_episodes
-
-    diag = {
-        "mean_action_mag"  : float(np.mean(all_action_mags)),
-        "p10_action_mag"   : float(np.percentile(all_action_mags, 10)),
-        "p90_action_mag"   : float(np.percentile(all_action_mags, 90)),
-        "mean_min_pen_dist": float(np.mean(ep_min_pen_list)),
-        "best_min_pen_dist": float(np.min(ep_min_pen_list)),
-        "mean_ep_steps"    : float(np.mean(ep_steps_list)),
-    }
-
-    print(f"\n  Action magnitude  mean={diag['mean_action_mag']:.3f}  "
-          f"p10={diag['p10_action_mag']:.3f}  p90={diag['p90_action_mag']:.3f}"
-          f"  (0=stopped, 1=full speed)")
-    print(f"  Pen distance      mean_min={diag['mean_min_pen_dist']:.1f}m  "
-          f"best_min={diag['best_min_pen_dist']:.1f}m  "
-          f"(how close sheep got to pen center)")
-
-    return success_rate, failure_counts, diag
-
-
-def train_stage(n_sheep, steps, n_envs=4, prev_model=None, prev_vecnorm=None):
-    """Train one stage; return (model, vecnorm)."""
-    train_env = SubprocVecEnv([make_env(n_sheep, i) for i in range(n_envs)])
-
-    if prev_vecnorm is not None:
-        vn = deepcopy(prev_vecnorm)
-        vn.set_venv(train_env)
-        vn.training    = True
-        vn.norm_reward = True
-    else:
-        vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
-
-    if prev_model is not None:
-        model = prev_model
-        model.set_env(vn)
-    else:
-        model = PPO(
-            "MlpPolicy", vn,
-            learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
-            gamma=0.995, gae_lambda=0.95, clip_range=0.2, ent_coef=0.02,
-            vf_coef=0.5, max_grad_norm=0.5,
-            policy_kwargs=dict(net_arch=[256, 256]),
-            verbose=1,
-        )
-
-    model.learn(total_timesteps=steps, reset_num_timesteps=(prev_model is None),
-                tb_log_name="ppo_smoke")
-    return model, vn
-
-
-def make_eval_env(model, vecnorm, n_sheep, max_steps=2000):
-    raw = DummyVecEnv([make_env(n_sheep, seed=9999, max_steps=max_steps)])
-    vn  = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
-    vn.obs_rms = deepcopy(vecnorm.obs_rms)
-    vn.ret_rms = deepcopy(vecnorm.ret_rms)
-    return vn
-
-
-def report(n_sheep, success_rate, failure_counts, n_episodes, threshold=PASS_THRESHOLD):
-    print(f"\n{'='*52}")
-    print(f"  Stage n_sheep={n_sheep}  |  success={success_rate*100:.0f}%  ({int(success_rate*n_episodes)}/{n_episodes})")
-    print(f"  {'─'*48}")
-    for mode, cnt in sorted(failure_counts.items(), key=lambda x: -x[1]):
-        bar = "█" * cnt
-        print(f"  {mode:<26} {cnt:>3}/{n_episodes}  {bar}")
-    print(f"{'='*52}")
-
-    passed = success_rate >= threshold
-    if passed:
-        print(f"  ✓ PASS  (threshold {threshold*100:.0f}%)")
-    else:
-        dominant = max(failure_counts, key=failure_counts.get)
-        print(f"  ✗ FAIL  — dominant: {dominant}")
-        if dominant == "NEVER_COMPACT":
-            print("    Dog can't compact flock. Check W_COLLECT, obs contains straggler positions?")
-        elif dominant == "COMPACT_CANT_DRIVE":
-            print("    Flock compacts but dog doesn't drive to pen. Check alignment reward / W_DRIVE.")
-        elif dominant.startswith("PARTIAL"):
-            print("    Flock splits near pen. Dog loses stragglers at the end.")
-    print()
-    return passed
-
-
-SHEEP_COLORS = ["#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00",
-                "#a65628","#f781bf","#999999","#66c2a5","#fc8d62"]
-
-def _save_smoke_vis(model, vn, n_sheep, save_dir, seed=42, max_steps=2000):
-    """Run one episode and save trajectory + timeseries PNGs."""
-    from copy import deepcopy
-    raw  = DummyVecEnv([make_env(n_sheep, seed=seed, max_steps=max_steps)])
-    env  = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
-    env.obs_rms = deepcopy(vn.obs_rms)
-    env.ret_rms = deepcopy(vn.ret_rms)
-
-    obs = env.reset()
-    inner = env.envs[0]
-    dog_xs, dog_ys = [], []
-    sheep_xs = [[] for _ in range(n_sheep)]
-    sheep_ys = [[] for _ in range(n_sheep)]
-    radii, action_mags, rewards = [], [], []
-    pen_dists = [[] for _ in range(n_sheep)]
-    done = False
-
-    while not done:
-        action, _ = model.predict(obs, deterministic=True)
-        obs, reward, dones, _ = env.step(action)
-        done = dones[0]
-        dog_xs.append(float(inner.dog_pos[0])); dog_ys.append(float(inner.dog_pos[1]))
-        com, radius, _ = inner._flock_stats()
-        radii.append(radius)
-        rewards.append(float(reward[0]))
-        action_mags.append(float(np.linalg.norm(action[0])))
-        for i in range(n_sheep):
-            sheep_xs[i].append(float(inner.sheep_pos[i][0]))
-            sheep_ys[i].append(float(inner.sheep_pos[i][1]))
-            pen_dists[i].append(float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER)))
-    env.close()
-
-    steps = len(dog_xs)
-    # Trajectory
-    fig, ax = plt.subplots(figsize=(6,6))
-    ax.set_xlim(-16,16); ax.set_ylim(-16,16); ax.set_aspect("equal")
-    ax.set_facecolor("#dcedc8")
-    ax.add_patch(mpatches.Rectangle((-15,-15),30,30,fill=False,edgecolor="#795548",lw=2))
-    ax.add_patch(mpatches.Rectangle((10,-15),3,7,facecolor="#ffe082",edgecolor="#795548",lw=2))
-    ax.text(11.5,-11.5,"pen",ha="center",va="center",fontsize=8,color="#795548")
-    for i in range(n_sheep):
-        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
-        ax.plot(sheep_xs[i], sheep_ys[i], color=c, lw=1, alpha=0.6, label=f"sheep {i+1}")
-        ax.plot(sheep_xs[i][0], sheep_ys[i][0], "o", color=c, ms=7)
-        ax.plot(sheep_xs[i][-1], sheep_ys[i][-1], "*", color=c, ms=10)
-    ax.plot(dog_xs, dog_ys, color="#4e342e", lw=1.5, label="dog", alpha=0.8)
-    ax.plot(dog_xs[0], dog_ys[0], "s", color="#4e342e", ms=9)
-    ax.plot(dog_xs[-1], dog_ys[-1], "D", color="#4e342e", ms=9)
-    ax.set_title(f"n_sheep={n_sheep}  {steps} steps  min_r={min(radii):.1f}m")
-    ax.legend(fontsize=7, loc="upper left")
-    plt.tight_layout()
-    fig.savefig(os.path.join(save_dir, "trajectory.png"), dpi=100)
-    plt.close(fig)
-
-    # Timeseries
-    t = np.arange(steps)
-    fig, axes = plt.subplots(4,1,figsize=(10,8),sharex=True)
-    axes[0].plot(t, radii, color="steelblue"); axes[0].axhline(5,color="orange",ls="--",lw=1)
-    axes[0].set_ylabel("radius (m)"); axes[0].set_title("Flock radius (orange=5m threshold)")
-    for i in range(n_sheep):
-        axes[1].plot(t, pen_dists[i], color=SHEEP_COLORS[i%len(SHEEP_COLORS)], lw=1, label=f"sheep {i+1}")
-    axes[1].set_ylabel("pen dist (m)"); axes[1].set_title("Per-sheep distance to pen"); axes[1].legend(fontsize=7)
-    axes[2].plot(t, action_mags, color="tomato", lw=1, alpha=0.8)
-    axes[2].axhline(1.0,color="gray",ls="--",lw=1); axes[2].set_ylim(0,1.5)
-    axes[2].set_ylabel("action mag"); axes[2].set_title("Dog action magnitude (0=stopped)")
-    axes[3].plot(t, rewards, color="purple", lw=1, alpha=0.7); axes[3].axhline(0,color="black",lw=0.5)
-    axes[3].set_ylabel("reward"); axes[3].set_xlabel("step"); axes[3].set_title("Reward per step")
-    fig.suptitle(f"Smoke stage n_sheep={n_sheep}", fontsize=12)
-    plt.tight_layout()
-    fig.savefig(os.path.join(save_dir, "timeseries.png"), dpi=100)
-    plt.close(fig)
-    print(f"  Viz saved to {save_dir}/trajectory.png + timeseries.png")
-
-
-def main():
-    p = argparse.ArgumentParser()
-    p.add_argument("--steps",   type=int, default=500_000,
-                   help="Steps per smoke-test stage (default 500k)")
-    p.add_argument("--n-envs",  type=int, default=4)
-    p.add_argument("--episodes", type=int, default=30,
-                   help="Validation episodes per stage")
-    p.add_argument("--render",  action="store_true")
-    args = p.parse_args()
-
-    # 1 sheep (500k):  hard check — obs/reward structurally correct?
-    # Thresholds are MINIMUM bars — smoke test always runs ALL stages even on failure.
-    # The per-episode diagnostics tell you WHY a stage failed.
-    stages = [(1, args.steps, 0.10), (2, args.steps * 2, 0.20), (3, args.steps * 3, 0.10)]
-
-    model, vn = None, None
-    stage_results = []
-
-    for n_sheep, steps, threshold in stages:
-        print(f"\n{'#'*52}")
-        print(f"#  Smoke-test stage: n_sheep={n_sheep}, {steps:,} steps")
-        print(f"{'#'*52}")
-
-        model, vn = train_stage(n_sheep, steps, args.n_envs, model, vn)
-
-        eval_env = make_eval_env(model, vn, n_sheep)
-        success_rate, failure_counts, diag = run_episodes(
-            model, eval_env, args.episodes, render=args.render
-        )
-        eval_env.close()
-
-        save_dir = f"runs/smoke_stage{n_sheep}"
-        os.makedirs(save_dir, exist_ok=True)
-        model.save(os.path.join(save_dir, "model"))
-        vn.save(os.path.join(save_dir, "vecnorm.pkl"))
-        print(f"  Model saved to {save_dir}/")
-        _save_smoke_vis(model, vn, n_sheep, save_dir)
-
-        passed = report(n_sheep, success_rate, failure_counts, args.episodes, threshold)
-        stage_results.append((n_sheep, success_rate, passed, diag))
-
-        if not passed:
-            print(f"  ⚠  Stage {n_sheep} BELOW threshold — continuing to next stage.")
-            print(f"     mean_action={diag['mean_action_mag']:.3f}  "
-                  f"best_pen_approach={diag['best_min_pen_dist']:.1f}m")
-            if diag['mean_action_mag'] < 0.05:
-                print("  !! Dog is NOT moving (sit-still). "
-                      "Check ent_coef / step_cost / alignment.")
-            elif diag['best_min_pen_dist'] > 5.0:
-                print("  !! Dog never gets sheep near pen. "
-                      "Check reward direction / initialization.")
-            else:
-                print("  !! Dog moves and approaches pen but low success rate. "
-                      "Likely needs more training time.")
-
-    print(f"\n{'='*52}")
-    print("  SMOKE TEST SUMMARY")
-    print(f"{'='*52}")
-    all_passed = True
-    for n_sheep, sr, passed, diag in stage_results:
-        status = "PASS" if passed else "FAIL"
-        print(f"  n_sheep={n_sheep}  success={sr*100:.0f}%  "
-              f"act={diag['mean_action_mag']:.2f}  "
-              f"best_pen={diag['best_min_pen_dist']:.1f}m  [{status}]")
-        if not passed:
-            all_passed = False
-
-    if all_passed:
-        print("\n  All stages passed. Ready for full curriculum training:")
-        print("    python train.py --curriculum --steps-per-stage 1500000 "
-              "--total-steps 15000000 --n-sheep 1 --max-sheep 10 "
-              "--n-envs 8 --run-dir runs/ppo_v3")
-    else:
-        print("\n  Some stages below threshold — check diagnostics above.")
-        print("  Key signals: act<0.05=sit-still, best_pen>5=wrong direction, "
-              "else needs more training time.")
-    print()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/training/sweep_reward.py b/training/sweep_reward.py
deleted file mode 100644
index 03f318a..0000000
--- a/training/sweep_reward.py
+++ /dev/null
@@ -1,314 +0,0 @@
-"""
-Random-search sweep over reward-function hyperparameters.
-
-Each trial trains a fresh PPO policy through a 1→2-sheep curriculum on a tight
-budget, then evaluates at n=1,2,3 sheep. A composite score is computed and
-written to a JSONL log. After all trials, a leaderboard is printed and the
-best config is saved.
-
-Sized to fit in ~4 hours wall-clock with default settings on 8 envs.
-
-Usage
------
-    python sweep_reward.py                     # 25 trials, default budget
-    python sweep_reward.py --n-trials 15
-    python sweep_reward.py --time-budget 6     # stop adding trials past 6h
-    python sweep_reward.py --resume runs/sweep_<timestamp>   # continue logging
-
-Per-trial budget (see TRAIN_*_STEPS below): ~1.0M training steps + 30 eval
-episodes × 3 sheep counts. On this env that runs in ~8–12 min per trial.
-"""
-import argparse
-import json
-import os
-import time
-import traceback
-from copy import deepcopy
-
-import numpy as np
-from stable_baselines3 import PPO
-from stable_baselines3.common.callbacks import BaseCallback
-from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize
-
-from herding_env import HerdingEnv
-
-
-class ProgressCallback(BaseCallback):
-    """Print a one-line trial-progress summary every `freq` env steps.
-    Tracks per-env returns and success directly from rollout rewards/infos
-    (no Monitor wrapper needed). The success window is COUNT-BASED, not
-    time-based, so successful episodes (which finish faster) don't oversample
-    the window vs truncated episodes (which take max_steps)."""
-    def __init__(self, trial_id: int, stage_label: str, freq: int = 50_000):
-        super().__init__()
-        self.trial_id    = trial_id
-        self.stage_label = stage_label
-        self.freq        = freq
-        self._last       = 0
-        self._ep_returns = []
-        self._ep_success = []
-        self._completed_count = 0   # total completed episodes since callback start
-        self._success_count   = 0   # total successful episodes since callback start
-        self._cur_ret    = None    # per-env running return
-
-    def _on_step(self) -> bool:
-        rewards = self.locals.get("rewards")
-        dones   = self.locals.get("dones")
-        infos   = self.locals.get("infos", [])
-        if rewards is None or dones is None:
-            return True
-        if self._cur_ret is None or len(self._cur_ret) != len(rewards):
-            self._cur_ret = np.zeros(len(rewards), dtype=np.float64)
-        self._cur_ret += np.asarray(rewards, dtype=np.float64)
-        for i, d in enumerate(dones):
-            if not d: continue
-            self._ep_returns.append(float(self._cur_ret[i]))
-            info = infos[i] if i < len(infos) else {}
-            success = int(info.get("n_penned", 0) == info.get("n_sheep", -1))
-            self._ep_success.append(success)
-            self._completed_count += 1
-            self._success_count   += success
-            self._cur_ret[i] = 0.0
-            if len(self._ep_returns) > 50:
-                self._ep_returns.pop(0); self._ep_success.pop(0)
-        if self.num_timesteps - self._last >= self.freq:
-            self._last = self.num_timesteps
-            n_eps  = len(self._ep_returns)
-            mean_r = float(np.mean(self._ep_returns)) if n_eps else float("nan")
-            # Window sr (biased: short eps over-represented), and cumulative sr
-            # (unbiased over the whole stage).
-            win_sr = float(np.mean(self._ep_success)) if n_eps else float("nan")
-            cum_sr = (self._success_count / self._completed_count
-                      if self._completed_count else float("nan"))
-            print(f"           ... [trial {self.trial_id+1} | {self.stage_label} | "
-                  f"{self.num_timesteps:>7,} steps | "
-                  f"ret(last {n_eps})={mean_r:+.2f}  "
-                  f"win_sr={win_sr*100:.0f}%  cum_sr={cum_sr*100:.0f}%]",
-                  flush=True)
-        return True
-
-# ---------------------------------------------------------------------------
-# Search space — reward weights + a couple of hyperparams
-# ---------------------------------------------------------------------------
-SEARCH_SPACE = {
-    "W_PER_SHEEP": [1.0, 2.0, 4.0, 6.0],
-    "W_ALIGN":     [0.0, 0.025, 0.05, 0.1],
-    "W_PEN_BONUS": [5.0, 10.0, 20.0],
-    "W_STEP_COST": [0.005, 0.02, 0.05],
-    "W_COMPLETE":  [50.0, 100.0, 200.0],
-    "W_COMPACT":   [0.0, 0.5, 1.5, 3.0],
-    "ALIGN_SHAPE": ["standoff", "near"],
-    "ALIGN_GATED": [True, False],
-    "ent_coef":    [0.005, 0.01, 0.02, 0.05],
-}
-
-# Per-trial training budget — keep tight; total = sum + eval
-TRAIN_STAGE1_STEPS = 400_000   # 1 sheep
-TRAIN_STAGE2_STEPS = 600_000   # 2 sheep
-EVAL_EPISODES      = 10
-EVAL_NSHEEP        = (1, 2, 3)
-MAX_STEPS          = 1500
-N_ENVS             = 8
-
-
-def sample_config(rng: np.random.Generator) -> dict:
-    cfg = {}
-    for k, v in SEARCH_SPACE.items():
-        choice = v[int(rng.integers(0, len(v)))]
-        cfg[k] = bool(choice) if isinstance(choice, np.bool_) else choice
-    return cfg
-
-
-def reward_cfg(cfg: dict) -> dict:
-    """Strip non-env keys (anything that isn't a HerdingEnv attribute)."""
-    return {k: v for k, v in cfg.items() if k != "ent_coef"}
-
-
-def make_env(n_sheep, seed, max_steps, rcfg, random_n_sheep=False):
-    def _init():
-        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
-                         reward_cfg=rcfg, random_n_sheep=random_n_sheep)
-        env.reset(seed=seed)
-        return env
-    return _init
-
-
-def evaluate(model, vn_template, n_sheep, n_episodes, max_steps, rcfg):
-    raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, rcfg)])
-    vn  = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
-    vn.obs_rms = deepcopy(vn_template.obs_rms)
-    vn.ret_rms = deepcopy(vn_template.ret_rms)
-    successes = 0
-    ep_lens, min_pen_list, action_mags = [], [], []
-    for _ in range(n_episodes):
-        obs = vn.reset()
-        done = False
-        steps, min_pen, mags = 0, float("inf"), []
-        while not done:
-            action, _ = model.predict(obs, deterministic=True)
-            obs, _, dones, infos = vn.step(action)
-            done = dones[0]
-            inner = vn.envs[0]
-            com, _, _ = inner._flock_stats()
-            min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER)))
-            mags.append(float(np.linalg.norm(action[0])))
-            steps += 1
-        successes += int(infos[0].get("n_penned") == n_sheep)
-        ep_lens.append(steps)
-        min_pen_list.append(min_pen)
-        action_mags.extend(mags)
-    vn.close()
-    return {
-        "sr":        successes / n_episodes,
-        "mean_len":  float(np.mean(ep_lens)),
-        "mean_min_pen": float(np.mean(min_pen_list)),
-        "mean_act":  float(np.mean(action_mags)),
-    }
-
-
-def run_trial(trial_id: int, cfg: dict, log_path: str, run_dir: str) -> dict:
-    rcfg = reward_cfg(cfg)
-    trial_dir = os.path.join(run_dir, f"trial_{trial_id:03d}")
-    os.makedirs(trial_dir, exist_ok=True)
-    with open(os.path.join(trial_dir, "config.json"), "w") as f:
-        json.dump(cfg, f, indent=2)
-
-    train_env = SubprocVecEnv([
-        make_env(1, seed=trial_id * 100 + i, max_steps=MAX_STEPS, rcfg=rcfg)
-        for i in range(N_ENVS)
-    ])
-    vn = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
-
-    model = PPO(
-        "MlpPolicy", vn,
-        learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
-        gamma=0.995, gae_lambda=0.95, clip_range=0.2,
-        ent_coef=cfg["ent_coef"], vf_coef=0.5, max_grad_norm=0.5,
-        policy_kwargs=dict(net_arch=[256, 256]),
-        verbose=0,
-    )
-
-    try:
-        model.learn(total_timesteps=TRAIN_STAGE1_STEPS,
-                    reset_num_timesteps=True,
-                    callback=ProgressCallback(trial_id, "1 sheep"))
-        vn.env_method("set_n_sheep", 2)
-        model.learn(total_timesteps=TRAIN_STAGE2_STEPS,
-                    reset_num_timesteps=False,
-                    callback=ProgressCallback(trial_id, "2 sheep"))
-
-        per_sheep = {}
-        for n in EVAL_NSHEEP:
-            print(f"           ... [trial {trial_id+1} | eval n={n}]", flush=True)
-            per_sheep[n] = evaluate(model, vn, n, EVAL_EPISODES, MAX_STEPS, rcfg)
-
-        model.save(os.path.join(trial_dir, "model"))
-        vn.save(os.path.join(trial_dir, "vecnorm.pkl"))
-    finally:
-        try: vn.close()
-        except Exception: pass
-
-    sr = {n: per_sheep[n]["sr"] for n in EVAL_NSHEEP}
-    score = 0.2 * sr[1] + 0.5 * sr[2] + 0.3 * sr[3]
-    return {
-        "trial":    trial_id,
-        "config":   cfg,
-        "score":    score,
-        "sr":       sr,
-        "details":  per_sheep,
-    }
-
-
-def main():
-    p = argparse.ArgumentParser()
-    p.add_argument("--n-trials", type=int, default=25)
-    p.add_argument("--time-budget", type=float, default=7.5,
-                   help="Stop launching new trials past this many hours.")
-    p.add_argument("--seed", type=int, default=42)
-    p.add_argument("--run-dir", type=str, default=None,
-                   help="If unset, creates runs/sweep_<timestamp>/")
-    p.add_argument("--resume", type=str, default=None,
-                   help="Continue logging into an existing sweep dir")
-    args = p.parse_args()
-
-    run_dir = args.resume or args.run_dir or os.path.join(
-        "runs", "sweep_" + time.strftime("%Y%m%d_%H%M%S")
-    )
-    os.makedirs(run_dir, exist_ok=True)
-    log_path = os.path.join(run_dir, "results.jsonl")
-
-    rng = np.random.default_rng(args.seed)
-    start = time.time()
-    budget_s = args.time_budget * 3600
-    results = []
-
-    # If resuming, replay the existing log into memory
-    if args.resume and os.path.exists(log_path):
-        with open(log_path) as f:
-            for line in f:
-                try: results.append(json.loads(line))
-                except Exception: pass
-        print(f"Resumed sweep: {len(results)} prior trials loaded from {log_path}")
-
-    print(f"Sweep dir: {run_dir}")
-    print(f"Search space: {list(SEARCH_SPACE.keys())}")
-    print(f"Per-trial: {TRAIN_STAGE1_STEPS+TRAIN_STAGE2_STEPS:,} steps train + "
-          f"{EVAL_EPISODES * len(EVAL_NSHEEP)} eval eps")
-    print(f"Time budget: {args.time_budget}h\n")
-
-    n_done = sum(1 for r in results if "error" not in r)
-    trial_id = len(results)
-    while n_done < args.n_trials:
-        elapsed_h = (time.time() - start) / 3600
-        if elapsed_h >= args.time_budget:
-            print(f"\n[Sweep] time budget reached ({elapsed_h:.2f}h) — stopping.")
-            break
-
-        cfg = sample_config(rng)
-        t0 = time.time()
-        print(f"[Trial {trial_id+1:>3}] {cfg}")
-        try:
-            result = run_trial(trial_id, cfg, log_path, run_dir)
-            result["elapsed_s"] = time.time() - t0
-            sr = result["sr"]
-            print(f"           → score={result['score']:.3f}  "
-                  f"sr1={sr[1]:.2f} sr2={sr[2]:.2f} sr3={sr[3]:.2f}  "
-                  f"[{result['elapsed_s']:.0f}s]")
-            results.append(result)
-            n_done += 1
-        except Exception as e:
-            traceback.print_exc()
-            err = {"trial": trial_id, "config": cfg,
-                   "error": f"{type(e).__name__}: {e}",
-                   "elapsed_s": time.time() - t0}
-            results.append(err)
-            print(f"           ! FAILED: {err['error']}")
-        with open(log_path, "a") as f:
-            f.write(json.dumps(results[-1]) + "\n")
-        trial_id += 1
-
-    # Leaderboard
-    succ = [r for r in results if "error" not in r]
-    succ.sort(key=lambda r: -r["score"])
-    print("\n" + "=" * 92)
-    print("  LEADERBOARD")
-    print("=" * 92)
-    hdr = f"  {'rank':>4} {'score':>6} {'sr1':>5} {'sr2':>5} {'sr3':>5}  config"
-    print(hdr); print("  " + "-" * 88)
-    for i, r in enumerate(succ[:15], 1):
-        sr = r["sr"]
-        cfg_short = " ".join(f"{k}={v}" for k, v in r["config"].items())
-        print(f"  {i:>4d} {r['score']:>6.3f} {sr[1]:>5.2f} {sr[2]:>5.2f} {sr[3]:>5.2f}  {cfg_short}")
-
-    if succ:
-        best = succ[0]
-        with open(os.path.join(run_dir, "best.json"), "w") as f:
-            json.dump(best, f, indent=2)
-        print(f"\n  Best config saved to {run_dir}/best.json")
-        print(f"  Total trials: {len(results)} ({len(succ)} successful, "
-              f"{len(results)-len(succ)} failed)")
-        print(f"  Total time:   {(time.time()-start)/3600:.2f}h\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/training/train.py b/training/train.py
index b961cd6..7e549c4 100644
--- a/training/train.py
+++ b/training/train.py
@@ -1,414 +1,529 @@
 """
-PPO training script for the herding task.
+PPO training for the herding task with curriculum learning.
 
-Usage examples
---------------
-# Proper 5-sheep curriculum, 1 M steps per stage:
-    python train.py --curriculum --steps-per-stage 1000000 --total-steps 5000000
+Trains from scratch through a 1→max_sheep curriculum, evaluates after each
+stage, and auto-generates trajectory/timeseries plots plus a summary chart.
 
-# Success-rate curriculum (advances when 70 % success over 100 episodes):
-    python train.py --curriculum --threshold 0.70
+Usage
+-----
+    python train.py                                       # defaults from config.json
+    python train.py --config my_config.json --max-sheep 5
+    python train.py --max-sheep 3 --steps-per-stage 1000000
 
-# Resume from checkpoint at stage 3:
-    python train.py --resume runs/ppo_herding/ckpt_3000000_steps.zip --n-sheep 3 \
-                    --curriculum --steps-per-stage 1000000 --total-steps 5000000
-
-# Quick smoke-test:
-    python train.py --n-envs 1 --total-steps 50000
+Outputs (in runs/<timestamp>/):
+    config.json          resolved config
+    final_model.zip      trained PPO model
+    vecnorm.pkl          VecNormalize statistics
+    stage_results.json   per-stage evaluation metrics
+    success_rate.png     summary bar chart
+    eval/                trajectory & timeseries plots per sheep count
 """
 
 import argparse
+import json
 import os
+import time
 from copy import deepcopy
 
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
 import numpy as np
+from matplotlib.collections import LineCollection
 from stable_baselines3 import PPO
-from stable_baselines3.common.callbacks import (
-    BaseCallback,
-    CallbackList,
-    CheckpointCallback,
-    EvalCallback,
+from stable_baselines3.common.callbacks import BaseCallback
+from stable_baselines3.common.vec_env import (
+    DummyVecEnv,
+    SubprocVecEnv,
+    VecNormalize,
 )
-from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize
 
 from herding_env import HerdingEnv
 
+
+# ── Colours ──────────────────────────────────────────────────────────────────
+
+SHEEP_COLORS = [
+    "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00",
+    "#a65628", "#f781bf", "#999999", "#66c2a5", "#fc8d62",
+]
+DOG_COLOR = "#4e342e"
+
+
+# ── Callbacks ────────────────────────────────────────────────────────────────
+
+class ProgressCallback(BaseCallback):
+    """One-line progress summary every `freq` env steps."""
+
+    def __init__(self, stage_label: str, freq: int = 100_000):
+        super().__init__()
+        self.stage_label = stage_label
+        self.freq = freq
+        self._last = 0
+        self._ep_returns = []
+        self._ep_success = []
+        self._total_eps = 0
+        self._total_success = 0
+        self._cur_ret = None
+
+    def _on_step(self) -> bool:
+        rewards = self.locals.get("rewards")
+        dones = self.locals.get("dones")
+        infos = self.locals.get("infos", [])
+        if rewards is None or dones is None:
+            return True
+        if self._cur_ret is None or len(self._cur_ret) != len(rewards):
+            self._cur_ret = np.zeros(len(rewards), dtype=np.float64)
+        self._cur_ret += np.asarray(rewards, dtype=np.float64)
+        for i, d in enumerate(dones):
+            if not d:
+                continue
+            self._ep_returns.append(float(self._cur_ret[i]))
+            info = infos[i] if i < len(infos) else {}
+            success = int(info.get("n_penned", 0) == info.get("n_sheep", -1))
+            self._ep_success.append(success)
+            self._total_eps += 1
+            self._total_success += success
+            self._cur_ret[i] = 0.0
+            if len(self._ep_returns) > 50:
+                self._ep_returns.pop(0)
+                self._ep_success.pop(0)
+        if self.num_timesteps - self._last >= self.freq:
+            self._last = self.num_timesteps
+            n = len(self._ep_returns)
+            mean_r = float(np.mean(self._ep_returns)) if n else float("nan")
+            win_sr = float(np.mean(self._ep_success)) if n else float("nan")
+            cum_sr = (self._total_success / self._total_eps
+                      if self._total_eps else float("nan"))
+            print(f"           ... [{self.stage_label} | "
+                  f"{self.num_timesteps:>7,} steps | "
+                  f"ret(last {n})={mean_r:+.2f}  "
+                  f"win_sr={win_sr*100:.0f}%  cum_sr={cum_sr*100:.0f}%]",
+                  flush=True)
+        return True
+
+
+# ── Environment factory ──────────────────────────────────────────────────────
+
+def make_env(n_sheep, seed, max_steps, reward_cfg=None):
+    def _init():
+        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
+                         reward_cfg=reward_cfg)
+        env.reset(seed=seed)
+        return env
+    return _init
+
+
+# ── Failure-mode classification ──────────────────────────────────────────────
+
 COMPACT_RADIUS = 5.0
 
 
-def _classify(ep_radius, ep_com_dist, n_penned, n_sheep, success):
-    if success:
+def _classify(ep_radii, ep_com_dists, n_penned, n_sheep):
+    if n_penned == n_sheep:
         return "SUCCESS"
-    if min(ep_radius) > COMPACT_RADIUS:
+    if min(ep_radii) > COMPACT_RADIUS:
         return "NEVER_COMPACT"
-    first = next(i for i, r in enumerate(ep_radius) if r <= COMPACT_RADIUS)
-    if min(ep_com_dist[first:]) > 3.0:
+    first = next(i for i, r in enumerate(ep_radii) if r <= COMPACT_RADIUS)
+    if min(ep_com_dists[first:]) > 3.0:
         return "COMPACT_CANT_DRIVE"
     if n_penned == 0:
         return "DROVE_NO_SHEEP"
     return f"PARTIAL_{n_penned}of{n_sheep}"
 
 
-# ---------------------------------------------------------------------------
-# Curriculum callback
-# ---------------------------------------------------------------------------
+# ── Evaluation ───────────────────────────────────────────────────────────────
 
-class CurriculumCallback(BaseCallback):
-    """
-    Advances n_sheep on both training and eval envs.
+def evaluate(model, vn_template, n_sheep, n_episodes, max_steps,
+             reward_cfg=None):
+    """Evaluate at a given sheep count; returns metrics dict."""
+    raw = DummyVecEnv([make_env(n_sheep, 9999, max_steps, reward_cfg)])
+    vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
+    vn.obs_rms = deepcopy(vn_template.obs_rms)
+    vn.ret_rms = deepcopy(vn_template.ret_rms)
 
-    Two modes (mutually exclusive):
-      steps_per_stage — advance every N environment steps regardless of
-                        success rate (recommended for reliability).
-      threshold       — advance when rolling success rate exceeds this value
-                        (requires the policy to actually reach the threshold).
-    """
+    successes = 0
+    ep_lens = []
+    min_pen_list = []
+    action_mags = []
+    failure_counts = {}
+    rc_sums = {}
+    rc_n = 0
 
-    def __init__(self, start_sheep: int, max_sheep: int,
-                 eval_env=None,
-                 steps_per_stage: int = None,
-                 threshold: float = 0.75,
-                 window: int = 100,
-                 min_episodes: int = 50,
-                 verbose: int = 1):
-        super().__init__(verbose)
-        self.max_sheep       = max_sheep
-        self.eval_env        = eval_env
-        self.steps_per_stage = steps_per_stage
-        self.threshold       = threshold
-        self.window          = window
-        self.min_episodes    = min_episodes
-        self._cur_sheep      = start_sheep
-        self._successes      = []
-        self._stage_start    = 0
+    for _ in range(n_episodes):
+        obs = vn.reset()
+        done = False
+        steps = 0
+        min_pen = float("inf")
+        mags = []
+        ep_radii = []
+        ep_com_dists = []
+        while not done:
+            action, _ = model.predict(obs, deterministic=True)
+            obs, _, dones, infos = vn.step(action)
+            done = dones[0]
+            inner = vn.envs[0]
+            com, radius, _ = inner._flock_stats()
+            min_pen = min(min_pen, float(np.linalg.norm(com - inner.PEN_CENTER)))
+            mags.append(float(np.linalg.norm(action[0])))
+            ep_radii.append(radius)
+            ep_com_dists.append(float(np.linalg.norm(com - inner.PEN_CENTER)))
+            steps += 1
+            rc = infos[0].get("rcomps")
+            if rc:
+                for k, v in rc.items():
+                    rc_sums[k] = rc_sums.get(k, 0.0) + v
+                rc_n += 1
+        n_penned = infos[0].get("n_penned", 0)
+        success = n_penned == n_sheep
+        successes += int(success)
+        ep_lens.append(steps)
+        min_pen_list.append(min_pen)
+        action_mags.extend(mags)
+        mode = _classify(ep_radii, ep_com_dists, n_penned, n_sheep)
+        failure_counts[mode] = failure_counts.get(mode, 0) + 1
 
-    def _advance(self):
-        prev_sheep = self._cur_sheep
-        recent_sr = (np.mean(self._successes) if self._successes else float("nan"))
-        if self.verbose:
-            print(f"\n[Curriculum] leaving stage n_sheep={prev_sheep} "
-                  f"after {self.num_timesteps - self._stage_start:,} steps "
-                  f"| training success rate (last {len(self._successes)} eps) = "
-                  f"{recent_sr*100:.0f}%")
-        self._cur_sheep += 1
-        self.training_env.env_method("set_n_sheep", self._cur_sheep)
-        if self.eval_env is not None:
-            self.eval_env.env_method("set_n_sheep", self._cur_sheep)
-        self._stage_start = self.num_timesteps
-        self._successes.clear()
-        if self.verbose:
-            print(f"[Curriculum] → {self._cur_sheep} sheep "
-                  f"at step {self.num_timesteps:,}\n")
+    vn.close()
 
-    def _on_step(self) -> bool:
-        if self._cur_sheep >= self.max_sheep:
-            return True
-
-        # Always track training-side success (success = sheep all penned, not truncated)
-        for info, done in zip(self.locals["infos"], self.locals["dones"]):
-            if done:
-                npen = info.get("n_penned", 0)
-                nshp = info.get("n_sheep",  self._cur_sheep)
-                self._successes.append(1 if npen == nshp else 0)
-                if len(self._successes) > self.window:
-                    self._successes.pop(0)
-
-        if self.steps_per_stage is not None:
-            if self.num_timesteps - self._stage_start >= self.steps_per_stage:
-                self._advance()
-        else:
-            if (len(self._successes) >= self.min_episodes
-                    and np.mean(self._successes) >= self.threshold):
-                self._advance()
-
-        return True
+    result = {
+        "sr": successes / n_episodes,
+        "mean_len": float(np.mean(ep_lens)),
+        "mean_min_pen": float(np.mean(min_pen_list)),
+        "mean_act": float(np.mean(action_mags)) if action_mags else 0.0,
+        "failure_modes": failure_counts,
+    }
+    if rc_n > 0:
+        result["reward_per_step"] = {k: v / rc_n for k, v in rc_sums.items()}
+    return result
 
 
-# ---------------------------------------------------------------------------
-# Diagnostic callback — failure-mode breakdown every diag_freq steps
-# ---------------------------------------------------------------------------
+# ── Visualization helpers ────────────────────────────────────────────────────
 
-class DiagnosticCallback(BaseCallback):
-    """
-    Every diag_freq env steps: spin up a temporary eval env, run n_episodes
-    deterministic episodes, and print a failure-mode breakdown.
-    Aborts training (returns False) if the dominant failure mode hasn't
-    changed after two consecutive checks at the same n_sheep — a sign that
-    training has stalled and further steps are wasted.
-    """
-
-    def __init__(self, diag_freq: int = 500_000, n_episodes: int = 20,
-                 max_steps: int = 2000, abort_on_stall: bool = True,
-                 verbose: int = 1):
-        super().__init__(verbose)
-        self.diag_freq   = diag_freq
-        self.n_episodes  = n_episodes
-        self.max_steps   = max_steps
-        self.abort_on_stall = abort_on_stall
-        self._last_diag  = 0
-        self._prev_dominant = None   # (n_sheep, mode) from last check
-        self._stall_count   = 0
-
-    def _on_step(self) -> bool:
-        if self.num_timesteps - self._last_diag < self.diag_freq:
-            return True
-        self._last_diag = self.num_timesteps
-
-        n_sheep = self.training_env.get_attr("n_sheep")[0]
-
-        # Build a temporary single-env with copied VecNorm stats
-        raw = DummyVecEnv([lambda: HerdingEnv(n_sheep=n_sheep,
-                                              max_steps=self.max_steps)])
-        vn  = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
-        vn.obs_rms = deepcopy(self.training_env.obs_rms)
-        vn.ret_rms = deepcopy(self.training_env.ret_rms)
-
-        failure_counts = {}
-        successes = 0
-        all_action_mags  = []
-        ep_min_radii     = []
-        ep_min_dog_com   = []   # closest the dog ever got to flock COM
-        ep_min_pen_dists = []   # closest COM ever got to pen
-        rcomp_sums = {"progress":0.0,"alignment":0.0,"pen_bonus":0.0,
-                      "step_cost":0.0,"complete":0.0}
-        rcomp_n    = 0
-
-        for _ in range(self.n_episodes):
-            obs  = vn.reset()
-            done = False
-            ep_radius, ep_com_dist, ep_dog_com = [], [], []
-            ep_actions = []
-            n_penned = 0
-
-            while not done:
-                action, _ = self.model.predict(obs, deterministic=True)
-                obs, _, dones, infos = vn.step(action)
-                done = dones[0]
-                inner = vn.envs[0]
-                com, radius, _ = inner._flock_stats()
-                ep_radius.append(radius)
-                ep_com_dist.append(
-                    float(np.linalg.norm(com - inner.PEN_CENTER))
-                )
-                ep_dog_com.append(
-                    float(np.linalg.norm(inner.dog_pos - com))
-                )
-                ep_actions.append(float(np.linalg.norm(action[0])))
-                rc = infos[0].get("rcomps")
-                if rc is not None:
-                    for k in rcomp_sums: rcomp_sums[k] += rc[k]
-                    rcomp_n += 1
-
-            n_penned = infos[0].get("n_penned", 0)
-            success  = n_penned == n_sheep
-            successes += int(success)
-            mode = _classify(ep_radius, ep_com_dist, n_penned, n_sheep, success)
-            failure_counts[mode] = failure_counts.get(mode, 0) + 1
-            all_action_mags.extend(ep_actions)
-            ep_min_radii.append(min(ep_radius))
-            ep_min_dog_com.append(min(ep_dog_com))
-            ep_min_pen_dists.append(min(ep_com_dist))
-
-        vn.close()
-
-        success_rate = successes / self.n_episodes
-        dominant     = max(failure_counts, key=failure_counts.get)
-
-        if self.verbose:
-            print(f"\n[Diag @ {self.num_timesteps:,} | n_sheep={n_sheep} | "
-                  f"success={success_rate*100:.0f}%]")
-            for m, c in sorted(failure_counts.items(), key=lambda x: -x[1]):
-                print(f"  {m:<26} {c}/{self.n_episodes}")
-            mean_act = float(np.mean(all_action_mags)) if all_action_mags else 0.0
-            p10 = float(np.percentile(all_action_mags, 10)) if all_action_mags else 0.0
-            p90 = float(np.percentile(all_action_mags, 90)) if all_action_mags else 0.0
-            print(f"  action_mag mean={mean_act:.3f} p10={p10:.3f} p90={p90:.3f} "
-                  f"(0=stopped, 1=full speed)")
-            print(f"  min_flock_radius mean={np.mean(ep_min_radii):.2f}m "
-                  f"best={np.min(ep_min_radii):.2f}m  (target <5m to compact)")
-            print(f"  min_dog_to_com   mean={np.mean(ep_min_dog_com):.2f}m "
-                  f"best={np.min(ep_min_dog_com):.2f}m  (FLEE_DIST=7m)")
-            print(f"  min_com_to_pen   mean={np.mean(ep_min_pen_dists):.2f}m "
-                  f"best={np.min(ep_min_pen_dists):.2f}m")
-            if rcomp_n > 0:
-                print(f"  reward/step (mean): " + "  ".join(
-                    f"{k}={rcomp_sums[k]/rcomp_n:+.4f}" for k in
-                    ("progress","alignment","pen_bonus","step_cost","complete")
-                ))
-
-        # Stall detection — disabled when --no-stall-abort or when we've never
-        # seen any stage succeed (we want full visibility into what's happening).
-        key = (n_sheep, dominant)
-        if key == self._prev_dominant and dominant != "SUCCESS":
-            self._stall_count += 1
-            if (self.abort_on_stall and self._stall_count >= 5
-                    and self.num_timesteps >= 3_000_000):
-                print(f"\n[Diag] STALL DETECTED — '{dominant}' on {n_sheep} sheep "
-                      f"for {self._stall_count} consecutive checks. "
-                      f"Aborting training early.")
-                return False
-        else:
-            self._stall_count  = 0
-            self._prev_dominant = key
-
-        return True
+def _draw_field(ax):
+    ax.set_xlim(-16, 16)
+    ax.set_ylim(-16, 16)
+    ax.set_aspect("equal")
+    ax.set_facecolor("#dcedc8")
+    ax.add_patch(mpatches.Rectangle((-15, -15), 30, 30,
+                 fill=False, edgecolor="#795548", lw=2))
+    ax.add_patch(mpatches.Rectangle((10, -15), 3, 7,
+                 facecolor="#ffe082", edgecolor="#795548", lw=2))
+    ax.text(11.5, -11.5, "pen", ha="center", va="center",
+            fontsize=8, color="#795548")
 
 
-# ---------------------------------------------------------------------------
-# Environment factory
-# ---------------------------------------------------------------------------
-
-def make_env(n_sheep: int, seed: int, max_steps: int, random_n_sheep: bool = False):
-    def _init():
-        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
-                         random_n_sheep=random_n_sheep)
-        env.reset(seed=seed)
-        return env
-    return _init
+def _faded_path(ax, xs, ys, color, lw=1.5, label=None):
+    n = len(xs)
+    if n < 2:
+        return
+    points = np.array([xs, ys]).T.reshape(-1, 1, 2)
+    segs = np.concatenate([points[:-1], points[1:]], axis=1)
+    alphas = np.linspace(0.15, 1.0, len(segs))
+    colors = [(*matplotlib.colors.to_rgb(color), a) for a in alphas]
+    ax.add_collection(LineCollection(segs, colors=colors, linewidth=lw))
+    if label:
+        ax.plot([], [], color=color, lw=lw, label=label)
 
 
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
+def run_and_record(model, vn_template, n_sheep, max_steps,
+                   reward_cfg=None, seed=42):
+    """Run one deterministic episode and return full history."""
+    raw = DummyVecEnv([make_env(n_sheep, seed, max_steps, reward_cfg)])
+    vn = VecNormalize(raw, norm_obs=True, norm_reward=False, training=False)
+    vn.obs_rms = deepcopy(vn_template.obs_rms)
+    vn.ret_rms = deepcopy(vn_template.ret_rms)
+
+    obs = vn.reset()
+    inner = vn.envs[0]
+    done = False
+
+    dog_xs, dog_ys = [], []
+    sheep_xs = [[] for _ in range(n_sheep)]
+    sheep_ys = [[] for _ in range(n_sheep)]
+    radii = []
+    pen_dists = [[] for _ in range(n_sheep)]
+    action_mags = []
+    rewards = []
+    penned_at = [None] * n_sheep
+    step = 0
+
+    while not done:
+        action, _ = model.predict(obs, deterministic=True)
+        obs, reward, dones, infos = vn.step(action)
+        done = dones[0]
+        step += 1
+
+        dog_xs.append(float(inner.dog_pos[0]))
+        dog_ys.append(float(inner.dog_pos[1]))
+        com, radius, _ = inner._flock_stats()
+        radii.append(radius)
+        rewards.append(float(reward[0]))
+        action_mags.append(float(np.linalg.norm(action[0])))
+
+        for i in range(n_sheep):
+            sheep_xs[i].append(float(inner.sheep_pos[i][0]))
+            sheep_ys[i].append(float(inner.sheep_pos[i][1]))
+            pen_dists[i].append(
+                float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER)))
+            if inner.penned[i] and penned_at[i] is None:
+                penned_at[i] = step
+
+    n_penned = infos[0].get("n_penned", 0)
+    vn.close()
+
+    return dict(
+        dog_xs=dog_xs, dog_ys=dog_ys,
+        sheep_xs=sheep_xs, sheep_ys=sheep_ys,
+        radii=radii, pen_dists=pen_dists,
+        action_mags=action_mags, rewards=rewards,
+        penned_at=penned_at,
+        n_penned=n_penned, n_sheep=n_sheep,
+        success=n_penned == n_sheep, steps=step,
+    )
+
+
+def plot_trajectory(hist, out_path):
+    fig, ax = plt.subplots(figsize=(7, 7))
+    _draw_field(ax)
+    for i in range(hist["n_sheep"]):
+        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
+        xs, ys = hist["sheep_xs"][i], hist["sheep_ys"][i]
+        _faded_path(ax, xs, ys, c, lw=1.2, label=f"sheep {i+1}")
+        ax.plot(xs[0], ys[0], "o", color=c, ms=7, zorder=4)
+        end = hist["penned_at"][i] if hist["penned_at"][i] is not None else -1
+        ax.plot(xs[end], ys[end], "*", color=c, ms=11, zorder=5)
+    _faded_path(ax, hist["dog_xs"], hist["dog_ys"], DOG_COLOR, lw=2.0,
+                label="dog")
+    ax.plot(hist["dog_xs"][0], hist["dog_ys"][0], "s", color=DOG_COLOR,
+            ms=10, zorder=5)
+    ax.plot(hist["dog_xs"][-1], hist["dog_ys"][-1], "D", color=DOG_COLOR,
+            ms=10, zorder=5)
+    result = ("SUCCESS" if hist["success"]
+              else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})")
+    ax.set_title(f"n={hist['n_sheep']}  {result}  {hist['steps']} steps",
+                 fontsize=12)
+    ax.legend(loc="upper left", fontsize=8)
+    plt.tight_layout()
+    fig.savefig(out_path, dpi=120)
+    plt.close(fig)
+
+
+def plot_timeseries(hist, out_path):
+    t = np.arange(hist["steps"])
+    fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)
+
+    axes[0].plot(t, hist["radii"], color="steelblue")
+    axes[0].axhline(5.0, color="orange", ls="--", lw=1, label="compact (5m)")
+    axes[0].set_ylabel("flock radius (m)")
+    axes[0].legend(fontsize=8)
+    axes[0].set_title("Flock radius")
+
+    for i in range(hist["n_sheep"]):
+        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
+        axes[1].plot(t, hist["pen_dists"][i], color=c, lw=1,
+                     label=f"sheep {i+1}")
+        if hist["penned_at"][i] is not None:
+            axes[1].axvline(hist["penned_at"][i], color=c, ls=":", lw=1)
+    axes[1].set_ylabel("dist to pen (m)")
+    axes[1].legend(fontsize=7, ncol=min(hist["n_sheep"], 5))
+    axes[1].set_title("Per-sheep distance to pen")
+
+    axes[2].plot(t, hist["action_mags"], color="tomato", lw=1)
+    axes[2].axhline(1.0, color="gray", ls="--", lw=1, label="max")
+    axes[2].set_ylabel("action ||(vx,vy)||")
+    axes[2].set_ylim(0, 1.5)
+    axes[2].set_title("Dog action magnitude")
+    axes[2].legend(fontsize=8)
+
+    axes[3].plot(t, hist["rewards"], color="purple", lw=1, alpha=0.7)
+    axes[3].axhline(0, color="black", lw=0.5)
+    axes[3].set_ylabel("reward")
+    axes[3].set_xlabel("step")
+    axes[3].set_title("Reward per step")
+
+    result = ("SUCCESS" if hist["success"]
+              else f"FAIL ({hist['n_penned']}/{hist['n_sheep']})")
+    fig.suptitle(f"n_sheep={hist['n_sheep']}  {result}  {hist['steps']} steps",
+                 fontsize=13)
+    plt.tight_layout()
+    fig.savefig(out_path, dpi=120)
+    plt.close(fig)
+
+
+def plot_success_rate(stage_results, out_path):
+    fig, ax = plt.subplots(figsize=(8, 4))
+    ns = [r["n_sheep"] for r in stage_results]
+    srs = [r["sr"] * 100 for r in stage_results]
+    bars = ax.bar(ns, srs, color="steelblue", edgecolor="white")
+    ax.set_xlabel("Sheep count")
+    ax.set_ylabel("Success rate (%)")
+    ax.set_ylim(0, 105)
+    ax.axhline(90, color="orange", ls="--", lw=1, label="90% target")
+    for bar, sr in zip(bars, srs):
+        ax.text(bar.get_x() + bar.get_width() / 2,
+                bar.get_height() + 1, f"{sr:.0f}%",
+                ha="center", fontsize=9)
+    ax.legend()
+    ax.set_title("Evaluation success rate per sheep count")
+    plt.tight_layout()
+    fig.savefig(out_path, dpi=120)
+    plt.close(fig)
+
+
+# ── CLI ──────────────────────────────────────────────────────────────────────
+
+DEFAULT_CONFIG = {
+    "W_PER_SHEEP": 2.0,
+    "W_ALIGN": 0.05,
+    "W_PEN_BONUS": 10.0,
+    "W_COMPLETE": 100.0,
+    "W_STEP_COST": 0.02,
+    "W_COMPACT": 0.0,
+    "W_WALL_TOUCH": 0.15,
+    "WALL_TOUCH_BUFFER": 0.8,
+    "ALIGN_SHAPE": "standoff",
+    "ALIGN_GATED": True,
+    "ENTRY_AWARE": False,
+    "ent_coef": 0.02,
+}
+
 
 def parse_args():
-    p = argparse.ArgumentParser()
-    p.add_argument("--n-sheep",          type=int,   default=1,
-                   help="Starting sheep count")
-    p.add_argument("--max-sheep",        type=int,   default=5,
-                   help="Final sheep count for curriculum")
-    p.add_argument("--n-envs",           type=int,   default=8,
-                   help="Parallel training environments")
-    p.add_argument("--total-steps",      type=int,   default=5_000_000)
-    p.add_argument("--max-steps",        type=int,   default=2000,
-                   help="Episode step limit")
-    p.add_argument("--curriculum",       action="store_true",
-                   help="Enable curriculum advancement")
-    p.add_argument("--steps-per-stage",  type=int,   default=None,
-                   help="Advance curriculum every N steps (overrides --threshold)")
-    p.add_argument("--threshold",        type=float, default=0.75,
-                   help="Success-rate threshold to advance (used without --steps-per-stage)")
-    p.add_argument("--resume",           type=str,   default=None,
-                   help="Checkpoint .zip to resume from")
-    p.add_argument("--run-dir",          type=str,   default="runs/ppo_herding")
-    p.add_argument("--save-freq",        type=int,   default=100_000)
-    p.add_argument("--eval-freq",        type=int,   default=50_000)
-    p.add_argument("--eval-eps",         type=int,   default=20)
-    p.add_argument("--diag-freq",        type=int,   default=500_000,
-                   help="Run failure-mode diagnostics every N env steps")
-    p.add_argument("--no-stall-abort",   action="store_true",
-                   help="Disable early-abort on stall — run full --total-steps "
-                        "for diagnostics")
-    p.add_argument("--mixed",            action="store_true",
-                   help="Randomise n_sheep each episode (consolidation pass, "
-                        "use with --resume after curriculum training)")
+    p = argparse.ArgumentParser(
+        description="PPO training for herding task with curriculum learning")
+    p.add_argument("--config", type=str, default=None,
+                   help="JSON config file (reward weights + ent_coef)")
+    p.add_argument("--max-sheep", type=int, default=10)
+    p.add_argument("--steps-per-stage", type=int, default=1_500_000)
+    p.add_argument("--n-envs", type=int, default=8)
+    p.add_argument("--max-steps", type=int, default=2500)
+    p.add_argument("--eval-episodes", type=int, default=30)
+    p.add_argument("--run-dir", type=str, default=None)
     return p.parse_args()
 
 
+# ── Main ─────────────────────────────────────────────────────────────────────
+
 def main():
     args = parse_args()
-    os.makedirs(args.run_dir, exist_ok=True)
-    ckpt_dir  = os.path.join(args.run_dir, "checkpoints")
-    best_dir  = os.path.join(args.run_dir, "best_model")
-    norm_path = os.path.join(args.run_dir, "vecnorm.pkl")
-    os.makedirs(ckpt_dir, exist_ok=True)
+
+    # Load config
+    cfg = dict(DEFAULT_CONFIG)
+    if args.config:
+        with open(args.config) as f:
+            cfg.update(json.load(f))
+
+    rcfg = {k: v for k, v in cfg.items() if hasattr(HerdingEnv, k)}
+
+    # Run directory
+    run_dir = args.run_dir or os.path.join(
+        "runs", time.strftime("%Y%m%d_%H%M%S"))
+    eval_dir = os.path.join(run_dir, "eval")
+    os.makedirs(eval_dir, exist_ok=True)
+    with open(os.path.join(run_dir, "config.json"), "w") as f:
+        json.dump(cfg, f, indent=2)
+
+    print(f"Config: {cfg}")
+    print(f"Run dir: {run_dir}")
+    print(f"Curriculum: 1 → {args.max_sheep} sheep, "
+          f"{args.steps_per_stage:,} steps/stage\n")
 
     # Training envs
     train_env = SubprocVecEnv([
-        make_env(args.n_sheep, seed=i, max_steps=args.max_steps,
-                 random_n_sheep=args.mixed)
+        make_env(1, seed=i, max_steps=args.max_steps, reward_cfg=rcfg)
         for i in range(args.n_envs)
     ])
-    if args.resume and os.path.exists(norm_path):
-        train_env = VecNormalize.load(norm_path, train_env)
-        train_env.training    = True
-        train_env.norm_reward = True
-    else:
-        train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True,
-                                 clip_obs=10.0)
-
-    # Eval env — starts at same difficulty, advances with curriculum callback
-    eval_env = SubprocVecEnv([
-        make_env(args.n_sheep, seed=1000 + i, max_steps=args.max_steps)
-        for i in range(2)
-    ])
-    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False,
-                            clip_obs=10.0, training=False)
-
-    # Callbacks
-    checkpoint_cb = CheckpointCallback(
-        save_freq=max(args.save_freq // args.n_envs, 1),
-        save_path=ckpt_dir,
-        name_prefix="ckpt",
-        save_vecnormalize=True,
-    )
-    eval_cb = EvalCallback(
-        eval_env,
-        best_model_save_path=best_dir,
-        log_path=args.run_dir,
-        eval_freq=max(args.eval_freq // args.n_envs, 1),
-        n_eval_episodes=args.eval_eps,
-        deterministic=True,
-        verbose=1,
-    )
-    diag_cb = DiagnosticCallback(
-        diag_freq=args.diag_freq,
-        n_episodes=20,
-        max_steps=args.max_steps,
-        abort_on_stall=not args.no_stall_abort,
-    )
-    callbacks = [checkpoint_cb, eval_cb, diag_cb]
-
-    if args.curriculum:
-        cur_cb = CurriculumCallback(
-            start_sheep=args.n_sheep,
-            max_sheep=args.max_sheep,
-            eval_env=eval_env,
-            steps_per_stage=args.steps_per_stage,
-            threshold=args.threshold,
-        )
-        callbacks.append(cur_cb)
-
-    callback_list = CallbackList(callbacks)
+    vn = VecNormalize(train_env, norm_obs=True, norm_reward=True,
+                      clip_obs=10.0)
 
     # Model
-    ppo_kwargs = dict(
-        policy          = "MlpPolicy",
-        env             = train_env,
-        learning_rate   = 3e-4,
-        n_steps         = 2048,
-        batch_size      = 256,
-        n_epochs        = 10,
-        gamma           = 0.995,
-        gae_lambda      = 0.95,
-        clip_range      = 0.2,
-        ent_coef        = 0.01,
-        vf_coef         = 0.5,
-        max_grad_norm   = 0.5,
-        policy_kwargs   = dict(net_arch=[256, 256]),
-        tensorboard_log = args.run_dir,
-        verbose         = 1,
+    model = PPO(
+        "MlpPolicy", vn,
+        learning_rate=3e-4, n_steps=2048, batch_size=256, n_epochs=10,
+        gamma=0.995, gae_lambda=0.95, clip_range=0.2,
+        ent_coef=cfg.get("ent_coef", 0.02), vf_coef=0.5, max_grad_norm=0.5,
+        policy_kwargs=dict(net_arch=[256, 256]),
+        verbose=0,
     )
 
-    if args.resume:
-        print(f"Resuming from {args.resume}")
-        model = PPO.load(args.resume, env=train_env, **{
-            k: v for k, v in ppo_kwargs.items()
-            if k not in ("policy", "env")
-        })
-    else:
-        model = PPO(**ppo_kwargs)
+    # Curriculum training
+    stage_results = []
+    t0 = time.time()
 
-    model.learn(
-        total_timesteps=args.total_steps,
-        callback=callback_list,
-        reset_num_timesteps=args.resume is None,
-        tb_log_name="ppo",
-    )
+    try:
+        for n in range(1, args.max_sheep + 1):
+            if n > 1:
+                vn.env_method("set_n_sheep", n)
 
-    model.save(os.path.join(args.run_dir, "final_model"))
-    train_env.save(norm_path)
-    print(f"\nTraining complete. Artefacts saved to {args.run_dir}/")
+            print(f"\n[Stage n_sheep={n}] training {args.steps_per_stage:,} steps")
+            model.learn(
+                total_timesteps=args.steps_per_stage,
+                reset_num_timesteps=(n == 1),
+                callback=ProgressCallback(f"{n} sheep", freq=100_000),
+            )
+
+            # Evaluate
+            print(f"[Stage n_sheep={n}] evaluating {args.eval_episodes} eps")
+            r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
+            print(f"[Stage n_sheep={n}] sr={r['sr']*100:.0f}%  "
+                  f"mean_len={r['mean_len']:.0f}  "
+                  f"mean_min_pen={r['mean_min_pen']:.1f}m  "
+                  f"mean_act={r['mean_act']:.2f}")
+
+            # Failure-mode breakdown
+            if r["failure_modes"]:
+                modes = "  ".join(
+                    f"{k}={v}" for k, v in sorted(
+                        r["failure_modes"].items(), key=lambda x: -x[1]))
+                print(f"  failure modes: {modes}")
+
+            # Reward breakdown
+            if "reward_per_step" in r:
+                rps = r["reward_per_step"]
+                print(f"  reward/step: " + "  ".join(
+                    f"{k}={v:+.4f}" for k, v in rps.items()))
+
+            # Episode visualization
+            hist = run_and_record(model, vn, n, args.max_steps, rcfg,
+                                  seed=1000 + n)
+            tag = "success" if hist["success"] else "fail"
+            plot_trajectory(
+                hist,
+                os.path.join(eval_dir, f"traj_{n}s_{tag}.png"))
+            plot_timeseries(
+                hist,
+                os.path.join(eval_dir, f"ts_{n}s_{tag}.png"))
+
+            r["n_sheep"] = n
+            stage_results.append(r)
+
+        # Save artefacts
+        model.save(os.path.join(run_dir, "final_model"))
+        vn.save(os.path.join(run_dir, "vecnorm.pkl"))
+        with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
+            json.dump(stage_results, f, indent=2)
+
+    finally:
+        try:
+            vn.close()
+        except Exception:
+            pass
+
+    # Summary
+    elapsed = (time.time() - t0) / 60
+    print("\n" + "=" * 70)
+    print("  TRAINING SUMMARY")
+    print("=" * 70)
+    for r in stage_results:
+        print(f"  n_sheep={r['n_sheep']}  sr={r['sr']*100:>3.0f}%  "
+              f"len={r['mean_len']:>5.0f}  min_pen={r['mean_min_pen']:>5.1f}m  "
+              f"act={r['mean_act']:.2f}")
+    print(f"\n  Total time: {elapsed:.1f} min")
+    print(f"  Artefacts:  {run_dir}/")
+
+    plot_success_rate(stage_results, os.path.join(run_dir, "success_rate.png"))
+    print(f"  Plots:      {run_dir}/success_rate.png, {eval_dir}/")
 
 
 if __name__ == "__main__":
diff --git a/training/vis_final_10sheep/episode.gif b/training/vis_final_10sheep/episode.gif
deleted file mode 100644
index dbe4cd6..0000000
Binary files a/training/vis_final_10sheep/episode.gif and /dev/null differ
diff --git a/training/vis_final_10sheep/timeseries.png b/training/vis_final_10sheep/timeseries.png
deleted file mode 100644
index ae80df5..0000000
Binary files a/training/vis_final_10sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_final_10sheep/trajectory.png b/training/vis_final_10sheep/trajectory.png
deleted file mode 100644
index 2839c5b..0000000
Binary files a/training/vis_final_10sheep/trajectory.png and /dev/null differ
diff --git a/training/vis_final_1sheep/episode.gif b/training/vis_final_1sheep/episode.gif
deleted file mode 100644
index 47a9aa2..0000000
Binary files a/training/vis_final_1sheep/episode.gif and /dev/null differ
diff --git a/training/vis_final_1sheep/timeseries.png b/training/vis_final_1sheep/timeseries.png
deleted file mode 100644
index 7f5e026..0000000
Binary files a/training/vis_final_1sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_final_1sheep/trajectory.png b/training/vis_final_1sheep/trajectory.png
deleted file mode 100644
index de47bb5..0000000
Binary files a/training/vis_final_1sheep/trajectory.png and /dev/null differ
diff --git a/training/vis_final_5sheep/episode.gif b/training/vis_final_5sheep/episode.gif
deleted file mode 100644
index 4690c8f..0000000
Binary files a/training/vis_final_5sheep/episode.gif and /dev/null differ
diff --git a/training/vis_final_5sheep/timeseries.png b/training/vis_final_5sheep/timeseries.png
deleted file mode 100644
index 23dcde3..0000000
Binary files a/training/vis_final_5sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_final_5sheep/trajectory.png b/training/vis_final_5sheep/trajectory.png
deleted file mode 100644
index 2880d46..0000000
Binary files a/training/vis_final_5sheep/trajectory.png and /dev/null differ
diff --git a/training/vis_final_v2_10sheep/episode.gif b/training/vis_final_v2_10sheep/episode.gif
deleted file mode 100644
index 4daecf4..0000000
Binary files a/training/vis_final_v2_10sheep/episode.gif and /dev/null differ
diff --git a/training/vis_final_v2_10sheep/timeseries.png b/training/vis_final_v2_10sheep/timeseries.png
deleted file mode 100644
index 7ebaff5..0000000
Binary files a/training/vis_final_v2_10sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_final_v2_10sheep/trajectory.png b/training/vis_final_v2_10sheep/trajectory.png
deleted file mode 100644
index 46254d7..0000000
Binary files a/training/vis_final_v2_10sheep/trajectory.png and /dev/null differ
diff --git a/training/vis_final_v2_1sheep/episode.gif b/training/vis_final_v2_1sheep/episode.gif
deleted file mode 100644
index 2f10452..0000000
Binary files a/training/vis_final_v2_1sheep/episode.gif and /dev/null differ
diff --git a/training/vis_final_v2_1sheep/timeseries.png b/training/vis_final_v2_1sheep/timeseries.png
deleted file mode 100644
index 09a2634..0000000
Binary files a/training/vis_final_v2_1sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_final_v2_1sheep/trajectory.png b/training/vis_final_v2_1sheep/trajectory.png
deleted file mode 100644
index b955810..0000000
Binary files a/training/vis_final_v2_1sheep/trajectory.png and /dev/null differ
diff --git a/training/vis_final_v2_3sheep/episode.gif b/training/vis_final_v2_3sheep/episode.gif
deleted file mode 100644
index 02010c5..0000000
Binary files a/training/vis_final_v2_3sheep/episode.gif and /dev/null differ
diff --git a/training/vis_final_v2_3sheep/timeseries.png b/training/vis_final_v2_3sheep/timeseries.png
deleted file mode 100644
index f5bafb9..0000000
Binary files a/training/vis_final_v2_3sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_final_v2_3sheep/trajectory.png b/training/vis_final_v2_3sheep/trajectory.png
deleted file mode 100644
index a505c3d..0000000
Binary files a/training/vis_final_v2_3sheep/trajectory.png and /dev/null differ
diff --git a/training/vis_final_v2_5sheep/episode.gif b/training/vis_final_v2_5sheep/episode.gif
deleted file mode 100644
index 61ed892..0000000
Binary files a/training/vis_final_v2_5sheep/episode.gif and /dev/null differ
diff --git a/training/vis_final_v2_5sheep/timeseries.png b/training/vis_final_v2_5sheep/timeseries.png
deleted file mode 100644
index 36f4810..0000000
Binary files a/training/vis_final_v2_5sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_final_v2_5sheep/trajectory.png b/training/vis_final_v2_5sheep/trajectory.png
deleted file mode 100644
index 02c6430..0000000
Binary files a/training/vis_final_v2_5sheep/trajectory.png and /dev/null differ
diff --git a/training/vis_final_v2_7sheep/episode.gif b/training/vis_final_v2_7sheep/episode.gif
deleted file mode 100644
index bfe678b..0000000
Binary files a/training/vis_final_v2_7sheep/episode.gif and /dev/null differ
diff --git a/training/vis_final_v2_7sheep/timeseries.png b/training/vis_final_v2_7sheep/timeseries.png
deleted file mode 100644
index 623a8c1..0000000
Binary files a/training/vis_final_v2_7sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_final_v2_7sheep/trajectory.png b/training/vis_final_v2_7sheep/trajectory.png
deleted file mode 100644
index 6e8c92d..0000000
Binary files a/training/vis_final_v2_7sheep/trajectory.png and /dev/null differ
diff --git a/training/vis_random/episode.gif b/training/vis_random/episode.gif
deleted file mode 100644
index e2ba22a..0000000
Binary files a/training/vis_random/episode.gif and /dev/null differ
diff --git a/training/vis_random/timeseries.png b/training/vis_random/timeseries.png
deleted file mode 100644
index 98f2822..0000000
Binary files a/training/vis_random/timeseries.png and /dev/null differ
diff --git a/training/vis_random/trajectory.png b/training/vis_random/trajectory.png
deleted file mode 100644
index 19dbde1..0000000
Binary files a/training/vis_random/trajectory.png and /dev/null differ
diff --git a/training/vis_trained_1sheep/episode.gif b/training/vis_trained_1sheep/episode.gif
deleted file mode 100644
index 4d4af6c..0000000
Binary files a/training/vis_trained_1sheep/episode.gif and /dev/null differ
diff --git a/training/vis_trained_1sheep/timeseries.png b/training/vis_trained_1sheep/timeseries.png
deleted file mode 100644
index ce46af7..0000000
Binary files a/training/vis_trained_1sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_trained_1sheep/trajectory.png b/training/vis_trained_1sheep/trajectory.png
deleted file mode 100644
index 1ea89ff..0000000
Binary files a/training/vis_trained_1sheep/trajectory.png and /dev/null differ
diff --git a/training/vis_trained_3sheep/episode.gif b/training/vis_trained_3sheep/episode.gif
deleted file mode 100644
index e902074..0000000
Binary files a/training/vis_trained_3sheep/episode.gif and /dev/null differ
diff --git a/training/vis_trained_3sheep/timeseries.png b/training/vis_trained_3sheep/timeseries.png
deleted file mode 100644
index ba7685f..0000000
Binary files a/training/vis_trained_3sheep/timeseries.png and /dev/null differ
diff --git a/training/vis_trained_3sheep/trajectory.png b/training/vis_trained_3sheep/trajectory.png
deleted file mode 100644
index 58dbe70..0000000
Binary files a/training/vis_trained_3sheep/trajectory.png and /dev/null differ
diff --git a/training/visualize.py b/training/visualize.py
deleted file mode 100644
index ea616e9..0000000
--- a/training/visualize.py
+++ /dev/null
@@ -1,316 +0,0 @@
-"""
-Single-episode visualization for the herding policy.
-
-Outputs (all saved to --out-dir):
-  trajectory.png  — full field view: dog path + every sheep path
-  timeseries.png  — radius, per-sheep pen distance, action magnitude, reward
-  episode.gif     — animated replay (slow enough to read)
-
-Run with no model to watch a RANDOM policy (useful baseline):
-  python visualize.py --random --n-sheep 3 --out-dir vis_random/
-
-Usage:
-  python visualize.py \\
-      --model runs/ppo_consolidation/final_model.zip \\
-      --vecnorm runs/ppo_consolidation/vecnorm.pkl \\
-      --n-sheep 3 --out-dir vis_out/
-"""
-
-import argparse
-import os
-import math
-import numpy as np
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
-import matplotlib.animation as animation
-from matplotlib.collections import LineCollection
-from stable_baselines3 import PPO
-from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
-from herding_env import HerdingEnv
-
-
-# ── colours ──────────────────────────────────────────────────────────────────
-SHEEP_COLORS = [
-    "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00",
-    "#a65628", "#f781bf", "#999999", "#66c2a5", "#fc8d62",
-]
-DOG_COLOR   = "#4e342e"
-PEN_COLOR   = "#ffe082"
-FIELD_COLOR = "#dcedc8"
-
-
-def make_env(n_sheep, max_steps, seed=42):
-    def _init():
-        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps)
-        env.reset(seed=seed)
-        return env
-    return _init
-
-
-def run_episode(model, env, n_sheep, max_steps):
-    """Run one deterministic episode; return recorded history."""
-    obs      = env.reset()
-    inner    = env.envs[0]
-    done     = False
-
-    dog_xs, dog_ys   = [], []
-    sheep_xs         = [[] for _ in range(n_sheep)]
-    sheep_ys         = [[] for _ in range(n_sheep)]
-    radii            = []
-    pen_dists        = [[] for _ in range(n_sheep)]
-    action_mags      = []
-    rewards          = []
-    penned_at        = [None] * n_sheep   # step when each sheep was penned
-
-    step = 0
-    while not done:
-        if model is None:
-            action = env.action_space.sample()[np.newaxis]
-        else:
-            action, _ = model.predict(obs, deterministic=True)
-
-        obs, reward, dones, infos = env.step(action)
-        done = dones[0]
-        step += 1
-
-        dx, dy = float(inner.dog_pos[0]), float(inner.dog_pos[1])
-        dog_xs.append(dx); dog_ys.append(dy)
-
-        com, radius, _ = inner._flock_stats()
-        radii.append(radius)
-        rewards.append(float(reward[0]))
-
-        act = action[0]
-        action_mags.append(float(np.linalg.norm(act)))
-
-        for i in range(n_sheep):
-            sx, sy = float(inner.sheep_pos[i][0]), float(inner.sheep_pos[i][1])
-            sheep_xs[i].append(sx)
-            sheep_ys[i].append(sy)
-            pen_dists[i].append(float(np.linalg.norm(inner.sheep_pos[i] - inner.PEN_CENTER)))
-            if inner.penned[i] and penned_at[i] is None:
-                penned_at[i] = step
-
-    info = infos[0]
-    n_penned = info.get("n_penned", 0)
-    success  = n_penned == n_sheep
-
-    return dict(
-        dog_xs=dog_xs, dog_ys=dog_ys,
-        sheep_xs=sheep_xs, sheep_ys=sheep_ys,
-        radii=radii, pen_dists=pen_dists,
-        action_mags=action_mags, rewards=rewards,
-        penned_at=penned_at,
-        n_penned=n_penned, n_sheep=n_sheep,
-        success=success, steps=step,
-    )
-
-
-# ── plot helpers ─────────────────────────────────────────────────────────────
-
-def draw_field(ax):
-    ax.set_xlim(-16, 16); ax.set_ylim(-16, 16)
-    ax.set_aspect("equal"); ax.set_facecolor(FIELD_COLOR)
-    ax.add_patch(mpatches.Rectangle((-15,-15), 30, 30,
-                 fill=False, edgecolor="#795548", lw=2))
-    ax.add_patch(mpatches.Rectangle((10,-15), 3, 7,
-                 facecolor=PEN_COLOR, edgecolor="#795548", lw=2))
-    ax.text(11.5, -11.5, "pen", ha="center", va="center",
-            fontsize=8, color="#795548")
-
-
-def faded_path(ax, xs, ys, color, lw=1.5, label=None):
-    """Draw a path with alpha fading from start (transparent) to end (opaque)."""
-    n = len(xs)
-    if n < 2:
-        return
-    points  = np.array([xs, ys]).T.reshape(-1, 1, 2)
-    segs    = np.concatenate([points[:-1], points[1:]], axis=1)
-    alphas  = np.linspace(0.15, 1.0, len(segs))
-    colors  = [(*matplotlib.colors.to_rgb(color), a) for a in alphas]
-    lc = LineCollection(segs, colors=colors, linewidth=lw)
-    ax.add_collection(lc)
-    if label:
-        ax.plot([], [], color=color, lw=lw, label=label)
-
-
-# ── main plots ────────────────────────────────────────────────────────────────
-
-def plot_trajectory(hist, out_path):
-    fig, ax = plt.subplots(figsize=(7, 7))
-    draw_field(ax)
-
-    # Sheep paths
-    for i in range(hist["n_sheep"]):
-        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
-        xs, ys = hist["sheep_xs"][i], hist["sheep_ys"][i]
-        faded_path(ax, xs, ys, c, lw=1.2, label=f"sheep {i+1}")
-        ax.plot(xs[0], ys[0], "o", color=c, ms=7, zorder=4)
-        pa = hist["penned_at"][i]
-        end = pa if pa is not None else -1
-        ax.plot(xs[end], ys[end], "*", color=c, ms=11, zorder=5)
-
-    # Dog path
-    faded_path(ax, hist["dog_xs"], hist["dog_ys"], DOG_COLOR, lw=2.0, label="dog")
-    ax.plot(hist["dog_xs"][0], hist["dog_ys"][0], "s", color=DOG_COLOR, ms=10, zorder=5)
-    ax.plot(hist["dog_xs"][-1], hist["dog_ys"][-1], "D", color=DOG_COLOR, ms=10, zorder=5)
-
-    result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']} penned)"
-    ax.set_title(f"Trajectory — {result} — {hist['steps']} steps", fontsize=12)
-    ax.legend(loc="upper left", fontsize=8)
-    plt.tight_layout()
-    fig.savefig(out_path, dpi=120)
-    plt.close(fig)
-    print(f"  saved {out_path}")
-
-
-def plot_timeseries(hist, out_path):
-    t      = np.arange(hist["steps"])
-    fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)
-
-    # 1. Flock radius
-    axes[0].plot(t, hist["radii"], color="steelblue")
-    axes[0].axhline(5.0, color="orange", ls="--", lw=1, label="compact threshold (5m)")
-    axes[0].set_ylabel("flock radius (m)")
-    axes[0].legend(fontsize=8)
-    axes[0].set_title("Flock radius — goal: get below 5m")
-
-    # 2. Per-sheep distance to pen
-    for i in range(hist["n_sheep"]):
-        c = SHEEP_COLORS[i % len(SHEEP_COLORS)]
-        axes[1].plot(t, hist["pen_dists"][i], color=c, lw=1, label=f"sheep {i+1}")
-        pa = hist["penned_at"][i]
-        if pa is not None:
-            axes[1].axvline(pa, color=c, ls=":", lw=1)
-    axes[1].set_ylabel("dist to pen (m)")
-    axes[1].legend(fontsize=7, ncol=min(hist["n_sheep"], 5))
-    axes[1].set_title("Per-sheep distance to pen — goal: all reach 0")
-
-    # 3. Action magnitude (how fast dog is moving)
-    axes[2].plot(t, hist["action_mags"], color="tomato", lw=1)
-    axes[2].axhline(1.0, color="gray", ls="--", lw=1, label="max")
-    axes[2].set_ylabel("action ||(vx,vy)||")
-    axes[2].set_ylim(0, 1.5)
-    axes[2].set_title("Dog action magnitude — 0=stopped, 1=full speed")
-    axes[2].legend(fontsize=8)
-
-    # 4. Reward per step
-    axes[3].plot(t, hist["rewards"], color="purple", lw=1, alpha=0.7)
-    axes[3].axhline(0, color="black", lw=0.5)
-    axes[3].set_ylabel("reward")
-    axes[3].set_xlabel("step")
-    axes[3].set_title("Reward per step")
-
-    result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']} penned)"
-    fig.suptitle(f"n_sheep={hist['n_sheep']}  {result}  {hist['steps']} steps", fontsize=13)
-    plt.tight_layout()
-    fig.savefig(out_path, dpi=120)
-    plt.close(fig)
-    print(f"  saved {out_path}")
-
-
-def save_gif(hist, out_path, fps=15, skip=5):
-    """Animated replay, every `skip` steps."""
-    n    = hist["n_sheep"]
-    idxs = list(range(0, hist["steps"], skip))
-
-    fig, ax = plt.subplots(figsize=(6, 6))
-
-    def _frame(k):
-        ax.clear()
-        draw_field(ax)
-        t = idxs[k]
-
-        for i in range(n):
-            c  = SHEEP_COLORS[i % len(SHEEP_COLORS)]
-            s0 = max(0, t - 30)
-            ax.plot(hist["sheep_xs"][i][s0:t+1],
-                    hist["sheep_ys"][i][s0:t+1],
-                    color=c, lw=0.8, alpha=0.5)
-            color = "#ff69b4" if (hist["penned_at"][i] is not None
-                                   and t >= hist["penned_at"][i]) else c
-            ax.plot(hist["sheep_xs"][i][t], hist["sheep_ys"][i][t],
-                    "o", color=color, ms=10, zorder=4,
-                    markeredgecolor="#555", markeredgewidth=1)
-
-        s0 = max(0, t - 30)
-        ax.plot(hist["dog_xs"][s0:t+1], hist["dog_ys"][s0:t+1],
-                color=DOG_COLOR, lw=1.5, alpha=0.6)
-        ax.plot(hist["dog_xs"][t], hist["dog_ys"][t],
-                "s", color=DOG_COLOR, ms=13, zorder=5,
-                markeredgecolor="black", markeredgewidth=1.5)
-
-        r = hist["radii"][t]
-        ax.set_title(f"step {t}/{hist['steps']}  radius={r:.1f}m  "
-                     f"penned={hist['n_penned'] if t==hist['steps']-1 else '?'}/{n}",
-                     fontsize=10)
-
-    ani = animation.FuncAnimation(fig, _frame, frames=len(idxs), interval=1000//fps)
-    ani.save(out_path, writer="pillow", fps=fps)
-    plt.close(fig)
-    print(f"  saved {out_path}")
-
-
-# ── entry point ───────────────────────────────────────────────────────────────
-
-def parse_args():
-    p = argparse.ArgumentParser()
-    p.add_argument("--model",     default=None, help="Model .zip (omit for random policy)")
-    p.add_argument("--vecnorm",   default=None)
-    p.add_argument("--n-sheep",   type=int, default=3)
-    p.add_argument("--max-steps", type=int, default=2000)
-    p.add_argument("--seed",      type=int, default=42)
-    p.add_argument("--out-dir",   default="vis_out")
-    p.add_argument("--random",    action="store_true",
-                   help="Use random policy (baseline comparison)")
-    p.add_argument("--gif-fps",   type=int, default=15)
-    p.add_argument("--gif-skip",  type=int, default=5,
-                   help="Render every Nth step in the GIF")
-    p.add_argument("--no-gif",    action="store_true")
-    return p.parse_args()
-
-
-def main():
-    args = parse_args()
-    os.makedirs(args.out_dir, exist_ok=True)
-
-    raw = DummyVecEnv([make_env(args.n_sheep, args.max_steps, args.seed)])
-
-    if args.random or args.model is None:
-        print("Using RANDOM policy")
-        env   = raw
-        model = None
-    else:
-        if args.vecnorm:
-            env = VecNormalize.load(args.vecnorm, raw)
-            env.training    = False
-            env.norm_reward = False
-        else:
-            env = raw
-        model = PPO.load(args.model, env=env)
-        print(f"Loaded model: {args.model}")
-
-    print(f"Running episode  n_sheep={args.n_sheep}  seed={args.seed} ...")
-    hist = run_episode(model, env, args.n_sheep, args.max_steps)
-
-    result = "SUCCESS" if hist["success"] else f"FAIL ({hist['n_penned']}/{hist['n_sheep']} penned)"
-    print(f"Episode done: {result}  steps={hist['steps']}")
-    print(f"  min radius : {min(hist['radii']):.2f} m")
-    print(f"  mean reward: {np.mean(hist['rewards']):.4f}")
-    print(f"  mean action: {np.mean(hist['action_mags']):.3f}")
-
-    env.close()
-
-    plot_trajectory(hist, os.path.join(args.out_dir, "trajectory.png"))
-    plot_timeseries(hist, os.path.join(args.out_dir, "timeseries.png"))
-    if not args.no_gif:
-        save_gif(hist, os.path.join(args.out_dir, "episode.gif"),
-                 fps=args.gif_fps, skip=args.gif_skip)
-
-    print(f"\nAll outputs saved to {args.out_dir}/")
-
-
-if __name__ == "__main__":
-    main()