RL training ready to test

2026-04-22 23:34:58 +01:00
parent f256e99a76
commit 00eaf47d1f
5 changed files with 682 additions and 2 deletions
@@ -0,0 +1,143 @@
 """
 Evaluation script for a trained herding policy.
 Runs N episodes and reports the three project metrics:
  1. Success rate       — fraction of episodes where all sheep are penned
  2. Time-to-pen        — mean steps across successful episodes (per sheep)
  3. Flock dispersion   — mean pairwise distance among active sheep, averaged
                          over all timesteps (lower = tighter herding)
 Usage
 -----
    python evaluate.py --model runs/ppo_herding/best_model/best_model.zip \
                       --vecnorm runs/ppo_herding/vecnorm.pkl \
                       --n-sheep 5 --episodes 100
 Add --render to watch the first episode in a matplotlib window.
 """
 import argparse
 import numpy as np
 from stable_baselines3 import PPO
 from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
 from herding_env import HerdingEnv
 def make_single_env(n_sheep: int, max_steps: int, render_mode: str = None):
    def _init():
        return HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
                          render_mode=render_mode)
    return _init
 def pairwise_mean(positions: np.ndarray, n_active: int) -> float:
    """Mean pairwise distance among the first n_active sheep."""
    if n_active < 2:
        return 0.0
    pts = positions[:n_active]
    dists = []
    for i in range(n_active):
        for j in range(i + 1, n_active):
            dists.append(float(np.linalg.norm(pts[i] - pts[j])))
    return float(np.mean(dists))
 def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--model",    required=True,
                   help="Path to saved model .zip")
    p.add_argument("--vecnorm",  default=None,
                   help="Path to VecNormalize stats .pkl (optional)")
    p.add_argument("--n-sheep",  type=int, default=1)
    p.add_argument("--episodes", type=int, default=50)
    p.add_argument("--max-steps", type=int, default=2000)
    p.add_argument("--render",   action="store_true",
                   help="Render first episode in matplotlib")
    p.add_argument("--seed",     type=int, default=42)
    return p.parse_args()
 def main():
    args = parse_args()
    render_mode = "human" if args.render else None
    raw_env = DummyVecEnv([make_single_env(args.n_sheep, args.max_steps,
                                           render_mode)])
    if args.vecnorm:
        env = VecNormalize.load(args.vecnorm, raw_env)
        env.training  = False
        env.norm_reward = False
    else:
        env = raw_env
    model = PPO.load(args.model, env=env)
    successes       = []
    steps_to_pen    = []   # steps for successful episodes
    dispersions     = []   # per-episode mean flock dispersion
    for ep in range(args.episodes):
        obs = env.reset()
        done = False
        ep_steps = 0
        ep_dispersion = []
        first_ep = ep == 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, _, dones, infos = env.step(action)
            done = dones[0]
            ep_steps += 1
            # Access the underlying HerdingEnv for dispersion calculation
            inner = env.envs[0] if hasattr(env, "envs") else env.venv.envs[0]
            if not inner.penned[:inner.n_sheep].all():
                ep_dispersion.append(
                    pairwise_mean(inner.sheep_pos, inner.n_sheep)
                )
            if first_ep and render_mode == "human":
                pass   # render() is called inside step()
        info = infos[0]
        n_penned = info.get("n_penned", 0)
        n_sheep  = info.get("n_sheep",  args.n_sheep)
        success  = n_penned == n_sheep
        successes.append(int(success))
        if success:
            steps_to_pen.append(ep_steps / n_sheep)
        if ep_dispersion:
            dispersions.append(float(np.mean(ep_dispersion)))
        if (ep + 1) % 10 == 0:
            print(f"  Episode {ep + 1:>4}/{args.episodes}  "
                  f"success={int(success)}  steps={ep_steps}")
    env.close()
    # -----------------------------------------------------------------------
    # Report
    # -----------------------------------------------------------------------
    success_rate = float(np.mean(successes))
    mean_ttp     = float(np.mean(steps_to_pen)) if steps_to_pen else float("nan")
    mean_disp    = float(np.mean(dispersions))   if dispersions  else float("nan")
    print("\n" + "=" * 50)
    print(f"  Model           : {args.model}")
    print(f"  Sheep           : {args.n_sheep}")
    print(f"  Episodes        : {args.episodes}")
    print("-" * 50)
    print(f"  Success rate    : {success_rate * 100:.1f}%"
          f"  ({sum(successes)}/{args.episodes})")
    print(f"  Time-to-pen     : {mean_ttp:.1f} steps/sheep"
          f"  (successful episodes only)")
    print(f"  Flock dispersion: {mean_disp:.2f} m"
          f"  (mean pairwise distance while active)")
    print("=" * 50)
 if __name__ == "__main__":
    main()
@@ -0,0 +1,319 @@
 """
 2D herding environment for PPO training (Gymnasium-compatible).
 The dog agent (action: 2D velocity vector) must herd n_sheep into the
 quarantine pen.  Sheep dynamics mirror the Webots controller exactly:
 flee (quadratic ramp), separation (inverse-distance), cohesion, wall
 avoidance, and wander.
 Coordinate system matches the Webots world file:
    field  : x ∈ [-15, 15],  y ∈ [-15, 15]
    pen    : x ∈ [10, 13],   y ∈ [-15, -8]   (SE corner, open north)
 Observation is always sized for MAX_SHEEP (currently 5) regardless of
 how many sheep are active.  Inactive slots are pre-penned at the pen
 centre with flag=1.  This keeps the model input dimension fixed across
 curriculum stages so VecNormalize statistics are preserved throughout.
 """
 import numpy as np
 import gymnasium as gym
 from gymnasium import spaces
 class HerdingEnv(gym.Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 30}
    # -----------------------------------------------------------------------
    # World constants — must match Webots world file
    # -----------------------------------------------------------------------
    MAX_SHEEP = 5
    FIELD     = 15.0                         # half-size; positions ∈ [-FIELD, FIELD]
    PEN_X     = (10.0, 13.0)                 # quarantine pen x bounds
    PEN_Y     = (-15.0, -8.0)               # quarantine pen y bounds
    PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
    # -----------------------------------------------------------------------
    # Dynamics — calibrated to match Webots robot specs
    # wheel radius 0.031 m; sheep FLEE_SPEED 20 rad/s → 0.62 m/s
    # wheel radius 0.038 m; dog  maxVelocity 70 rad/s → 2.66 m/s
    # -----------------------------------------------------------------------
    DOG_SPEED      = 2.5    # m/s
    SHEEP_FLEE_V   = 0.65   # m/s
    SHEEP_WANDER_V = 0.20   # m/s
    DT             = 0.1    # seconds per step
    # Boid parameters — identical to sheep.py
    FLEE_DIST       = 7.0
    SEPARATION_DIST = 2.5
    COHESION_DIST   = 8.0
    WALL_MARGIN     = 3.5
    # -----------------------------------------------------------------------
    # Reward weights
    # -----------------------------------------------------------------------
    W_APPROACH   = 0.3     # dense: dog distance to nearest active sheep
    W_SHAPING    = 0.5     # dense: mean sheep distance to pen  (was 0.01)
    W_PEN_BONUS  = 5.0     # sparse: per sheep successfully penned
    W_COMPLETE   = 20.0    # bonus when ALL active sheep are penned
    W_STEP_COST  = 0.002   # penalty per step (encourages efficiency)
    def __init__(self, n_sheep: int = 1, max_steps: int = 2000,
                 render_mode: str = None):
        super().__init__()
        assert 1 <= n_sheep <= self.MAX_SHEEP
        self.n_sheep    = n_sheep
        self.max_steps  = max_steps
        self.render_mode = render_mode
        # Observation: dog(x,y) + MAX_SHEEP×sheep(x,y) + MAX_SHEEP×penned
        # Fixed size across all curriculum stages.
        obs_dim = 2 + 2 * self.MAX_SHEEP + self.MAX_SHEEP
        self.observation_space = spaces.Box(
            low=-1.0, high=1.0, shape=(obs_dim,), dtype=np.float32
        )
        # Action: desired velocity (vx, vy) ∈ [-1, 1]², scaled by DOG_SPEED
        self.action_space = spaces.Box(
            low=-1.0, high=1.0, shape=(2,), dtype=np.float32
        )
        # Runtime state (populated by reset)
        self._step_count   = 0
        self._prev_penned  = 0
        self.dog_pos       = np.zeros(2, dtype=np.float32)
        self.sheep_pos     = np.zeros((self.MAX_SHEEP, 2), dtype=np.float32)
        self.penned        = np.ones(self.MAX_SHEEP, dtype=bool)
        self.wander_ang    = np.zeros(self.MAX_SHEEP, dtype=np.float32)
        self._fig = None    # lazy matplotlib figure
    # ------------------------------------------------------------------
    # Curriculum interface
    # ------------------------------------------------------------------
    def set_n_sheep(self, n: int):
        """Advance curriculum difficulty; takes effect on next reset()."""
        assert 1 <= n <= self.MAX_SHEEP
        self.n_sheep = n
    # ------------------------------------------------------------------
    # Gymnasium API
    # ------------------------------------------------------------------
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._step_count  = 0
        self._prev_penned = 0
        # Dog: random start in the open field (not near the pen)
        self.dog_pos = self.np_random.uniform(-8.0, 5.0, size=(2,)).astype(np.float32)
        # Active sheep (0 .. n_sheep-1): random non-pen positions
        self.sheep_pos[:] = self.PEN_CENTER   # default all to pen centre
        self.penned[:]    = True
        placed = 0
        while placed < self.n_sheep:
            p = self.np_random.uniform(-12.0, 12.0, size=(2,)).astype(np.float32)
            if not self._in_pen(p):
                self.sheep_pos[placed] = p
                self.penned[placed]    = False
                placed += 1
        # Inactive slots (n_sheep .. MAX_SHEEP-1): already at pen centre, penned=True
        self.wander_ang = self.np_random.uniform(
            -np.pi, np.pi, size=(self.MAX_SHEEP,)
        ).astype(np.float32)
        return self._obs(), {}
    def step(self, action):
        self._step_count += 1
        # Move dog — clip each axis independently so the agent can idle
        act = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0)
        self.dog_pos = np.clip(
            self.dog_pos + act * self.DOG_SPEED * self.DT,
            -self.FIELD, self.FIELD
        )
        # Step sheep dynamics
        for i in range(self.n_sheep):
            if self.penned[i]:
                continue
            self.sheep_pos[i] = self._step_sheep(i)
            if self._in_pen(self.sheep_pos[i]):
                self.penned[i] = True
        n_penned     = int(self.penned[:self.n_sheep].sum())
        newly_penned = n_penned - self._prev_penned
        self._prev_penned = n_penned
        reward     = self._reward(n_penned, newly_penned)
        terminated = n_penned == self.n_sheep
        truncated  = self._step_count >= self.max_steps
        info       = {"n_penned": n_penned, "n_sheep": self.n_sheep}
        if self.render_mode == "human":
            self.render()
        return self._obs(), float(reward), terminated, truncated, info
    def render(self):
        import matplotlib.pyplot as plt
        import matplotlib.patches as mpatches
        if self._fig is None:
            plt.ion()
            self._fig, self._ax = plt.subplots(figsize=(6, 6))
        ax = self._ax
        ax.clear()
        ax.set_xlim(-16, 16)
        ax.set_ylim(-16, 16)
        ax.set_aspect("equal")
        ax.set_facecolor("#dcedc8")
        # Field boundary
        ax.add_patch(mpatches.Rectangle(
            (-15, -15), 30, 30, fill=False, edgecolor="#795548", linewidth=2
        ))
        # Pen
        pw = self.PEN_X[1] - self.PEN_X[0]
        ph = self.PEN_Y[1] - self.PEN_Y[0]
        ax.add_patch(mpatches.Rectangle(
            (self.PEN_X[0], self.PEN_Y[0]), pw, ph,
            facecolor="#ffe082", edgecolor="#795548", linewidth=2
        ))
        ax.text(11.5, -11.5, "pen", ha="center", va="center",
                fontsize=8, color="#795548")
        # Sheep
        for i in range(self.MAX_SHEEP):
            if i >= self.n_sheep:
                continue   # inactive slot — not shown
            color = "deeppink" if self.penned[i] else "white"
            ax.plot(*self.sheep_pos[i], "o", color=color, markersize=11,
                    markeredgecolor="#555", markeredgewidth=1.5)
        # Dog
        ax.plot(*self.dog_pos, "s", color="#4e342e", markersize=13,
                markeredgecolor="black", markeredgewidth=1.5)
        ax.set_title(
            f"step {self._step_count} | "
            f"penned {int(self.penned[:self.n_sheep].sum())}/{self.n_sheep}",
            fontsize=11
        )
        self._fig.canvas.draw()
        self._fig.canvas.flush_events()
        plt.pause(0.001)
    def close(self):
        if self._fig is not None:
            import matplotlib.pyplot as plt
            plt.close(self._fig)
            self._fig = None
    # ------------------------------------------------------------------
    # Internals
    # ------------------------------------------------------------------
    def _in_pen(self, pos: np.ndarray) -> bool:
        return (self.PEN_X[0] < pos[0] < self.PEN_X[1] and
                self.PEN_Y[0] < pos[1] < self.PEN_Y[1])
    def _obs(self) -> np.ndarray:
        scale = 1.0 / self.FIELD
        return np.concatenate([
            self.dog_pos * scale,                          # 2
            (self.sheep_pos * scale).flatten(),            # 2 * MAX_SHEEP
            self.penned.astype(np.float32),                # MAX_SHEEP
        ]).astype(np.float32)
    def _reward(self, n_penned: int, newly_penned: int) -> float:
        active_mask = ~self.penned[:self.n_sheep]
        if active_mask.any():
            active_pos = self.sheep_pos[:self.n_sheep][active_mask]
            # Sheep-to-pen shaping: encourages moving sheep toward pen
            dists_pen = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1)
            shaping   = -(dists_pen.mean() / (2 * self.FIELD))  # ∈ [-1, 0]
            # Dog-to-nearest-sheep approach: incentivises the dog to stay
            # within flee range (FLEE_DIST=7m) rather than wandering away
            dists_dog = np.linalg.norm(active_pos - self.dog_pos, axis=1)
            approach  = -(dists_dog.min() / (2 * self.FIELD))   # ∈ [-1, 0]
        else:
            shaping = approach = 0.0
        reward  = shaping  * self.W_SHAPING
        reward += approach * self.W_APPROACH
        reward += newly_penned * self.W_PEN_BONUS
        reward -= self.W_STEP_COST
        if n_penned == self.n_sheep:
            reward += self.W_COMPLETE
        return reward
    def _step_sheep(self, i: int) -> np.ndarray:
        """Apply one timestep of boid dynamics to sheep i."""
        pos = self.sheep_pos[i].copy()
        fx, fy = 0.0, 0.0
        fleeing = False
        # Flee from dog — quadratic ramp (mirrors sheep.py)
        diff = self.dog_pos - pos
        dist = float(np.linalg.norm(diff))
        if 0.01 < dist < self.FLEE_DIST:
            t = 1.0 - dist / self.FLEE_DIST
            s = t * t * 5.0
            fx -= (diff[0] / dist) * s
            fy -= (diff[1] / dist) * s
            fleeing = True
        # Separation (inverse-distance) + Cohesion
        cx, cy, cn = 0.0, 0.0, 0
        for j in range(self.n_sheep):
            if j == i or self.penned[j]:
                continue
            dv = self.sheep_pos[j] - pos
            dj = float(np.linalg.norm(dv))
            if 0.3 < dj < self.COHESION_DIST:
                cx += self.sheep_pos[j][0]
                cy += self.sheep_pos[j][1]
                cn += 1
            if 0.05 < dj < self.SEPARATION_DIST:
                push = (self.SEPARATION_DIST - dj) / dj
                fx -= (dv[0] / dj) * push * 2.5
                fy -= (dv[1] / dj) * push * 2.5
        if cn > 0:
            w = 0.08 if fleeing else 0.15
            fx += (cx / cn - pos[0]) * w
            fy += (cy / cn - pos[1]) * w
        # Wall avoidance
        m, F = self.WALL_MARGIN, self.FIELD
        if pos[0] < -F + m: fx += ((-F + m - pos[0]) / m) * 6.0
        if pos[0] >  F - m: fx -= ((pos[0] - (F - m)) / m) * 6.0
        if pos[1] < -F + m: fy += ((-F + m - pos[1]) / m) * 6.0
        if pos[1] >  F - m: fy -= ((pos[1] - (F - m)) / m) * 6.0
        # Wander — suppressed while fleeing
        if not fleeing:
            if self.np_random.random() < 0.02:
                self.wander_ang[i] += float(self.np_random.uniform(-0.6, 0.6))
            fx += float(np.cos(self.wander_ang[i])) * 0.5
            fy += float(np.sin(self.wander_ang[i])) * 0.5
        # Integrate
        force = np.array([fx, fy])
        mag   = float(np.linalg.norm(force))
        if mag > 0.01:
            top_speed = self.SHEEP_FLEE_V if fleeing else self.SHEEP_WANDER_V
            speed = min(top_speed, mag * 0.3)
            pos   = np.clip(pos + (force / mag) * speed * self.DT,
                            -self.FIELD, self.FIELD)
        return pos.astype(np.float32)
@@ -0,0 +1,6 @@
 gymnasium>=0.29
 stable-baselines3>=2.3
 torch>=2.2
 numpy>=1.26
 matplotlib>=3.8
 tensorboard>=2.16
@@ -0,0 +1,211 @@
 """
 PPO training script for the herding task.
 Usage examples
 --------------
 # Start fresh with curriculum (1 → 5 sheep):
    python train.py --curriculum
 # Resume from checkpoint, skip directly to 3 sheep:
    python train.py --resume runs/ppo_herding/ckpt_200000_steps.zip --n-sheep 3
 # Quick smoke-test (no curriculum, single env):
    python train.py --n-envs 1 --total-steps 50000
 """
 import argparse
 import os
 import numpy as np
 from stable_baselines3 import PPO
 from stable_baselines3.common.callbacks import (
    BaseCallback,
    CallbackList,
    CheckpointCallback,
    EvalCallback,
 )
 from stable_baselines3.common.vec_env import SubprocVecEnv, VecNormalize
 from herding_env import HerdingEnv
 # ---------------------------------------------------------------------------
 # Curriculum callback
 # ---------------------------------------------------------------------------
 class CurriculumCallback(BaseCallback):
    """
    Advances the curriculum (number of active sheep) when the rolling mean
    episode success rate exceeds a threshold.
    Success = episode terminated (all sheep penned) rather than truncated.
    """
    THRESHOLD   = 0.75   # success rate to graduate
    WINDOW      = 100    # episodes to average over
    MIN_EPISODES = 50    # don't graduate before seeing this many episodes
    def __init__(self, start_sheep: int, max_sheep: int, verbose: int = 1):
        super().__init__(verbose)
        self.max_sheep  = max_sheep
        self._successes = []
        self._cur_sheep = start_sheep
    def _on_step(self) -> bool:
        for info, done in zip(self.locals["infos"], self.locals["dones"]):
            if done:
                truncated = info.get("TimeLimit.truncated", False)
                self._successes.append(0 if truncated else 1)
                if len(self._successes) > self.WINDOW:
                    self._successes.pop(0)
        if (self._cur_sheep < self.max_sheep
                and len(self._successes) >= self.MIN_EPISODES
                and np.mean(self._successes) >= self.THRESHOLD):
            self._cur_sheep += 1
            self.training_env.env_method("set_n_sheep", self._cur_sheep)
            self._successes.clear()
            if self.verbose:
                print(f"\n[Curriculum] Advanced to {self._cur_sheep} sheep "
                      f"at step {self.num_timesteps}\n")
        return True
 # ---------------------------------------------------------------------------
 # Environment factory
 # ---------------------------------------------------------------------------
 def make_env(n_sheep: int, seed: int, max_steps: int):
    def _init():
        env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps)
        env.reset(seed=seed)
        return env
    return _init
 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------
 def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--n-sheep",     type=int,   default=1,
                   help="Starting number of sheep (or fixed count if no curriculum)")
    p.add_argument("--max-sheep",   type=int,   default=5,
                   help="Maximum sheep for curriculum (ignored without --curriculum)")
    p.add_argument("--n-envs",      type=int,   default=8,
                   help="Number of parallel environments")
    p.add_argument("--total-steps", type=int,   default=5_000_000,
                   help="Total environment steps to train for")
    p.add_argument("--max-steps",   type=int,   default=2000,
                   help="Episode step limit inside each env")
    p.add_argument("--curriculum",  action="store_true",
                   help="Enable automatic curriculum advancement")
    p.add_argument("--resume",      type=str,   default=None,
                   help="Path to a .zip checkpoint to resume training from")
    p.add_argument("--run-dir",     type=str,   default="runs/ppo_herding",
                   help="Output directory for checkpoints and logs")
    p.add_argument("--save-freq",   type=int,   default=100_000,
                   help="Checkpoint every N steps (per-env, not total)")
    p.add_argument("--eval-freq",   type=int,   default=50_000,
                   help="Evaluate every N steps")
    p.add_argument("--eval-eps",    type=int,   default=20,
                   help="Episodes per evaluation run")
    return p.parse_args()
 def main():
    args = parse_args()
    os.makedirs(args.run_dir, exist_ok=True)
    ckpt_dir = os.path.join(args.run_dir, "checkpoints")
    best_dir = os.path.join(args.run_dir, "best_model")
    norm_path = os.path.join(args.run_dir, "vecnorm.pkl")
    os.makedirs(ckpt_dir, exist_ok=True)
    # Training envs
    train_env = SubprocVecEnv([
        make_env(args.n_sheep, seed=i, max_steps=args.max_steps)
        for i in range(args.n_envs)
    ])
    if args.resume and os.path.exists(norm_path):
        train_env = VecNormalize.load(norm_path, train_env)
        train_env.training = True
        train_env.norm_reward = True
    else:
        train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True,
                                 clip_obs=10.0)
    # Eval env (no reward normalisation, deterministic)
    eval_env = SubprocVecEnv([
        make_env(args.n_sheep, seed=1000 + i, max_steps=args.max_steps)
        for i in range(2)
    ])
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False,
                            clip_obs=10.0, training=False)
    # Callbacks
    checkpoint_cb = CheckpointCallback(
        save_freq=max(args.save_freq // args.n_envs, 1),
        save_path=ckpt_dir,
        name_prefix="ckpt",
        save_vecnormalize=True,
    )
    eval_cb = EvalCallback(
        eval_env,
        best_model_save_path=best_dir,
        log_path=args.run_dir,
        eval_freq=max(args.eval_freq // args.n_envs, 1),
        n_eval_episodes=args.eval_eps,
        deterministic=True,
        verbose=1,
    )
    callbacks = [checkpoint_cb, eval_cb]
    if args.curriculum:
        callbacks.append(CurriculumCallback(start_sheep=args.n_sheep,
                                            max_sheep=args.max_sheep))
    callback_list = CallbackList(callbacks)
    # Model
    ppo_kwargs = dict(
        policy          = "MlpPolicy",
        env             = train_env,
        learning_rate   = 3e-4,
        n_steps         = 2048,
        batch_size      = 256,
        n_epochs        = 10,
        gamma           = 0.995,
        gae_lambda      = 0.95,
        clip_range      = 0.2,
        ent_coef        = 0.005,
        vf_coef         = 0.5,
        max_grad_norm   = 0.5,
        policy_kwargs   = dict(net_arch=[256, 256]),
        tensorboard_log = args.run_dir,
        verbose         = 1,
    )
    if args.resume:
        print(f"Resuming from {args.resume}")
        model = PPO.load(args.resume, env=train_env, **{
            k: v for k, v in ppo_kwargs.items()
            if k not in ("policy", "env")
        })
    else:
        model = PPO(**ppo_kwargs)
    model.learn(
        total_timesteps=args.total_steps,
        callback=callback_list,
        reset_num_timesteps=args.resume is None,
        tb_log_name="ppo",
    )
    # Save final artefacts
    model.save(os.path.join(args.run_dir, "final_model"))
    train_env.save(norm_path)
    print(f"\nTraining complete. Artefacts saved to {args.run_dir}/")
 if __name__ == "__main__":
    main()
@@ -364,6 +364,7 @@ Solid {
 # ==================== SCARECROW (east side, outside fence) ====================
 Solid {
  translation 20 -10 0
  rotation 0 0 1 2.61799
  children [
    Transform { translation 0 0 1.22 children [ Shape { appearance USE TRUNK geometry Cylinder { height 2.44 radius 0.045 subdivision 8 } } ] }
    Transform { translation 0 0 2.02 rotation 1 0 0 1.5708 children [ Shape { appearance USE TRUNK geometry Cylinder { height 1.60 radius 0.032 subdivision 8 } } ] }
@@ -391,12 +392,12 @@ Solid {
 # ==================== HAY BALES (near barn) ====================
 Solid { translation 25.75 13.76 0.62 children [ Transform { rotation 1 0 0 1.5708 children [ Shape { appearance USE HAY geometry Cylinder { height 1.30 radius 0.62 subdivision 14 } } ] } ] boundingObject Box { size 1.30 1.24 1.24 } }
-Solid { translation 24.34 12.32 0.62 children [ Transform { rotation 1 0 0 1.5708 children [ Shape { appearance USE HAY geometry Cylinder { height 1.30 radius 0.62 subdivision 14 } } ] } ] boundingObject Box { size 1.30 1.24 1.24 } }
+Solid { translation 24.34 12.32 0.62 rotation -1 0 0 1.5708 children [ Transform { rotation 1 0 0 1.5708 children [ Shape { appearance USE HAY geometry Cylinder { height 1.30 radius 0.62 subdivision 14 } } ] } ] boundingObject Box { size 1.30 1.24 1.24 } }
 Solid { translation 24.28 13.79 0.62 children [ Transform { rotation 1 0 0 1.5708 children [ Shape { appearance USE HAY geometry Cylinder { height 1.30 radius 0.62 subdivision 14 } } ] } ] boundingObject Box { size 1.30 1.24 1.24 } }
 # ==================== TRACTOR (near barn) ====================
 Solid {
-  translation 17 19 0
+  translation 17 19 0.18
  rotation 0 0 1 1.9
  children [
    # Chassis