Checkpoint 7

This commit is contained in:
Johnny Fernandes
2026-05-11 12:21:51 +01:00
parent fce0e0c786
commit a01a5c9cef
34 changed files with 1266 additions and 1038 deletions
+73 -179
View File
@@ -1,61 +1,30 @@
"""Gymnasium environment for the shepherd-dog herding task.
Single-agent: the agent is the dog. Sheep are environment-controlled
flocking agents whose dynamics are imported verbatim from
``herding.flocking_sim`` so a policy trained here transfers to Webots
without re-tuning. Differential-drive kinematics for both dog and sheep
match the proto specs (wheel radius, base, max wheel ω) via
``herding.diffdrive``.
Single-agent: the dog is the policy; sheep are env-controlled flocking
agents (``herding.world.flocking_sim``). Differential-drive kinematics
match the proto specs (``herding.world.diffdrive``) so a policy trained
here transfers to Webots without re-tuning.
Action space
------------
Box(-1, 1, (2,)) — the dog's desired (vx, vy) velocity *intent*. This
matches the high-level action representation the Webots controller
already uses; the env converts (vx, vy) → wheel speeds with the same
formula.
Observation space
-----------------
Box(-inf, inf, (28,)) — the order-invariant feature vector built by
``herding.obs.build_obs``. See ``herding/obs.py`` for the layout.
Reset
-----
``options["n_sheep"]`` (1..MAX_SHEEP) overrides the default flock size
for the next episode. If absent, flock size is sampled uniformly from
[1, max_n_sheep] each reset, where ``max_n_sheep`` can be raised over
training time by an outer callback.
Reward
------
Sparse + shaping (see :func:`HerdingEnv._compute_reward` for weights).
+2.0 per newly penned sheep
+0.5 · ΔCoM-distance-to-pen (positive when CoM moves closer)
+0.2 · ΔFlock-radius (positive when flock tightens)
-0.005 per step (encourages speed)
- wall and collision penalties
+10.0 terminal bonus when all sheep penned
* **Action**: ``Box(-1, 1, (2,))`` — desired ``(vx, vy)`` intent.
* **Observation**: ``Box(-inf, inf, (32·K,))`` from ``herding.perception.obs.build_obs``
with optional frame stacking K (concatenated oldest → newest).
* **Reset**: ``options["n_sheep"]`` overrides flock size; otherwise
sampled uniformly from ``[1, max_n_sheep]``.
* **Reward**: dense shaping (per-sheep distance progress, time
penalty, Strömbom-imitation cosine bonus) + sparse pen/done jackpots.
Weights live as class attributes on :class:`HerdingEnv`.
"""
from __future__ import annotations
import math
import os
import random
import sys
from typing import Optional
import gymnasium as gym
import numpy as np
from gymnasium import spaces
# Make herding/ importable when run from anywhere.
_HERE = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
from herding.world.diffdrive import (
heading_speed_to_wheels, kinematics_step, velocity_to_wheels,
)
@@ -71,7 +40,7 @@ from herding.world.geometry import (
)
from herding.perception.lidar_perception import detections_from_scan
from herding.perception.lidar_sim import simulate_scan
from herding.obs import OBS_DIM, build_obs
from herding.perception.obs import OBS_DIM, build_obs
from herding.perception.sheep_tracker import SheepTracker
from herding.control.strombom import compute_action as strombom_action
@@ -85,45 +54,23 @@ class HerdingEnv(gym.Env):
metadata = {"render_modes": []}
# Reward shaping weights. Re-tuned after the first run got stuck at
# 0% success: progress reward must dominate the time penalty by a
# large margin, and the pen-event bonus must be big enough that PPO's
# advantage estimator can credit-assign across the long path that
# leads to it. Per-step shaping is bounded by the clamps inside
# _compute_reward.
# Drastically simplified after two runs got stuck farming a position
# bonus instead of penning sheep. Reward now is essentially:
# • huge jackpot for actually penning sheep (+100 per pen, +500 done)
# • small dense gradient: per-sheep mean distance to pen
# No position shaping (gameable), no compactness shaping (gameable),
# no engagement bonus (gameable). The terminal per-unpenned penalty
# forbids "good enough" partial herds.
# We have a working analytic baseline (Strömbom, 100 % on easy mode).
# Use it as a teacher: per-step bonus proportional to the cosine
# similarity between the policy's action and what Strömbom would do
# at the same state. This drags the policy out of "do nothing" local
# optima without locking it to the teacher — PPO can still find
# improvements over Strömbom because pen jackpots dominate.
W_PEN_DELTA = 100.0
W_PROGRESS = 20.0
W_IMITATE = 0.5 # per-step max ±0.5 (action cosine sim, [-1, 1])
W_TIME = 0.0
W_WALL = 0.0
W_COLLISION = 0.0
W_DONE = 500.0
# Reward weights. Sparse jackpots (W_PEN_DELTA, W_DONE) dominate;
# dense shaping (W_PROGRESS on Δ mean-distance-to-pen) provides the
# gradient; W_IMITATE adds a small cosine bonus toward the analytic
# teacher's action; W_TIME is a per-step penalty (0 by default).
W_PEN_DELTA = 100.0
W_PROGRESS = 20.0
W_IMITATE = 0.5
W_TIME = 0.0
W_WALL = 0.0
W_COLLISION = 0.0
W_DONE = 500.0
# Action smoothing during training: 0 = none. The Webots controller
# still applies its own EMA at inference for actuator stability, so
# the policy doesn't need to learn smoothness explicitly.
# In-env action EMA. 0 = none; the Webots controller applies its own
# EMA at inference, so the policy needn't learn smoothness.
ACTION_SMOOTH = 0.0
# Episode budget. ~80 s of sim time at dt=0.016. The new external-pen
# layout has paths up to ~28 m from spawn to pen entry; at sheep flee
# speed ~0.4 m/s, that's 70 s minimum. 3000 steps (48 s) was leaving
# the dog with no margin for collect-then-drive on multi-sheep cases.
DEFAULT_MAX_STEPS = 5000
# Distance under which the dog is considered "colliding" with a sheep.
COLLISION_DIST = 0.30
def __init__(
@@ -137,19 +84,15 @@ class HerdingEnv(gym.Env):
frame_stack: int = 1,
):
super().__init__()
# When True (default), the obs and the imitation-reward teacher
# see only LiDAR-perceived sheep positions through a tracker
# matching what the Webots controller has access to. When False,
# both consume ground-truth positions (legacy "privileged" mode,
# kept for ablation).
# ``use_lidar=True`` (default): obs and imitation-reward teacher
# see only LiDAR-perceived positions via a tracker, matching the
# Webots controller. ``False`` exposes ground truth for ablation.
self._use_lidar = bool(use_lidar)
self._tracker = SheepTracker() if self._use_lidar else None
self._np_rng_lidar: Optional[np.random.Generator] = None
# Frame stacking: the policy receives the last K single-frame
# observations concatenated. Lets a memoryless MLP integrate
# information across time, partly compensating for the limited
# LiDAR FOV. K=1 reproduces the legacy single-frame obs.
# Frame stacking: the policy receives the last K obs concatenated,
# giving a memoryless MLP temporal context. K=1 → single frame.
self._frame_stack = max(1, int(frame_stack))
self._frame_buffer: list[np.ndarray] = []
self.action_space = spaces.Box(-1.0, 1.0, shape=(2,), dtype=np.float32)
@@ -159,18 +102,16 @@ class HerdingEnv(gym.Env):
shape=(OBS_DIM * self._frame_stack,), dtype=np.float32,
)
# If n_sheep is None, env will sample uniformly from [1, max_n_sheep]
# on every reset — this is the default for curriculum-free training.
# n_sheep=None → sample uniformly from [1, max_n_sheep] each reset.
self._fixed_n_sheep = n_sheep
self._max_n_sheep = max_n_sheep
self.max_steps = max_steps
# difficulty ∈ [0, 1]: 0 = sheep spawn next to the gate (easy),
# 1 = sheep spawn anywhere in the field (hard, the deployment
# distribution). Curriculum bumps this from 0 → 1 over training.
# difficulty ∈ [0, 1]: 0 = sheep spawn near the gate (easy);
# 1 = sheep spawn anywhere in the field (deployment distribution).
self._difficulty = float(difficulty)
self._initial_seed = seed
# State (initialized in reset)
# State (initialised in reset)
self.dog_x = self.dog_y = self.dog_heading = 0.0
self.sheep_x = np.zeros(0, dtype=np.float32)
self.sheep_y = np.zeros(0, dtype=np.float32)
@@ -186,12 +127,10 @@ class HerdingEnv(gym.Env):
self.prev_d_pen = 0.0
self.prev_radius = 0.0
# Env-owned RNG for the flocking wander-jitter, seeded fresh on each
# reset so determinism is preserved without touching the global
# random module.
# Env-owned RNG for wander jitter, re-seeded from np_random on reset.
self._py_rng = random.Random()
# ---- public knobs (used by curriculum callback) ----
# --- Public knobs ---
def set_max_n_sheep(self, value: int) -> None:
self._max_n_sheep = int(np.clip(value, 1, MAX_SHEEP))
@@ -199,22 +138,18 @@ class HerdingEnv(gym.Env):
self._difficulty = float(np.clip(value, 0.0, 1.0))
def set_imitate_weight(self, value: float) -> None:
"""Override W_IMITATE (instance-level) — used to disable the
Strömbom imitation reward during BC fine-tuning, when the policy
already mimics a stronger teacher (sequential)."""
"""Override the instance W_IMITATE — used to disable Strömbom
imitation during PPO fine-tune."""
self.W_IMITATE = float(value)
def set_time_weight(self, value: float) -> None:
"""Override W_TIME (instance-level). Default 0.0; a small
negative value (e.g. -0.1) adds a per-step penalty that
explicitly rewards fast time-to-pen during PPO fine-tune."""
"""Override the instance W_TIME — set negative to penalise step
count and encourage faster time-to-pen during PPO fine-tune."""
self.W_TIME = float(value)
# ---- gym API ----
# --- gym API ---
def reset(self, *, seed=None, options=None):
super().reset(seed=seed)
# Re-seed the flocking RNG from np_random so flocking jitter is
# reproducible alongside everything else the env samples.
self._py_rng.seed(int(self.np_random.integers(0, 2**31 - 1)))
opts = options or {}
@@ -230,28 +165,26 @@ class HerdingEnv(gym.Env):
self.dog_y = float(self.np_random.uniform(-2.5, 2.5))
self.dog_heading = float(self.np_random.uniform(-math.pi, math.pi))
# Sheep spawn region scales with difficulty:
# 0.0 → narrow box just north of the gate (x ∈ [7, 14], y ∈ [-12, -6])
# 1.0 → full field (x ∈ [-13, 13], y ∈ [-12, 13])
# Linear interpolation between the two for intermediate values.
# Sheep spawn region linearly interpolates with difficulty:
# 0 → small box near the gate, 1 → full field.
d = self._difficulty
sx_lo = 7.0 - d * 20.0 # → -13 at d=1
sx_hi = 14.0 - d * 1.0 # → 13 at d=1
sy_lo = -12.0 + d * 0.0 # → -12 at d=1
sy_hi = -6.0 + d * 19.0 # → 13 at d=1
sx_lo = 7.0 - d * 20.0
sx_hi = 14.0 - d * 1.0
sy_lo = -12.0 + d * 0.0
sy_hi = -6.0 + d * 19.0
sxs, sys_, shs, sws = [], [], [], []
for _ in range(self.n_sheep):
for _try in range(100):
sx = float(self.np_random.uniform(sx_lo, sx_hi))
sy = float(self.np_random.uniform(sy_lo, sy_hi))
# Reject too close to dog or to other sheep.
# Reject if too close to the dog or another sheep, or
# already in the gate column (would start "penned").
if math.hypot(sx - self.dog_x, sy - self.dog_y) < 3.0:
continue
if any(math.hypot(sx - x, sy - y) < 1.5
for x, y in zip(sxs, sys_)):
continue
# Reject inside the gate column already (they'd start "penned").
if PEN_X[0] <= sx <= PEN_X[1] and sy < -8.0:
continue
break
@@ -275,10 +208,8 @@ class HerdingEnv(gym.Env):
self._tracker.reset()
self._np_rng_lidar = np.random.default_rng(
int(self.np_random.integers(0, 2**31 - 1)))
# Prime the tracker with one scan so the first obs isn't empty.
self._update_tracker()
# Clear the frame stack — the next _build_obs will repopulate.
self._frame_buffer = []
obs = self._build_obs()
@@ -288,7 +219,6 @@ class HerdingEnv(gym.Env):
def step(self, action):
action = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0)
# EMA smoothing — the Webots controller does this too.
self.smoothed_action = (
self.ACTION_SMOOTH * self.prev_action
+ (1.0 - self.ACTION_SMOOTH) * action
@@ -296,12 +226,11 @@ class HerdingEnv(gym.Env):
self.prev_action = self.smoothed_action.copy()
vx, vy = float(self.smoothed_action[0]), float(self.smoothed_action[1])
# Safety supervisor mirrored from the controller — keeps the dog
# north of the gate so the policy can't strand itself in the pen.
# Safety supervisor — dog stays north of the gate.
if self.dog_y < DOG_SOUTH_LIMIT and vy < 0.0:
vx, vy = 0.0, 1.0
# --- Step the dog ---
# Step the dog.
wL, wR = velocity_to_wheels(
vx, vy, self.dog_heading,
max_linear=DOG_MAX_LINEAR,
@@ -313,27 +242,22 @@ class HerdingEnv(gym.Env):
self.dog_x, self.dog_y, self.dog_heading,
wL, wR, DOG_WHEEL_RADIUS, DOG_WHEEL_BASE, WEBOTS_DT,
)
# Clip dog to field bounds and out of pen — same as the Webots stone walls.
self.dog_x = float(np.clip(self.dog_x, FIELD_X[0] + 0.3, FIELD_X[1] - 0.3))
self.dog_y = float(np.clip(self.dog_y, DOG_SOUTH_LIMIT, FIELD_Y[1] - 0.3))
# --- Step each sheep ---
# Step sheep and update penned flags (GT-based).
for i in range(self.n_sheep):
self._step_one_sheep(i)
# --- Update penned state ---
for i in range(self.n_sheep):
if (not self.sheep_penned[i]
and is_penned_position(self.sheep_x[i], self.sheep_y[i])):
self.sheep_penned[i] = True
# --- Run LiDAR perception on this step's state (after sheep have
# moved). Updates the tracker that obs and the imitation-
# reward teacher consume. Reward / termination still use GT. ---
# LiDAR perception runs after sheep move; feeds the obs and the
# imitation reward. Reward/termination still use GT.
if self._tracker is not None:
self._update_tracker()
# --- Reward, termination ---
d_pen, radius = self._flock_metrics()
reward = self._compute_reward(d_pen, radius, action=action)
self.prev_d_pen = d_pen
@@ -346,12 +270,6 @@ class HerdingEnv(gym.Env):
truncated = self.steps >= self.max_steps
if all_penned:
reward += self.W_DONE
# No timeout penalty: a per-unpenned penalty made "do nothing"
# strictly preferable to noisy-random under reward-progress shaping
# (random sometimes pushes sheep away → negative progress, then
# always ate the timeout penalty), which collapsed exploration to
# tiny actions. The pen jackpot alone provides the directional
# signal once exploration is wide enough to find it.
obs = self._build_obs()
info = {
@@ -362,7 +280,7 @@ class HerdingEnv(gym.Env):
}
return obs, float(reward), terminated, truncated, info
# ---- internals ----
# --- Internals ---
def _step_one_sheep(self, i: int) -> None:
x, y = float(self.sheep_x[i]), float(self.sheep_y[i])
peers = [(float(self.sheep_x[j]), float(self.sheep_y[j]))
@@ -386,8 +304,7 @@ class HerdingEnv(gym.Env):
SHEEP_WHEEL_RADIUS, SHEEP_WHEEL_BASE, WEBOTS_DT,
)
# Wall clipping — matches Webots stone walls, except in the gate column
# where the south wall is absent.
# Wall clipping (south wall absent inside the gate column).
nx = float(np.clip(nx, FIELD_X[0] + 0.2, FIELD_X[1] - 0.2))
in_gate_col = PEN_X[0] <= nx <= PEN_X[1]
if in_gate_col:
@@ -400,12 +317,11 @@ class HerdingEnv(gym.Env):
self.sheep_h[i] = nh
def _flock_metrics(self):
"""(per-sheep mean distance to pen entry, max-radius).
"""Return (per-sheep mean distance to pen, max radius from CoM).
Using the per-sheep mean instead of CoM-distance ensures stragglers
keep contributing to the progress signal — the dog can't game the
shaping by herding the bulk of the flock and abandoning one
outlier (CoM moves toward pen, but mean-distance doesn't).
The per-sheep mean (not CoM distance) makes the progress signal
sensitive to stragglers: the dog can't game it by herding most of
the flock and abandoning one outlier.
"""
active_mask = ~self.sheep_penned
if not active_mask.any():
@@ -422,24 +338,14 @@ class HerdingEnv(gym.Env):
return d_pen, radius
def _compute_reward(self, d_pen: float, radius: float, action=None) -> float:
"""Sparse + per-sheep distance shaping + Strömbom imitation.
d_pen is the *mean* distance over active sheep, so progress only
accrues when ALL active sheep get closer to the pen on average —
the dog can't farm it by herding one sheep while ignoring others.
The imitation term is computed by querying Strömbom for the
recommended action at the *current* (post-step) state and
rewarding cosine similarity with what the policy actually did.
"""
"""Sparse pen jackpot + dense progress shaping + Strömbom imitation."""
n_penned = int(self.sheep_penned.sum())
delta_pen = n_penned - self.prev_n_penned
d_progress = max(-5.0, min(5.0, self.prev_d_pen - d_pen))
r = self.W_PEN_DELTA * delta_pen + self.W_PROGRESS * d_progress
# Per-step time penalty (0 by default). When negative, encourages
# the policy to finish quickly — used during PPO fine-tune.
r += self.W_TIME
r = (self.W_PEN_DELTA * delta_pen
+ self.W_PROGRESS * d_progress
+ self.W_TIME)
if action is not None and self.W_IMITATE > 0.0:
positions = self._perceived_positions()
@@ -457,10 +363,7 @@ class HerdingEnv(gym.Env):
def _build_single_obs(self) -> np.ndarray:
if self._tracker is not None:
# Obs sees only the tracker's active set; penned tracks are
# intentionally excluded (matches the prior receiver-based
# behaviour where penned sheep stopped contributing to the
# symbolic obs).
# LiDAR mode: the obs sees only the tracker's active set.
active = self._tracker.get_positions()
sheep_xy_list = list(active.values())
sheep_penned_list = [False] * len(sheep_xy_list)
@@ -477,22 +380,18 @@ class HerdingEnv(gym.Env):
single = self._build_single_obs()
if self._frame_stack <= 1:
return single
# On a fresh reset the buffer is empty — duplicate the first
# frame so the stack is always full-length.
# On reset the buffer is empty — pad with copies of the first frame.
if not self._frame_buffer:
self._frame_buffer = [single.copy() for _ in range(self._frame_stack)]
else:
self._frame_buffer.append(single)
if len(self._frame_buffer) > self._frame_stack:
self._frame_buffer = self._frame_buffer[-self._frame_stack:]
# Concatenate oldest → newest.
return np.concatenate(self._frame_buffer, axis=0).astype(np.float32)
# ------------------------------------------------------------------
# LiDAR perception helpers
# ------------------------------------------------------------------
# --- LiDAR perception ---
def _all_sheep_xy(self) -> list[tuple[float, float]]:
"""Every sheep, including penned ones (the LiDAR sees them)."""
"""Every sheep, including penned (the LiDAR sees them)."""
return [(float(self.sheep_x[i]), float(self.sheep_y[i]))
for i in range(self.n_sheep)]
@@ -508,19 +407,14 @@ class HerdingEnv(gym.Env):
self._tracker.update(detections)
def perceived_positions(self) -> dict[str, tuple[float, float]]:
"""Public accessor — what the controller would 'see' this step.
LiDAR mode → the tracker's active set.
Privileged mode → ground-truth active sheep.
Used by ``training.eval`` and ``tools.collect_demos`` so analytic
teachers run on the same perception the deployed controller has.
"""What the controller would "see" this step: tracker output in
LiDAR mode, ground truth in privileged mode. Used by demo
collection and analytic-policy eval so the teacher runs on the
same perception the deployed controller has.
"""
if self._tracker is not None:
return self._tracker.get_positions()
return {f"s{i}": (float(self.sheep_x[i]), float(self.sheep_y[i]))
for i in range(self.n_sheep) if not self.sheep_penned[i]}
# Internal alias so the imitation reward path doesn't need to know
# which mode it's in.
_perceived_positions = perceived_positions